File size: 2,900 Bytes
4ec3e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bac9bd
4ec3e55
 
 
7bac9bd
4ec3e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from PyPDF2 import PdfReader
import openai
import fitz  # PyMuPDF
import logging

# Configure logging
logging.basicConfig(
    filename='extract_date.log',  # You can adjust the log file name here
    filemode='a',
    format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
    datefmt='%Y-%b-%d %H:%M:%S'
)
LOGGER = logging.getLogger(__name__)

log_level_env = 'INFO'  # You can adjust the log level here
log_level_dict = {
    'DEBUG': logging.DEBUG,
    'INFO': logging.INFO,
    'WARNING': logging.WARNING,
    'ERROR': logging.ERROR,
    'CRITICAL': logging.CRITICAL
}
if log_level_env in log_level_dict:
    log_level = log_level_dict[log_level_env]
else:
    log_level = log_level_dict['INFO']
LOGGER.setLevel(log_level)

class ExtractDateAndDuration:


    def __init__(self):
        """
        Initialize the ExtractDateAndDuration class.
        """
        pass

    def get_date_and_duration(self, contract_text: str) -> str:
        """
        Extract dates and durations from the provided contract text.

        Args:
            contract_text (str): The text of the contract to analyze.

        Returns:
            str: Extracted dates and durations.
        """
        try:
            response = openai.Completion.create(
                engine="text-davinci-003",
                prompt=f"""Your task is Identify Dates and Durations Mentioned in the contract and extract that date and duration in key-value pair.
                ```contract: {contract_text}```
                """,
                max_tokens=300,
                temperature=0
            )
            extracted_date_duration = response.choices[0].text.strip()
            return extracted_date_duration

        except Exception as e:
            LOGGER.error(f"An error occurred during text analysis: {str(e)}")

    def itrate_each_page(self, pdf_file_path: str):
        """
        Extract text from each page of a PDF document and process it.

        Args:
            pdf_file_path (str): The path to the PDF document.

        Returns:
            str: Extracted text from the PDF pages.
        """
        try:
            # Open the multi-page PDF using PdfReaderer
            pdf = PdfReader(pdf_file_path.name)

            extracted_date_duration = ""

            # Extract text from each page and pass it to the process_text function
            for page_number in range(len(pdf.pages)):
                # Extract text from the page
                page = pdf.pages[page_number]
                text = page.extract_text()

                # Pass the text to the process_text function for further processing
                extracted_date_duration += self.get_date_and_duration(text)
            return extracted_date_duration

        except Exception as e:
            LOGGER.error(f"An error occurred while processing the PDF document: {str(e)}")