File size: 3,670 Bytes
a2780b1
 
4ec3e55
a2780b1
0c0ecb2
a2780b1
4ec3e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2780b1
 
f0ab5f2
a2780b1
 
 
1fe0ea3
a2780b1
 
 
 
 
 
 
 
 
 
 
 
4ec3e55
a2780b1
 
893ed50
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec3e55
a2780b1
 
 
4ec3e55
a2780b1
 
 
 
 
 
 
 
 
 
4ec3e55
a2780b1
 
afcf7a1
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec3e55
a2780b1
 
4ec3e55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import PyPDF2
import logging
from langchain import PromptTemplate, LLMChain
from langchain_openai import AzureChatOpenAI

# Configure logging
logging.basicConfig(
    filename='pdftojson.log',  # You can adjust the log file name here
    filemode='a',
    format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
    datefmt='%Y-%b-%d %H:%M:%S'
)
LOGGER = logging.getLogger(__name__)

log_level_env = 'INFO'  # You can adjust the log level here
log_level_dict = {
    'DEBUG': logging.DEBUG,
    'INFO': logging.INFO,
    'WARNING': logging.WARNING,
    'ERROR': logging.ERROR,
    'CRITICAL': logging.CRITICAL
}
if log_level_env in log_level_dict:
    log_level = log_level_dict[log_level_env]
else:
    log_level = log_level_dict['INFO']
LOGGER.setLevel(log_level)

class PdftoJson:

    def __init__(self):
        """
        Initialize the PdftoJson class with OpenAI API key.
        """
        pass

    def _get_json(self, input_text: str) -> str:
        """
        Generate JSON result by analyzing and splitting input text into topics and content.

        Args:
            input_text (str): Text to be analyzed.

        Returns:
            str: JSON result containing topics and content.
        """
        try:
            LOGGER.info("Generating JSON result by analyzing input text...")

            # Initialize the OpenAI language model with specified settings
            llm = AzureChatOpenAI(azure_deployment = "GPT-3")

            # Define a template that instructs the model to split input text into topics and content
            template = """
            Your task is Get the text and analyse and split it into Topics and Content in json format.Give Proper Name to Topic dont give any Numbers and Dont Give any empty Contents.The Output Format Should Be very good.

            {text}
            """
            prompt = PromptTemplate(template=template, input_variables=["text"])

            # Create an LLMChain instance to chain the prompt and language model together
            llm_chain = LLMChain(prompt=prompt, llm=llm)

            # Use the provided input text to generate JSON result using the model
            text = input_text
            json_result = llm_chain.run(text)

            LOGGER.info("Generated JSON result successfully.")
            return json_result

        except Exception as e:
            LOGGER.error(f"Error occurred while generating JSON result: {str(e)}")


    def extract_text_from_pdf(self, pdf_path: str):
        """
        Extract text from a PDF file, generate JSON result, and save it to a file.

        Args:
            pdf_path (str): Path to the PDF file.
        """
        try:
            LOGGER.info("Extracting text from PDF, generating JSON result, and saving to a file...")

            # Open the PDF file in binary read mode
            with open(pdf_path.name, "rb") as pdf_file:
                # Create a PDF reader object
                pdf_reader = PyPDF2.PdfReader(pdf_file)

                # Iterate through each page in the PDF
                for page_number in range(len(pdf_reader.pages)):
                    # Extract text from the current page
                    page = pdf_reader.pages[page_number]
                    text = page.extract_text()

                    # Generate JSON result for the extracted text
                    json_result = self._get_json(text)

                    return json_result

            LOGGER.info("Extraction, JSON generation, and saving completed.")

        except Exception as e:
            LOGGER.error(f"Error occurred during extraction and processing: {str(e)}")