File size: 4,223 Bytes
3c65a2f
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# import openai
import json
from typing import Dict
import os
from typing import List
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate


class Extractor:

    """
    This class handles the extraction of tags from a PDF document.

    Attributes:
        config (dict): Configuration settings loaded from a JSON file.
        pdf_file_path (str): Path to the input PDF file.
    """
    def __init__(self):
        """
        Initialize the Extractor class.
        """

        # Set OpenAI API key
        # os.environ["OPENAI_API_KEY"] = ""

    def _document_loader(self,pdf_file_path) -> List[str]:
        """
        Load and split the PDF document into individual pages.

        Returns:
            List[str]: List of text content from each page.
        """
        try:
            loader = PyPDFLoader(pdf_file_path.name)
            pages = loader.load_and_split()
            return pages

        except Exception as e:
            print(f"Error while loading and splitting the document: {str(e)}")


    def _document_text_spilliter(self,pdf_file_path) -> List[str]:
        """
        Split the document text into smaller chunks.

        Returns:
            List[str]: List of smaller text chunks.
        """
        try:
            # Load the document texts
            docs = self._document_loader(pdf_file_path)

            # Initialize the text splitter with specified chunk size and overlap
            text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
                chunk_size=1000, chunk_overlap=200
            )

            # Split the documents into chunks
            split_docs = text_splitter.split_documents(docs)

            # Return the list of split document chunks
            return split_docs

        except Exception as e:
            print(f"Error while splitting document text: {str(e)}")


    def _refine_summary(self,pdf_file_path) -> str:
        """
        Generate a refined summary of the document using language models.

        Returns:
            str: Refined summary text.
        """
        try:
            # Split documents into chunks for efficient processing
            split_docs = self._document_text_spilliter(pdf_file_path)

            # Prepare the prompt template for summarization
            prompt_template = """Write a concise summary of the following:
            {text}
            CONCISE SUMMARY:"""
            prompt = PromptTemplate.from_template(prompt_template)

            # Prepare the template for refining the summary with additional context
            refine_template = (
                "Your job is to produce a final summary\n"
                "We have provided an existing summary up to a certain point: {existing_answer}\n"
                "We have the opportunity to refine the existing summary"
                "(only if needed) with some more context below.\n"
                "------------\n"
                "{text}\n"
                "------------\n"
                "Given the new context, refine the original summary"
                "If the context isn't useful, return the original summary."
            )
            refine_prompt = PromptTemplate.from_template(refine_template)

            # Load the summarization chain using the ChatOpenAI language model
            chain = load_summarize_chain(
                llm = ChatOpenAI(temperature=0),
                chain_type="refine",
                question_prompt=prompt,
                refine_prompt=refine_prompt,
                return_intermediate_steps=True,
                input_key="input_documents",
                output_key="output_text",
            )

            # Generate the refined summary using the loaded summarization chain
            result = chain({"input_documents": split_docs}, return_only_outputs=True)

            return result["output_text"]

        except Exception as e:
            print(f"Error while generating refined summary: {str(e)}")