File size: 4,221 Bytes
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import openai
import json
from typing import Dict
import os
from typing import List
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate


class Extractor:

    """
    This class handles the extraction of tags from a PDF document.

    Attributes:
        config (dict): Configuration settings loaded from a JSON file.
        pdf_file_path (str): Path to the input PDF file.
    """
    def __init__(self):
        """
        Initialize the Extractor class.
        """

        # Set OpenAI API key
        # os.environ["OPENAI_API_KEY"] = ""

    def _document_loader(self,pdf_file_path) -> List[str]:
        """
        Load and split the PDF document into individual pages.

        Returns:
            List[str]: List of text content from each page.
        """
        try:
            loader = PyPDFLoader(pdf_file_path.name)
            pages = loader.load_and_split()
            return pages

        except Exception as e:
            print(f"Error while loading and splitting the document: {str(e)}")


    def _document_text_spilliter(self,pdf_file_path) -> List[str]:
        """
        Split the document text into smaller chunks.

        Returns:
            List[str]: List of smaller text chunks.
        """
        try:
            # Load the document texts
            docs = self._document_loader(pdf_file_path)

            # Initialize the text splitter with specified chunk size and overlap
            text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
                chunk_size=1000, chunk_overlap=200
            )

            # Split the documents into chunks
            split_docs = text_splitter.split_documents(docs)

            # Return the list of split document chunks
            return split_docs

        except Exception as e:
            print(f"Error while splitting document text: {str(e)}")


    def _refine_summary(self,pdf_file_path) -> str:
        """
        Generate a refined summary of the document using language models.

        Returns:
            str: Refined summary text.
        """
        try:
            # Split documents into chunks for efficient processing
            split_docs = self._document_text_spilliter(pdf_file_path)

            # Prepare the prompt template for summarization
            prompt_template = """Write a concise summary of the following:
            {text}
            CONCISE SUMMARY:"""
            prompt = PromptTemplate.from_template(prompt_template)

            # Prepare the template for refining the summary with additional context
            refine_template = (
                "Your job is to produce a final summary\n"
                "We have provided an existing summary up to a certain point: {existing_answer}\n"
                "We have the opportunity to refine the existing summary"
                "(only if needed) with some more context below.\n"
                "------------\n"
                "{text}\n"
                "------------\n"
                "Given the new context, refine the original summary"
                "If the context isn't useful, return the original summary."
            )
            refine_prompt = PromptTemplate.from_template(refine_template)

            # Load the summarization chain using the ChatOpenAI language model
            chain = load_summarize_chain(
                llm = ChatOpenAI(temperature=0),
                chain_type="refine",
                question_prompt=prompt,
                refine_prompt=refine_prompt,
                return_intermediate_steps=True,
                input_key="input_documents",
                output_key="output_text",
            )

            # Generate the refined summary using the loaded summarization chain
            result = chain({"input_documents": split_docs}, return_only_outputs=True)

            return result["output_text"]

        except Exception as e:
            print(f"Error while generating refined summary: {str(e)}")