File size: 4,255 Bytes
d4cef3c
a2780b1
 
 
43df042
970b086
a2780b1
 
 
 
970b086
a2780b1
43df042
a2780b1
 
 
 
 
 
 
 
 
970b086
a2780b1
 
 
 
2788315
a2780b1
 
 
 
 
 
 
 
 
6e417dd
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970b086
a2780b1
 
 
 
970b086
a2780b1
 
970b086
a2780b1
 
 
 
 
 
 
9eb59d5
970b086
a2780b1
 
 
 
 
 
970b086
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970b086
a2780b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import json
from typing import Dict
from typing import List
os.system("pip install langchain-openai")
from langchain_openai import AzureChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import UnstructuredFileLoader


class Extractor:

    """
    This class handles the extraction of tags from a PDF document.

    Attributes:
        config (dict): Configuration settings loaded from a JSON file.
        pdf_file_path (str): Path to the input PDF file.
    """

    def __init__(self):
        """
        Initialize the Extractor class.
        """
        pass

    def _document_loader(self,pdf_file_path) -> List[str]:
        """
        Load and split the PDF document into individual pages.

        Returns:
            List[str]: List of text content from each page.
        """
        try:
            loader = UnstructuredFileLoader(pdf_file_path.name)
            pages = loader.load_and_split()
            return pages

        except Exception as e:
            print(f"Error while loading and splitting the document: {str(e)}")


    def _document_text_spilliter(self,pdf_file_path) -> List[str]:
        """
        Split the document text into smaller chunks.

        Returns:
            List[str]: List of smaller text chunks.
        """
        try:
            # Load the document texts
            docs = self._document_loader(pdf_file_path)
    
            # Initialize the text splitter with specified chunk size and overlap
            text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
                chunk_size=1000, chunk_overlap=200
            )
    
            # Split the documents into chunks
            split_docs = text_splitter.split_documents(docs)
    
            # Return the list of split document chunks
            return split_docs

        except Exception as e:
            print(f"Error while splitting document text: {str(e)}")


    def _refine_summary(self,pdf_file_path) -> str:

        """
        Generate a refined summary of the document using language models.

        Returns:
            str: Refined summary text.
        """

        try:
            # Split documents into chunks for efficient processing
            split_docs = self._document_text_spilliter(pdf_file_path)

            # Prepare the prompt template for summarization
            prompt_template = """Write a concise summary of the following:
            {text}
            CONCISE SUMMARY:"""
            prompt = PromptTemplate.from_template(prompt_template)

            # Prepare the template for refining the summary with additional context
            refine_template = (
                "Your job is to produce a final summary\n"
                "We have provided an existing summary up to a certain point: {existing_answer}\n"
                "We have the opportunity to refine the existing summary"
                "(only if needed) with some more context below.\n"
                "------------\n"
                "{text}\n"
                "------------\n"
                "Given the new context, refine the original summary"
                "If the context isn't useful, return the original summary."
            )
            refine_prompt = PromptTemplate.from_template(refine_template)

            # Load the summarization chain using the ChatOpenAI language model
            chain = load_summarize_chain(
                llm = AzureChatOpenAI(azure_deployment = "ChatGPT"),
                chain_type="refine",
                question_prompt=prompt,
                refine_prompt=refine_prompt,
                return_intermediate_steps=True,
                input_key="input_documents",
                output_key="output_text",
            )

            # Generate the refined summary using the loaded summarization chain
            result = chain({"input_documents": split_docs}, return_only_outputs=True)

            return result["output_text"]

        except Exception as e:
            print(f"Error while generating refined summary: {str(e)}")