import openai import json from typing import Dict import os from typing import List from langchain_openai import AzureChatOpenAI from langchain.chains.mapreduce import MapReduceChain from langchain.text_splitter import CharacterTextSplitter from langchain.chains.summarize import load_summarize_chain from langchain.prompts import PromptTemplate from langchain_community.document_loaders import UnstructuredFileLoader class Extractor: """ This class handles the extraction of tags from a PDF document. Attributes: config (dict): Configuration settings loaded from a JSON file. pdf_file_path (str): Path to the input PDF file. """ def __init__(self): """ Initialize the Extractor class. """ openai.api_type = os.getenv['api_type'] os.environ["AZURE_OPENAI_API_KEY"] = os.getenv['api_key'] os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv['api_base'] os.environ["OPENAI_API_VERSION"] = os.getenv['api_version'] def _document_loader(self,pdf_file_path) -> List[str]: """ Load and split the PDF document into individual pages. Returns: List[str]: List of text content from each page. """ try: loader = UnstructuredFileLoader(pdf_file_path) pages = loader.load_and_split() return pages except Exception as e: print(f"Error while loading and splitting the document: {str(e)}") def _document_text_spilliter(self,pdf_file_path) -> List[str]: """ Split the document text into smaller chunks. Returns: List[str]: List of smaller text chunks. """ try: # Load the document texts docs = self._document_loader(pdf_file_path) # Initialize the text splitter with specified chunk size and overlap text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=1000, chunk_overlap=200 ) # Split the documents into chunks split_docs = text_splitter.split_documents(docs) # Return the list of split document chunks return split_docs except Exception as e: print(f"Error while splitting document text: {str(e)}") def refine_summary(self,pdf_file_path) -> str: """ Generate a refined summary of the document using language models. Returns: str: Refined summary text. """ try: # Split documents into chunks for efficient processing split_docs = self._document_text_spilliter(pdf_file_path) # Prepare the prompt template for summarization prompt_template = """Write a concise summary of the following: {text} CONCISE SUMMARY:""" prompt = PromptTemplate.from_template(prompt_template) # Prepare the template for refining the summary with additional context refine_template = ( "Your job is to produce a final summary\n" "We have provided an existing summary up to a certain point: {existing_answer}\n" "We have the opportunity to refine the existing summary" "(only if needed) with some more context below.\n" "------------\n" "{text}\n" "------------\n" "Given the new context, refine the original summary" "If the context isn't useful, return the original summary." ) refine_prompt = PromptTemplate.from_template(refine_template) # Load the summarization chain using the ChatOpenAI language model chain = load_summarize_chain( llm = AzureChatOpenAI(azure_deployment = "ChatGPT"), chain_type="refine", question_prompt=prompt, refine_prompt=refine_prompt, return_intermediate_steps=True, input_key="input_documents", output_key="output_text", ) # Generate the refined summary using the loaded summarization chain result = chain({"input_documents": split_docs}, return_only_outputs=True) return result["output_text"] except Exception as e: print(f"Error while generating refined summary: {str(e)}")