Spaces:
Running
Running
import os | |
import json | |
from typing import Dict | |
from typing import List | |
# os.system("pip install langchain-openai") | |
from langchain_openai import AzureChatOpenAI | |
from langchain.chains.mapreduce import MapReduceChain | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.prompts import PromptTemplate | |
from langchain_community.document_loaders import UnstructuredFileLoader | |
class Extractor: | |
""" | |
This class handles the extraction of tags from a PDF document. | |
Attributes: | |
config (dict): Configuration settings loaded from a JSON file. | |
pdf_file_path (str): Path to the input PDF file. | |
""" | |
def __init__(self): | |
""" | |
Initialize the Extractor class. | |
""" | |
pass | |
def _document_loader(self,pdf_file_path) -> List[str]: | |
""" | |
Load and split the PDF document into individual pages. | |
Returns: | |
List[str]: List of text content from each page. | |
""" | |
try: | |
loader = UnstructuredFileLoader(pdf_file_path.name) | |
pages = loader.load_and_split() | |
return pages | |
except Exception as e: | |
print(f"Error while loading and splitting the document: {str(e)}") | |
def _document_text_spilliter(self,pdf_file_path) -> List[str]: | |
""" | |
Split the document text into smaller chunks. | |
Returns: | |
List[str]: List of smaller text chunks. | |
""" | |
try: | |
# Load the document texts | |
docs = self._document_loader(pdf_file_path) | |
# Initialize the text splitter with specified chunk size and overlap | |
text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=1000, chunk_overlap=200 | |
) | |
# Split the documents into chunks | |
split_docs = text_splitter.split_documents(docs) | |
# Return the list of split document chunks | |
return split_docs | |
except Exception as e: | |
print(f"Error while splitting document text: {str(e)}") | |
def _refine_summary(self,pdf_file_path) -> str: | |
""" | |
Generate a refined summary of the document using language models. | |
Returns: | |
str: Refined summary text. | |
""" | |
try: | |
# Split documents into chunks for efficient processing | |
split_docs = self._document_text_spilliter(pdf_file_path) | |
# Prepare the prompt template for summarization | |
prompt_template = """Write a concise summary of the following: | |
{text} | |
CONCISE SUMMARY:""" | |
prompt = PromptTemplate.from_template(prompt_template) | |
# Prepare the template for refining the summary with additional context | |
refine_template = ( | |
"Your job is to produce a final summary\n" | |
"We have provided an existing summary up to a certain point: {existing_answer}\n" | |
"We have the opportunity to refine the existing summary" | |
"(only if needed) with some more context below.\n" | |
"------------\n" | |
"{text}\n" | |
"------------\n" | |
"Given the new context, refine the original summary" | |
"If the context isn't useful, return the original summary." | |
) | |
refine_prompt = PromptTemplate.from_template(refine_template) | |
# Load the summarization chain using the ChatOpenAI language model | |
chain = load_summarize_chain( | |
llm = AzureChatOpenAI(azure_deployment = "GPT-3"), | |
chain_type="refine", | |
question_prompt=prompt, | |
refine_prompt=refine_prompt, | |
return_intermediate_steps=True, | |
input_key="input_documents", | |
output_key="output_text", | |
) | |
# Generate the refined summary using the loaded summarization chain | |
result = chain({"input_documents": split_docs}, return_only_outputs=True) | |
return result["output_text"] | |
except Exception as e: | |
print(f"Error while generating refined summary: {str(e)}") |