Spaces:
Running
Running
File size: 4,326 Bytes
d4cef3c a2780b1 8db4a21 970b086 a2780b1 95a02ef 43df042 a2780b1 970b086 a2780b1 2788315 a2780b1 b65834e a2780b1 970b086 a2780b1 970b086 a2780b1 970b086 a2780b1 9eb59d5 970b086 a2780b1 970b086 a2780b1 1b2c45f a2780b1 b5b29b6 a2780b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import os
import json
from typing import Dict
from typing import List
# os.system("pip install langchain-openai")
from langchain_openai import AzureChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
# from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import PyPDFLoader
class Extractor:
"""
This class handles the extraction of tags from a PDF document.
Attributes:
config (dict): Configuration settings loaded from a JSON file.
pdf_file_path (str): Path to the input PDF file.
"""
def __init__(self):
"""
Initialize the Extractor class.
"""
pass
def _document_loader(self,pdf_file_path) -> List[str]:
"""
Load and split the PDF document into individual pages.
Returns:
List[str]: List of text content from each page.
"""
try:
loader = PyPDFLoader(pdf_file_path.name)
pages = loader.load_and_split()
return pages
except Exception as e:
print(f"Error while loading and splitting the document: {str(e)}")
def _document_text_spilliter(self,pdf_file_path) -> List[str]:
"""
Split the document text into smaller chunks.
Returns:
List[str]: List of smaller text chunks.
"""
try:
# Load the document texts
docs = self._document_loader(pdf_file_path)
# Initialize the text splitter with specified chunk size and overlap
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=200
)
# Split the documents into chunks
split_docs = text_splitter.split_documents(docs)
# Return the list of split document chunks
return split_docs
except Exception as e:
print(f"Error while splitting document text: {str(e)}")
def _refine_summary(self,pdf_file_path) -> str:
"""
Generate a refined summary of the document using language models.
Returns:
str: Refined summary text.
"""
try:
# Split documents into chunks for efficient processing
split_docs = self._document_text_spilliter(pdf_file_path)
# Prepare the prompt template for summarization
prompt_template = """Write a concise summary of the following Contract:
Contrcat : {text}
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)
# Prepare the template for refining the summary with additional context
refine_template = (
"Your job is to produce a final summary\n"
"We have provided an existing summary up to a certain point: {existing_answer}\n"
"We have the opportunity to refine the existing summary"
"(only if needed) with some more context below.\n"
"------------\n"
"{text}\n"
"------------\n"
"Given the new context, refine the original summary"
"If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)
# Load the summarization chain using the ChatOpenAI language model
chain = load_summarize_chain(
llm = AzureChatOpenAI(azure_deployment = "GPT-3"),
chain_type="refine",
question_prompt=prompt,
refine_prompt=refine_prompt,
return_intermediate_steps=True,
input_key="input_documents",
output_key="output_text",
)
# Generate the refined summary using the loaded summarization chain
result = chain({"input_documents": split_docs}, return_only_outputs=True)
return result["output_text"]
except Exception as e:
print(f"Error while generating refined summary: {str(e)}") |