Spaces:
Sleeping
Sleeping
File size: 4,644 Bytes
1e615dd a5b6905 1e615dd a5b6905 2976ddc 1e615dd a5b6905 1e615dd 2976ddc 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd a5b6905 1e615dd 2976ddc a5b6905 1e615dd 2976ddc 1e615dd 2976ddc 1e615dd a5b6905 1e615dd a5b6905 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import logging
import tempfile
from typing import List
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
DB_FAISS_PATH = 'vectorstore/db_faiss'
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEFAULT_MODEL = "facebook/bart-large-cnn"
# Default model parameters
DEFAULT_PARAMS = {
"temperature": 0.7,
"max_length": 1024,
"num_beams": 4,
"top_p": 0.95,
"repetition_penalty": 1.2,
}
def get_default_value(param_name: str, default: float) -> float:
"""Safely get a float value from DEFAULT_PARAMS."""
value = DEFAULT_PARAMS.get(param_name, default)
return float(value) if not isinstance(value, list) else float(value[0]) if value else default
def load_embeddings():
"""Load and cache the embedding model."""
try:
return SentenceTransformer(EMBEDDING_MODEL)
except Exception as e:
logger.error(f"Failed to load embeddings: {e}")
raise
def load_llm(model_name, custom_params=None):
"""Load the language model with specific parameters."""
try:
params = custom_params or DEFAULT_PARAMS
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
return pipeline("summarization", model=model, tokenizer=tokenizer, **params)
except Exception as e:
logger.error(f"Failed to load LLM: {e}")
raise
def process_pdf(file) -> List[Document]:
"""Process the PDF and convert it into a list of Document objects."""
try:
loader = PyPDFLoader(file_path=file)
documents = loader.load() # Load each page as a separate Document
return documents
except Exception as e:
logger.error(f"Error processing PDF: {e}")
raise
def create_vector_store(documents: List[Document], embeddings):
"""Create and save the vector store."""
try:
db = FAISS.from_documents(documents, embeddings)
db.save_local(DB_FAISS_PATH)
return db
except Exception as e:
logger.error(f"Error creating vector store: {e}")
raise
def summarize_report(documents: List[Document], llm) -> str:
"""Summarize the report using a map-reduce approach."""
try:
# Limit the number of chunks to process
max_chunks = 50 # Adjust this value based on your needs
if len(documents) > max_chunks:
logger.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
documents = documents[:max_chunks]
# Map prompt
map_template = """Summarize the following text:\n\n{text}\n\nSummary:"""
map_prompt = PromptTemplate.from_template(map_template)
# Reduce prompt
reduce_template = """Combine these summaries into a final summary:\n\nSummary:\n{doc_summaries}\n\nFinal Summary:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
# Create the chains
map_chain = MapReduceDocumentsChain(
llm_chain=lambda text: llm(text=map_prompt.format(text=text)),
reduce_documents_chain=ReduceDocumentsChain(
combine_documents_chain=lambda summaries: llm(text=reduce_prompt.format(doc_summaries=summaries))
),
)
summary = map_chain.run(documents)
return summary
except Exception as e:
logger.error(f"Error summarizing report: {e}")
raise
def main(pdf_path: str, model_name: str = DEFAULT_MODEL):
"""Main function to summarize the PDF report."""
try:
# Load models and embeddings
embeddings = load_embeddings()
llm = load_llm(model_name)
# Process the PDF
documents = process_pdf(pdf_path)
# Create vector store
create_vector_store(documents, embeddings)
# Generate summary
summary = summarize_report(documents, llm)
print("Structured Summary:\n", summary)
except Exception as e:
logger.error(f"Failed to summarize the report: {e}")
if __name__ == "__main__":
pdf_path = "path/to/your/report.pdf" # Replace with the path to your PDF
main(pdf_path)
|