File size: 4,644 Bytes
1e615dd
 
a5b6905
1e615dd
a5b6905
 
2976ddc
 
1e615dd
a5b6905
1e615dd
2976ddc
1e615dd
 
 
 
 
 
 
 
a5b6905
1e615dd
 
 
 
a5b6905
 
 
 
1e615dd
 
 
 
 
 
 
 
 
 
a5b6905
1e615dd
 
a5b6905
1e615dd
a5b6905
 
1e615dd
a5b6905
1e615dd
a5b6905
 
1e615dd
 
a5b6905
1e615dd
 
a5b6905
1e615dd
a5b6905
 
1e615dd
 
 
a5b6905
1e615dd
 
 
 
 
 
 
 
 
a5b6905
1e615dd
a5b6905
1e615dd
 
 
 
 
a5b6905
1e615dd
 
 
a5b6905
 
1e615dd
 
2976ddc
a5b6905
1e615dd
2976ddc
 
 
 
 
 
1e615dd
 
2976ddc
1e615dd
 
 
 
 
a5b6905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e615dd
 
a5b6905
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import logging
import tempfile
from typing import List
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
DB_FAISS_PATH = 'vectorstore/db_faiss'
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEFAULT_MODEL = "facebook/bart-large-cnn"

# Default model parameters
DEFAULT_PARAMS = {
    "temperature": 0.7,
    "max_length": 1024,
    "num_beams": 4,
    "top_p": 0.95,
    "repetition_penalty": 1.2,
}

def get_default_value(param_name: str, default: float) -> float:
    """Safely get a float value from DEFAULT_PARAMS."""
    value = DEFAULT_PARAMS.get(param_name, default)
    return float(value) if not isinstance(value, list) else float(value[0]) if value else default

def load_embeddings():
    """Load and cache the embedding model."""
    try:
        return SentenceTransformer(EMBEDDING_MODEL)
    except Exception as e:
        logger.error(f"Failed to load embeddings: {e}")
        raise

def load_llm(model_name, custom_params=None):
    """Load the language model with specific parameters."""
    try:
        params = custom_params or DEFAULT_PARAMS
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return pipeline("summarization", model=model, tokenizer=tokenizer, **params)
    except Exception as e:
        logger.error(f"Failed to load LLM: {e}")
        raise

def process_pdf(file) -> List[Document]:
    """Process the PDF and convert it into a list of Document objects."""
    try:
        loader = PyPDFLoader(file_path=file)
        documents = loader.load()  # Load each page as a separate Document
        return documents
    except Exception as e:
        logger.error(f"Error processing PDF: {e}")
        raise

def create_vector_store(documents: List[Document], embeddings):
    """Create and save the vector store."""
    try:
        db = FAISS.from_documents(documents, embeddings)
        db.save_local(DB_FAISS_PATH)
        return db
    except Exception as e:
        logger.error(f"Error creating vector store: {e}")
        raise

def summarize_report(documents: List[Document], llm) -> str:
    """Summarize the report using a map-reduce approach."""
    try:
        # Limit the number of chunks to process
        max_chunks = 50  # Adjust this value based on your needs
        if len(documents) > max_chunks:
            logger.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
            documents = documents[:max_chunks]

        # Map prompt
        map_template = """Summarize the following text:\n\n{text}\n\nSummary:"""
        map_prompt = PromptTemplate.from_template(map_template)

        # Reduce prompt
        reduce_template = """Combine these summaries into a final summary:\n\nSummary:\n{doc_summaries}\n\nFinal Summary:"""
        reduce_prompt = PromptTemplate.from_template(reduce_template)

        # Create the chains
        map_chain = MapReduceDocumentsChain(
            llm_chain=lambda text: llm(text=map_prompt.format(text=text)),
            reduce_documents_chain=ReduceDocumentsChain(
                combine_documents_chain=lambda summaries: llm(text=reduce_prompt.format(doc_summaries=summaries))
            ),
        )

        summary = map_chain.run(documents)

        return summary

    except Exception as e:
        logger.error(f"Error summarizing report: {e}")
        raise

def main(pdf_path: str, model_name: str = DEFAULT_MODEL):
    """Main function to summarize the PDF report."""
    try:
        # Load models and embeddings
        embeddings = load_embeddings()
        llm = load_llm(model_name)

        # Process the PDF
        documents = process_pdf(pdf_path)

        # Create vector store
        create_vector_store(documents, embeddings)

        # Generate summary
        summary = summarize_report(documents, llm)

        print("Structured Summary:\n", summary)
    except Exception as e:
        logger.error(f"Failed to summarize the report: {e}")

if __name__ == "__main__":
    pdf_path = "path/to/your/report.pdf"  # Replace with the path to your PDF
    main(pdf_path)