Kathirsci commited on
Commit
ef0dde7
·
verified ·
1 Parent(s): a119da5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -119
app.py CHANGED
@@ -1,131 +1,48 @@
1
- import os
2
- import logging
3
- import tempfile
4
- from typing import List
5
- from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
6
- from sentence_transformers import SentenceTransformer
7
- from langchain_community.vectorstores import FAISS
8
- from langchain_community.document_loaders import PyPDFLoader
9
  from langchain.prompts import PromptTemplate
10
- from langchain.schema import Document
11
- from langchain.text_splitter import CharacterTextSplitter
12
- from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
13
-
14
- # Set up logging
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- # Constants
19
- DB_FAISS_PATH = 'vectorstore/db_faiss'
20
- EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
21
- DEFAULT_MODEL = "facebook/bart-large-cnn"
22
-
23
- # Default model parameters
24
- DEFAULT_PARAMS = {
25
- "temperature": 0.7,
26
- "max_length": 1024,
27
- "num_beams": 4,
28
- "top_p": 0.95,
29
- "repetition_penalty": 1.2,
30
- }
31
-
32
- def get_default_value(param_name: str, default: float) -> float:
33
- """Safely get a float value from DEFAULT_PARAMS."""
34
- value = DEFAULT_PARAMS.get(param_name, default)
35
- return float(value) if not isinstance(value, list) else float(value[0]) if value else default
36
-
37
- def load_embeddings():
38
- """Load and cache the embedding model."""
39
- try:
40
- return SentenceTransformer(EMBEDDING_MODEL)
41
- except Exception as e:
42
- logger.error(f"Failed to load embeddings: {e}")
43
- raise
44
 
45
- def load_llm(model_name, custom_params=None):
46
- """Load the language model with specific parameters."""
47
- try:
48
- params = custom_params or DEFAULT_PARAMS
49
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
50
- tokenizer = AutoTokenizer.from_pretrained(model_name)
51
- return pipeline("summarization", model=model, tokenizer=tokenizer, **params)
52
- except Exception as e:
53
- logger.error(f"Failed to load LLM: {e}")
54
- raise
55
 
56
- def process_pdf(file) -> List[Document]:
57
- """Process the PDF and convert it into a list of Document objects."""
58
  try:
59
- loader = PyPDFLoader(file_path=file)
60
- documents = loader.load() # Load each page as a separate Document
61
- return documents
62
- except Exception as e:
63
- logger.error(f"Error processing PDF: {e}")
64
- raise
65
-
66
- def create_vector_store(documents: List[Document], embeddings):
67
- """Create and save the vector store."""
68
- try:
69
- db = FAISS.from_documents(documents, embeddings)
70
- db.save_local(DB_FAISS_PATH)
71
- return db
72
- except Exception as e:
73
- logger.error(f"Error creating vector store: {e}")
74
- raise
75
 
76
- def summarize_report(documents: List[Document], llm) -> str:
77
- """Summarize the report using a map-reduce approach."""
78
- try:
79
- # Limit the number of chunks to process
80
- max_chunks = 50 # Adjust this value based on your needs
81
- if len(documents) > max_chunks:
82
- logger.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
83
- documents = documents[:max_chunks]
84
-
85
- # Map prompt
86
  map_template = """Summarize the following text:\n\n{text}\n\nSummary:"""
87
  map_prompt = PromptTemplate.from_template(map_template)
88
-
89
- # Reduce prompt
90
- reduce_template = """Combine these summaries into a final summary:\n\nSummary:\n{doc_summaries}\n\nFinal Summary:"""
91
  reduce_prompt = PromptTemplate.from_template(reduce_template)
92
 
93
- # Create the chains
94
- map_chain = MapReduceDocumentsChain(
95
- llm_chain=lambda text: llm(text=map_prompt.format(text=text)),
96
- reduce_documents_chain=ReduceDocumentsChain(
97
- combine_documents_chain=lambda summaries: llm(text=reduce_prompt.format(doc_summaries=summaries))
98
- ),
99
  )
100
-
101
- summary = map_chain.run(documents)
102
-
103
  return summary
104
-
105
- except Exception as e:
106
- logger.error(f"Error summarizing report: {e}")
107
- raise
108
-
109
- def main(pdf_path: str, model_name: str = DEFAULT_MODEL):
110
- """Main function to summarize the PDF report."""
111
- try:
112
- # Load models and embeddings
113
- embeddings = load_embeddings()
114
- llm = load_llm(model_name)
115
-
116
- # Process the PDF
117
- documents = process_pdf(pdf_path)
118
-
119
- # Create vector store
120
- create_vector_store(documents, embeddings)
121
-
122
- # Generate summary
123
- summary = summarize_report(documents, llm)
124
-
125
- print("Structured Summary:\n", summary)
126
  except Exception as e:
127
- logger.error(f"Failed to summarize the report: {e}")
128
-
129
- if __name__ == "__main__":
130
- pdf_path = "path/to/your/report.pdf" # Replace with the path to your PDF
131
- main(pdf_path)
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.llms import HuggingFaceLLM
5
+ from langchain.chains import MapReduceChain
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings import HuggingFaceEmbeddings
 
8
  from langchain.prompts import PromptTemplate
9
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Load model and embeddings
12
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cpu")
13
+ llm = HuggingFaceLLM.from_pretrained("your-model")
 
 
 
 
 
 
 
14
 
15
+ def process_pdf_and_summarize(file):
 
16
  try:
17
+ # Load PDF document
18
+ loader = PyPDFLoader(file.name)
19
+ documents = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Summarize the document
 
 
 
 
 
 
 
 
 
22
  map_template = """Summarize the following text:\n\n{text}\n\nSummary:"""
23
  map_prompt = PromptTemplate.from_template(map_template)
24
+ reduce_template = """Combine these summaries into a final summary:\n\nSummaries: {doc_summaries}\n\nFinal Summary:"""
 
 
25
  reduce_prompt = PromptTemplate.from_template(reduce_template)
26
 
27
+ chain = MapReduceChain.from_chain_type(
28
+ llm=llm,
29
+ chain_type="map_reduce",
30
+ map_prompt=map_prompt,
31
+ reduce_prompt=reduce_prompt,
32
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
33
  )
34
+ summary = chain.run(documents)
 
 
35
  return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  except Exception as e:
37
+ return f"Error processing PDF: {str(e)}"
38
+
39
+ # Gradio interface
40
+ interface = gr.Interface(
41
+ fn=process_pdf_and_summarize,
42
+ inputs=gr.inputs.File(label="Upload PDF"),
43
+ outputs="text",
44
+ title="PDF Summarizer",
45
+ description="Upload a PDF document to generate a summary."
46
+ )
47
+
48
+ interface.launch()