Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,19 +1,17 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import tempfile
|
3 |
import os
|
4 |
import logging
|
5 |
import subprocess
|
|
|
6 |
from typing import List
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from langchain.
|
11 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
from langchain.prompts import PromptTemplate
|
|
|
13 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
14 |
from langchain.runnables import RunnableMap, RunnableLambda
|
15 |
-
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
|
16 |
-
|
17 |
|
18 |
# Set up logging
|
19 |
logging.basicConfig(level=logging.INFO)
|
@@ -22,14 +20,15 @@ logger = logging.getLogger(__name__)
|
|
22 |
# Constants
|
23 |
DB_FAISS_PATH = 'vectorstore/db_faiss'
|
24 |
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
25 |
-
DEFAULT_MODEL = "
|
26 |
|
27 |
# Default model parameters
|
28 |
DEFAULT_PARAMS = {
|
29 |
"temperature": 0.7,
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
|
|
33 |
}
|
34 |
|
35 |
def get_default_value(param_name: str, default: float) -> float:
|
@@ -37,42 +36,34 @@ def get_default_value(param_name: str, default: float) -> float:
|
|
37 |
value = DEFAULT_PARAMS.get(param_name, default)
|
38 |
return float(value) if not isinstance(value, list) else float(value[0]) if value else default
|
39 |
|
40 |
-
@st.cache_resource
|
41 |
def load_embeddings():
|
42 |
"""Load and cache the embedding model."""
|
43 |
try:
|
44 |
-
return
|
45 |
except Exception as e:
|
46 |
logger.error(f"Failed to load embeddings: {e}")
|
47 |
-
|
48 |
-
return None
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
"""Load and cache the Hugging Face model and tokenizer."""
|
53 |
try:
|
54 |
-
|
55 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
56 |
-
|
57 |
-
return
|
58 |
except Exception as e:
|
59 |
logger.error(f"Failed to load LLM: {e}")
|
60 |
-
|
61 |
-
return None
|
62 |
|
63 |
def process_pdf(file) -> List[Document]:
|
|
|
64 |
try:
|
65 |
-
|
66 |
-
|
67 |
-
temp_file_path = temp_file.name
|
68 |
-
loader = PyPDFLoader(file_path=temp_file_path)
|
69 |
-
documents = loader.load() # This loads each page as a separate Document
|
70 |
-
os.unlink(temp_file_path) # Clean up the temporary file
|
71 |
return documents
|
72 |
except Exception as e:
|
73 |
logger.error(f"Error processing PDF: {e}")
|
74 |
-
|
75 |
-
return []
|
76 |
|
77 |
def create_vector_store(documents: List[Document], embeddings):
|
78 |
"""Create and save the vector store."""
|
@@ -82,101 +73,65 @@ def create_vector_store(documents: List[Document], embeddings):
|
|
82 |
return db
|
83 |
except Exception as e:
|
84 |
logger.error(f"Error creating vector store: {e}")
|
85 |
-
|
86 |
-
return None
|
87 |
|
88 |
-
def summarize_report(documents: List[Document],
|
89 |
"""Summarize the report using a map-reduce approach."""
|
90 |
try:
|
91 |
# Limit the number of chunks to process
|
92 |
max_chunks = 50 # Adjust this value based on your needs
|
93 |
if len(documents) > max_chunks:
|
94 |
-
|
95 |
documents = documents[:max_chunks]
|
96 |
|
97 |
# Map prompt
|
98 |
-
|
99 |
-
|
100 |
-
return summary
|
101 |
|
102 |
# Reduce prompt
|
103 |
-
|
104 |
-
|
105 |
-
final_summary = summarizer(combined_text, max_length=300, min_length=100, do_sample=False)[0]['summary_text']
|
106 |
-
return final_summary
|
107 |
|
108 |
-
#
|
109 |
map_chain = RunnableMap(
|
110 |
-
llm_chain=lambda text:
|
111 |
)
|
112 |
|
|
|
113 |
reduce_chain = RunnableLambda(
|
114 |
-
llm_chain=lambda doc_summaries:
|
115 |
)
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
summary = reduce_chain.run({"doc_summaries": summaries})
|
121 |
|
122 |
return summary
|
123 |
|
124 |
except Exception as e:
|
125 |
logger.error(f"Error summarizing report: {e}")
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
custom_params = {
|
149 |
-
"temperature": custom_temp,
|
150 |
-
"top_p": custom_top_p,
|
151 |
-
"num_ctx": custom_num_ctx,
|
152 |
-
"repeat_penalty": custom_repeat_penalty
|
153 |
-
}
|
154 |
-
|
155 |
-
uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")
|
156 |
-
|
157 |
-
summarizer = load_llm(model_option)
|
158 |
-
embeddings = load_embeddings()
|
159 |
-
|
160 |
-
if not summarizer or not embeddings:
|
161 |
-
return
|
162 |
-
|
163 |
-
if uploaded_file:
|
164 |
-
with st.spinner("Processing PDF..."):
|
165 |
-
documents = process_pdf(uploaded_file)
|
166 |
-
|
167 |
-
if documents:
|
168 |
-
with st.spinner("Creating vector store..."):
|
169 |
-
db = create_vector_store(documents, embeddings)
|
170 |
-
|
171 |
-
if db and st.button("Summarize"):
|
172 |
-
with st.spinner(f"Generating structured summary using {model_option}..."):
|
173 |
-
summary = summarize_report(documents, summarizer)
|
174 |
-
|
175 |
-
if summary:
|
176 |
-
st.subheader("Structured Summary:")
|
177 |
-
st.markdown(summary)
|
178 |
-
else:
|
179 |
-
st.warning("Failed to generate summary. Please try again.")
|
180 |
|
181 |
if __name__ == "__main__":
|
182 |
-
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import logging
|
3 |
import subprocess
|
4 |
+
import tempfile
|
5 |
from typing import List
|
6 |
+
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from langchain.document_loaders import PyPDFLoader
|
|
|
10 |
from langchain.prompts import PromptTemplate
|
11 |
+
from langchain.schema import Document
|
12 |
from langchain.text_splitter import CharacterTextSplitter
|
13 |
+
from langchain.chains import MapReduceDocumentsChain
|
14 |
from langchain.runnables import RunnableMap, RunnableLambda
|
|
|
|
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
20 |
# Constants
|
21 |
DB_FAISS_PATH = 'vectorstore/db_faiss'
|
22 |
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
23 |
+
DEFAULT_MODEL = "facebook/bart-large-cnn"
|
24 |
|
25 |
# Default model parameters
|
26 |
DEFAULT_PARAMS = {
|
27 |
"temperature": 0.7,
|
28 |
+
"max_length": 1024,
|
29 |
+
"num_beams": 4,
|
30 |
+
"top_p": 0.95,
|
31 |
+
"repetition_penalty": 1.2,
|
32 |
}
|
33 |
|
34 |
def get_default_value(param_name: str, default: float) -> float:
|
|
|
36 |
value = DEFAULT_PARAMS.get(param_name, default)
|
37 |
return float(value) if not isinstance(value, list) else float(value[0]) if value else default
|
38 |
|
|
|
39 |
def load_embeddings():
|
40 |
"""Load and cache the embedding model."""
|
41 |
try:
|
42 |
+
return SentenceTransformer(EMBEDDING_MODEL)
|
43 |
except Exception as e:
|
44 |
logger.error(f"Failed to load embeddings: {e}")
|
45 |
+
raise
|
|
|
46 |
|
47 |
+
def load_llm(model_name, custom_params=None):
|
48 |
+
"""Load the language model with specific parameters."""
|
|
|
49 |
try:
|
50 |
+
params = custom_params or DEFAULT_PARAMS
|
51 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
53 |
+
return pipeline("summarization", model=model, tokenizer=tokenizer, **params)
|
54 |
except Exception as e:
|
55 |
logger.error(f"Failed to load LLM: {e}")
|
56 |
+
raise
|
|
|
57 |
|
58 |
def process_pdf(file) -> List[Document]:
|
59 |
+
"""Process the PDF and convert it into a list of Document objects."""
|
60 |
try:
|
61 |
+
loader = PyPDFLoader(file_path=file)
|
62 |
+
documents = loader.load() # Load each page as a separate Document
|
|
|
|
|
|
|
|
|
63 |
return documents
|
64 |
except Exception as e:
|
65 |
logger.error(f"Error processing PDF: {e}")
|
66 |
+
raise
|
|
|
67 |
|
68 |
def create_vector_store(documents: List[Document], embeddings):
|
69 |
"""Create and save the vector store."""
|
|
|
73 |
return db
|
74 |
except Exception as e:
|
75 |
logger.error(f"Error creating vector store: {e}")
|
76 |
+
raise
|
|
|
77 |
|
78 |
+
def summarize_report(documents: List[Document], llm) -> str:
|
79 |
"""Summarize the report using a map-reduce approach."""
|
80 |
try:
|
81 |
# Limit the number of chunks to process
|
82 |
max_chunks = 50 # Adjust this value based on your needs
|
83 |
if len(documents) > max_chunks:
|
84 |
+
logger.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
|
85 |
documents = documents[:max_chunks]
|
86 |
|
87 |
# Map prompt
|
88 |
+
map_template = """Summarize the following text:\n\n{text}\n\nSummary:"""
|
89 |
+
map_prompt = PromptTemplate.from_template(map_template)
|
|
|
90 |
|
91 |
# Reduce prompt
|
92 |
+
reduce_template = """Combine these summaries into a final summary with the following structure:\n\nSummary:\n{doc_summaries}\n\nFinal Summary:"""
|
93 |
+
reduce_prompt = PromptTemplate.from_template(reduce_template)
|
|
|
|
|
94 |
|
95 |
+
# Map chain
|
96 |
map_chain = RunnableMap(
|
97 |
+
llm_chain=lambda text: llm(text=map_prompt.format(text=text))
|
98 |
)
|
99 |
|
100 |
+
# Reduce chain
|
101 |
reduce_chain = RunnableLambda(
|
102 |
+
llm_chain=lambda doc_summaries: llm(text=reduce_prompt.format(doc_summaries=doc_summaries))
|
103 |
)
|
104 |
|
105 |
+
# Run map-reduce sequence
|
106 |
+
summaries = map_chain.run([doc.page_content for doc in documents])
|
107 |
+
summary = reduce_chain.run({"doc_summaries": summaries})
|
|
|
108 |
|
109 |
return summary
|
110 |
|
111 |
except Exception as e:
|
112 |
logger.error(f"Error summarizing report: {e}")
|
113 |
+
raise
|
114 |
+
|
115 |
+
def main(pdf_path: str, model_name: str = DEFAULT_MODEL):
|
116 |
+
"""Main function to summarize the PDF report."""
|
117 |
+
try:
|
118 |
+
# Load models and embeddings
|
119 |
+
embeddings = load_embeddings()
|
120 |
+
llm = load_llm(model_name)
|
121 |
+
|
122 |
+
# Process the PDF
|
123 |
+
documents = process_pdf(pdf_path)
|
124 |
+
|
125 |
+
# Create vector store
|
126 |
+
create_vector_store(documents, embeddings)
|
127 |
+
|
128 |
+
# Generate summary
|
129 |
+
summary = summarize_report(documents, llm)
|
130 |
+
|
131 |
+
print("Structured Summary:\n", summary)
|
132 |
+
except Exception as e:
|
133 |
+
logger.error(f"Failed to summarize the report: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
if __name__ == "__main__":
|
136 |
+
pdf_path = "path/to/your/report.pdf" # Replace with the path to your PDF
|
137 |
+
main(pdf_path)
|