Spaces:
Sleeping
Sleeping
File size: 6,990 Bytes
1e615dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import streamlit as st
import tempfile
import os
import logging
import subprocess
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.runnables import RunnableMap, RunnableLambda
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
DB_FAISS_PATH = 'vectorstore/db_faiss'
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEFAULT_MODEL = "google/flan-t5-large" # Replace with your preferred Hugging Face model
# Default model parameters
DEFAULT_PARAMS = {
"temperature": 0.7,
"top_p": 1.0,
"num_ctx": 4096,
"repeat_penalty": 1.1,
}
def get_default_value(param_name: str, default: float) -> float:
"""Safely get a float value from DEFAULT_PARAMS."""
value = DEFAULT_PARAMS.get(param_name, default)
return float(value) if not isinstance(value, list) else float(value[0]) if value else default
@st.cache_resource
def load_embeddings():
"""Load and cache the embedding model."""
try:
return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': 'cpu'})
except Exception as e:
logger.error(f"Failed to load embeddings: {e}")
st.error("Failed to load the embedding model. Please try again later.")
return None
@st.cache_resource
def load_llm(model_name: str):
"""Load and cache the Hugging Face model and tokenizer."""
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
return summarizer
except Exception as e:
logger.error(f"Failed to load LLM: {e}")
st.error(f"Failed to load the model {model_name}. Please check the model name and try again.")
return None
def process_pdf(file) -> List[Document]:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file.getvalue())
temp_file_path = temp_file.name
loader = PyPDFLoader(file_path=temp_file_path)
documents = loader.load() # This loads each page as a separate Document
os.unlink(temp_file_path) # Clean up the temporary file
return documents
except Exception as e:
logger.error(f"Error processing PDF: {e}")
st.error("Failed to process the PDF. Please make sure it's a valid PDF file.")
return []
def create_vector_store(documents: List[Document], embeddings):
"""Create and save the vector store."""
try:
db = FAISS.from_documents(documents, embeddings)
db.save_local(DB_FAISS_PATH)
return db
except Exception as e:
logger.error(f"Error creating vector store: {e}")
st.error("Failed to create the vector store. Please try again.")
return None
def summarize_report(documents: List[Document], summarizer) -> str:
"""Summarize the report using a map-reduce approach."""
try:
# Limit the number of chunks to process
max_chunks = 50 # Adjust this value based on your needs
if len(documents) > max_chunks:
st.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
documents = documents[:max_chunks]
# Map prompt
def map_fn(text):
summary = summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
return summary
# Reduce prompt
def reduce_fn(summaries):
combined_text = " ".join(summaries)
final_summary = summarizer(combined_text, max_length=300, min_length=100, do_sample=False)[0]['summary_text']
return final_summary
# RunnableSequence replaces the deprecated LLMChain
map_chain = RunnableMap(
llm_chain=lambda text: map_fn(text)
)
reduce_chain = RunnableLambda(
llm_chain=lambda doc_summaries: reduce_fn(doc_summaries)
)
with st.spinner("Generating summary..."):
# Run map-reduce sequence
summaries = map_chain.run([doc.page_content for doc in documents])
summary = reduce_chain.run({"doc_summaries": summaries})
return summary
except Exception as e:
logger.error(f"Error summarizing report: {e}")
st.error("Failed to summarize the report. Please try again.")
return ""
def main():
st.title("Report Summarizer ")
model_option = st.sidebar.text_input("Enter Hugging Face model name", value=DEFAULT_MODEL)
# Advanced options
with st.sidebar.expander("Advanced Model Parameters"):
custom_temp = st.slider("Temperature", 0.0, 1.0,
value=get_default_value("temperature", 0.7),
step=0.01)
custom_top_p = st.slider("Top P", 0.0, 1.0,
value=get_default_value("top_p", 1.0),
step=0.01)
custom_num_ctx = st.number_input("Context Window", 1024, 8192,
value=int(get_default_value("num_ctx", 4096)))
custom_repeat_penalty = st.slider("Repeat Penalty", 1.0, 2.0,
value=get_default_value("repeat_penalty", 1.1),
step=0.01)
custom_params = {
"temperature": custom_temp,
"top_p": custom_top_p,
"num_ctx": custom_num_ctx,
"repeat_penalty": custom_repeat_penalty
}
uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")
summarizer = load_llm(model_option)
embeddings = load_embeddings()
if not summarizer or not embeddings:
return
if uploaded_file:
with st.spinner("Processing PDF..."):
documents = process_pdf(uploaded_file)
if documents:
with st.spinner("Creating vector store..."):
db = create_vector_store(documents, embeddings)
if db and st.button("Summarize"):
with st.spinner(f"Generating structured summary using {model_option}..."):
summary = summarize_report(documents, summarizer)
if summary:
st.subheader("Structured Summary:")
st.markdown(summary)
else:
st.warning("Failed to generate summary. Please try again.")
if __name__ == "__main__":
main()
|