Spaces:

Kathirsci
/

mistreal

Sleeping

App Files Files Community

Kathirsci commited on Aug 19, 2024

Commit

a5b6905

verified ·

1 Parent(s): 8e690b1

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -106

app.py CHANGED Viewed

@@ -1,19 +1,17 @@
-import streamlit as st
-import tempfile
 import os
 import logging
 import subprocess
 from typing import List
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain.schema import Document
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.prompts import PromptTemplate
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.runnables import RunnableMap, RunnableLambda
-from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -22,14 +20,15 @@ logger = logging.getLogger(__name__)
 # Constants
 DB_FAISS_PATH = 'vectorstore/db_faiss'
 EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
-DEFAULT_MODEL = "google/flan-t5-large"  # Replace with your preferred Hugging Face model
 # Default model parameters
 DEFAULT_PARAMS = {
     "temperature": 0.7,
-    "top_p": 1.0,
-    "num_ctx": 4096,
-    "repeat_penalty": 1.1,
 }
 def get_default_value(param_name: str, default: float) -> float:
@@ -37,42 +36,34 @@ def get_default_value(param_name: str, default: float) -> float:
     value = DEFAULT_PARAMS.get(param_name, default)
     return float(value) if not isinstance(value, list) else float(value[0]) if value else default
-@st.cache_resource
 def load_embeddings():
     """Load and cache the embedding model."""
     try:
-        return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': 'cpu'})
     except Exception as e:
         logger.error(f"Failed to load embeddings: {e}")
-        st.error("Failed to load the embedding model. Please try again later.")
-        return None
-@st.cache_resource
-def load_llm(model_name: str):
-    """Load and cache the Hugging Face model and tokenizer."""
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
-        return summarizer
     except Exception as e:
         logger.error(f"Failed to load LLM: {e}")
-        st.error(f"Failed to load the model {model_name}. Please check the model name and try again.")
-        return None
 def process_pdf(file) -> List[Document]:
     try:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
-            temp_file.write(file.getvalue())
-            temp_file_path = temp_file.name
-        loader = PyPDFLoader(file_path=temp_file_path)
-        documents = loader.load()  # This loads each page as a separate Document
-        os.unlink(temp_file_path)  # Clean up the temporary file
         return documents
     except Exception as e:
         logger.error(f"Error processing PDF: {e}")
-        st.error("Failed to process the PDF. Please make sure it's a valid PDF file.")
-        return []
 def create_vector_store(documents: List[Document], embeddings):
     """Create and save the vector store."""
@@ -82,101 +73,65 @@ def create_vector_store(documents: List[Document], embeddings):
         return db
     except Exception as e:
         logger.error(f"Error creating vector store: {e}")
-        st.error("Failed to create the vector store. Please try again.")
-        return None
-def summarize_report(documents: List[Document], summarizer) -> str:
     """Summarize the report using a map-reduce approach."""
     try:
         # Limit the number of chunks to process
         max_chunks = 50  # Adjust this value based on your needs
         if len(documents) > max_chunks:
-            st.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
             documents = documents[:max_chunks]
         # Map prompt
-        def map_fn(text):
-            summary = summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
-            return summary
         # Reduce prompt
-        def reduce_fn(summaries):
-            combined_text = " ".join(summaries)
-            final_summary = summarizer(combined_text, max_length=300, min_length=100, do_sample=False)[0]['summary_text']
-            return final_summary
-        # RunnableSequence replaces the deprecated LLMChain
         map_chain = RunnableMap(
-            llm_chain=lambda text: map_fn(text)
         )
         reduce_chain = RunnableLambda(
-            llm_chain=lambda doc_summaries: reduce_fn(doc_summaries)
         )
-        with st.spinner("Generating summary..."):
-            # Run map-reduce sequence
-            summaries = map_chain.run([doc.page_content for doc in documents])
-            summary = reduce_chain.run({"doc_summaries": summaries})
         return summary
     except Exception as e:
         logger.error(f"Error summarizing report: {e}")
-        st.error("Failed to summarize the report. Please try again.")
-        return ""
-def main():
-    st.title("Report Summarizer ")
-    model_option = st.sidebar.text_input("Enter Hugging Face model name", value=DEFAULT_MODEL)
-    # Advanced options
-    with st.sidebar.expander("Advanced Model Parameters"):
-        custom_temp = st.slider("Temperature", 0.0, 1.0,
-                                 value=get_default_value("temperature", 0.7),
-                                 step=0.01)
-        custom_top_p = st.slider("Top P", 0.0, 1.0,
-                                  value=get_default_value("top_p", 1.0),
-                                  step=0.01)
-        custom_num_ctx = st.number_input("Context Window", 1024, 8192,
-                                          value=int(get_default_value("num_ctx", 4096)))
-        custom_repeat_penalty = st.slider("Repeat Penalty", 1.0, 2.0,
-                                           value=get_default_value("repeat_penalty", 1.1),
-                                           step=0.01)
-    custom_params = {
-        "temperature": custom_temp,
-        "top_p": custom_top_p,
-        "num_ctx": custom_num_ctx,
-        "repeat_penalty": custom_repeat_penalty
-    }
-    uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")
-    summarizer = load_llm(model_option)
-    embeddings = load_embeddings()
-    if not summarizer or not embeddings:
-        return
-    if uploaded_file:
-        with st.spinner("Processing PDF..."):
-            documents = process_pdf(uploaded_file)
-        if documents:
-            with st.spinner("Creating vector store..."):
-                db = create_vector_store(documents, embeddings)
-            if db and st.button("Summarize"):
-                with st.spinner(f"Generating structured summary using {model_option}..."):
-                    summary = summarize_report(documents, summarizer)
-                    if summary:
-                        st.subheader("Structured Summary:")
-                        st.markdown(summary)
-                    else:
-                        st.warning("Failed to generate summary. Please try again.")
 if __name__ == "__main__":
-    main()

 import os
 import logging
 import subprocess
+import tempfile
 from typing import List
+from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
+from sentence_transformers import SentenceTransformer
+from langchain.vectorstores import FAISS
+from langchain.document_loaders import PyPDFLoader
 from langchain.prompts import PromptTemplate
+from langchain.schema import Document
 from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains import MapReduceDocumentsChain
 from langchain.runnables import RunnableMap, RunnableLambda
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 # Constants
 DB_FAISS_PATH = 'vectorstore/db_faiss'
 EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+DEFAULT_MODEL = "facebook/bart-large-cnn"
 # Default model parameters
 DEFAULT_PARAMS = {
     "temperature": 0.7,
+    "max_length": 1024,
+    "num_beams": 4,
+    "top_p": 0.95,
+    "repetition_penalty": 1.2,
 }
 def get_default_value(param_name: str, default: float) -> float:
     value = DEFAULT_PARAMS.get(param_name, default)
     return float(value) if not isinstance(value, list) else float(value[0]) if value else default
 def load_embeddings():
     """Load and cache the embedding model."""
     try:
+        return SentenceTransformer(EMBEDDING_MODEL)
     except Exception as e:
         logger.error(f"Failed to load embeddings: {e}")
+        raise
+def load_llm(model_name, custom_params=None):
+    """Load the language model with specific parameters."""
     try:
+        params = custom_params or DEFAULT_PARAMS
         model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        return pipeline("summarization", model=model, tokenizer=tokenizer, **params)
     except Exception as e:
         logger.error(f"Failed to load LLM: {e}")
+        raise
 def process_pdf(file) -> List[Document]:
+    """Process the PDF and convert it into a list of Document objects."""
     try:
+        loader = PyPDFLoader(file_path=file)
+        documents = loader.load()  # Load each page as a separate Document
         return documents
     except Exception as e:
         logger.error(f"Error processing PDF: {e}")
+        raise
 def create_vector_store(documents: List[Document], embeddings):
     """Create and save the vector store."""
         return db
     except Exception as e:
         logger.error(f"Error creating vector store: {e}")
+        raise
+def summarize_report(documents: List[Document], llm) -> str:
     """Summarize the report using a map-reduce approach."""
     try:
         # Limit the number of chunks to process
         max_chunks = 50  # Adjust this value based on your needs
         if len(documents) > max_chunks:
+            logger.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
             documents = documents[:max_chunks]
         # Map prompt
+        map_template = """Summarize the following text:\n\n{text}\n\nSummary:"""
+        map_prompt = PromptTemplate.from_template(map_template)
         # Reduce prompt
+        reduce_template = """Combine these summaries into a final summary with the following structure:\n\nSummary:\n{doc_summaries}\n\nFinal Summary:"""
+        reduce_prompt = PromptTemplate.from_template(reduce_template)
+        # Map chain
         map_chain = RunnableMap(
+            llm_chain=lambda text: llm(text=map_prompt.format(text=text))
         )
+        # Reduce chain
         reduce_chain = RunnableLambda(
+            llm_chain=lambda doc_summaries: llm(text=reduce_prompt.format(doc_summaries=doc_summaries))
         )
+        # Run map-reduce sequence
+        summaries = map_chain.run([doc.page_content for doc in documents])
+        summary = reduce_chain.run({"doc_summaries": summaries})
         return summary
     except Exception as e:
         logger.error(f"Error summarizing report: {e}")
+        raise
+def main(pdf_path: str, model_name: str = DEFAULT_MODEL):
+    """Main function to summarize the PDF report."""
+    try:
+        # Load models and embeddings
+        embeddings = load_embeddings()
+        llm = load_llm(model_name)
+        # Process the PDF
+        documents = process_pdf(pdf_path)
+        # Create vector store
+        create_vector_store(documents, embeddings)
+        # Generate summary
+        summary = summarize_report(documents, llm)
+        print("Structured Summary:\n", summary)
+    except Exception as e:
+        logger.error(f"Failed to summarize the report: {e}")
 if __name__ == "__main__":
+    pdf_path = "path/to/your/report.pdf"  # Replace with the path to your PDF
+    main(pdf_path)