File size: 6,990 Bytes
1e615dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import streamlit as st
import tempfile
import os
import logging
import subprocess
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.runnables import RunnableMap, RunnableLambda
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
DB_FAISS_PATH = 'vectorstore/db_faiss'
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEFAULT_MODEL = "google/flan-t5-large"  # Replace with your preferred Hugging Face model

# Default model parameters
DEFAULT_PARAMS = {
    "temperature": 0.7,
    "top_p": 1.0,
    "num_ctx": 4096,
    "repeat_penalty": 1.1,
}

def get_default_value(param_name: str, default: float) -> float:
    """Safely get a float value from DEFAULT_PARAMS."""
    value = DEFAULT_PARAMS.get(param_name, default)
    return float(value) if not isinstance(value, list) else float(value[0]) if value else default

@st.cache_resource
def load_embeddings():
    """Load and cache the embedding model."""
    try:
        return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': 'cpu'})
    except Exception as e:
        logger.error(f"Failed to load embeddings: {e}")
        st.error("Failed to load the embedding model. Please try again later.")
        return None

@st.cache_resource
def load_llm(model_name: str):
    """Load and cache the Hugging Face model and tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
        return summarizer
    except Exception as e:
        logger.error(f"Failed to load LLM: {e}")
        st.error(f"Failed to load the model {model_name}. Please check the model name and try again.")
        return None

def process_pdf(file) -> List[Document]:
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file.getvalue())
            temp_file_path = temp_file.name
        loader = PyPDFLoader(file_path=temp_file_path)
        documents = loader.load()  # This loads each page as a separate Document
        os.unlink(temp_file_path)  # Clean up the temporary file
        return documents
    except Exception as e:
        logger.error(f"Error processing PDF: {e}")
        st.error("Failed to process the PDF. Please make sure it's a valid PDF file.")
        return []

def create_vector_store(documents: List[Document], embeddings):
    """Create and save the vector store."""
    try:
        db = FAISS.from_documents(documents, embeddings)
        db.save_local(DB_FAISS_PATH)
        return db
    except Exception as e:
        logger.error(f"Error creating vector store: {e}")
        st.error("Failed to create the vector store. Please try again.")
        return None

def summarize_report(documents: List[Document], summarizer) -> str:
    """Summarize the report using a map-reduce approach."""
    try:
        # Limit the number of chunks to process
        max_chunks = 50  # Adjust this value based on your needs
        if len(documents) > max_chunks:
            st.warning(f"Document is very large. Summarizing first {max_chunks} chunks only.")
            documents = documents[:max_chunks]

        # Map prompt
        def map_fn(text):
            summary = summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
            return summary

        # Reduce prompt
        def reduce_fn(summaries):
            combined_text = " ".join(summaries)
            final_summary = summarizer(combined_text, max_length=300, min_length=100, do_sample=False)[0]['summary_text']
            return final_summary

        # RunnableSequence replaces the deprecated LLMChain
        map_chain = RunnableMap(
            llm_chain=lambda text: map_fn(text)
        )

        reduce_chain = RunnableLambda(
            llm_chain=lambda doc_summaries: reduce_fn(doc_summaries)
        )

        with st.spinner("Generating summary..."):
            # Run map-reduce sequence
            summaries = map_chain.run([doc.page_content for doc in documents])
            summary = reduce_chain.run({"doc_summaries": summaries})

        return summary

    except Exception as e:
        logger.error(f"Error summarizing report: {e}")
        st.error("Failed to summarize the report. Please try again.")
        return ""
        
def main():
    st.title("Report Summarizer ")
    
    model_option = st.sidebar.text_input("Enter Hugging Face model name", value=DEFAULT_MODEL)

    # Advanced options
    with st.sidebar.expander("Advanced Model Parameters"):
        custom_temp = st.slider("Temperature", 0.0, 1.0,
                                 value=get_default_value("temperature", 0.7),
                                 step=0.01)
        custom_top_p = st.slider("Top P", 0.0, 1.0,
                                  value=get_default_value("top_p", 1.0),
                                  step=0.01)
        custom_num_ctx = st.number_input("Context Window", 1024, 8192,
                                          value=int(get_default_value("num_ctx", 4096)))
        custom_repeat_penalty = st.slider("Repeat Penalty", 1.0, 2.0,
                                           value=get_default_value("repeat_penalty", 1.1),
                                           step=0.01)

    custom_params = {
        "temperature": custom_temp,
        "top_p": custom_top_p,
        "num_ctx": custom_num_ctx,
        "repeat_penalty": custom_repeat_penalty
    }

    uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")

    summarizer = load_llm(model_option)
    embeddings = load_embeddings()

    if not summarizer or not embeddings:
        return

    if uploaded_file:
        with st.spinner("Processing PDF..."):
            documents = process_pdf(uploaded_file)

        if documents:
            with st.spinner("Creating vector store..."):
                db = create_vector_store(documents, embeddings)

            if db and st.button("Summarize"):
                with st.spinner(f"Generating structured summary using {model_option}..."):
                    summary = summarize_report(documents, summarizer)

                    if summary:
                        st.subheader("Structured Summary:")
                        st.markdown(summary)
                    else:
                        st.warning("Failed to generate summary. Please try again.")

if __name__ == "__main__":
    main()