Spaces:

aaporosh
/

SmartPDF_Q_A

Running

App Files Files Community

aaporosh commited on 5 days ago

Commit

afc3005

verified ·

1 Parent(s): 7c6674a

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -53

app.py CHANGED Viewed

@@ -3,12 +3,17 @@ import logging
 import os
 from io import BytesIO
 import pdfplumber
 from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
 from datasets import load_dataset
 import re
 # Setup logging for Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -34,7 +39,7 @@ def load_qa_pipeline():
             fine_tuned_pipeline = fine_tune_qa_model(dataset)
             if fine_tuned_pipeline:
                 return fine_tuned_pipeline
-        return pipeline("text2text-generation", model="google/flan-t5-small", max_length=300)
     except Exception as e:
         logger.error(f"QA model load error: {str(e)}")
         st.error(f"QA model error: {str(e)}")
@@ -51,19 +56,19 @@ def load_summary_pipeline():
         return None
 # Load and prepare dataset (e.g., SQuAD)
-@st.cache_resource(ttl=3600)
 def load_and_prepare_dataset(dataset_name="squad", max_samples=1000):
     logger.info(f"Loading dataset: {dataset_name}")
     try:
-        dataset = load_dataset(dataset_name, split="train")
-        dataset = dataset.shuffle(seed=42).select(range(max_samples))
         def preprocess(examples):
             inputs = [f"question: {q} context: {c}" for q, c in zip(examples['question'], examples['context'])]
             targets = examples['answers']['text']
             return {'input_text': inputs, 'target_text': [t[0] if t else "" for t in targets]}
-        dataset = dataset.map(preprocess, batched=True)
         return dataset
     except Exception as e:
         logger.error(f"Dataset load error: {str(e)}")
@@ -74,7 +79,7 @@ def load_and_prepare_dataset(dataset_name="squad", max_samples=1000):
 def fine_tune_qa_model(dataset):
     logger.info("Starting fine-tuning")
     try:
-        model_name = "google/flan-t5-small"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -84,17 +89,17 @@ def fine_tune_qa_model(dataset):
             model_inputs["labels"] = labels["input_ids"]
             return model_inputs
-        tokenized_dataset = dataset.map(tokenize_function, batched=True)
         training_args = TrainingArguments(
             output_dir="./fine_tuned_model",
-            num_train_epochs=1,
             per_device_train_batch_size=4,
             save_steps=500,
             logging_steps=100,
             evaluation_strategy="no",
-            learning_rate=5e-5,
-            fp16=False,
         )
         trainer = Trainer(
@@ -116,18 +121,18 @@ def fine_tune_qa_model(dataset):
 def augment_vector_store(vector_store, dataset_name="squad", max_samples=500):
     logger.info(f"Augmenting vector store with dataset: {dataset_name}")
     try:
-        dataset = load_dataset(dataset_name, split="train").select(range(max_samples))
         chunks = [f"Context: {c}\nAnswer: {a['text'][0]}" for c, a in zip(dataset['context'], dataset['answers'])]
         embeddings_model = load_embeddings_model()
         if embeddings_model and vector_store:
-            embeddings = embeddings_model.encode(chunks)
             vector_store.add_embeddings(zip(chunks, embeddings))
         return vector_store
     except Exception as e:
         logger.error(f"Vector store augmentation error: {str(e)}")
         return vector_store
-# Process PDF with enhanced extraction
 def process_pdf(uploaded_file):
     logger.info("Processing PDF with enhanced extraction")
     try:
@@ -136,6 +141,12 @@ def process_pdf(uploaded_file):
         with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
             for page in pdf.pages[:20]:
                 extracted = page.extract_text(layout=False)
                 if extracted:
                     text += extracted + "\n"
                 for char in page.chars:
@@ -157,35 +168,34 @@ def process_pdf(uploaded_file):
         if not text:
             raise ValueError("No text extracted from PDF")
-        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=500, chunk_overlap=100, keep_separator=True)
-        text_chunks = text_splitter.split_text(text)[:50]
-        code_chunks = text_splitter.split_text(code_text)[:25] if code_text else []
         embeddings_model = load_embeddings_model()
         if not embeddings_model:
             return None, None, text, code_text
         text_vector_store = FAISS.from_embeddings(
-            zip(text_chunks, [embeddings_model.encode(chunk) for chunk in text_chunks]),
             embeddings_model.encode
         ) if text_chunks else None
         code_vector_store = FAISS.from_embeddings(
-            zip(code_chunks, [embeddings_model.encode(chunk) for chunk in code_chunks]),
             embeddings_model.encode
         ) if code_chunks else None
-        # Augment text vector store with dataset
         if text_vector_store:
             text_vector_store = augment_vector_store(text_vector_store)
-        logger.info("PDF processed successfully with enhanced extraction")
         return text_vector_store, code_vector_store, text, code_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         st.error(f"PDF error: {str(e)}")
         return None, None, "", ""
-# Summarize PDF
 def summarize_pdf(text):
     logger.info("Generating summary")
     try:
@@ -193,24 +203,28 @@ def summarize_pdf(text):
         if not summary_pipeline:
             return "Summary model unavailable."
-        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=500, chunk_overlap=50)
         chunks = text_splitter.split_text(text)[:2]
         summaries = []
         for chunk in chunks:
-            summary = summary_pipeline(chunk[:500], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
             summaries.append(summary.strip())
         combined_summary = " ".join(summaries)
         if len(combined_summary.split()) > 150:
             combined_summary = " ".join(combined_summary.split()[:150])
-        logger.info("Summary generated")
-        return f"Sure, here's a concise summary of the PDF:\n{combined_summary}"
     except Exception as e:
         logger.error(f"Summary error: {str(e)}")
         return f"Oops, something went wrong summarizing: {str(e)}"
-# Answer question with improved response
 def answer_question(text_vector_store, code_vector_store, query):
     logger.info(f"Processing query: {query}")
     try:
@@ -223,18 +237,27 @@ def answer_question(text_vector_store, code_vector_store, query):
         is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code"])
         if is_code_query and code_vector_store:
-            return f"Here's the code from the PDF:\n```python\n{st.session_state.code_text}\n```"
         vector_store = text_vector_store
         if not vector_store:
             return "No relevant content found for your query."
-        docs = vector_store.similarity_search(query, k=5)
-        context = "\n".join(doc.page_content for doc in docs)
-        prompt = f"Context: {context}\nQuestion: {query}\nProvide a detailed, accurate answer based on the context, prioritizing relevant information. Respond as a helpful assistant:"
         response = qa_pipeline(prompt)[0]['generated_text']
         logger.info("Answer generated")
-        return f"Got it! Here's a detailed answer:\n{response.strip()}"
     except Exception as e:
         logger.error(f"Query error: {str(e)}")
         return f"Sorry, something went wrong: {str(e)}"
@@ -245,22 +268,24 @@ try:
     st.markdown("""
         <style>
         .main { max-width: 900px; margin: 0 auto; padding: 20px; }
-        .sidebar { background-color: #f8f9fa; padding: 10px; border-radius: 5px; }
-        .chat-container { border: 1px solid #ddd; border-radius: 10px; padding: 10px; height: 60vh; overflow-y: auto; margin-top: 20px; }
-        .stChatMessage { border-radius: 10px; padding: 10px; margin: 5px; max-width: 70%; }
-        .user { background-color: #e6f3ff; align-self: flex-end; }
-        .assistant { background-color: #f0f0f0; }
-        .dark .user { background-color: #2a2a72; color: #fff; }
-        .dark .assistant { background-color: #2e2e2e; color: #fff; }
-        .stButton>button { background-color: #4CAF50; color: white; border: none; padding: 8px 16px; border-radius: 5px; }
-        .stButton>button:hover { background-color: #45a049; }
-        pre { background-color: #f8f8f8; padding: 10px; border-radius: 5px; overflow-x: auto; }
-        .header { background: linear-gradient(90deg, #4CAF50, #81C784); color: white; padding: 10px; border-radius: 5px; text-align: center; }
         </style>
     """, unsafe_allow_html=True)
     st.markdown('<div class="header"><h1>Smart PDF Q&A</h1></div>', unsafe_allow_html=True)
-    st.markdown("Upload a PDF to ask questions, summarize (~150 words), or extract code with 'give me code'. Fast and friendly responses!")
     # Initialize session state
     if "messages" not in st.session_state:
@@ -274,20 +299,26 @@ try:
     if "code_text" not in st.session_state:
         st.session_state.code_text = ""
-    # Sidebar with toggle and dataset options
     with st.sidebar:
         st.markdown('<div class="sidebar">', unsafe_allow_html=True)
         theme = st.radio("Theme", ["Light", "Dark"], index=0)
         dataset_name = st.selectbox("Select Dataset for Fine-Tuning", ["squad", "cnn_dailymail", "bigcode/the-stack"], index=0)
         if st.button("Fine-Tune Model"):
-            with st.spinner("Fine-tuning model..."):
-                dataset = load_and_prepare_dataset(dataset_name=dataset_name)
-                if dataset:
-                    fine_tuned_pipeline = fine_tune_qa_model(dataset)
-                    if fine_tuned_pipeline:
-                        st.success("Model fine-tuned successfully!")
-                    else:
-                        st.error("Fine-tuning failed.")
         st.markdown('</div>', unsafe_allow_html=True)
     # PDF upload and processing
@@ -295,7 +326,11 @@ try:
     col1, col2 = st.columns([1, 1])
     with col1:
         if st.button("Process PDF"):
             with st.spinner("Processing PDF..."):
                 st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text = process_pdf(uploaded_file)
                 if st.session_state.text_vector_store or st.session_state.code_vector_store:
                     st.success("PDF processed! Ask away or summarize.")
@@ -304,7 +339,11 @@ try:
                     st.error("Failed to process PDF.")
     with col2:
         if st.button("Summarize PDF") and st.session_state.pdf_text:
             with st.spinner("Summarizing..."):
                 summary = summarize_pdf(st.session_state.pdf_text)
                 st.session_state.messages.append({"role": "assistant", "content": summary})
                 st.markdown(summary, unsafe_allow_html=True)
@@ -318,7 +357,11 @@ try:
             with st.chat_message("user"):
                 st.markdown(prompt)
             with st.chat_message("assistant"):
-                with st.spinner('<div class="spinner">⏳</div>'):
                     answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
                 st.markdown(answer, unsafe_allow_html=True)
             st.session_state.messages.append({"role": "assistant", "content": answer})

 import os
 from io import BytesIO
 import pdfplumber
+from PIL import Image
+import pytesseract
 from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
 from datasets import load_dataset
+from rank_bm25 import BM25Okapi
+from rouge_score import rouge_scorer
 import re
+import time
 # Setup logging for Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
             fine_tuned_pipeline = fine_tune_qa_model(dataset)
             if fine_tuned_pipeline:
                 return fine_tuned_pipeline
+        return pipeline("text2text-generation", model="google/flan-t5-base", max_length=300)
     except Exception as e:
         logger.error(f"QA model load error: {str(e)}")
         st.error(f"QA model error: {str(e)}")
         return None
 # Load and prepare dataset (e.g., SQuAD)
+@st.cache_data(ttl=3600)
 def load_and_prepare_dataset(dataset_name="squad", max_samples=1000):
     logger.info(f"Loading dataset: {dataset_name}")
     try:
+        dataset = load_dataset(dataset_name, split="train[:80%]")
+        dataset = dataset.shuffle(seed=42).select(range(min(max_samples, len(dataset))))
         def preprocess(examples):
             inputs = [f"question: {q} context: {c}" for q, c in zip(examples['question'], examples['context'])]
             targets = examples['answers']['text']
             return {'input_text': inputs, 'target_text': [t[0] if t else "" for t in targets]}
+        dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
         return dataset
     except Exception as e:
         logger.error(f"Dataset load error: {str(e)}")
 def fine_tune_qa_model(dataset):
     logger.info("Starting fine-tuning")
     try:
+        model_name = "google/flan-t5-base"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
             model_inputs["labels"] = labels["input_ids"]
             return model_inputs
+        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input_text', 'target_text'])
         training_args = TrainingArguments(
             output_dir="./fine_tuned_model",
+            num_train_epochs=2,
             per_device_train_batch_size=4,
             save_steps=500,
             logging_steps=100,
             evaluation_strategy="no",
+            learning_rate=3e-5,
+            fp16=False,  # Set True if GPU available
         )
         trainer = Trainer(
 def augment_vector_store(vector_store, dataset_name="squad", max_samples=500):
     logger.info(f"Augmenting vector store with dataset: {dataset_name}")
     try:
+        dataset = load_dataset(dataset_name, split="train").select(range(min(max_samples, len(dataset))))
         chunks = [f"Context: {c}\nAnswer: {a['text'][0]}" for c, a in zip(dataset['context'], dataset['answers'])]
         embeddings_model = load_embeddings_model()
         if embeddings_model and vector_store:
+            embeddings = embeddings_model.encode(chunks, batch_size=32, show_progress_bar=False)
             vector_store.add_embeddings(zip(chunks, embeddings))
         return vector_store
     except Exception as e:
         logger.error(f"Vector store augmentation error: {str(e)}")
         return vector_store
+# Process PDF with enhanced extraction and OCR fallback
 def process_pdf(uploaded_file):
     logger.info("Processing PDF with enhanced extraction")
     try:
         with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
             for page in pdf.pages[:20]:
                 extracted = page.extract_text(layout=False)
+                if not extracted:  # OCR fallback for scanned PDFs
+                    try:
+                        img = page.to_image(resolution=150).original
+                        extracted = pytesseract.image_to_string(img, config='--psm 6')
+                    except Exception as ocr_e:
+                        logger.warning(f"OCR failed: {str(ocr_e)}")
                 if extracted:
                     text += extracted + "\n"
                 for char in page.chars:
         if not text:
             raise ValueError("No text extracted from PDF")
+        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=400, chunk_overlap=80, keep_separator=True)
+        text_chunks = text_splitter.split_text(text)[:80]
+        code_chunks = text_splitter.split_text(code_text)[:40] if code_text else []
         embeddings_model = load_embeddings_model()
         if not embeddings_model:
             return None, None, text, code_text
         text_vector_store = FAISS.from_embeddings(
+            zip(text_chunks, [embeddings_model.encode(chunk, show_progress_bar=False) for chunk in text_chunks]),
             embeddings_model.encode
         ) if text_chunks else None
         code_vector_store = FAISS.from_embeddings(
+            zip(code_chunks, [embeddings_model.encode(chunk, show_progress_bar=False) for chunk in code_chunks]),
             embeddings_model.encode
         ) if code_chunks else None
         if text_vector_store:
             text_vector_store = augment_vector_store(text_vector_store)
+        logger.info("PDF processed successfully")
         return text_vector_store, code_vector_store, text, code_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         st.error(f"PDF error: {str(e)}")
         return None, None, "", ""
+# Summarize PDF with ROUGE metrics
 def summarize_pdf(text):
     logger.info("Generating summary")
     try:
         if not summary_pipeline:
             return "Summary model unavailable."
+        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=400, chunk_overlap=50)
         chunks = text_splitter.split_text(text)[:2]
         summaries = []
         for chunk in chunks:
+            summary = summary_pipeline(chunk[:400], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
             summaries.append(summary.strip())
         combined_summary = " ".join(summaries)
         if len(combined_summary.split()) > 150:
             combined_summary = " ".join(combined_summary.split()[:150])
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+        scores = scorer.score(text[:400], combined_summary)
+        logger.info(f"ROUGE scores: {scores}")
+        return f"**Summary**:\n{combined_summary}\n\n**ROUGE-1**: {scores['rouge1'].fmeasure:.2f}"
     except Exception as e:
         logger.error(f"Summary error: {str(e)}")
         return f"Oops, something went wrong summarizing: {str(e)}"
+# Answer question with hybrid search
 def answer_question(text_vector_store, code_vector_store, query):
     logger.info(f"Processing query: {query}")
     try:
         is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code"])
         if is_code_query and code_vector_store:
+            docs = code_vector_store.similarity_search(query, k=3)
+            code = "\n".join(doc.page_content for doc in docs)
+            explanation = qa_pipeline(f"Explain this code: {code[:500]}")[0]['generated_text']
+            return f"**Code**:\n```python\n{code}\n```\n**Explanation**:\n{explanation}"
         vector_store = text_vector_store
         if not vector_store:
             return "No relevant content found for your query."
+        # Hybrid search: FAISS + BM25
+        text_chunks = [doc.page_content for doc in vector_store.similarity_search(query, k=10)]
+        bm25 = BM25Okapi([chunk.split() for chunk in text_chunks])
+        bm25_docs = bm25.get_top_n(query.split(), text_chunks, n=5)
+        faiss_docs = vector_store.similarity_search(query, k=5)
+        combined_docs = list(set(bm25_docs + [doc.page_content for doc in faiss_docs]))[:5]
+        context = "\n".join(combined_docs)
+        prompt = f"Use the following PDF content to answer the question accurately and concisely. Avoid speculation and focus on the provided context:\n\n{context}\n\nQuestion: {query}\nAnswer:"
         response = qa_pipeline(prompt)[0]['generated_text']
         logger.info("Answer generated")
+        return f"**Answer**:\n{response.strip()}\n\n**Source Context**:\n{context[:500]}..."
     except Exception as e:
         logger.error(f"Query error: {str(e)}")
         return f"Sorry, something went wrong: {str(e)}"
     st.markdown("""
         <style>
         .main { max-width: 900px; margin: 0 auto; padding: 20px; }
+        .sidebar { background-color: #f8f9fa; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
+        .chat-container { border: 1px solid #ddd; border-radius: 12px; padding: 15px; height: 60vh; overflow-y: auto; margin-top: 20px; background-color: #fafafa; }
+        .stChatMessage { border-radius: 12px; padding: 12px; margin: 8px; max-width: 75%; transition: all 0.3s ease; }
+        .user { background-color: #e6f3ff; align-self: flex-end; border: 1px solid #b3d4fc; }
+        .assistant { background-color: #f0f0f0; border: 1px solid #ccc; }
+        .dark .user { background-color: #2a2a72; color: #fff; border: 1px solid #4a4ab2; }
+        .dark .assistant { background-color: #2e2e2e; color: #fff; border: 1px solid #4a4a4a; }
+        .stButton>button { background-color: #4CAF50; color: white; border: none; padding: 10px 20px; border-radius: 8px; font-weight: bold; }
+        .stButton>button:hover { background-color: #45a049; transform: scale(1.05); }
+        pre { background-color: #f8f8f8; padding: 12px; border-radius: 8px; overflow-x: auto; }
+        .header { background: linear-gradient(90deg, #4CAF50, #81C784); color: white; padding: 15px; border-radius: 8px; text-align: center; box-shadow: 0 2px 4px rgba(0,0,0,0.2); }
+        .progress-bar { background-color: #e0e0e0; border-radius: 5px; height: 10px; }
+        .progress-fill { background-color: #4CAF50; height: 100%; border-radius: 5px; transition: width 0.5s ease; }
         </style>
     """, unsafe_allow_html=True)
     st.markdown('<div class="header"><h1>Smart PDF Q&A</h1></div>', unsafe_allow_html=True)
+    st.markdown("Upload a PDF to ask questions, summarize (~150 words), or extract code with 'give me code'. Fast, accurate, and smooth!")
     # Initialize session state
     if "messages" not in st.session_state:
     if "code_text" not in st.session_state:
         st.session_state.code_text = ""
+    # Sidebar with controls
     with st.sidebar:
         st.markdown('<div class="sidebar">', unsafe_allow_html=True)
         theme = st.radio("Theme", ["Light", "Dark"], index=0)
         dataset_name = st.selectbox("Select Dataset for Fine-Tuning", ["squad", "cnn_dailymail", "bigcode/the-stack"], index=0)
         if st.button("Fine-Tune Model"):
+            progress_bar = st.progress(0)
+            for i in range(100):
+                time.sleep(0.02)
+                progress_bar.progress(i + 1)
+            dataset = load_and_prepare_dataset(dataset_name=dataset_name)
+            if dataset:
+                fine_tuned_pipeline = fine_tune_qa_model(dataset)
+                if fine_tuned_pipeline:
+                    st.success("Model fine-tuned successfully!")
+                else:
+                    st.error("Fine-tuning failed.")
+        if st.button("Clear Chat"):
+            st.session_state.messages = []
+            st.experimental_rerun()
         st.markdown('</div>', unsafe_allow_html=True)
     # PDF upload and processing
     col1, col2 = st.columns([1, 1])
     with col1:
         if st.button("Process PDF"):
+            progress_bar = st.progress(0)
             with st.spinner("Processing PDF..."):
+                for i in range(100):
+                    time.sleep(0.05)
+                    progress_bar.progress(i + 1)
                 st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text = process_pdf(uploaded_file)
                 if st.session_state.text_vector_store or st.session_state.code_vector_store:
                     st.success("PDF processed! Ask away or summarize.")
                     st.error("Failed to process PDF.")
     with col2:
         if st.button("Summarize PDF") and st.session_state.pdf_text:
+            progress_bar = st.progress(0)
             with st.spinner("Summarizing..."):
+                for i in range(100):
+                    time.sleep(0.02)
+                    progress_bar.progress(i + 1)
                 summary = summarize_pdf(st.session_state.pdf_text)
                 st.session_state.messages.append({"role": "assistant", "content": summary})
                 st.markdown(summary, unsafe_allow_html=True)
             with st.chat_message("user"):
                 st.markdown(prompt)
             with st.chat_message("assistant"):
+                progress_bar = st.progress(0)
+                with st.spinner('<div class="spinner">⏳ Processing...</div>'):
+                    for i in range(100):
+                        time.sleep(0.01)
+                        progress_bar.progress(i + 1)
                     answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
                 st.markdown(answer, unsafe_allow_html=True)
             st.session_state.messages.append({"role": "assistant", "content": answer})