Spaces:

NaimaAqeel
/

Chatbot

Running

App Files Files Community

NaimaAqeel commited on 10 days ago

Commit

965462a

verified ·

1 Parent(s): 60d9162

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -77

app.py CHANGED Viewed

@@ -6,46 +6,54 @@ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import re
 import torch
 # Load models
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
-qa_pipeline = pipeline("question-answering",
-                      model="distilbert-base-cased-distilled-squad",
-                      device=0 if torch.cuda.is_available() else -1)
-# Load GPT model (using GPT-2 as example - replace with GPT-3/4 if available)
 gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")
 gpt_model.eval()
 def extract_text(file):
     if file.name.endswith(".pdf"):
         text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
     elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
-    # Clean up text
-    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
     return text.strip()
 def chunk_text(text, chunk_size=500, overlap=100):
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-    chunks = []
-    current_chunk = ""
     for sent in sentences:
         if len(current_chunk) + len(sent) < chunk_size:
             current_chunk += sent + " "
         else:
             chunks.append(current_chunk.strip())
-            # Keep some overlap between chunks for context
             current_chunk = current_chunk[-overlap:] + sent + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
 def generate_with_gpt(prompt, max_length=150):
     inputs = gpt_tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
         outputs = gpt_model.generate(
@@ -60,65 +68,20 @@ def generate_with_gpt(prompt, max_length=150):
         )
     return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
-def ask_question(file, question, history):
-    if not file:
-        return "Please upload a file.", history
-    text = extract_text(file)
-    if not text:
-        return "Could not extract text from the file.", history
-    chunks = chunk_text(text)
-    if not chunks:
-        return "No meaningful text chunks could be created.", history
-    # Initialize answer as None
-    answer = None
-    try:
-        # Normalize question for better matching
-        normalized_question = question.lower().strip(" ?")
-        # First try to find direct definitions
-        if "artificial system" in normalized_question:
-            answer = extract_direct_definition(text, "artificial system")
-        elif "natural system" in normalized_question:
-            answer = extract_direct_definition(text, "natural system")
-        elif "component" in normalized_question:
-            answer = extract_direct_definition(text, "component")
-        # If no direct definition found, use semantic search
-        if not answer:
-            emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
-            emb_question = embedder.encode(question, convert_to_tensor=True)
-            scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
-            best_idx = scores.argmax().item()
-            best_chunk = chunks[best_idx]
-            if scores[best_idx] < 0.3:  # Low confidence
-                top_k = min(3, len(chunks))
-                best_indices = scores.topk(top_k).indices.tolist()
-                best_chunk = " ".join([chunks[i] for i in best_indices])
-            result = qa_pipeline(question=question, context=best_chunk)
-            if result["score"] > 0.1 and len(result["answer"].split()) >= 2:
-                answer = result["answer"]
-        # Final fallback if no answer found
-        if not answer:
-            answer = "Sorry, I couldn't find a clear answer in the document."
-    except Exception as e:
-        answer = f"An error occurred: {str(e)}"
-    history.append((question, answer))
-    return "", history
 def extract_direct_definition(text, term):
-    """Try to find a sentence that directly defines the term"""
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
     term = term.lower()
     candidates = []
     for sent in sentences:
         lower_sent = sent.lower()
@@ -126,11 +89,13 @@ def extract_direct_definition(text, term):
             if (" is " in lower_sent or " are " in lower_sent or
                 " refers to " in lower_sent or " defined as " in lower_sent):
                 candidates.append(sent)
     if candidates:
         return candidates[0]
     return None
 def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
@@ -143,11 +108,12 @@ def ask_question(file, question, history):
     if not chunks:
         return "No meaningful text chunks could be created.", history
-    # Normalize question for better matching
     normalized_question = question.lower().strip(" ?")
     try:
-        # First try to find direct definitions
         if "artificial system" in normalized_question:
             answer = extract_direct_definition(text, "artificial system")
         elif "natural system" in normalized_question:
@@ -155,7 +121,7 @@ def ask_question(file, question, history):
         elif "component" in normalized_question:
             answer = extract_direct_definition(text, "component")
-        # If no direct definition found, use semantic search
         if not answer:
             emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
             emb_question = embedder.encode(question, convert_to_tensor=True)
@@ -163,32 +129,32 @@ def ask_question(file, question, history):
             best_idx = scores.argmax().item()
             best_chunk = chunks[best_idx]
-            # Combine top chunks if confidence is low
             if scores[best_idx] < 0.3:
                 top_k = min(3, len(chunks))
                 best_indices = scores.topk(top_k).indices.tolist()
                 best_chunk = " ".join([chunks[i] for i in best_indices])
-            # Get initial answer from QA model
             result = qa_pipeline(question=question, context=best_chunk)
             answer = result["answer"] if result["score"] > 0.1 else None
-            # Refine answer with GPT if available
             if answer and len(answer.split()) > 2:
                 answer = refine_answer_with_gpt(best_chunk, question, answer)
-        # Final fallback
         if not answer:
             answer = "Sorry, I couldn't find a clear answer in the document."
     except Exception as e:
         answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
 with gr.Blocks() as demo:
-    gr.Markdown("## Enhanced Document QA with GPT Integration")
     with gr.Row():
         file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
     with gr.Row():
@@ -196,7 +162,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
     state = gr.State([])
     question.submit(
         ask_question,
         [file_input, question, state],

 import re
 import torch
+# -------------------------
 # Load models
+# -------------------------
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
+qa_pipeline = pipeline(
+    "question-answering",
+    model="distilbert-base-cased-distilled-squad",
+    device=0 if torch.cuda.is_available() else -1
+)
+# GPT model (using GPT-2 here – replace with better model if you have)
 gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")
 gpt_model.eval()
+# -------------------------
+# Helper functions
+# -------------------------
 def extract_text(file):
+    """Extract text from PDF or DOCX"""
     if file.name.endswith(".pdf"):
         text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
     elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
+    text = re.sub(r'\s+', ' ', text)  # clean whitespace
     return text.strip()
 def chunk_text(text, chunk_size=500, overlap=100):
+    """Split text into overlapping chunks"""
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
+    chunks, current_chunk = [], ""
     for sent in sentences:
         if len(current_chunk) + len(sent) < chunk_size:
             current_chunk += sent + " "
         else:
             chunks.append(current_chunk.strip())
             current_chunk = current_chunk[-overlap:] + sent + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
 def generate_with_gpt(prompt, max_length=150):
+    """Generate text with GPT model"""
     inputs = gpt_tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
         outputs = gpt_model.generate(
         )
     return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
+def refine_answer_with_gpt(context, question, answer):
+    """Ask GPT to refine the QA model answer"""
+    prompt = (
+        f"Context: {context}\n\n"
+        f"Question: {question}\n\n"
+        f"Answer: {answer}\n\n"
+        f"Please provide a clearer and more complete answer in simple language."
+    )
+    return generate_with_gpt(prompt, max_length=120)
 def extract_direct_definition(text, term):
+    """Find a direct definition of a term in the text"""
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
     term = term.lower()
     candidates = []
     for sent in sentences:
         lower_sent = sent.lower()
             if (" is " in lower_sent or " are " in lower_sent or
                 " refers to " in lower_sent or " defined as " in lower_sent):
                 candidates.append(sent)
     if candidates:
         return candidates[0]
     return None
+# -------------------------
+# Main QA function
+# -------------------------
 def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
     if not chunks:
         return "No meaningful text chunks could be created.", history
+    # Initialize answer
+    answer = None
     normalized_question = question.lower().strip(" ?")
     try:
+        # Try direct definition
         if "artificial system" in normalized_question:
             answer = extract_direct_definition(text, "artificial system")
         elif "natural system" in normalized_question:
         elif "component" in normalized_question:
             answer = extract_direct_definition(text, "component")
+        # If no direct definition, do semantic search + QA
         if not answer:
             emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
             emb_question = embedder.encode(question, convert_to_tensor=True)
             best_idx = scores.argmax().item()
             best_chunk = chunks[best_idx]
+            # Low confidence → merge top chunks
             if scores[best_idx] < 0.3:
                 top_k = min(3, len(chunks))
                 best_indices = scores.topk(top_k).indices.tolist()
                 best_chunk = " ".join([chunks[i] for i in best_indices])
             result = qa_pipeline(question=question, context=best_chunk)
             answer = result["answer"] if result["score"] > 0.1 else None
             if answer and len(answer.split()) > 2:
                 answer = refine_answer_with_gpt(best_chunk, question, answer)
         if not answer:
             answer = "Sorry, I couldn't find a clear answer in the document."
     except Exception as e:
         answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
+# -------------------------
+# Gradio Interface
+# -------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 📘 Enhanced Document QA with GPT Integration")
     with gr.Row():
         file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
     with gr.Row():
     with gr.Row():
         question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
     state = gr.State([])
     question.submit(
         ask_question,
         [file_input, question, state],