Spaces:

NaimaAqeel
/

Chatbot

Running

App Files Files Community

NaimaAqeel commited on 9 days ago

Commit

e0d20c3

verified ·

1 Parent(s): 8889a56

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -113

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 from PyPDF2 import PdfReader
 import docx
 from sentence_transformers import SentenceTransformer, util
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import re
 import torch
@@ -12,160 +12,222 @@ import torch
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 qa_pipeline = pipeline(
-    "question-answering",
     model="distilbert-base-cased-distilled-squad",
     device=0 if torch.cuda.is_available() else -1
 )
-# GPT model (using GPT-2 here – replace with better model if you have)
-gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")
-gpt_model.eval()
 # -------------------------
-# Helper functions
 # -------------------------
-def extract_text(file):
-    """Extract text from PDF or DOCX"""
-    if file.name.endswith(".pdf"):
         text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
-    elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
-    text = re.sub(r'\s+', ' ', text)  # clean whitespace
-    return text.strip()
-def chunk_text(text, chunk_size=500, overlap=100):
-    """Split text into overlapping chunks"""
-    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-    chunks, current_chunk = [], ""
-    for sent in sentences:
-        if len(current_chunk) + len(sent) < chunk_size:
-            current_chunk += sent + " "
         else:
-            chunks.append(current_chunk.strip())
-            current_chunk = current_chunk[-overlap:] + sent + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
     return chunks
-def generate_with_gpt(prompt, max_new_tokens=100):
-    """Generate text with GPT model"""
-    inputs = gpt_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-    with torch.no_grad():
-        outputs = gpt_model.generate(
-            inputs.input_ids,
-            max_new_tokens=max_new_tokens,   # FIXED
-            num_return_sequences=1,
-            no_repeat_ngram_size=2,
-            do_sample=True,
-            top_k=50,
-            top_p=0.95,
-            temperature=0.7
-        )
-    return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
-def refine_answer_with_gpt(context, question, answer):
-    """Ask GPT to refine the QA model answer"""
-    prompt = (
-        f"Context: {context}\n\n"
-        f"Question: {question}\n\n"
-        f"Answer: {answer}\n\n"
-        f"Please provide a clearer and more complete answer in simple language."
-    )
-    return generate_with_gpt(prompt, max_new_tokens=120)
-def extract_direct_definition(text, term):
-    """Find a direct definition of a term in the text"""
-    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-    term = term.lower()
-    candidates = []
-    for sent in sentences:
-        lower_sent = sent.lower()
-        if term in lower_sent:
-            if (" is " in lower_sent or " are " in lower_sent or
-                " refers to " in lower_sent or " defined as " in lower_sent):
-                candidates.append(sent)
-    if candidates:
-        return candidates[0]
-    return None
 # -------------------------
-# Main QA function
 # -------------------------
-def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
     text = extract_text(file)
     if not text:
         return "Could not extract text from the file.", history
-    chunks = chunk_text(text)
-    if not chunks:
-        return "No meaningful text chunks could be created.", history
-    # Initialize answer
-    answer = None
-    normalized_question = question.lower().strip(" ?")
     try:
-        # Try direct definition
-        if "artificial system" in normalized_question:
-            answer = extract_direct_definition(text, "artificial system")
-        elif "natural system" in normalized_question:
-            answer = extract_direct_definition(text, "natural system")
-        elif "component" in normalized_question:
-            answer = extract_direct_definition(text, "component")
-        # If no direct definition, do semantic search + QA
         if not answer:
-            emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
-            emb_question = embedder.encode(question, convert_to_tensor=True)
-            scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
-            best_idx = scores.argmax().item()
-            best_chunk = chunks[best_idx]
-            # Low confidence → merge top chunks
-            if scores[best_idx] < 0.3:
-                top_k = min(3, len(chunks))
-                best_indices = scores.topk(top_k).indices.tolist()
-                best_chunk = " ".join([chunks[i] for i in best_indices])
-            result = qa_pipeline(question=question, context=best_chunk)
-            answer = result["answer"] if result["score"] > 0.1 else None
-            if answer and len(answer.split()) > 2:
-                answer = refine_answer_with_gpt(best_chunk, question, answer)
         if not answer:
-            answer = "Sorry, I couldn't find a clear answer in the document."
     except Exception as e:
         answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
 # -------------------------
-# Gradio Interface
 # -------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 📘 Enhanced Document QA with GPT Integration")
     with gr.Row():
         file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
     with gr.Row():
-        chatbot = gr.Chatbot(height=400)
     with gr.Row():
-        question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
     state = gr.State([])
     question.submit(
-        ask_question,
-        [file_input, question, state],
         [question, chatbot]
     )

 from PyPDF2 import PdfReader
 import docx
 from sentence_transformers import SentenceTransformer, util
+from transformers import pipeline
 import re
 import torch
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 qa_pipeline = pipeline(
+    "question-answering",
     model="distilbert-base-cased-distilled-squad",
     device=0 if torch.cuda.is_available() else -1
 )
 # -------------------------
+# Text utilities
 # -------------------------
+SENT_SPLIT_RE = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')
+def normalize_ws(text: str) -> str:
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\s*\n\s*', '\n', text)
+    return text.strip()
+def extract_text(file) -> str:
+    if file.name.lower().endswith(".pdf"):
         text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
+    elif file.name.lower().endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
+    return normalize_ws(text)
+def split_sentences(text: str):
+    parts = SENT_SPLIT_RE.split(text)
+    # Merge very short fragments with neighbors
+    out = []
+    buf = ""
+    for p in parts:
+        if len(p.strip()) < 40:
+            buf += (" " if buf else "") + p.strip()
         else:
+            if buf:
+                out.append(buf.strip())
+                buf = ""
+            out.append(p.strip())
+    if buf:
+        out.append(buf.strip())
+    return [s for s in out if s]
+def chunk_by_chars(sentences, chunk_char_limit=900, overlap_sents=1):
+    chunks, cur, cur_len = [], [], 0
+    for s in sentences:
+        if cur_len + len(s) + 1 <= chunk_char_limit:
+            cur.append(s); cur_len += len(s) + 1
+        else:
+            if cur:
+                chunks.append(" ".join(cur))
+                # overlap for context
+                cur = cur[-overlap_sents:] + [s]
+                cur_len = sum(len(x) + 1 for x in cur)
+            else:
+                # extremely long sentence, hard cut
+                chunks.append(s[:chunk_char_limit])
+                cur, cur_len = [], 0
+    if cur:
+        chunks.append(" ".join(cur))
     return chunks
+def clean_answer(text: str) -> str:
+    # Remove obvious footer/contact lines or emails/urls/phones
+    text = re.sub(r'\bIf you have any questions.*', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', '', text)
+    text = re.sub(r'https?://\S+|www\.\S+', '', text)
+    text = re.sub(r'\b(?:Tel|Phone|Cell|Contact)\b.*', '', text, flags=re.IGNORECASE)
+    return normalize_ws(text)
 # -------------------------
+# Definition finder (for "what is / define" questions)
 # -------------------------
+DEF_PATTERNS = [
+    r'\b(system)\s+is\s+(?:an?|the)\s+[^.]+?\.',         # "system is a/the ..."
+    r'\b(system)\s+refers\s+to\s+[^.]+?\.',              # "system refers to ..."
+    r'\b(system)\s+can\s+be\s+defined\s+as\s+[^.]+?\.',  # "system can be defined as ..."
+    r'\b(system)\s+consists\s+of\s+[^.]+?\.',            # "system consists of ..."
+]
+KEYWORDS_BONUS = {"interrelated", "components", "objective", "objectives",
+                  "environment", "inputs", "outputs", "communication", "function"}
+def find_definition_sentences(text: str, term: str = "system"):
+    sentences = split_sentences(text)
+    cand = []
+    term_lc = term.lower()
+    for s in sentences:
+        s_lc = s.lower()
+        if term_lc not in s_lc:
+            continue
+        matched = any(re.search(pat.replace("system", term_lc), s_lc) for pat in DEF_PATTERNS)
+        if matched:
+            score = sum(1 for k in KEYWORDS_BONUS if k in s_lc)
+            cand.append((score, s.strip()))
+    if not cand:
+        # fallback: sentences with term + several keywords
+        for s in sentences:
+            s_lc = s.lower()
+            if term_lc in s_lc:
+                score = sum(1 for k in KEYWORDS_BONUS if k in s_lc)
+                if score >= 2:
+                    cand.append((score, s.strip()))
+    if not cand:
+        return None
+    cand.sort(key=lambda x: (-x[0], len(x[1])))
+    return cand[0][1]
+# -------------------------
+# Retrieval helpers
+# -------------------------
+def select_top_chunks(chunks, question, top_k=3):
+    emb_chunks = embedder.encode(chunks, convert_to_tensor=True, normalize_embeddings=True)
+    emb_q = embedder.encode([question], convert_to_tensor=True, normalize_embeddings=True)
+    sims = util.cos_sim(emb_q, emb_chunks)[0]  # shape [num_chunks]
+    top_k = min(top_k, len(chunks))
+    top_idx = torch.topk(sims, k=top_k).indices.tolist()
+    return [chunks[i] for i in top_idx], sims.max().item()
+# -------------------------
+# Main QA logic
+# -------------------------
+def answer_from_chunks(question: str, chunks: list, strict_extractive=True):
+    """
+    Try QA over the best chunk(s). If strict_extractive, return the extractive span only.
+    We'll query the best chunk first; if low score, concatenate top-3 chunks and retry.
+    """
+    if not chunks:
+        return None
+    # Best single chunk first
+    result = qa_pipeline(question=question, context=chunks[0])
+    best_answer, best_score = result.get("answer", ""), result.get("score", 0.0)
+    # If weak, try merged top chunks
+    if best_score < 0.25 and len(chunks) > 1:
+        merged = " ".join(chunks)
+        result2 = qa_pipeline(question=question, context=merged)
+        if result2.get("score", 0.0) > best_score:
+            best_answer, best_score = result2["answer"], result2["score"]
+    if best_score < 0.15 or len(best_answer.strip()) < 2:
+        return None
+    ans = best_answer.strip()
+    # keep it extractive and clean
+    ans = clean_answer(ans)
+    if strict_extractive:
+        # ensure it's a concise span (avoid run-on junk)
+        ans = re.split(r'[\n\r]', ans)[0].strip()
+    return ans or None
+# -------------------------
+# Gradio callback
+# -------------------------
+def ask_question(file, question, history, strict_extractive=True):
     if not file:
         return "Please upload a file.", history
+    if not question or not question.strip():
+        return "Please type a question.", history
     text = extract_text(file)
     if not text:
         return "Could not extract text from the file.", history
+    sentences = split_sentences(text)
+    chunks = chunk_by_chars(sentences, chunk_char_limit=900, overlap_sents=1)
+    q_norm = question.lower().strip(" ?!")
     try:
+        # 1) Prefer a definition for "what is/define ..." style questions
+        if re.search(r'\b(what\s+is|define|definition of)\b', q_norm) and "system" in q_norm:
+            defin = find_definition_sentences(text, term="system")
+            if defin:
+                answer = clean_answer(defin)
+                history.append((question, answer))
+                return "", history
+        # 2) Retrieval + extractive QA
+        top_chunks, max_sim = select_top_chunks(chunks, question, top_k=3)
+        answer = answer_from_chunks(question, top_chunks, strict_extractive=strict_extractive)
+        # 3) If still nothing, try a simpler sentence retrieval: pick the most relevant sentence
         if not answer:
+            emb_sents = embedder.encode(sentences, convert_to_tensor=True, normalize_embeddings=True)
+            emb_q = embedder.encode([question], convert_to_tensor=True, normalize_embeddings=True)
+            sims = util.cos_sim(emb_q, emb_sents)[0]
+            best_i = int(torch.argmax(sims).item())
+            if sims[best_i].item() > 0.2:
+                answer = clean_answer(sentences[best_i])
         if not answer:
+            answer = "Sorry, I couldn't find a clear, grounded answer in the document."
     except Exception as e:
         answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
 # -------------------------
+# UI
 # -------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 📘 Document QA — Strict Extractive (No Hallucinations)")
     with gr.Row():
         file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
     with gr.Row():
+        chatbot = gr.Chatbot(height=420)
     with gr.Row():
+        question = gr.Textbox(label="Ask your question", placeholder="e.g., What is a system?")
+    strict = gr.Checkbox(value=True, label="Strict extractive only (recommended)")
     state = gr.State([])
     question.submit(
+        ask_question,
+        [file_input, question, state, strict],
         [question, chatbot]
     )