Spaces:

NaimaAqeel
/

Chatbot

Running

App Files Files Community

NaimaAqeel commited on 7 days ago

Commit

318fbd2

verified ·

1 Parent(s): e0d20c3

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -212

app.py CHANGED Viewed

@@ -1,234 +1,56 @@
-import gradio as gr
 from PyPDF2 import PdfReader
 import docx
 from sentence_transformers import SentenceTransformer, util
 from transformers import pipeline
-import re
-import torch
-# -------------------------
 # Load models
-# -------------------------
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
-qa_pipeline = pipeline(
-    "question-answering",
-    model="distilbert-base-cased-distilled-squad",
-    device=0 if torch.cuda.is_available() else -1
-)
-# -------------------------
-# Text utilities
-# -------------------------
-SENT_SPLIT_RE = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')
-def normalize_ws(text: str) -> str:
-    text = re.sub(r'[ \t]+', ' ', text)
-    text = re.sub(r'\s*\n\s*', '\n', text)
-    return text.strip()
-def extract_text(file) -> str:
-    if file.name.lower().endswith(".pdf"):
-        text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
-    elif file.name.lower().endswith(".docx"):
-        text = "\n".join([p.text for p in docx.Document(file).paragraphs])
-    else:
-        return ""
-    return normalize_ws(text)
-def split_sentences(text: str):
-    parts = SENT_SPLIT_RE.split(text)
-    # Merge very short fragments with neighbors
-    out = []
-    buf = ""
-    for p in parts:
-        if len(p.strip()) < 40:
-            buf += (" " if buf else "") + p.strip()
-        else:
-            if buf:
-                out.append(buf.strip())
-                buf = ""
-            out.append(p.strip())
-    if buf:
-        out.append(buf.strip())
-    return [s for s in out if s]
-def chunk_by_chars(sentences, chunk_char_limit=900, overlap_sents=1):
-    chunks, cur, cur_len = [], [], 0
-    for s in sentences:
-        if cur_len + len(s) + 1 <= chunk_char_limit:
-            cur.append(s); cur_len += len(s) + 1
         else:
-            if cur:
-                chunks.append(" ".join(cur))
-                # overlap for context
-                cur = cur[-overlap_sents:] + [s]
-                cur_len = sum(len(x) + 1 for x in cur)
-            else:
-                # extremely long sentence, hard cut
-                chunks.append(s[:chunk_char_limit])
-                cur, cur_len = [], 0
-    if cur:
-        chunks.append(" ".join(cur))
     return chunks
-def clean_answer(text: str) -> str:
-    # Remove obvious footer/contact lines or emails/urls/phones
-    text = re.sub(r'\bIf you have any questions.*', '', text, flags=re.IGNORECASE)
-    text = re.sub(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', '', text)
-    text = re.sub(r'https?://\S+|www\.\S+', '', text)
-    text = re.sub(r'\b(?:Tel|Phone|Cell|Contact)\b.*', '', text, flags=re.IGNORECASE)
-    return normalize_ws(text)
-# -------------------------
-# Definition finder (for "what is / define" questions)
-# -------------------------
-DEF_PATTERNS = [
-    r'\b(system)\s+is\s+(?:an?|the)\s+[^.]+?\.',         # "system is a/the ..."
-    r'\b(system)\s+refers\s+to\s+[^.]+?\.',              # "system refers to ..."
-    r'\b(system)\s+can\s+be\s+defined\s+as\s+[^.]+?\.',  # "system can be defined as ..."
-    r'\b(system)\s+consists\s+of\s+[^.]+?\.',            # "system consists of ..."
-]
-KEYWORDS_BONUS = {"interrelated", "components", "objective", "objectives",
-                  "environment", "inputs", "outputs", "communication", "function"}
-def find_definition_sentences(text: str, term: str = "system"):
-    sentences = split_sentences(text)
-    cand = []
-    term_lc = term.lower()
-    for s in sentences:
-        s_lc = s.lower()
-        if term_lc not in s_lc:
-            continue
-        matched = any(re.search(pat.replace("system", term_lc), s_lc) for pat in DEF_PATTERNS)
-        if matched:
-            score = sum(1 for k in KEYWORDS_BONUS if k in s_lc)
-            cand.append((score, s.strip()))
-    if not cand:
-        # fallback: sentences with term + several keywords
-        for s in sentences:
-            s_lc = s.lower()
-            if term_lc in s_lc:
-                score = sum(1 for k in KEYWORDS_BONUS if k in s_lc)
-                if score >= 2:
-                    cand.append((score, s.strip()))
-    if not cand:
-        return None
-    cand.sort(key=lambda x: (-x[0], len(x[1])))
-    return cand[0][1]
-# -------------------------
-# Retrieval helpers
-# -------------------------
-def select_top_chunks(chunks, question, top_k=3):
-    emb_chunks = embedder.encode(chunks, convert_to_tensor=True, normalize_embeddings=True)
-    emb_q = embedder.encode([question], convert_to_tensor=True, normalize_embeddings=True)
-    sims = util.cos_sim(emb_q, emb_chunks)[0]  # shape [num_chunks]
-    top_k = min(top_k, len(chunks))
-    top_idx = torch.topk(sims, k=top_k).indices.tolist()
-    return [chunks[i] for i in top_idx], sims.max().item()
-# -------------------------
-# Main QA logic
-# -------------------------
-def answer_from_chunks(question: str, chunks: list, strict_extractive=True):
-    """
-    Try QA over the best chunk(s). If strict_extractive, return the extractive span only.
-    We'll query the best chunk first; if low score, concatenate top-3 chunks and retry.
-    """
-    if not chunks:
-        return None
-    # Best single chunk first
-    result = qa_pipeline(question=question, context=chunks[0])
-    best_answer, best_score = result.get("answer", ""), result.get("score", 0.0)
-    # If weak, try merged top chunks
-    if best_score < 0.25 and len(chunks) > 1:
-        merged = " ".join(chunks)
-        result2 = qa_pipeline(question=question, context=merged)
-        if result2.get("score", 0.0) > best_score:
-            best_answer, best_score = result2["answer"], result2["score"]
-    if best_score < 0.15 or len(best_answer.strip()) < 2:
-        return None
-    ans = best_answer.strip()
-    # keep it extractive and clean
-    ans = clean_answer(ans)
-    if strict_extractive:
-        # ensure it's a concise span (avoid run-on junk)
-        ans = re.split(r'[\n\r]', ans)[0].strip()
-    return ans or None
-# -------------------------
-# Gradio callback
-# -------------------------
-def ask_question(file, question, history, strict_extractive=True):
     if not file:
         return "Please upload a file.", history
-    if not question or not question.strip():
-        return "Please type a question.", history
     text = extract_text(file)
-    if not text:
-        return "Could not extract text from the file.", history
-    sentences = split_sentences(text)
-    chunks = chunk_by_chars(sentences, chunk_char_limit=900, overlap_sents=1)
-    q_norm = question.lower().strip(" ?!")
-    try:
-        # 1) Prefer a definition for "what is/define ..." style questions
-        if re.search(r'\b(what\s+is|define|definition of)\b', q_norm) and "system" in q_norm:
-            defin = find_definition_sentences(text, term="system")
-            if defin:
-                answer = clean_answer(defin)
-                history.append((question, answer))
-                return "", history
-        # 2) Retrieval + extractive QA
-        top_chunks, max_sim = select_top_chunks(chunks, question, top_k=3)
-        answer = answer_from_chunks(question, top_chunks, strict_extractive=strict_extractive)
-        # 3) If still nothing, try a simpler sentence retrieval: pick the most relevant sentence
-        if not answer:
-            emb_sents = embedder.encode(sentences, convert_to_tensor=True, normalize_embeddings=True)
-            emb_q = embedder.encode([question], convert_to_tensor=True, normalize_embeddings=True)
-            sims = util.cos_sim(emb_q, emb_sents)[0]
-            best_i = int(torch.argmax(sims).item())
-            if sims[best_i].item() > 0.2:
-                answer = clean_answer(sentences[best_i])
-        if not answer:
-            answer = "Sorry, I couldn't find a clear, grounded answer in the document."
-    except Exception as e:
-        answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
-# -------------------------
-# UI
-# -------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 📘 Document QA — Strict Extractive (No Hallucinations)")
-    with gr.Row():
-        file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
-    with gr.Row():
-        chatbot = gr.Chatbot(height=420)
-    with gr.Row():
-        question = gr.Textbox(label="Ask your question", placeholder="e.g., What is a system?")
-    strict = gr.Checkbox(value=True, label="Strict extractive only (recommended)")
     state = gr.State([])
-    question.submit(
-        ask_question,
-        [file_input, question, state, strict],
-        [question, chatbot]
-    )
-demo.launch()

+ import gradio as gr
 from PyPDF2 import PdfReader
 import docx
 from sentence_transformers import SentenceTransformer, util
 from transformers import pipeline
 # Load models
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
+qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+def extract_text(file):
+    if file.name.endswith(".pdf"):
+        return "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
+    elif file.name.endswith(".docx"):
+        return "\n".join([p.text for p in docx.Document(file).paragraphs])
+    return ""
+def chunk_text(text, chunk_size=500):
+    sentences = text.split(". ")
+    chunks, buffer = [], ""
+    for sent in sentences:
+        if len(buffer) + len(sent) < chunk_size:
+            buffer += sent + ". "
         else:
+            chunks.append(buffer.strip())
+            buffer = sent + ". "
+    if buffer:
+        chunks.append(buffer.strip())
     return chunks
+def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
     text = extract_text(file)
+    chunks = chunk_text(text)
+    emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
+    emb_question = embedder.encode(question, convert_to_tensor=True)
+    scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
+    best_chunk = chunks[scores.argmax().item()]
+    result = qa_pipeline(question=question, context=best_chunk)
+    answer = result["answer"] if result["score"] > 0.1 else "Sorry, not found."
     history.append((question, answer))
     return "", history
 with gr.Blocks() as demo:
+    gr.Markdown("##  Document QA with Smart Retrieval")
+    file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
+    chatbot = gr.Chatbot()
+    question = gr.Textbox(label="Ask your question")
     state = gr.State([])
+    question.submit(ask_question, [file_input, question, state], [question, chatbot])
+demo.launch()