Spaces:

NaimaAqeel
/

Chatbot

Running

App Files Files Community

NaimaAqeel commited on 18 days ago

Commit

f06cc93

verified ·

1 Parent(s): 32d52a8

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -81

app.py CHANGED Viewed

@@ -1,128 +1,188 @@
 import gradio as gr
-import torch
 from PyPDF2 import PdfReader
 import docx
 from sentence_transformers import SentenceTransformer, util
 from transformers import pipeline
-import re
 # Load models
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
-qa_pipeline = pipeline("question-answering",
-                      model="distilbert-base-cased-distilled-squad",
-                      device=0 if torch.cuda.is_available() else -1)
 def extract_text(file):
     if file.name.endswith(".pdf"):
-        text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
     elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
-    # Clean up text
-    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
     return text.strip()
-def chunk_text(text, chunk_size=500, overlap=100):
-    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
     chunks = []
     current_chunk = ""
-    for sent in sentences:
-        if len(current_chunk) + len(sent) < chunk_size:
-            current_chunk += sent + " "
         else:
             chunks.append(current_chunk.strip())
-            # Keep some overlap between chunks for context
-            current_chunk = current_chunk[-overlap:] + sent + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
 def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
     text = extract_text(file)
     if not text:
-        return "Could not extract text from the file.", history
-    chunks = chunk_text(text)
     if not chunks:
-        return "No meaningful text chunks could be created.", history
-    # Normalize question for better matching
-    normalized_question = question.lower().strip(" ?")
-    try:
-        emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
-        emb_question = embedder.encode(question, convert_to_tensor=True)
-        scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
-        best_idx = scores.argmax().item()
-        best_chunk = chunks[best_idx]
-        # If the best score is too low, try with more chunks
-        if scores[best_idx] < 0.3:  # Lower similarity threshold
-            # Combine top 3 chunks for more context
-            top_k = min(3, len(chunks))
-            best_indices = scores.topk(top_k).indices.tolist()
-            best_chunk = " ".join([chunks[i] for i in best_indices])
-        result = qa_pipeline(question=question, context=best_chunk)
-        # More sophisticated answer validation
-        answer = result["answer"]
-        if result["score"] < 0.1 or len(answer.split()) < 2:  # Require longer answers
-            # Try alternative approach - look for direct matches
-            if "artificial system" in normalized_question:
-                answer = extract_direct_definition(text, "artificial system")
-            elif "natural system" in normalized_question:
-                answer = extract_direct_definition(text, "natural system")
-            elif "component" in normalized_question:
-                answer = extract_direct_definition(text, "component")
-            else:
-                answer = "Sorry, I couldn't find a clear answer in the document."
-    except Exception as e:
-        answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
-def extract_direct_definition(text, term):
-    """Try to find a sentence that directly defines the term"""
-    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-    term = term.lower()
-    # Look for sentences that contain the term and seem like definitions
-    candidates = []
-    for sent in sentences:
-        lower_sent = sent.lower()
-        if term in lower_sent:
-            # Look for patterns like "X is Y" or "X refers to Y"
-            if (" is " in lower_sent or " are " in lower_sent or
-                " refers to " in lower_sent or " defined as " in lower_sent):
-                candidates.append(sent)
-    if candidates:
-        return candidates[0]  # Return first definition found
-    return f"Information about {term} not found in the document."
 with gr.Blocks() as demo:
-    gr.Markdown("## Enhanced Document QA with Smart Retrieval")
     with gr.Row():
-        file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
     with gr.Row():
         chatbot = gr.Chatbot(height=400)
     with gr.Row():
         question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
-    state = gr.State([])
-    question.submit(
-        ask_question,
-        [file_input, question, state],
-        [question, chatbot]
-    )
 demo.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
 import docx
+import re
+import torch
 from sentence_transformers import SentenceTransformer, util
 from transformers import pipeline
+from nltk.tokenize import sent_tokenize
+import nltk
+# Download NLTK data (run once)
+nltk.download('punkt')
 # Load models
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
+qa_pipeline = pipeline(
+    "question-answering",
+    model="deepset/roberta-base-squad2",
+    device=0 if torch.cuda.is_available() else -1
+)
+# Predefined answers for key questions
+KEYWORD_RESPONSES = {
+    "what is a system": """
+    A system is a collection of interrelated components designed to perform specific functions. Key characteristics:
+    - Composed of multiple components that work together
+    - Has defined objectives/purpose
+    - Operates within an environment
+    - Components communicate with each other
+    Example from document: A car is made up of an engine, wheels, brakes and other related items that work together.
+    """,
+    "types of a system": """
+    The document clearly states there are two main types:
+    1. Natural Systems - exist independently without human involvement (e.g., ecosystems)
+    2. Artificial Systems - created by humans (e.g., computer systems, transportation systems)
+    """,
+    "what is an artificial system": """
+    Artificial Systems are human-created systems designed for specific purposes. Key points:
+    - Created by people to solve problems or perform tasks
+    - Three main categories mentioned in the document:
+      • Knowledge Systems (math, databases)
+      • Engineering Systems (civil, mechanical)
+      • Social Systems (governments, organizations)
+    Example: A computer system processes data to perform tasks.
+    """,
+    "what is a natural system": """
+    Natural Systems exist independently without human involvement. The document specifies:
+    - Governed by natural laws and processes
+    - Four subtypes:
+      1. Physical (planets, atoms)
+      2. Chemical (chemical reactions)
+      3. Biological (living organisms)
+      4. Psychological (human mind/behavior)
+    Example: An ecosystem where species interact naturally.
+    """,
+    "components of a system": """
+    The document describes system components as:
+    1. Fundamental building blocks that work together
+    2. Each component has a specific role
+    3. Must communicate effectively
+    4. Examples given:
+       - In computers: CPU, memory, I/O devices
+       - In cars: engine, wheels, brakes
+    The exact components vary by system type.
+    """
+}
 def extract_text(file):
     if file.name.endswith(".pdf"):
+        text = ""
+        for page in PdfReader(file).pages:
+            text += page.extract_text() or ""
     elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
+    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+    text = re.sub(r'\[.*?\]', '', text)  # Remove [comments]
     return text.strip()
+def chunk_text(text, chunk_size=500, overlap=50):
+    sentences = sent_tokenize(text)
     chunks = []
     current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < chunk_size:
+            current_chunk += sentence + " "
         else:
             chunks.append(current_chunk.strip())
+            current_chunk = current_chunk[-overlap:] + sentence + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
+def get_relevant_chunks(text, question, embedder, top_k=3):
+    chunks = chunk_text(text)
+    if not chunks:
+        return []
+    emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
+    emb_question = embedder.encode(question, convert_to_tensor=True)
+    scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
+    top_indices = scores.topk(top_k).indices.tolist()
+    return [chunks[i] for i in top_indices]
+def extract_direct_definition(text, keyword):
+    sentences = sent_tokenize(text)
+    keyword = keyword.lower()
+    for sentence in sentences:
+        if keyword in sentence.lower():
+            if " is " in sentence or " are " in sentence or " defined as " in sentence:
+                return sentence
+    return None
+def clean_answer(answer):
+    answer = re.sub(r'\[\d+\]', '', answer)  # Remove citations
+    sentences = list(dict.fromkeys(sent_tokenize(answer)))  # Remove duplicates
+    return " ".join(sentences).strip()
+def check_keywords(question):
+    question_lower = question.lower()
+    for keyword, response in KEYWORD_RESPONSES.items():
+        if keyword in question_lower:
+            return response
+    return None
 def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
+    # Check for predefined answers first
+    predefined_answer = check_keywords(question)
+    if predefined_answer:
+        history.append((question, predefined_answer))
+        return "", history
     text = extract_text(file)
     if not text:
+        return "Could not extract text from the document.", history
+    # Get relevant context
+    chunks = get_relevant_chunks(text, question, embedder)
     if not chunks:
+        return "No relevant information found in document.", history
+    context = " ".join(chunks)
+    # Try QA pipeline
+    result = qa_pipeline(question=question, context=context)
+    if result["score"] > 0.2:  # Confidence threshold
+        answer = clean_answer(result["answer"])
+    else:
+        # Fallback to keyword extraction
+        keywords = ["system", "natural", "artificial", "component", "objective"]
+        for keyword in keywords:
+            if keyword in question.lower():
+                answer = extract_direct_definition(text, keyword)
+                if answer:
+                    break
+        else:
+            answer = "Sorry, I couldn't find a clear answer in the document."
     history.append((question, answer))
     return "", history
 with gr.Blocks() as demo:
+    gr.Markdown("## Enhanced Document QA System")
     with gr.Row():
+        file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
     with gr.Row():
         chatbot = gr.Chatbot(height=400)
     with gr.Row():
         question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
+    with gr.Row():
+        gr.Button("👍"), gr.Button("👎")
+    state = gr.State([])
+    question.submit(ask_question, [file_input, question, state], [question, chatbot])
 demo.launch()