Spaces:

NaimaAqeel
/

Chatbot

Running

App Files Files Community

NaimaAqeel commited on 11 days ago

Commit

b1bb0b3

verified ·

1 Parent(s): 2ecc80b

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -134

app.py CHANGED Viewed

@@ -1,188 +1,166 @@
 import gradio as gr
 from PyPDF2 import PdfReader
 import docx
 import re
 import torch
-from sentence_transformers import SentenceTransformer, util
-from transformers import pipeline
-from nltk.tokenize import sent_tokenize
-import nltk
-# Download NLTK data (run once)
-nltk.download('punkt')
 # Load models
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
-qa_pipeline = pipeline(
-    "question-answering",
-    model="deepset/roberta-base-squad2",
-    device=0 if torch.cuda.is_available() else -1
-)
-# Predefined answers for key questions
-KEYWORD_RESPONSES = {
-    "what is a system": """
-    A system is a collection of interrelated components designed to perform specific functions. Key characteristics:
-    - Composed of multiple components that work together
-    - Has defined objectives/purpose
-    - Operates within an environment
-    - Components communicate with each other
-    Example from document: A car is made up of an engine, wheels, brakes and other related items that work together.
-    """,
-    "types of a system": """
-    The document clearly states there are two main types:
-    1. Natural Systems - exist independently without human involvement (e.g., ecosystems)
-    2. Artificial Systems - created by humans (e.g., computer systems, transportation systems)
-    """,
-    "what is an artificial system": """
-    Artificial Systems are human-created systems designed for specific purposes. Key points:
-    - Created by people to solve problems or perform tasks
-    - Three main categories mentioned in the document:
-      • Knowledge Systems (math, databases)
-      • Engineering Systems (civil, mechanical)
-      • Social Systems (governments, organizations)
-    Example: A computer system processes data to perform tasks.
-    """,
-    "what is a natural system": """
-    Natural Systems exist independently without human involvement. The document specifies:
-    - Governed by natural laws and processes
-    - Four subtypes:
-      1. Physical (planets, atoms)
-      2. Chemical (chemical reactions)
-      3. Biological (living organisms)
-      4. Psychological (human mind/behavior)
-    Example: An ecosystem where species interact naturally.
-    """,
-    "components of a system": """
-    The document describes system components as:
-    1. Fundamental building blocks that work together
-    2. Each component has a specific role
-    3. Must communicate effectively
-    4. Examples given:
-       - In computers: CPU, memory, I/O devices
-       - In cars: engine, wheels, brakes
-    The exact components vary by system type.
-    """
-}
 def extract_text(file):
     if file.name.endswith(".pdf"):
-        text = ""
-        for page in PdfReader(file).pages:
-            text += page.extract_text() or ""
     elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
-    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
-    text = re.sub(r'\[.*?\]', '', text)  # Remove [comments]
     return text.strip()
-def chunk_text(text, chunk_size=500, overlap=50):
-    sentences = sent_tokenize(text)
     chunks = []
     current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) < chunk_size:
-            current_chunk += sentence + " "
         else:
             chunks.append(current_chunk.strip())
-            current_chunk = current_chunk[-overlap:] + sentence + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
-def get_relevant_chunks(text, question, embedder, top_k=3):
-    chunks = chunk_text(text)
-    if not chunks:
-        return []
-    emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
-    emb_question = embedder.encode(question, convert_to_tensor=True)
-    scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
-    top_indices = scores.topk(top_k).indices.tolist()
-    return [chunks[i] for i in top_indices]
-def extract_direct_definition(text, keyword):
-    sentences = sent_tokenize(text)
-    keyword = keyword.lower()
-    for sentence in sentences:
-        if keyword in sentence.lower():
-            if " is " in sentence or " are " in sentence or " defined as " in sentence:
-                return sentence
-    return None
-def clean_answer(answer):
-    answer = re.sub(r'\[\d+\]', '', answer)  # Remove citations
-    sentences = list(dict.fromkeys(sent_tokenize(answer)))  # Remove duplicates
-    return " ".join(sentences).strip()
-def check_keywords(question):
-    question_lower = question.lower()
-    for keyword, response in KEYWORD_RESPONSES.items():
-        if keyword in question_lower:
-            return response
     return None
 def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
-    # Check for predefined answers first
-    predefined_answer = check_keywords(question)
-    if predefined_answer:
-        history.append((question, predefined_answer))
-        return "", history
     text = extract_text(file)
     if not text:
-        return "Could not extract text from the document.", history
-    # Get relevant context
-    chunks = get_relevant_chunks(text, question, embedder)
     if not chunks:
-        return "No relevant information found in document.", history
-    context = " ".join(chunks)
-    # Try QA pipeline
-    result = qa_pipeline(question=question, context=context)
-    if result["score"] > 0.2:  # Confidence threshold
-        answer = clean_answer(result["answer"])
-    else:
-        # Fallback to keyword extraction
-        keywords = ["system", "natural", "artificial", "component", "objective"]
-        for keyword in keywords:
-            if keyword in question.lower():
-                answer = extract_direct_definition(text, keyword)
-                if answer:
-                    break
-        else:
             answer = "Sorry, I couldn't find a clear answer in the document."
     history.append((question, answer))
     return "", history
 with gr.Blocks() as demo:
-    gr.Markdown("## Enhanced Document QA System")
     with gr.Row():
-        file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
     with gr.Row():
         chatbot = gr.Chatbot(height=400)
     with gr.Row():
         question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
-    with gr.Row():
-        gr.Button("👍"), gr.Button("👎")
     state = gr.State([])
-    question.submit(ask_question, [file_input, question, state], [question, chatbot])
 demo.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
 import docx
+from sentence_transformers import SentenceTransformer, util
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import re
 import torch
 # Load models
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
+qa_pipeline = pipeline("question-answering",
+                      model="distilbert-base-cased-distilled-squad",
+                      device=0 if torch.cuda.is_available() else -1)
+# Load GPT model (using GPT-2 as example - replace with GPT-3/4 if available)
+gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")
+gpt_model.eval()
 def extract_text(file):
     if file.name.endswith(".pdf"):
+        text = "\n".join([page.extract_text() or "" for page in PdfReader(file).pages])
     elif file.name.endswith(".docx"):
         text = "\n".join([p.text for p in docx.Document(file).paragraphs])
     else:
         return ""
+    # Clean up text
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
     return text.strip()
+def chunk_text(text, chunk_size=500, overlap=100):
+    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
     chunks = []
     current_chunk = ""
+    for sent in sentences:
+        if len(current_chunk) + len(sent) < chunk_size:
+            current_chunk += sent + " "
         else:
             chunks.append(current_chunk.strip())
+            # Keep some overlap between chunks for context
+            current_chunk = current_chunk[-overlap:] + sent + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
+def generate_with_gpt(prompt, max_length=150):
+    inputs = gpt_tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = gpt_model.generate(
+            inputs.input_ids,
+            max_length=max_length,
+            num_return_sequences=1,
+            no_repeat_ngram_size=2,
+            do_sample=True,
+            top_k=50,
+            top_p=0.95,
+            temperature=0.7
+        )
+    return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
+def refine_answer_with_gpt(context, question, initial_answer):
+    prompt = f"""
+    Based on the following context, refine the answer to make it more clear and complete:
+    Context: {context}
+    Question: {question}
+    Initial Answer: {initial_answer}
+    Improved Answer:
+    """
+    return generate_with_gpt(prompt)
+def extract_direct_definition(text, term):
+    """Try to find a sentence that directly defines the term"""
+    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
+    term = term.lower()
+    candidates = []
+    for sent in sentences:
+        lower_sent = sent.lower()
+        if term in lower_sent:
+            if (" is " in lower_sent or " are " in lower_sent or
+                " refers to " in lower_sent or " defined as " in lower_sent):
+                candidates.append(sent)
+    if candidates:
+        return candidates[0]
     return None
 def ask_question(file, question, history):
     if not file:
         return "Please upload a file.", history
     text = extract_text(file)
     if not text:
+        return "Could not extract text from the file.", history
+    chunks = chunk_text(text)
     if not chunks:
+        return "No meaningful text chunks could be created.", history
+    # Normalize question for better matching
+    normalized_question = question.lower().strip(" ?")
+    try:
+        # First try to find direct definitions
+        if "artificial system" in normalized_question:
+            answer = extract_direct_definition(text, "artificial system")
+        elif "natural system" in normalized_question:
+            answer = extract_direct_definition(text, "natural system")
+        elif "component" in normalized_question:
+            answer = extract_direct_definition(text, "component")
+        # If no direct definition found, use semantic search
+        if not answer:
+            emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
+            emb_question = embedder.encode(question, convert_to_tensor=True)
+            scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
+            best_idx = scores.argmax().item()
+            best_chunk = chunks[best_idx]
+            # Combine top chunks if confidence is low
+            if scores[best_idx] < 0.3:
+                top_k = min(3, len(chunks))
+                best_indices = scores.topk(top_k).indices.tolist()
+                best_chunk = " ".join([chunks[i] for i in best_indices])
+            # Get initial answer from QA model
+            result = qa_pipeline(question=question, context=best_chunk)
+            answer = result["answer"] if result["score"] > 0.1 else None
+            # Refine answer with GPT if available
+            if answer and len(answer.split()) > 2:
+                answer = refine_answer_with_gpt(best_chunk, question, answer)
+        # Final fallback
+        if not answer:
             answer = "Sorry, I couldn't find a clear answer in the document."
+    except Exception as e:
+        answer = f"An error occurred: {str(e)}"
     history.append((question, answer))
     return "", history
 with gr.Blocks() as demo:
+    gr.Markdown("## Enhanced Document QA with GPT Integration")
     with gr.Row():
+        file_input = gr.File(label="Upload PDF or Word", file_types=[".pdf", ".docx"])
     with gr.Row():
         chatbot = gr.Chatbot(height=400)
     with gr.Row():
         question = gr.Textbox(label="Ask your question", placeholder="Type your question here...")
     state = gr.State([])
+    question.submit(
+        ask_question,
+        [file_input, question, state],
+        [question, chatbot]
+    )
 demo.launch()