Spaces:

Shriharsh
/

Web_Content_QA

Sleeping

App Files Files Community

Shriharsh commited on Mar 21

Commit

681b2fa

verified ·

1 Parent(s): e637b24

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -5

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Web Content Q&A Tool for Hugging Face Spaces
-# Optimized for memory constraints (2GB RAM) and 24-hour timeline
 # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
 import gradio as gr
 from bs4 import BeautifulSoup
@@ -12,6 +12,7 @@ import torch
 from huggingface_hub import hf_hub_download, HfFolder
 from huggingface_hub.utils import configure_http_backend
 import requests as hf_requests
 # Configure Hugging Face Hub to use a custom session with increased timeout and retries
 def create_custom_session():
@@ -56,6 +57,36 @@ model = torch.quantization.quantize_dynamic(
 # Create the QA pipeline with PyTorch
 qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
 def ingest_urls(urls):
     """
     Ingest up to 3 URLs, scrape content, and compute embeddings.
@@ -108,6 +139,7 @@ def answer_question(question):
     Answer a question using retrieved context and DistilBERT QA (PyTorch).
     Retrieves top 3 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
@@ -118,7 +150,7 @@ def answer_question(question):
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
-    top_k = min(1, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
     # Retrieve context (top 3 paragraphs)
@@ -132,9 +164,17 @@ def answer_question(question):
     answer = result['answer']
     confidence = result['score']
-    # Format response with answer, confidence, and sources
-    sources_str = "\n".join(set(sources))  # Unique sources
-    return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
 def clear_all():
     """Clear all inputs and outputs for a fresh start."""

 # Web Content Q&A Tool for Hugging Face Spaces
 # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
+# Includes keyword search fallback for low-confidence QA answers
 import gradio as gr
 from bs4 import BeautifulSoup
 from huggingface_hub import hf_hub_download, HfFolder
 from huggingface_hub.utils import configure_http_backend
 import requests as hf_requests
+import re
 # Configure Hugging Face Hub to use a custom session with increased timeout and retries
 def create_custom_session():
 # Create the QA pipeline with PyTorch
 qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
+# Keyword search function for fallback
+def keyword_search(question, corpus, sources_list):
+    stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
+    def clean_text(text):
+        return re.sub(r'[^a-zA-Z\s]', '', text).lower()
+    cleaned_question = clean_text(question)
+    keywords = [word for word in cleaned_question.split() if word not in stop_words]
+    if not keywords:
+        return "No keywords found for search.", None
+    best_paragraph = None
+    best_count = 0
+    best_source = None
+    for i, para in enumerate(corpus):
+        cleaned_para = clean_text(para)
+        words = set(cleaned_para.split())  # Use set for faster lookup
+        count = sum(1 for kw in keywords if kw in words)
+        if count > best_count:
+            best_count = count
+            best_paragraph = para
+            best_source = sources_list[i]
+    if best_paragraph is None:
+        return "No relevant paragraph found.", None
+    return best_paragraph, best_source
 def ingest_urls(urls):
     """
     Ingest up to 3 URLs, scrape content, and compute embeddings.
     Answer a question using retrieved context and DistilBERT QA (PyTorch).
     Retrieves top 3 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
+    If QA confidence is below 0.4, falls back to keyword search.
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
+    top_k = min(2, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
     # Retrieve context (top 3 paragraphs)
     answer = result['answer']
     confidence = result['score']
+    if confidence >= 0.4:
+        # Format response with answer, confidence, and sources
+        sources_str = "\n".join(set(sources))  # Unique sources
+        return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
+    else:
+        # Perform keyword search
+        kw_answer, kw_source = keyword_search(question, corpus, sources_list)
+        if kw_source:
+            return f"Answer: {kw_answer} (from keyword search, as QA confidence was {confidence:.2f})\nSource: {kw_source}"
+        else:
+            return "No relevant answer found from keyword search."
 def clear_all():
     """Clear all inputs and outputs for a fresh start."""