Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# Web Content Q&A Tool for Hugging Face Spaces
|
2 |
-
# Optimized for memory constraints (2GB RAM) and 24-hour timeline
|
3 |
# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
from bs4 import BeautifulSoup
|
@@ -12,6 +12,7 @@ import torch
|
|
12 |
from huggingface_hub import hf_hub_download, HfFolder
|
13 |
from huggingface_hub.utils import configure_http_backend
|
14 |
import requests as hf_requests
|
|
|
15 |
|
16 |
# Configure Hugging Face Hub to use a custom session with increased timeout and retries
|
17 |
def create_custom_session():
|
@@ -56,6 +57,36 @@ model = torch.quantization.quantize_dynamic(
|
|
56 |
# Create the QA pipeline with PyTorch
|
57 |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def ingest_urls(urls):
|
60 |
"""
|
61 |
Ingest up to 3 URLs, scrape content, and compute embeddings.
|
@@ -108,6 +139,7 @@ def answer_question(question):
|
|
108 |
Answer a question using retrieved context and DistilBERT QA (PyTorch).
|
109 |
Retrieves top 3 paragraphs to improve answer accuracy.
|
110 |
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
|
|
111 |
"""
|
112 |
global corpus, embeddings, sources_list
|
113 |
if not corpus or embeddings is None:
|
@@ -118,7 +150,7 @@ def answer_question(question):
|
|
118 |
|
119 |
# Compute cosine similarity with stored embeddings
|
120 |
cos_scores = util.cos_sim(question_embedding, embeddings)[0]
|
121 |
-
top_k = min(
|
122 |
top_indices = np.argsort(-cos_scores)[:top_k]
|
123 |
|
124 |
# Retrieve context (top 3 paragraphs)
|
@@ -132,9 +164,17 @@ def answer_question(question):
|
|
132 |
answer = result['answer']
|
133 |
confidence = result['score']
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
def clear_all():
|
140 |
"""Clear all inputs and outputs for a fresh start."""
|
|
|
1 |
# Web Content Q&A Tool for Hugging Face Spaces
|
|
|
2 |
# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
|
3 |
+
# Includes keyword search fallback for low-confidence QA answers
|
4 |
|
5 |
import gradio as gr
|
6 |
from bs4 import BeautifulSoup
|
|
|
12 |
from huggingface_hub import hf_hub_download, HfFolder
|
13 |
from huggingface_hub.utils import configure_http_backend
|
14 |
import requests as hf_requests
|
15 |
+
import re
|
16 |
|
17 |
# Configure Hugging Face Hub to use a custom session with increased timeout and retries
|
18 |
def create_custom_session():
|
|
|
57 |
# Create the QA pipeline with PyTorch
|
58 |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
|
59 |
|
60 |
+
# Keyword search function for fallback
|
61 |
+
def keyword_search(question, corpus, sources_list):
|
62 |
+
stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
|
63 |
+
|
64 |
+
def clean_text(text):
|
65 |
+
return re.sub(r'[^a-zA-Z\s]', '', text).lower()
|
66 |
+
|
67 |
+
cleaned_question = clean_text(question)
|
68 |
+
keywords = [word for word in cleaned_question.split() if word not in stop_words]
|
69 |
+
if not keywords:
|
70 |
+
return "No keywords found for search.", None
|
71 |
+
|
72 |
+
best_paragraph = None
|
73 |
+
best_count = 0
|
74 |
+
best_source = None
|
75 |
+
|
76 |
+
for i, para in enumerate(corpus):
|
77 |
+
cleaned_para = clean_text(para)
|
78 |
+
words = set(cleaned_para.split()) # Use set for faster lookup
|
79 |
+
count = sum(1 for kw in keywords if kw in words)
|
80 |
+
if count > best_count:
|
81 |
+
best_count = count
|
82 |
+
best_paragraph = para
|
83 |
+
best_source = sources_list[i]
|
84 |
+
|
85 |
+
if best_paragraph is None:
|
86 |
+
return "No relevant paragraph found.", None
|
87 |
+
|
88 |
+
return best_paragraph, best_source
|
89 |
+
|
90 |
def ingest_urls(urls):
|
91 |
"""
|
92 |
Ingest up to 3 URLs, scrape content, and compute embeddings.
|
|
|
139 |
Answer a question using retrieved context and DistilBERT QA (PyTorch).
|
140 |
Retrieves top 3 paragraphs to improve answer accuracy.
|
141 |
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
142 |
+
If QA confidence is below 0.4, falls back to keyword search.
|
143 |
"""
|
144 |
global corpus, embeddings, sources_list
|
145 |
if not corpus or embeddings is None:
|
|
|
150 |
|
151 |
# Compute cosine similarity with stored embeddings
|
152 |
cos_scores = util.cos_sim(question_embedding, embeddings)[0]
|
153 |
+
top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
|
154 |
top_indices = np.argsort(-cos_scores)[:top_k]
|
155 |
|
156 |
# Retrieve context (top 3 paragraphs)
|
|
|
164 |
answer = result['answer']
|
165 |
confidence = result['score']
|
166 |
|
167 |
+
if confidence >= 0.4:
|
168 |
+
# Format response with answer, confidence, and sources
|
169 |
+
sources_str = "\n".join(set(sources)) # Unique sources
|
170 |
+
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
|
171 |
+
else:
|
172 |
+
# Perform keyword search
|
173 |
+
kw_answer, kw_source = keyword_search(question, corpus, sources_list)
|
174 |
+
if kw_source:
|
175 |
+
return f"Answer: {kw_answer} (from keyword search, as QA confidence was {confidence:.2f})\nSource: {kw_source}"
|
176 |
+
else:
|
177 |
+
return "No relevant answer found from keyword search."
|
178 |
|
179 |
def clear_all():
|
180 |
"""Clear all inputs and outputs for a fresh start."""
|