Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# Web Content Q&A Tool for Hugging Face Spaces
|
2 |
-
#
|
|
|
3 |
# Includes keyword search fallback for low-confidence QA answers
|
4 |
|
5 |
import gradio as gr
|
@@ -57,6 +58,16 @@ model = torch.quantization.quantize_dynamic(
|
|
57 |
# Create the QA pipeline with PyTorch
|
58 |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
# Keyword search function for fallback
|
61 |
def keyword_search(question, corpus, sources_list):
|
62 |
stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
|
@@ -85,6 +96,8 @@ def keyword_search(question, corpus, sources_list):
|
|
85 |
if best_paragraph is None:
|
86 |
return "No relevant paragraph found.", None
|
87 |
|
|
|
|
|
88 |
return best_paragraph, best_source
|
89 |
|
90 |
def ingest_urls(urls):
|
@@ -140,6 +153,7 @@ def answer_question(question):
|
|
140 |
Retrieves top 3 paragraphs to improve answer accuracy.
|
141 |
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
142 |
If QA confidence is below 0.4, falls back to keyword search.
|
|
|
143 |
"""
|
144 |
global corpus, embeddings, sources_list
|
145 |
if not corpus or embeddings is None:
|
@@ -153,7 +167,7 @@ def answer_question(question):
|
|
153 |
top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
|
154 |
top_indices = np.argsort(-cos_scores)[:top_k]
|
155 |
|
156 |
-
# Retrieve context (top
|
157 |
contexts = [corpus[i] for i in top_indices]
|
158 |
context = " ".join(contexts) # Concatenate with space
|
159 |
sources = [sources_list[i] for i in top_indices]
|
@@ -165,7 +179,11 @@ def answer_question(question):
|
|
165 |
confidence = result['score']
|
166 |
|
167 |
if confidence >= 0.4:
|
168 |
-
#
|
|
|
|
|
|
|
|
|
169 |
sources_str = "\n".join(set(sources)) # Unique sources
|
170 |
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
|
171 |
else:
|
|
|
1 |
# Web Content Q&A Tool for Hugging Face Spaces
|
2 |
+
# Optimized for memory constraints (2GB RAM) and 24-hour timeline
|
3 |
+
# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
|
4 |
# Includes keyword search fallback for low-confidence QA answers
|
5 |
|
6 |
import gradio as gr
|
|
|
58 |
# Create the QA pipeline with PyTorch
|
59 |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
|
60 |
|
61 |
+
# Utility function to truncate text to one line
|
62 |
+
def truncate_to_one_line(text):
|
63 |
+
# Split by sentence-ending punctuation and take the first sentence
|
64 |
+
sentences = re.split(r'[.!?]+', text.strip())
|
65 |
+
first_sentence = sentences[0].strip() if sentences else text.strip()
|
66 |
+
# If the sentence is too long, truncate to 100 characters
|
67 |
+
if len(first_sentence) > 100:
|
68 |
+
first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
|
69 |
+
return first_sentence if first_sentence else "No answer available."
|
70 |
+
|
71 |
# Keyword search function for fallback
|
72 |
def keyword_search(question, corpus, sources_list):
|
73 |
stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
|
|
|
96 |
if best_paragraph is None:
|
97 |
return "No relevant paragraph found.", None
|
98 |
|
99 |
+
# Truncate the paragraph to one line
|
100 |
+
best_paragraph = truncate_to_one_line(best_paragraph)
|
101 |
return best_paragraph, best_source
|
102 |
|
103 |
def ingest_urls(urls):
|
|
|
153 |
Retrieves top 3 paragraphs to improve answer accuracy.
|
154 |
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
155 |
If QA confidence is below 0.4, falls back to keyword search.
|
156 |
+
Ensures answers are one line (max 100 chars).
|
157 |
"""
|
158 |
global corpus, embeddings, sources_list
|
159 |
if not corpus or embeddings is None:
|
|
|
167 |
top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
|
168 |
top_indices = np.argsort(-cos_scores)[:top_k]
|
169 |
|
170 |
+
# Retrieve context (top 2 paragraphs)
|
171 |
contexts = [corpus[i] for i in top_indices]
|
172 |
context = " ".join(contexts) # Concatenate with space
|
173 |
sources = [sources_list[i] for i in top_indices]
|
|
|
179 |
confidence = result['score']
|
180 |
|
181 |
if confidence >= 0.4:
|
182 |
+
# Truncate QA answer to one line
|
183 |
+
answer = truncate_to_one_line(answer)
|
184 |
+
# Ensure at least one line
|
185 |
+
if not answer:
|
186 |
+
answer = "No answer available."
|
187 |
sources_str = "\n".join(set(sources)) # Unique sources
|
188 |
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
|
189 |
else:
|