Shriharsh commited on
Commit
681b2fa
·
verified ·
1 Parent(s): e637b24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -5
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # Web Content Q&A Tool for Hugging Face Spaces
2
- # Optimized for memory constraints (2GB RAM) and 24-hour timeline
3
  # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
 
4
 
5
  import gradio as gr
6
  from bs4 import BeautifulSoup
@@ -12,6 +12,7 @@ import torch
12
  from huggingface_hub import hf_hub_download, HfFolder
13
  from huggingface_hub.utils import configure_http_backend
14
  import requests as hf_requests
 
15
 
16
  # Configure Hugging Face Hub to use a custom session with increased timeout and retries
17
  def create_custom_session():
@@ -56,6 +57,36 @@ model = torch.quantization.quantize_dynamic(
56
  # Create the QA pipeline with PyTorch
57
  qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def ingest_urls(urls):
60
  """
61
  Ingest up to 3 URLs, scrape content, and compute embeddings.
@@ -108,6 +139,7 @@ def answer_question(question):
108
  Answer a question using retrieved context and DistilBERT QA (PyTorch).
109
  Retrieves top 3 paragraphs to improve answer accuracy.
110
  If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
 
111
  """
112
  global corpus, embeddings, sources_list
113
  if not corpus or embeddings is None:
@@ -118,7 +150,7 @@ def answer_question(question):
118
 
119
  # Compute cosine similarity with stored embeddings
120
  cos_scores = util.cos_sim(question_embedding, embeddings)[0]
121
- top_k = min(1, len(corpus)) # Get top 3 paragraphs to improve accuracy
122
  top_indices = np.argsort(-cos_scores)[:top_k]
123
 
124
  # Retrieve context (top 3 paragraphs)
@@ -132,9 +164,17 @@ def answer_question(question):
132
  answer = result['answer']
133
  confidence = result['score']
134
 
135
- # Format response with answer, confidence, and sources
136
- sources_str = "\n".join(set(sources)) # Unique sources
137
- return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
 
 
 
 
 
 
 
 
138
 
139
  def clear_all():
140
  """Clear all inputs and outputs for a fresh start."""
 
1
  # Web Content Q&A Tool for Hugging Face Spaces
 
2
  # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
3
+ # Includes keyword search fallback for low-confidence QA answers
4
 
5
  import gradio as gr
6
  from bs4 import BeautifulSoup
 
12
  from huggingface_hub import hf_hub_download, HfFolder
13
  from huggingface_hub.utils import configure_http_backend
14
  import requests as hf_requests
15
+ import re
16
 
17
  # Configure Hugging Face Hub to use a custom session with increased timeout and retries
18
  def create_custom_session():
 
57
  # Create the QA pipeline with PyTorch
58
  qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
59
 
60
+ # Keyword search function for fallback
61
+ def keyword_search(question, corpus, sources_list):
62
+ stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
63
+
64
+ def clean_text(text):
65
+ return re.sub(r'[^a-zA-Z\s]', '', text).lower()
66
+
67
+ cleaned_question = clean_text(question)
68
+ keywords = [word for word in cleaned_question.split() if word not in stop_words]
69
+ if not keywords:
70
+ return "No keywords found for search.", None
71
+
72
+ best_paragraph = None
73
+ best_count = 0
74
+ best_source = None
75
+
76
+ for i, para in enumerate(corpus):
77
+ cleaned_para = clean_text(para)
78
+ words = set(cleaned_para.split()) # Use set for faster lookup
79
+ count = sum(1 for kw in keywords if kw in words)
80
+ if count > best_count:
81
+ best_count = count
82
+ best_paragraph = para
83
+ best_source = sources_list[i]
84
+
85
+ if best_paragraph is None:
86
+ return "No relevant paragraph found.", None
87
+
88
+ return best_paragraph, best_source
89
+
90
  def ingest_urls(urls):
91
  """
92
  Ingest up to 3 URLs, scrape content, and compute embeddings.
 
139
  Answer a question using retrieved context and DistilBERT QA (PyTorch).
140
  Retrieves top 3 paragraphs to improve answer accuracy.
141
  If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
142
+ If QA confidence is below 0.4, falls back to keyword search.
143
  """
144
  global corpus, embeddings, sources_list
145
  if not corpus or embeddings is None:
 
150
 
151
  # Compute cosine similarity with stored embeddings
152
  cos_scores = util.cos_sim(question_embedding, embeddings)[0]
153
+ top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
154
  top_indices = np.argsort(-cos_scores)[:top_k]
155
 
156
  # Retrieve context (top 3 paragraphs)
 
164
  answer = result['answer']
165
  confidence = result['score']
166
 
167
+ if confidence >= 0.4:
168
+ # Format response with answer, confidence, and sources
169
+ sources_str = "\n".join(set(sources)) # Unique sources
170
+ return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
171
+ else:
172
+ # Perform keyword search
173
+ kw_answer, kw_source = keyword_search(question, corpus, sources_list)
174
+ if kw_source:
175
+ return f"Answer: {kw_answer} (from keyword search, as QA confidence was {confidence:.2f})\nSource: {kw_source}"
176
+ else:
177
+ return "No relevant answer found from keyword search."
178
 
179
  def clear_all():
180
  """Clear all inputs and outputs for a fresh start."""