Shriharsh commited on
Commit
9121798
·
verified ·
1 Parent(s): 681b2fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -3
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # Web Content Q&A Tool for Hugging Face Spaces
2
- # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
 
3
  # Includes keyword search fallback for low-confidence QA answers
4
 
5
  import gradio as gr
@@ -57,6 +58,16 @@ model = torch.quantization.quantize_dynamic(
57
  # Create the QA pipeline with PyTorch
58
  qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
59
 
 
 
 
 
 
 
 
 
 
 
60
  # Keyword search function for fallback
61
  def keyword_search(question, corpus, sources_list):
62
  stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
@@ -85,6 +96,8 @@ def keyword_search(question, corpus, sources_list):
85
  if best_paragraph is None:
86
  return "No relevant paragraph found.", None
87
 
 
 
88
  return best_paragraph, best_source
89
 
90
  def ingest_urls(urls):
@@ -140,6 +153,7 @@ def answer_question(question):
140
  Retrieves top 3 paragraphs to improve answer accuracy.
141
  If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
142
  If QA confidence is below 0.4, falls back to keyword search.
 
143
  """
144
  global corpus, embeddings, sources_list
145
  if not corpus or embeddings is None:
@@ -153,7 +167,7 @@ def answer_question(question):
153
  top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
154
  top_indices = np.argsort(-cos_scores)[:top_k]
155
 
156
- # Retrieve context (top 3 paragraphs)
157
  contexts = [corpus[i] for i in top_indices]
158
  context = " ".join(contexts) # Concatenate with space
159
  sources = [sources_list[i] for i in top_indices]
@@ -165,7 +179,11 @@ def answer_question(question):
165
  confidence = result['score']
166
 
167
  if confidence >= 0.4:
168
- # Format response with answer, confidence, and sources
 
 
 
 
169
  sources_str = "\n".join(set(sources)) # Unique sources
170
  return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
171
  else:
 
1
  # Web Content Q&A Tool for Hugging Face Spaces
2
+ # Optimized for memory constraints (2GB RAM) and 24-hour timeline
3
+ # Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
4
  # Includes keyword search fallback for low-confidence QA answers
5
 
6
  import gradio as gr
 
58
  # Create the QA pipeline with PyTorch
59
  qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
60
 
61
+ # Utility function to truncate text to one line
62
+ def truncate_to_one_line(text):
63
+ # Split by sentence-ending punctuation and take the first sentence
64
+ sentences = re.split(r'[.!?]+', text.strip())
65
+ first_sentence = sentences[0].strip() if sentences else text.strip()
66
+ # If the sentence is too long, truncate to 100 characters
67
+ if len(first_sentence) > 100:
68
+ first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
69
+ return first_sentence if first_sentence else "No answer available."
70
+
71
  # Keyword search function for fallback
72
  def keyword_search(question, corpus, sources_list):
73
  stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
 
96
  if best_paragraph is None:
97
  return "No relevant paragraph found.", None
98
 
99
+ # Truncate the paragraph to one line
100
+ best_paragraph = truncate_to_one_line(best_paragraph)
101
  return best_paragraph, best_source
102
 
103
  def ingest_urls(urls):
 
153
  Retrieves top 3 paragraphs to improve answer accuracy.
154
  If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
155
  If QA confidence is below 0.4, falls back to keyword search.
156
+ Ensures answers are one line (max 100 chars).
157
  """
158
  global corpus, embeddings, sources_list
159
  if not corpus or embeddings is None:
 
167
  top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
168
  top_indices = np.argsort(-cos_scores)[:top_k]
169
 
170
+ # Retrieve context (top 2 paragraphs)
171
  contexts = [corpus[i] for i in top_indices]
172
  context = " ".join(contexts) # Concatenate with space
173
  sources = [sources_list[i] for i in top_indices]
 
179
  confidence = result['score']
180
 
181
  if confidence >= 0.4:
182
+ # Truncate QA answer to one line
183
+ answer = truncate_to_one_line(answer)
184
+ # Ensure at least one line
185
+ if not answer:
186
+ answer = "No answer available."
187
  sources_str = "\n".join(set(sources)) # Unique sources
188
  return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
189
  else: