Spaces:

Shriharsh
/

Web_Content_QA

Running

App Files Files Community

Shriharsh commited on Mar 21

Commit

9121798

verified ·

1 Parent(s): 681b2fa

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -3

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Web Content Q&A Tool for Hugging Face Spaces
-# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
 # Includes keyword search fallback for low-confidence QA answers
 import gradio as gr
@@ -57,6 +58,16 @@ model = torch.quantization.quantize_dynamic(
 # Create the QA pipeline with PyTorch
 qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
 # Keyword search function for fallback
 def keyword_search(question, corpus, sources_list):
     stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
@@ -85,6 +96,8 @@ def keyword_search(question, corpus, sources_list):
     if best_paragraph is None:
         return "No relevant paragraph found.", None
     return best_paragraph, best_source
 def ingest_urls(urls):
@@ -140,6 +153,7 @@ def answer_question(question):
     Retrieves top 3 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     If QA confidence is below 0.4, falls back to keyword search.
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
@@ -153,7 +167,7 @@ def answer_question(question):
     top_k = min(2, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
-    # Retrieve context (top 3 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
@@ -165,7 +179,11 @@ def answer_question(question):
     confidence = result['score']
     if confidence >= 0.4:
-        # Format response with answer, confidence, and sources
         sources_str = "\n".join(set(sources))  # Unique sources
         return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
     else:

 # Web Content Q&A Tool for Hugging Face Spaces
+# Optimized for memory constraints (2GB RAM) and 24-hour timeline
+# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
 # Includes keyword search fallback for low-confidence QA answers
 import gradio as gr
 # Create the QA pipeline with PyTorch
 qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
+# Utility function to truncate text to one line
+def truncate_to_one_line(text):
+    # Split by sentence-ending punctuation and take the first sentence
+    sentences = re.split(r'[.!?]+', text.strip())
+    first_sentence = sentences[0].strip() if sentences else text.strip()
+    # If the sentence is too long, truncate to 100 characters
+    if len(first_sentence) > 100:
+        first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
+    return first_sentence if first_sentence else "No answer available."
 # Keyword search function for fallback
 def keyword_search(question, corpus, sources_list):
     stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
     if best_paragraph is None:
         return "No relevant paragraph found.", None
+    # Truncate the paragraph to one line
+    best_paragraph = truncate_to_one_line(best_paragraph)
     return best_paragraph, best_source
 def ingest_urls(urls):
     Retrieves top 3 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     If QA confidence is below 0.4, falls back to keyword search.
+    Ensures answers are one line (max 100 chars).
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
     top_k = min(2, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
+    # Retrieve context (top 2 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
     confidence = result['score']
     if confidence >= 0.4:
+        # Truncate QA answer to one line
+        answer = truncate_to_one_line(answer)
+        # Ensure at least one line
+        if not answer:
+            answer = "No answer available."
         sources_str = "\n".join(set(sources))  # Unique sources
         return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
     else: