Spaces:

Shriharsh
/

Web_Content_QA

Sleeping

App Files Files Community

Shriharsh commited on Mar 21

Commit

5148cc2

verified ·

1 Parent(s): 1544b6b

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -22

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Web Content Q&A Tool for Hugging Face Spaces
 # Optimized for memory constraints (2GB RAM) and 24-hour timeline
-# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
 import gradio as gr
 from bs4 import BeautifulSoup
@@ -12,7 +14,6 @@ import torch
 from huggingface_hub import hf_hub_download, HfFolder
 from huggingface_hub.utils import configure_http_backend
 import requests as hf_requests
-import re
 # Configure Hugging Face Hub to use a custom session with increased timeout and retries
 def create_custom_session():
@@ -57,16 +58,6 @@ model = torch.quantization.quantize_dynamic(
 # Create the QA pipeline with PyTorch
 qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
-# Utility function to truncate text to one line
-def truncate_to_one_line(text):
-    # Split by sentence-ending punctuation and take the first sentence
-    sentences = re.split(r'[.!?]+', text.strip())
-    first_sentence = sentences[0].strip() if sentences else text.strip()
-    # If the sentence is too long, truncate to 100 characters
-    if len(first_sentence) > 100:
-        first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
-    return first_sentence if first_sentence else "No answer available."
 def ingest_urls(urls):
     """
     Ingest up to 3 URLs, scrape content, and compute embeddings.
@@ -117,9 +108,8 @@ def ingest_urls(urls):
 def answer_question(question):
     """
     Answer a question using retrieved context and DistilBERT QA (PyTorch).
-    Retrieves top 2 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
-    Ensures answers are one line (max 100 chars).
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
@@ -130,10 +120,10 @@ def answer_question(question):
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
-    top_k = min(2, len(corpus))  # Get top 2 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
-    # Retrieve context (top 2 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
@@ -144,12 +134,6 @@ def answer_question(question):
     answer = result['answer']
     confidence = result['score']
-    # Truncate answer to one line
-    answer = truncate_to_one_line(answer)
-    # Ensure at least one line
-    if not answer:
-        answer = "No answer available."
     # Format response with answer, confidence, and sources
     sources_str = "\n".join(set(sources))  # Unique sources
     return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"

+this is the app.py code without keyword searching
 # Web Content Q&A Tool for Hugging Face Spaces
 # Optimized for memory constraints (2GB RAM) and 24-hour timeline
+# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
 import gradio as gr
 from bs4 import BeautifulSoup
 from huggingface_hub import hf_hub_download, HfFolder
 from huggingface_hub.utils import configure_http_backend
 import requests as hf_requests
 # Configure Hugging Face Hub to use a custom session with increased timeout and retries
 def create_custom_session():
 # Create the QA pipeline with PyTorch
 qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
 def ingest_urls(urls):
     """
     Ingest up to 3 URLs, scrape content, and compute embeddings.
 def answer_question(question):
     """
     Answer a question using retrieved context and DistilBERT QA (PyTorch).
+    Retrieves top 3 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
+    top_k = min(1, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
+    # Retrieve context (top 3 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
     answer = result['answer']
     confidence = result['score']
     # Format response with answer, confidence, and sources
     sources_str = "\n".join(set(sources))  # Unique sources
     return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"