Spaces:

Shriharsh
/

Web_Content_QA

Running

App Files Files Community

Shriharsh commited on Mar 20

Commit

581df5c

verified ·

1 Parent(s): d97eaef

Create app.py

Browse files

Files changed (1) hide show

app.py +141 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Web Content Q&A Tool for Hugging Face Spaces
+# Optimized for memory constraints (2GB RAM) and 24-hour timeline
+# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT
+import gradio as gr
+from bs4 import BeautifulSoup
+import requests
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+from transformers import pipeline
+# Global variables for in-memory storage (reset on app restart)
+corpus = []  # List of paragraphs from URLs
+embeddings = None  # Precomputed embeddings for retrieval
+sources_list = []  # Source URLs for each paragraph
+# Load models at startup (memory: ~340MB total)
+# Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
+retriever = SentenceTransformer('all-MiniLM-L6-v2')
+# QA model: DistilBERT fine-tuned on SQuAD (~260MB)
+qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+def ingest_urls(urls):
+    """
+    Ingest up to 3 URLs, scrape content, and compute embeddings.
+    Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total).
+    """
+    global corpus, embeddings, sources_list
+    # Clear previous data
+    corpus.clear()
+    sources_list.clear()
+    embeddings = None
+    # Parse URLs from input (one per line, max 3)
+    url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3]
+    if not url_list:
+        return "Error: Please enter at least one valid URL."
+    # Headers to mimic browser and avoid blocking
+    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
+    # Scrape each URL
+    for url in url_list:
+        try:
+            response = requests.get(url, headers=headers, timeout=5)
+            response.raise_for_status()  # Raise exception for bad status codes
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract content from <p> and <div> tags for broader coverage
+            elements = soup.find_all(['p', 'div'])
+            paragraph_count = 0
+            for elem in elements:
+                text = elem.get_text().strip()
+                # Filter short or empty text
+                if text and len(text) > 20 and paragraph_count < 100:
+                    corpus.append(text)
+                    sources_list.append(url)
+                    paragraph_count += 1
+            if paragraph_count == 0:
+                return f"Warning: No usable content found at {url}."
+        except Exception as e:
+            return f"Error ingesting {url}: {str(e)}. Check URL and try again."
+    # Compute embeddings if content was ingested
+    if corpus:
+        # Embeddings: ~1.5KB per paragraph, ~450KB for 300 paragraphs
+        embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
+        return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
+    return "Error: No valid content ingested."
+def answer_question(question):
+    """
+    Answer a question using retrieved context and DistilBERT QA.
+    Retrieves top 3 paragraphs to provide broader context for cross-questioning.
+    If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
+    """
+    global corpus, embeddings, sources_list
+    if not corpus or embeddings is None:
+        return "Error: Please ingest URLs first."
+    # Encode question into embedding
+    question_embedding = retriever.encode(question, convert_to_tensor=True)
+    # Compute cosine similarity with stored embeddings
+    cos_scores = util.cos_sim(question_embedding, embeddings)[0]
+    top_k = min(3, len(corpus))  # Get top 3 or less if fewer paragraphs
+    top_indices = np.argsort(-cos_scores)[:top_k]
+    # Retrieve context (top 3 paragraphs)
+    contexts = [corpus[i] for i in top_indices]
+    context = " ".join(contexts)  # Concatenate with space
+    sources = [sources_list[i] for i in top_indices]
+    # Extract answer with DistilBERT
+    # Note: If total tokens exceed 512, it will be truncated automatically
+    result = qa_model(question=question, context=context)
+    answer = result['answer']
+    confidence = result['score']
+    # Format response with answer, confidence, and sources
+    sources_str = "\n".join(set(sources))  # Unique sources
+    return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
+def clear_all():
+    """Clear all inputs and outputs for a fresh start."""
+    global corpus, embeddings, sources_list
+    corpus.clear()
+    embeddings = None
+    sources_list.clear()
+    return "", "", ""
+# Gradio UI with minimal, user-friendly design
+with gr.Blocks(title="Web Content Q&A Tool") as demo:
+    gr.Markdown(
+        """
+        # Web Content Q&A Tool
+        Enter up to 3 URLs (one per line), ingest their content, and ask questions.
+        Answers are generated using only the ingested data. Note: Data resets on app restart.
+        """
+    )
+    # URL input and ingestion
+    with gr.Row():
+        url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com")
+        with gr.Column():
+            ingest_btn = gr.Button("Ingest URLs")
+            clear_btn = gr.Button("Clear All")
+    ingest_output = gr.Textbox(label="Ingestion Status", interactive=False)
+    # Question input and answer
+    with gr.Row():
+        question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?")
+        ask_btn = gr.Button("Ask")
+    answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
+    # Bind functions to buttons
+    ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output)
+    ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)
+    clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])
+# Launch the app (HF Spaces expects port 7860)
+demo.launch(server_name="0.0.0.0", server_port=7860)