# Web Content Q&A Tool for Hugging Face Spaces # Optimized for memory constraints (2GB RAM) and 24-hour timeline # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT import gradio as gr from bs4 import BeautifulSoup import requests from sentence_transformers import SentenceTransformer, util import numpy as np from transformers import pipeline # Global variables for in-memory storage (reset on app restart) corpus = [] # List of paragraphs from URLs embeddings = None # Precomputed embeddings for retrieval sources_list = [] # Source URLs for each paragraph # Load models at startup (memory: ~340MB total) # Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings) retriever = SentenceTransformer('all-MiniLM-L6-v2') # QA model: DistilBERT fine-tuned on SQuAD (~260MB) qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") def ingest_urls(urls): """ Ingest up to 3 URLs, scrape content, and compute embeddings. Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total). """ global corpus, embeddings, sources_list # Clear previous data corpus.clear() sources_list.clear() embeddings = None # Parse URLs from input (one per line, max 3) url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3] if not url_list: return "Error: Please enter at least one valid URL." # Headers to mimic browser and avoid blocking headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} # Scrape each URL for url in url_list: try: response = requests.get(url, headers=headers, timeout=5) response.raise_for_status() # Raise exception for bad status codes soup = BeautifulSoup(response.text, 'html.parser') # Extract content from

and

tags for broader coverage elements = soup.find_all(['p', 'div']) paragraph_count = 0 for elem in elements: text = elem.get_text().strip() # Filter short or empty text if text and len(text) > 20 and paragraph_count < 100: corpus.append(text) sources_list.append(url) paragraph_count += 1 if paragraph_count == 0: return f"Warning: No usable content found at {url}." except Exception as e: return f"Error ingesting {url}: {str(e)}. Check URL and try again." # Compute embeddings if content was ingested if corpus: # Embeddings: ~1.5KB per paragraph, ~450KB for 300 paragraphs embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False) return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs." return "Error: No valid content ingested." def answer_question(question): """ Answer a question using retrieved context and DistilBERT QA. Retrieves top 3 paragraphs to provide broader context for cross-questioning. If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically. """ global corpus, embeddings, sources_list if not corpus or embeddings is None: return "Error: Please ingest URLs first." # Encode question into embedding question_embedding = retriever.encode(question, convert_to_tensor=True) # Compute cosine similarity with stored embeddings cos_scores = util.cos_sim(question_embedding, embeddings)[0] top_k = min(3, len(corpus)) # Get top 3 or less if fewer paragraphs top_indices = np.argsort(-cos_scores)[:top_k] # Retrieve context (top 3 paragraphs) contexts = [corpus[i] for i in top_indices] context = " ".join(contexts) # Concatenate with space sources = [sources_list[i] for i in top_indices] # Extract answer with DistilBERT # Note: If total tokens exceed 512, it will be truncated automatically result = qa_model(question=question, context=context) answer = result['answer'] confidence = result['score'] # Format response with answer, confidence, and sources sources_str = "\n".join(set(sources)) # Unique sources return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}" def clear_all(): """Clear all inputs and outputs for a fresh start.""" global corpus, embeddings, sources_list corpus.clear() embeddings = None sources_list.clear() return "", "", "" # Gradio UI with minimal, user-friendly design with gr.Blocks(title="Web Content Q&A Tool") as demo: gr.Markdown( """ # Web Content Q&A Tool Enter up to 3 URLs (one per line), ingest their content, and ask questions. Answers are generated using only the ingested data. Note: Data resets on app restart. """ ) # URL input and ingestion with gr.Row(): url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com") with gr.Column(): ingest_btn = gr.Button("Ingest URLs") clear_btn = gr.Button("Clear All") ingest_output = gr.Textbox(label="Ingestion Status", interactive=False) # Question input and answer with gr.Row(): question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?") ask_btn = gr.Button("Ask") answer_output = gr.Textbox(label="Answer", lines=5, interactive=False) # Bind functions to buttons ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output) ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output) clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output]) # Launch the app (HF Spaces expects port 7860) demo.launch(share = True, server_name="0.0.0.0", server_port=7860)