Spaces:
Running
Running
# Web Content Q&A Tool for Hugging Face Spaces | |
# Optimized for memory constraints (2GB RAM) and 24-hour timeline | |
# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using RoBERTa with PyTorch | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
import requests | |
from sentence_transformers import SentenceTransformer, util | |
import numpy as np | |
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer | |
import torch | |
from huggingface_hub import hf_hub_download, HfFolder | |
from huggingface_hub.utils import configure_http_backend | |
import requests as hf_requests | |
import re | |
# Configure Hugging Face Hub to use a custom session with increased timeout and retries | |
def create_custom_session(): | |
session = hf_requests.Session() | |
# Increase timeout to 30 seconds (default is 10 seconds) | |
adapter = hf_requests.adapters.HTTPAdapter(max_retries=3) # Retry 3 times on failure | |
session.mount("https://", adapter) | |
session.timeout = 30 # Set timeout to 30 seconds | |
return session | |
# Set the custom session for Hugging Face Hub | |
configure_http_backend(backend_factory=create_custom_session) | |
# Global variables for in-memory storage (reset on app restart) | |
corpus = [] # List of paragraphs from URLs | |
embeddings = None # Precomputed embeddings for retrieval | |
sources_list = [] # Source URLs for each paragraph | |
# Load models at startup (memory: ~410MB total) | |
# Retrieval model: multi-qa-mpnet-base-dot-v1 (~110MB, 768-dim embeddings) | |
retriever = SentenceTransformer('multi-qa-mpnet-base-dot-v1') | |
# Load PyTorch model for QA | |
# Model: roberta-base-squad2 (~355MB, quantized to ~200-250MB) | |
try: | |
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") | |
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") | |
except Exception as e: | |
print(f"Error loading model: {str(e)}. Retrying with force_download=True...") | |
# Force re-download in case of corrupted cache | |
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2", force_download=True) | |
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2", force_download=True) | |
# Set model to evaluation mode | |
model.eval() | |
# Apply quantization to the model for faster inference on CPU | |
model = torch.quantization.quantize_dynamic( | |
model, {torch.nn.Linear}, dtype=torch.qint8 | |
) | |
# Create the QA pipeline with PyTorch | |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU | |
# Utility function to truncate text to one line | |
def truncate_to_one_line(text): | |
# Split by sentence-ending punctuation and take the first sentence | |
sentences = re.split(r'[.!?]+', text.strip()) | |
first_sentence = sentences[0].strip() if sentences else text.strip() | |
# If the sentence is too long, truncate to 100 characters | |
if len(first_sentence) > 100: | |
first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..." | |
return first_sentence if first_sentence else "No answer available." | |
def ingest_urls(urls): | |
""" | |
Ingest up to 3 URLs, scrape content, and compute embeddings. | |
Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total). | |
""" | |
global corpus, embeddings, sources_list | |
# Clear previous data | |
corpus.clear() | |
sources_list.clear() | |
embeddings = None | |
# Parse URLs from input (one per line, max 3) | |
url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3] | |
if not url_list: | |
return "Error: Please enter at least one valid URL." | |
# Headers to mimic browser and avoid blocking | |
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} | |
# Scrape each URL | |
for url in url_list: | |
try: | |
response = requests.get(url, headers=headers, timeout=5) | |
response.raise_for_status() # Raise exception for bad status codes | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract content from <p> and <div> tags for broader coverage | |
elements = soup.find_all(['p', 'div']) | |
paragraph_count = 0 | |
for elem in elements: | |
text = elem.get_text().strip() | |
# Filter short or empty text | |
if text and len(text) > 20 and paragraph_count < 100: | |
corpus.append(text) | |
sources_list.append(url) | |
paragraph_count += 1 | |
if paragraph_count == 0: | |
return f"Warning: No usable content found at {url}." | |
except Exception as e: | |
return f"Error ingesting {url}: {str(e)}. Check URL and try again." | |
# Compute embeddings if content was ingested | |
if corpus: | |
# Embeddings: ~3KB per paragraph, ~900KB for 300 paragraphs (768-dim) | |
embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False) | |
return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs." | |
return "Error: No valid content ingested." | |
def answer_question(question): | |
""" | |
Answer a question using retrieved context and RoBERTa QA (PyTorch). | |
Retrieves top 3 paragraphs to improve answer accuracy. | |
If total context exceeds 512 tokens (RoBERTa's max length), it will be truncated automatically. | |
Rejects answers with confidence below 0.3. Ensures answers are one line (max 100 chars). | |
""" | |
global corpus, embeddings, sources_list | |
if not corpus or embeddings is None: | |
return "Error: Please ingest URLs first." | |
# Encode question into embedding | |
question_embedding = retriever.encode(question, convert_to_tensor=True) | |
# Compute cosine similarity with stored embeddings | |
cos_scores = util.cos_sim(question_embedding, embeddings)[0] | |
top_k = min(3, len(corpus)) # Get top 3 paragraphs as preferred | |
top_indices = np.argsort(-cos_scores)[:top_k] | |
# Retrieve context (top 3 paragraphs) | |
contexts = [corpus[i] for i in top_indices] | |
context = " ".join(contexts) # Concatenate with space | |
sources = [sources_list[i] for i in top_indices] | |
# Extract answer with RoBERTa (PyTorch) | |
with torch.no_grad(): # Disable gradient computation for faster inference | |
result = qa_model(question=question, context=context) | |
answer = result['answer'] | |
confidence = result['score'] | |
# Check confidence threshold | |
if confidence < 0.3: | |
return f"No confident answer found (confidence {confidence:.2f} below 0.3)." | |
# Truncate answer to one line | |
answer = truncate_to_one_line(answer) | |
# Ensure at least one line | |
if not answer: | |
answer = "No answer available." | |
# Format response with answer, confidence, and sources | |
sources_str = "\n".join(set(sources)) # Unique sources | |
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}" | |
def clear_all(): | |
"""Clear all inputs and outputs for a fresh start.""" | |
global corpus, embeddings, sources_list | |
corpus.clear() | |
embeddings = None | |
sources_list.clear() | |
return "", "", "" | |
# Gradio UI with minimal, user-friendly design | |
with gr.Blocks(title="Web Content Q&A Tool") as demo: | |
gr.Markdown( | |
""" | |
# Web Content Q&A Tool | |
Enter up to 3 URLs (one per line), ingest their content, and ask questions. | |
Answers are generated using only the ingested data. Note: Data resets on app restart. | |
""" | |
) | |
# URL input and ingestion | |
with gr.Row(): | |
url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com") | |
with gr.Column(): | |
ingest_btn = gr.Button("Ingest URLs") | |
clear_btn = gr.Button("Clear All") | |
ingest_output = gr.Textbox(label="Ingestion Status", interactive=False) | |
# Question input and answer | |
with gr.Row(): | |
question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?") | |
ask_btn = gr.Button("Ask") | |
answer_output = gr.Textbox(label="Answer", lines=5, interactive=False) | |
# Bind functions to buttons | |
ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output) | |
ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output) | |
clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output]) | |
# Launch the app (HF Spaces expects port 7860) | |
demo.launch(server_name="0.0.0.0", server_port=7860) |