Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Web Content Q&A Tool for Hugging Face Spaces
|
2 |
+
# Optimized for memory constraints (2GB RAM) and 24-hour timeline
|
3 |
+
# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import requests
|
8 |
+
from sentence_transformers import SentenceTransformer, util
|
9 |
+
import numpy as np
|
10 |
+
from transformers import pipeline
|
11 |
+
|
12 |
+
# Global variables for in-memory storage (reset on app restart)
|
13 |
+
corpus = [] # List of paragraphs from URLs
|
14 |
+
embeddings = None # Precomputed embeddings for retrieval
|
15 |
+
sources_list = [] # Source URLs for each paragraph
|
16 |
+
|
17 |
+
# Load models at startup (memory: ~340MB total)
|
18 |
+
# Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
|
19 |
+
retriever = SentenceTransformer('all-MiniLM-L6-v2')
|
20 |
+
# QA model: DistilBERT fine-tuned on SQuAD (~260MB)
|
21 |
+
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
22 |
+
|
23 |
+
def ingest_urls(urls):
|
24 |
+
"""
|
25 |
+
Ingest up to 3 URLs, scrape content, and compute embeddings.
|
26 |
+
Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total).
|
27 |
+
"""
|
28 |
+
global corpus, embeddings, sources_list
|
29 |
+
# Clear previous data
|
30 |
+
corpus.clear()
|
31 |
+
sources_list.clear()
|
32 |
+
embeddings = None
|
33 |
+
|
34 |
+
# Parse URLs from input (one per line, max 3)
|
35 |
+
url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3]
|
36 |
+
if not url_list:
|
37 |
+
return "Error: Please enter at least one valid URL."
|
38 |
+
|
39 |
+
# Headers to mimic browser and avoid blocking
|
40 |
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
41 |
+
|
42 |
+
# Scrape each URL
|
43 |
+
for url in url_list:
|
44 |
+
try:
|
45 |
+
response = requests.get(url, headers=headers, timeout=5)
|
46 |
+
response.raise_for_status() # Raise exception for bad status codes
|
47 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
48 |
+
# Extract content from <p> and <div> tags for broader coverage
|
49 |
+
elements = soup.find_all(['p', 'div'])
|
50 |
+
paragraph_count = 0
|
51 |
+
for elem in elements:
|
52 |
+
text = elem.get_text().strip()
|
53 |
+
# Filter short or empty text
|
54 |
+
if text and len(text) > 20 and paragraph_count < 100:
|
55 |
+
corpus.append(text)
|
56 |
+
sources_list.append(url)
|
57 |
+
paragraph_count += 1
|
58 |
+
if paragraph_count == 0:
|
59 |
+
return f"Warning: No usable content found at {url}."
|
60 |
+
except Exception as e:
|
61 |
+
return f"Error ingesting {url}: {str(e)}. Check URL and try again."
|
62 |
+
|
63 |
+
# Compute embeddings if content was ingested
|
64 |
+
if corpus:
|
65 |
+
# Embeddings: ~1.5KB per paragraph, ~450KB for 300 paragraphs
|
66 |
+
embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
|
67 |
+
return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
|
68 |
+
return "Error: No valid content ingested."
|
69 |
+
|
70 |
+
def answer_question(question):
|
71 |
+
"""
|
72 |
+
Answer a question using retrieved context and DistilBERT QA.
|
73 |
+
Retrieves top 3 paragraphs to provide broader context for cross-questioning.
|
74 |
+
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
75 |
+
"""
|
76 |
+
global corpus, embeddings, sources_list
|
77 |
+
if not corpus or embeddings is None:
|
78 |
+
return "Error: Please ingest URLs first."
|
79 |
+
|
80 |
+
# Encode question into embedding
|
81 |
+
question_embedding = retriever.encode(question, convert_to_tensor=True)
|
82 |
+
|
83 |
+
# Compute cosine similarity with stored embeddings
|
84 |
+
cos_scores = util.cos_sim(question_embedding, embeddings)[0]
|
85 |
+
top_k = min(3, len(corpus)) # Get top 3 or less if fewer paragraphs
|
86 |
+
top_indices = np.argsort(-cos_scores)[:top_k]
|
87 |
+
|
88 |
+
# Retrieve context (top 3 paragraphs)
|
89 |
+
contexts = [corpus[i] for i in top_indices]
|
90 |
+
context = " ".join(contexts) # Concatenate with space
|
91 |
+
sources = [sources_list[i] for i in top_indices]
|
92 |
+
|
93 |
+
# Extract answer with DistilBERT
|
94 |
+
# Note: If total tokens exceed 512, it will be truncated automatically
|
95 |
+
result = qa_model(question=question, context=context)
|
96 |
+
answer = result['answer']
|
97 |
+
confidence = result['score']
|
98 |
+
|
99 |
+
# Format response with answer, confidence, and sources
|
100 |
+
sources_str = "\n".join(set(sources)) # Unique sources
|
101 |
+
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
|
102 |
+
|
103 |
+
def clear_all():
|
104 |
+
"""Clear all inputs and outputs for a fresh start."""
|
105 |
+
global corpus, embeddings, sources_list
|
106 |
+
corpus.clear()
|
107 |
+
embeddings = None
|
108 |
+
sources_list.clear()
|
109 |
+
return "", "", ""
|
110 |
+
|
111 |
+
# Gradio UI with minimal, user-friendly design
|
112 |
+
with gr.Blocks(title="Web Content Q&A Tool") as demo:
|
113 |
+
gr.Markdown(
|
114 |
+
"""
|
115 |
+
# Web Content Q&A Tool
|
116 |
+
Enter up to 3 URLs (one per line), ingest their content, and ask questions.
|
117 |
+
Answers are generated using only the ingested data. Note: Data resets on app restart.
|
118 |
+
"""
|
119 |
+
)
|
120 |
+
|
121 |
+
# URL input and ingestion
|
122 |
+
with gr.Row():
|
123 |
+
url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com")
|
124 |
+
with gr.Column():
|
125 |
+
ingest_btn = gr.Button("Ingest URLs")
|
126 |
+
clear_btn = gr.Button("Clear All")
|
127 |
+
ingest_output = gr.Textbox(label="Ingestion Status", interactive=False)
|
128 |
+
|
129 |
+
# Question input and answer
|
130 |
+
with gr.Row():
|
131 |
+
question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?")
|
132 |
+
ask_btn = gr.Button("Ask")
|
133 |
+
answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
|
134 |
+
|
135 |
+
# Bind functions to buttons
|
136 |
+
ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output)
|
137 |
+
ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)
|
138 |
+
clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])
|
139 |
+
|
140 |
+
# Launch the app (HF Spaces expects port 7860)
|
141 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|