Shriharsh commited on
Commit
581df5c
·
verified ·
1 Parent(s): d97eaef

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web Content Q&A Tool for Hugging Face Spaces
2
+ # Optimized for memory constraints (2GB RAM) and 24-hour timeline
3
+ # Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT
4
+
5
+ import gradio as gr
6
+ from bs4 import BeautifulSoup
7
+ import requests
8
+ from sentence_transformers import SentenceTransformer, util
9
+ import numpy as np
10
+ from transformers import pipeline
11
+
12
+ # Global variables for in-memory storage (reset on app restart)
13
+ corpus = [] # List of paragraphs from URLs
14
+ embeddings = None # Precomputed embeddings for retrieval
15
+ sources_list = [] # Source URLs for each paragraph
16
+
17
+ # Load models at startup (memory: ~340MB total)
18
+ # Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
19
+ retriever = SentenceTransformer('all-MiniLM-L6-v2')
20
+ # QA model: DistilBERT fine-tuned on SQuAD (~260MB)
21
+ qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
22
+
23
+ def ingest_urls(urls):
24
+ """
25
+ Ingest up to 3 URLs, scrape content, and compute embeddings.
26
+ Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total).
27
+ """
28
+ global corpus, embeddings, sources_list
29
+ # Clear previous data
30
+ corpus.clear()
31
+ sources_list.clear()
32
+ embeddings = None
33
+
34
+ # Parse URLs from input (one per line, max 3)
35
+ url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3]
36
+ if not url_list:
37
+ return "Error: Please enter at least one valid URL."
38
+
39
+ # Headers to mimic browser and avoid blocking
40
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
41
+
42
+ # Scrape each URL
43
+ for url in url_list:
44
+ try:
45
+ response = requests.get(url, headers=headers, timeout=5)
46
+ response.raise_for_status() # Raise exception for bad status codes
47
+ soup = BeautifulSoup(response.text, 'html.parser')
48
+ # Extract content from <p> and <div> tags for broader coverage
49
+ elements = soup.find_all(['p', 'div'])
50
+ paragraph_count = 0
51
+ for elem in elements:
52
+ text = elem.get_text().strip()
53
+ # Filter short or empty text
54
+ if text and len(text) > 20 and paragraph_count < 100:
55
+ corpus.append(text)
56
+ sources_list.append(url)
57
+ paragraph_count += 1
58
+ if paragraph_count == 0:
59
+ return f"Warning: No usable content found at {url}."
60
+ except Exception as e:
61
+ return f"Error ingesting {url}: {str(e)}. Check URL and try again."
62
+
63
+ # Compute embeddings if content was ingested
64
+ if corpus:
65
+ # Embeddings: ~1.5KB per paragraph, ~450KB for 300 paragraphs
66
+ embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
67
+ return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
68
+ return "Error: No valid content ingested."
69
+
70
+ def answer_question(question):
71
+ """
72
+ Answer a question using retrieved context and DistilBERT QA.
73
+ Retrieves top 3 paragraphs to provide broader context for cross-questioning.
74
+ If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
75
+ """
76
+ global corpus, embeddings, sources_list
77
+ if not corpus or embeddings is None:
78
+ return "Error: Please ingest URLs first."
79
+
80
+ # Encode question into embedding
81
+ question_embedding = retriever.encode(question, convert_to_tensor=True)
82
+
83
+ # Compute cosine similarity with stored embeddings
84
+ cos_scores = util.cos_sim(question_embedding, embeddings)[0]
85
+ top_k = min(3, len(corpus)) # Get top 3 or less if fewer paragraphs
86
+ top_indices = np.argsort(-cos_scores)[:top_k]
87
+
88
+ # Retrieve context (top 3 paragraphs)
89
+ contexts = [corpus[i] for i in top_indices]
90
+ context = " ".join(contexts) # Concatenate with space
91
+ sources = [sources_list[i] for i in top_indices]
92
+
93
+ # Extract answer with DistilBERT
94
+ # Note: If total tokens exceed 512, it will be truncated automatically
95
+ result = qa_model(question=question, context=context)
96
+ answer = result['answer']
97
+ confidence = result['score']
98
+
99
+ # Format response with answer, confidence, and sources
100
+ sources_str = "\n".join(set(sources)) # Unique sources
101
+ return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
102
+
103
+ def clear_all():
104
+ """Clear all inputs and outputs for a fresh start."""
105
+ global corpus, embeddings, sources_list
106
+ corpus.clear()
107
+ embeddings = None
108
+ sources_list.clear()
109
+ return "", "", ""
110
+
111
+ # Gradio UI with minimal, user-friendly design
112
+ with gr.Blocks(title="Web Content Q&A Tool") as demo:
113
+ gr.Markdown(
114
+ """
115
+ # Web Content Q&A Tool
116
+ Enter up to 3 URLs (one per line), ingest their content, and ask questions.
117
+ Answers are generated using only the ingested data. Note: Data resets on app restart.
118
+ """
119
+ )
120
+
121
+ # URL input and ingestion
122
+ with gr.Row():
123
+ url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com")
124
+ with gr.Column():
125
+ ingest_btn = gr.Button("Ingest URLs")
126
+ clear_btn = gr.Button("Clear All")
127
+ ingest_output = gr.Textbox(label="Ingestion Status", interactive=False)
128
+
129
+ # Question input and answer
130
+ with gr.Row():
131
+ question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?")
132
+ ask_btn = gr.Button("Ask")
133
+ answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
134
+
135
+ # Bind functions to buttons
136
+ ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output)
137
+ ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)
138
+ clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])
139
+
140
+ # Launch the app (HF Spaces expects port 7860)
141
+ demo.launch(server_name="0.0.0.0", server_port=7860)