File size: 10,132 Bytes
581df5c
9121798
 
681b2fa
581df5c
 
 
 
 
 
1bb4299
 
df6464c
 
 
681b2fa
df6464c
 
 
 
 
 
 
 
 
 
 
 
581df5c
 
 
 
 
 
f1dc219
 
 
aaaa3f2
1bb4299
 
df6464c
 
 
 
 
 
 
 
1bb4299
 
 
 
f5630fa
1bb4299
 
5a35f4a
1bb4299
 
 
581df5c
9121798
 
 
 
 
 
 
 
 
 
681b2fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9121798
 
681b2fa
 
581df5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1dc219
581df5c
 
 
 
 
 
1bb4299
f1dc219
581df5c
681b2fa
9121798
581df5c
 
 
 
 
 
 
 
 
 
681b2fa
581df5c
 
9121798
581df5c
 
 
 
1bb4299
 
 
581df5c
 
 
681b2fa
9121798
 
 
 
 
681b2fa
 
 
 
 
 
 
 
 
581df5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bb4299
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# Web Content Q&A Tool for Hugging Face Spaces
# Optimized for memory constraints (2GB RAM) and 24-hour timeline
# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
# Includes keyword search fallback for low-confidence QA answers

import gradio as gr
from bs4 import BeautifulSoup
import requests
from sentence_transformers import SentenceTransformer, util
import numpy as np
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import torch
from huggingface_hub import hf_hub_download, HfFolder
from huggingface_hub.utils import configure_http_backend
import requests as hf_requests
import re

# Configure Hugging Face Hub to use a custom session with increased timeout and retries
def create_custom_session():
    session = hf_requests.Session()
    # Increase timeout to 30 seconds (default is 10 seconds)
    adapter = hf_requests.adapters.HTTPAdapter(max_retries=3)  # Retry 3 times on failure
    session.mount("https://", adapter)
    session.timeout = 30  # Set timeout to 30 seconds
    return session

# Set the custom session for Hugging Face Hub
configure_http_backend(backend_factory=create_custom_session)

# Global variables for in-memory storage (reset on app restart)
corpus = []  # List of paragraphs from URLs
embeddings = None  # Precomputed embeddings for retrieval
sources_list = []  # Source URLs for each paragraph

# Load models at startup (memory: ~370MB total)
# Retrieval model: all-mpnet-base-v2 (~110MB, 768-dim embeddings)
retriever = SentenceTransformer('all-mpnet-base-v2')

# Load PyTorch model for QA
# Model: distilbert-base-uncased-distilled-squad (~260MB)
try:
    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
except Exception as e:
    print(f"Error loading model: {str(e)}. Retrying with force_download=True...")
    # Force re-download in case of corrupted cache
    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad", force_download=True)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad", force_download=True)

# Set model to evaluation mode
model.eval()

# Apply quantization to the model for faster inference on CPU
model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Create the QA pipeline with PyTorch
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU

# Utility function to truncate text to one line
def truncate_to_one_line(text):
    # Split by sentence-ending punctuation and take the first sentence
    sentences = re.split(r'[.!?]+', text.strip())
    first_sentence = sentences[0].strip() if sentences else text.strip()
    # If the sentence is too long, truncate to 100 characters
    if len(first_sentence) > 100:
        first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
    return first_sentence if first_sentence else "No answer available."

# Keyword search function for fallback
def keyword_search(question, corpus, sources_list):
    stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
    
    def clean_text(text):
        return re.sub(r'[^a-zA-Z\s]', '', text).lower()
    
    cleaned_question = clean_text(question)
    keywords = [word for word in cleaned_question.split() if word not in stop_words]
    if not keywords:
        return "No keywords found for search.", None
    
    best_paragraph = None
    best_count = 0
    best_source = None
    
    for i, para in enumerate(corpus):
        cleaned_para = clean_text(para)
        words = set(cleaned_para.split())  # Use set for faster lookup
        count = sum(1 for kw in keywords if kw in words)
        if count > best_count:
            best_count = count
            best_paragraph = para
            best_source = sources_list[i]
    
    if best_paragraph is None:
        return "No relevant paragraph found.", None
    
    # Truncate the paragraph to one line
    best_paragraph = truncate_to_one_line(best_paragraph)
    return best_paragraph, best_source

def ingest_urls(urls):
    """
    Ingest up to 3 URLs, scrape content, and compute embeddings.
    Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total).
    """
    global corpus, embeddings, sources_list
    # Clear previous data
    corpus.clear()
    sources_list.clear()
    embeddings = None
    
    # Parse URLs from input (one per line, max 3)
    url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3]
    if not url_list:
        return "Error: Please enter at least one valid URL."
    
    # Headers to mimic browser and avoid blocking
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    
    # Scrape each URL
    for url in url_list:
        try:
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()  # Raise exception for bad status codes
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract content from <p> and <div> tags for broader coverage
            elements = soup.find_all(['p', 'div'])
            paragraph_count = 0
            for elem in elements:
                text = elem.get_text().strip()
                # Filter short or empty text
                if text and len(text) > 20 and paragraph_count < 100:
                    corpus.append(text)
                    sources_list.append(url)
                    paragraph_count += 1
            if paragraph_count == 0:
                return f"Warning: No usable content found at {url}."
        except Exception as e:
            return f"Error ingesting {url}: {str(e)}. Check URL and try again."
    
    # Compute embeddings if content was ingested
    if corpus:
        # Embeddings: ~3KB per paragraph, ~900KB for 300 paragraphs (768-dim)
        embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
        return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
    return "Error: No valid content ingested."

def answer_question(question):
    """
    Answer a question using retrieved context and DistilBERT QA (PyTorch).
    Retrieves top 3 paragraphs to improve answer accuracy.
    If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
    If QA confidence is below 0.4, falls back to keyword search.
    Ensures answers are one line (max 100 chars).
    """
    global corpus, embeddings, sources_list
    if not corpus or embeddings is None:
        return "Error: Please ingest URLs first."
    
    # Encode question into embedding
    question_embedding = retriever.encode(question, convert_to_tensor=True)
    
    # Compute cosine similarity with stored embeddings
    cos_scores = util.cos_sim(question_embedding, embeddings)[0]
    top_k = min(2, len(corpus))  # Get top 3 paragraphs to improve accuracy
    top_indices = np.argsort(-cos_scores)[:top_k]
    
    # Retrieve context (top 2 paragraphs)
    contexts = [corpus[i] for i in top_indices]
    context = " ".join(contexts)  # Concatenate with space
    sources = [sources_list[i] for i in top_indices]
    
    # Extract answer with DistilBERT (PyTorch)
    with torch.no_grad():  # Disable gradient computation for faster inference
        result = qa_model(question=question, context=context)
    answer = result['answer']
    confidence = result['score']
    
    if confidence >= 0.4:
        # Truncate QA answer to one line
        answer = truncate_to_one_line(answer)
        # Ensure at least one line
        if not answer:
            answer = "No answer available."
        sources_str = "\n".join(set(sources))  # Unique sources
        return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
    else:
        # Perform keyword search
        kw_answer, kw_source = keyword_search(question, corpus, sources_list)
        if kw_source:
            return f"Answer: {kw_answer} (from keyword search, as QA confidence was {confidence:.2f})\nSource: {kw_source}"
        else:
            return "No relevant answer found from keyword search."

def clear_all():
    """Clear all inputs and outputs for a fresh start."""
    global corpus, embeddings, sources_list
    corpus.clear()
    embeddings = None
    sources_list.clear()
    return "", "", ""

# Gradio UI with minimal, user-friendly design
with gr.Blocks(title="Web Content Q&A Tool") as demo:
    gr.Markdown(
        """
        # Web Content Q&A Tool
        Enter up to 3 URLs (one per line), ingest their content, and ask questions.
        Answers are generated using only the ingested data. Note: Data resets on app restart.
        """
    )
    
    # URL input and ingestion
    with gr.Row():
        url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com")
        with gr.Column():
            ingest_btn = gr.Button("Ingest URLs")
            clear_btn = gr.Button("Clear All")
    ingest_output = gr.Textbox(label="Ingestion Status", interactive=False)
    
    # Question input and answer
    with gr.Row():
        question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?")
        ask_btn = gr.Button("Ask")
    answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
    
    # Bind functions to buttons
    ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output)
    ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)
    clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])

# Launch the app (HF Spaces expects port 7860)
demo.launch(server_name="0.0.0.0", server_port=7860)