tags for broader coverage elements = soup.find_all(['p', 'div']) paragraph_count = 0 for elem in elements: text = elem.get_text().strip() # Filter short or empty text if text and len(text) > 20 and paragraph_count < 100: corpus.append(text) sources_list.append(url) paragraph_count += 1 if paragraph_count == 0: return f"Warning: No usable content found at {url}." except Exception as e: return f"Error ingesting {url}: {str(e)}. Check URL and try again." # Compute embeddings if content was ingested if corpus: # Embeddings: ~3KB per paragraph, ~900KB for 300 paragraphs (768-dim) embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False) return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs." return "Error: No valid content ingested." def answer_question(question): """ Answer a question using retrieved context and DistilBERT QA (PyTorch). Retrieves top 2 paragraphs to improve answer accuracy. If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically. Ensures answers are one line (max 100 chars). """ global corpus, embeddings, sources_list if not corpus or embeddings is None: return "Error: Please ingest URLs first." # Encode question into embedding question_embedding = retriever.encode(question, convert_to_tensor=True) # Compute cosine similarity with stored embeddings cos_scores = util.cos_sim(question_embedding, embeddings)[0] top_k = min(2, len(corpus)) # Get top 2 paragraphs to improve accuracy top_indices = np.argsort(-cos_scores)[:top_k] # Retrieve context (top 2 paragraphs) contexts = [corpus[i] for i in top_indices] context = " ".join(contexts) # Concatenate with space sources = [sources_list[i] for i in top_indices] # Extract answer with DistilBERT (PyTorch) with torch.no_grad(): # Disable gradient computation for faster inference result = qa_model(question=question, context=context) answer = result['answer'] confidence = result['score'] # Truncate answer to one line answer = truncate_to_one_line(answer) # Ensure at least one line if not answer: answer = "No answer available." # Format response with answer, confidence, and sources sources_str = "\n".join(set(sources)) # Unique sources return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}" def clear_all(): """Clear all inputs and outputs for a fresh start.""" global corpus, embeddings, sources_list corpus.clear() embeddings = None sources_list.clear() return "", "", "" # Gradio UI with minimal, user-friendly design with gr.Blocks(title="Web Content Q&A Tool") as demo: gr.Markdown( """ # Web Content Q&A Tool Enter up to 3 URLs (one per line), ingest their content, and ask questions. Answers are generated using only the ingested data. Note: Data resets on app restart. """ ) # URL input and ingestion with gr.Row(): url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com") with gr.Column(): ingest_btn = gr.Button("Ingest URLs") clear_btn = gr.Button("Clear All") ingest_output = gr.Textbox(label="Ingestion Status", interactive=False) # Question input and answer with gr.Row(): question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?") ask_btn = gr.Button("Ask") answer_output = gr.Textbox(label="Answer", lines=5, interactive=False) # Bind functions to buttons ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output) ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output) clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output]) # Launch the app (HF Spaces expects port 7860) demo.launch(server_name="0.0.0.0", server_port=7860)