Spaces:

Shriharsh
/

Web_Content_QA

Sleeping

App Files Files Community

Shriharsh commited on Mar 20

Commit

1bb4299

verified ·

1 Parent(s): bc9fd78

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -21

app.py CHANGED Viewed

@@ -1,15 +1,14 @@
 # Web Content Q&A Tool for Hugging Face Spaces
 # Optimized for memory constraints (2GB RAM) and 24-hour timeline
-# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with ONNX
 import gradio as gr
 from bs4 import BeautifulSoup
 import requests
 from sentence_transformers import SentenceTransformer, util
 import numpy as np
-from optimum.onnxruntime import ORTModelForQuestionAnswering
-from transformers import AutoTokenizer
-from optimum.pipelines import pipeline
 # Global variables for in-memory storage (reset on app restart)
 corpus = []  # List of paragraphs from URLs
@@ -20,16 +19,28 @@ sources_list = []  # Source URLs for each paragraph
 # Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
 retriever = SentenceTransformer('all-MiniLM-L6-v2')
-# Load ONNX model for QA using optimum.onnxruntime
-# Model: Xenova/distilbert-base-uncased-distilled-squad (~260MB)
-# Specify file_name="model.onnx" to select the correct ONNX file
-model = ORTModelForQuestionAnswering.from_pretrained(
-    "Xenova/distilbert-base-uncased-distilled-squad",
-    file_name="onnx/model.onnx",
-    provider="CPUExecutionProvider"
 )
-tokenizer = AutoTokenizer.from_pretrained("Xenova/distilbert-base-uncased-distilled-squad")
-qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="ort", device=0)
 def ingest_urls(urls):
     """
@@ -80,8 +91,8 @@ def ingest_urls(urls):
 def answer_question(question):
     """
-    Answer a question using retrieved context and DistilBERT QA (ONNX).
-    Retrieves top 3 paragraphs to provide broader context for cross-questioning.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     """
     global corpus, embeddings, sources_list
@@ -93,17 +104,17 @@ def answer_question(question):
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
-    top_k = min(2, len(corpus))  # Get top 2 or less if fewer paragraphs
     top_indices = np.argsort(-cos_scores)[:top_k]
-    # Retrieve context (top 3 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
-    # Extract answer with DistilBERT (ONNX)
-    # Note: If total tokens exceed 512, it will be truncated automatically
-    result = qa_model(question=question, context=context)
     answer = result['answer']
     confidence = result['score']
@@ -149,4 +160,4 @@ with gr.Blocks(title="Web Content Q&A Tool") as demo:
     clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])
 # Launch the app (HF Spaces expects port 7860)
-demo.launch(share = True, server_name="0.0.0.0", server_port=7860)

 # Web Content Q&A Tool for Hugging Face Spaces
 # Optimized for memory constraints (2GB RAM) and 24-hour timeline
+# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT with PyTorch
 import gradio as gr
 from bs4 import BeautifulSoup
 import requests
 from sentence_transformers import SentenceTransformer, util
 import numpy as np
+from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
+import torch
 # Global variables for in-memory storage (reset on app restart)
 corpus = []  # List of paragraphs from URLs
 # Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
 retriever = SentenceTransformer('all-MiniLM-L6-v2')
+# Load PyTorch model for QA
+# Model: distilbert-base-uncased-distilled-squad (~260MB)
+model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
+# Set model to evaluation mode
+model.eval()
+# Compile the model with torch.compile for faster inference (PyTorch 2.0+)
+# Use backend="inductor" for CPU optimization
+try:
+    model = torch.compile(model, backend="inductor")
+except Exception as e:
+    print(f"Warning: torch.compile failed with error: {str(e)}. Proceeding without compilation.")
+# Apply quantization to the model for additional speedup on CPU
+model = torch.quantization.quantize_dynamic(
+    model, {torch.nn.Linear}, dtype=torch.qint8
 )
+# Create the QA pipeline with PyTorch
+qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1)  # device=-1 for CPU
 def ingest_urls(urls):
     """
 def answer_question(question):
     """
+    Answer a question using retrieved context and DistilBERT QA (PyTorch).
+    Retrieves top 1 paragraph to reduce inference time.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     """
     global corpus, embeddings, sources_list
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
+    top_k = min(1, len(corpus))  # Get top 1 paragraph to speed up inference
     top_indices = np.argsort(-cos_scores)[:top_k]
+    # Retrieve context (top 1 paragraph)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
+    # Extract answer with DistilBERT (PyTorch)
+    with torch.no_grad():  # Disable gradient computation for faster inference
+        result = qa_model(question=question, context=context)
     answer = result['answer']
     confidence = result['score']
     clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])
 # Launch the app (HF Spaces expects port 7860)
+demo.launch(server_name="0.0.0.0", server_port=7860)