Spaces:

Shriharsh
/

Web_Content_QA

Sleeping

App Files Files Community

Shriharsh commited on Mar 21

Commit

f1dc219

verified ·

1 Parent(s): df6464c

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -7

app.py CHANGED Viewed

@@ -30,9 +30,9 @@ corpus = []  # List of paragraphs from URLs
 embeddings = None  # Precomputed embeddings for retrieval
 sources_list = []  # Source URLs for each paragraph
-# Load models at startup (memory: ~340MB total)
-# Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
-retriever = SentenceTransformer('all-MiniLM-L6-v2')
 # Load PyTorch model for QA
 # Model: distilbert-base-uncased-distilled-squad (~260MB)
@@ -98,7 +98,7 @@ def ingest_urls(urls):
     # Compute embeddings if content was ingested
     if corpus:
-        # Embeddings: ~1.5KB per paragraph, ~450KB for 300 paragraphs
         embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
         return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
     return "Error: No valid content ingested."
@@ -106,7 +106,7 @@ def ingest_urls(urls):
 def answer_question(question):
     """
     Answer a question using retrieved context and DistilBERT QA (PyTorch).
-    Retrieves top 1 paragraph to reduce inference time.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     """
     global corpus, embeddings, sources_list
@@ -118,10 +118,10 @@ def answer_question(question):
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
-    top_k = min(1, len(corpus))  # Get top 1 paragraph to speed up inference
     top_indices = np.argsort(-cos_scores)[:top_k]
-    # Retrieve context (top 1 paragraph)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]

 embeddings = None  # Precomputed embeddings for retrieval
 sources_list = []  # Source URLs for each paragraph
+# Load models at startup (memory: ~370MB total)
+# Retrieval model: all-mpnet-base-v2 (~110MB, 768-dim embeddings)
+retriever = SentenceTransformer('all-mpnet-base-v2')
 # Load PyTorch model for QA
 # Model: distilbert-base-uncased-distilled-squad (~260MB)
     # Compute embeddings if content was ingested
     if corpus:
+        # Embeddings: ~3KB per paragraph, ~900KB for 300 paragraphs (768-dim)
         embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
         return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
     return "Error: No valid content ingested."
 def answer_question(question):
     """
     Answer a question using retrieved context and DistilBERT QA (PyTorch).
+    Retrieves top 3 paragraphs to improve answer accuracy.
     If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
     """
     global corpus, embeddings, sources_list
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
+    top_k = min(2, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
+    # Retrieve context (top 3 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]