Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
9719dbf
1
Parent(s):
d52fd55
Implement caching for query results and enhance logging in log_query_and_results function
Browse files
app.py
CHANGED
@@ -2,6 +2,8 @@ import datetime
|
|
2 |
import json
|
3 |
import os
|
4 |
import uuid
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
import pandas as pd
|
@@ -54,6 +56,24 @@ dataset = None
|
|
54 |
embedding_model = None
|
55 |
reasoning_model = None
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
def init_embedding_model(
|
59 |
model_name_or_path: str, model_revision: str = None, hf_token: str = None
|
@@ -367,20 +387,22 @@ Return only the JSON object. All key names and string values must be in double q
|
|
367 |
# })
|
368 |
|
369 |
|
370 |
-
def log_query_and_results(query_id: str, query: str, results: list[dict]):
|
371 |
-
"""Log the query and results to a file."""
|
372 |
log_entry = {
|
373 |
"timestamp": datetime.datetime.now().isoformat(),
|
374 |
"query_id": query_id,
|
375 |
"query": query,
|
376 |
"results": results,
|
|
|
377 |
}
|
378 |
log_file = os.path.join(data_path, "query_results_log.jsonl")
|
379 |
with open(log_file, "a") as f:
|
380 |
f.write(json.dumps(log_entry) + "\n")
|
381 |
-
|
382 |
-
#
|
383 |
-
|
|
|
384 |
|
385 |
|
386 |
def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
|
@@ -392,26 +414,31 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
|
|
392 |
|
393 |
# Normalize the abstract for cosine similarity
|
394 |
abstract = abstract.replace("\n", " ")
|
395 |
-
# Replace multiple whitespaces with a single space
|
396 |
abstract = " ".join(abstract.split())
|
397 |
-
# Remove leading/trailing whitespace
|
398 |
abstract = abstract.strip()
|
399 |
-
# Check if the abstract is empty
|
400 |
if not abstract:
|
401 |
raise ValueError("Abstract is empty. Please provide a valid abstract.")
|
402 |
|
403 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
abstract_embedding = embed_text(abstract)
|
405 |
|
406 |
# Access the dataset's train split from the DatasetManager instance
|
407 |
train_dataset = dataset.dataset["train"]
|
408 |
|
409 |
-
# Search for similar papers using FAISS
|
410 |
scores, examples = train_dataset.get_nearest_examples("embedding", abstract_embedding, k=limit)
|
411 |
|
412 |
papers = []
|
413 |
for i in range(len(scores)):
|
414 |
-
# With cosine similarity, higher scores are better (closer to 1)
|
415 |
paper_dict = {
|
416 |
"id": examples["id"][i],
|
417 |
"title": examples["title"][i],
|
@@ -419,13 +446,17 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
|
|
419 |
"categories": examples["categories"][i],
|
420 |
"abstract": examples["abstract"][i],
|
421 |
"update_date": examples["update_date"][i],
|
422 |
-
"synergy_score": float(scores[i]),
|
423 |
}
|
424 |
papers.append(paper_dict)
|
425 |
|
426 |
# Log the query and results
|
427 |
log_query_and_results(query_id, abstract, papers)
|
428 |
|
|
|
|
|
|
|
|
|
429 |
return papers
|
430 |
|
431 |
|
|
|
2 |
import json
|
3 |
import os
|
4 |
import uuid
|
5 |
+
import hashlib
|
6 |
+
import pickle
|
7 |
|
8 |
import gradio as gr
|
9 |
import pandas as pd
|
|
|
56 |
embedding_model = None
|
57 |
reasoning_model = None
|
58 |
|
59 |
+
# Define a cache file path
|
60 |
+
cache_file = os.path.join(data_path, "query_cache.pkl")
|
61 |
+
|
62 |
+
# Load cache from file if it exists
|
63 |
+
if os.path.exists(cache_file):
|
64 |
+
with open(cache_file, "rb") as f:
|
65 |
+
query_cache = pickle.load(f)
|
66 |
+
else:
|
67 |
+
query_cache = {}
|
68 |
+
|
69 |
+
def hash_query(query: str) -> str:
|
70 |
+
"""Generate a unique hash for the query."""
|
71 |
+
return hashlib.sha256(query.encode("utf-8")).hexdigest()
|
72 |
+
|
73 |
+
def save_cache():
|
74 |
+
"""Save the cache to a file."""
|
75 |
+
with open(cache_file, "wb") as f:
|
76 |
+
pickle.dump(query_cache, f)
|
77 |
|
78 |
def init_embedding_model(
|
79 |
model_name_or_path: str, model_revision: str = None, hf_token: str = None
|
|
|
387 |
# })
|
388 |
|
389 |
|
390 |
+
def log_query_and_results(query_id: str, query: str, results: list[dict], cache_hit: bool = False):
|
391 |
+
"""Log the query and results to a file, including whether it was a cache hit."""
|
392 |
log_entry = {
|
393 |
"timestamp": datetime.datetime.now().isoformat(),
|
394 |
"query_id": query_id,
|
395 |
"query": query,
|
396 |
"results": results,
|
397 |
+
"cache_hit": cache_hit,
|
398 |
}
|
399 |
log_file = os.path.join(data_path, "query_results_log.jsonl")
|
400 |
with open(log_file, "a") as f:
|
401 |
f.write(json.dumps(log_entry) + "\n")
|
402 |
+
|
403 |
+
# Print a short summary of the log entry with timestamp
|
404 |
+
cache_status = "Cache Hit" if cache_hit else "Cache Miss"
|
405 |
+
print(f"[{log_entry['timestamp']}] Query ID: {query_id}, Results Count: {len(results)}, Status: {cache_status}")
|
406 |
|
407 |
|
408 |
def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
|
|
|
414 |
|
415 |
# Normalize the abstract for cosine similarity
|
416 |
abstract = abstract.replace("\n", " ")
|
|
|
417 |
abstract = " ".join(abstract.split())
|
|
|
418 |
abstract = abstract.strip()
|
|
|
419 |
if not abstract:
|
420 |
raise ValueError("Abstract is empty. Please provide a valid abstract.")
|
421 |
|
422 |
+
# Hash the query to use as a cache key
|
423 |
+
query_hash = hash_query(abstract)
|
424 |
+
|
425 |
+
# Check if the query result is in the cache
|
426 |
+
if query_hash in query_cache:
|
427 |
+
print("Cache hit for query")
|
428 |
+
log_query_and_results(query_id, abstract, query_cache[query_hash], cache_hit=True) # Log cache hit details
|
429 |
+
return query_cache[query_hash]
|
430 |
+
|
431 |
+
# Generate embedding for the query abstract
|
432 |
abstract_embedding = embed_text(abstract)
|
433 |
|
434 |
# Access the dataset's train split from the DatasetManager instance
|
435 |
train_dataset = dataset.dataset["train"]
|
436 |
|
437 |
+
# Search for similar papers using FAISS
|
438 |
scores, examples = train_dataset.get_nearest_examples("embedding", abstract_embedding, k=limit)
|
439 |
|
440 |
papers = []
|
441 |
for i in range(len(scores)):
|
|
|
442 |
paper_dict = {
|
443 |
"id": examples["id"][i],
|
444 |
"title": examples["title"][i],
|
|
|
446 |
"categories": examples["categories"][i],
|
447 |
"abstract": examples["abstract"][i],
|
448 |
"update_date": examples["update_date"][i],
|
449 |
+
"synergy_score": float(scores[i]),
|
450 |
}
|
451 |
papers.append(paper_dict)
|
452 |
|
453 |
# Log the query and results
|
454 |
log_query_and_results(query_id, abstract, papers)
|
455 |
|
456 |
+
# Store the result in the cache
|
457 |
+
query_cache[query_hash] = papers
|
458 |
+
save_cache()
|
459 |
+
|
460 |
return papers
|
461 |
|
462 |
|