nomadicsynth commited on
Commit
9719dbf
·
1 Parent(s): d52fd55

Implement caching for query results and enhance logging in log_query_and_results function

Browse files
Files changed (1) hide show
  1. app.py +43 -12
app.py CHANGED
@@ -2,6 +2,8 @@ import datetime
2
  import json
3
  import os
4
  import uuid
 
 
5
 
6
  import gradio as gr
7
  import pandas as pd
@@ -54,6 +56,24 @@ dataset = None
54
  embedding_model = None
55
  reasoning_model = None
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def init_embedding_model(
59
  model_name_or_path: str, model_revision: str = None, hf_token: str = None
@@ -367,20 +387,22 @@ Return only the JSON object. All key names and string values must be in double q
367
  # })
368
 
369
 
370
- def log_query_and_results(query_id: str, query: str, results: list[dict]):
371
- """Log the query and results to a file."""
372
  log_entry = {
373
  "timestamp": datetime.datetime.now().isoformat(),
374
  "query_id": query_id,
375
  "query": query,
376
  "results": results,
 
377
  }
378
  log_file = os.path.join(data_path, "query_results_log.jsonl")
379
  with open(log_file, "a") as f:
380
  f.write(json.dumps(log_entry) + "\n")
381
-
382
- # print a short summary of the log entry with timestamp
383
- print(f"[{log_entry['timestamp']}] Query ID: {query_id}, Results Count: {len(results)}")
 
384
 
385
 
386
  def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
@@ -392,26 +414,31 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
392
 
393
  # Normalize the abstract for cosine similarity
394
  abstract = abstract.replace("\n", " ")
395
- # Replace multiple whitespaces with a single space
396
  abstract = " ".join(abstract.split())
397
- # Remove leading/trailing whitespace
398
  abstract = abstract.strip()
399
- # Check if the abstract is empty
400
  if not abstract:
401
  raise ValueError("Abstract is empty. Please provide a valid abstract.")
402
 
403
- # Generate embedding for the query abstract (normalized for cosine similarity)
 
 
 
 
 
 
 
 
 
404
  abstract_embedding = embed_text(abstract)
405
 
406
  # Access the dataset's train split from the DatasetManager instance
407
  train_dataset = dataset.dataset["train"]
408
 
409
- # Search for similar papers using FAISS with inner product (cosine similarity for normalized vectors)
410
  scores, examples = train_dataset.get_nearest_examples("embedding", abstract_embedding, k=limit)
411
 
412
  papers = []
413
  for i in range(len(scores)):
414
- # With cosine similarity, higher scores are better (closer to 1)
415
  paper_dict = {
416
  "id": examples["id"][i],
417
  "title": examples["title"][i],
@@ -419,13 +446,17 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
419
  "categories": examples["categories"][i],
420
  "abstract": examples["abstract"][i],
421
  "update_date": examples["update_date"][i],
422
- "synergy_score": float(scores[i]), # Convert to float for serialization
423
  }
424
  papers.append(paper_dict)
425
 
426
  # Log the query and results
427
  log_query_and_results(query_id, abstract, papers)
428
 
 
 
 
 
429
  return papers
430
 
431
 
 
2
  import json
3
  import os
4
  import uuid
5
+ import hashlib
6
+ import pickle
7
 
8
  import gradio as gr
9
  import pandas as pd
 
56
  embedding_model = None
57
  reasoning_model = None
58
 
59
+ # Define a cache file path
60
+ cache_file = os.path.join(data_path, "query_cache.pkl")
61
+
62
+ # Load cache from file if it exists
63
+ if os.path.exists(cache_file):
64
+ with open(cache_file, "rb") as f:
65
+ query_cache = pickle.load(f)
66
+ else:
67
+ query_cache = {}
68
+
69
+ def hash_query(query: str) -> str:
70
+ """Generate a unique hash for the query."""
71
+ return hashlib.sha256(query.encode("utf-8")).hexdigest()
72
+
73
+ def save_cache():
74
+ """Save the cache to a file."""
75
+ with open(cache_file, "wb") as f:
76
+ pickle.dump(query_cache, f)
77
 
78
  def init_embedding_model(
79
  model_name_or_path: str, model_revision: str = None, hf_token: str = None
 
387
  # })
388
 
389
 
390
+ def log_query_and_results(query_id: str, query: str, results: list[dict], cache_hit: bool = False):
391
+ """Log the query and results to a file, including whether it was a cache hit."""
392
  log_entry = {
393
  "timestamp": datetime.datetime.now().isoformat(),
394
  "query_id": query_id,
395
  "query": query,
396
  "results": results,
397
+ "cache_hit": cache_hit,
398
  }
399
  log_file = os.path.join(data_path, "query_results_log.jsonl")
400
  with open(log_file, "a") as f:
401
  f.write(json.dumps(log_entry) + "\n")
402
+
403
+ # Print a short summary of the log entry with timestamp
404
+ cache_status = "Cache Hit" if cache_hit else "Cache Miss"
405
+ print(f"[{log_entry['timestamp']}] Query ID: {query_id}, Results Count: {len(results)}, Status: {cache_status}")
406
 
407
 
408
  def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
 
414
 
415
  # Normalize the abstract for cosine similarity
416
  abstract = abstract.replace("\n", " ")
 
417
  abstract = " ".join(abstract.split())
 
418
  abstract = abstract.strip()
 
419
  if not abstract:
420
  raise ValueError("Abstract is empty. Please provide a valid abstract.")
421
 
422
+ # Hash the query to use as a cache key
423
+ query_hash = hash_query(abstract)
424
+
425
+ # Check if the query result is in the cache
426
+ if query_hash in query_cache:
427
+ print("Cache hit for query")
428
+ log_query_and_results(query_id, abstract, query_cache[query_hash], cache_hit=True) # Log cache hit details
429
+ return query_cache[query_hash]
430
+
431
+ # Generate embedding for the query abstract
432
  abstract_embedding = embed_text(abstract)
433
 
434
  # Access the dataset's train split from the DatasetManager instance
435
  train_dataset = dataset.dataset["train"]
436
 
437
+ # Search for similar papers using FAISS
438
  scores, examples = train_dataset.get_nearest_examples("embedding", abstract_embedding, k=limit)
439
 
440
  papers = []
441
  for i in range(len(scores)):
 
442
  paper_dict = {
443
  "id": examples["id"][i],
444
  "title": examples["title"][i],
 
446
  "categories": examples["categories"][i],
447
  "abstract": examples["abstract"][i],
448
  "update_date": examples["update_date"][i],
449
+ "synergy_score": float(scores[i]),
450
  }
451
  papers.append(paper_dict)
452
 
453
  # Log the query and results
454
  log_query_and_results(query_id, abstract, papers)
455
 
456
+ # Store the result in the cache
457
+ query_cache[query_hash] = papers
458
+ save_cache()
459
+
460
  return papers
461
 
462