Spaces:

nomadicsynth
/

inkling

Running on Zero

App Files Files Community

nomadicsynth commited on May 7

Commit

9719dbf

1 Parent(s): d52fd55

Implement caching for query results and enhance logging in log_query_and_results function

Browse files

Files changed (1) hide show

app.py +43 -12

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import datetime
 import json
 import os
 import uuid
 import gradio as gr
 import pandas as pd
@@ -54,6 +56,24 @@ dataset = None
 embedding_model = None
 reasoning_model = None
 def init_embedding_model(
     model_name_or_path: str, model_revision: str = None, hf_token: str = None
@@ -367,20 +387,22 @@ Return only the JSON object. All key names and string values must be in double q
 # })
-def log_query_and_results(query_id: str, query: str, results: list[dict]):
-    """Log the query and results to a file."""
     log_entry = {
         "timestamp": datetime.datetime.now().isoformat(),
         "query_id": query_id,
         "query": query,
         "results": results,
     }
     log_file = os.path.join(data_path, "query_results_log.jsonl")
     with open(log_file, "a") as f:
         f.write(json.dumps(log_entry) + "\n")
-    # print a short summary of the log entry with timestamp
-    print(f"[{log_entry['timestamp']}] Query ID: {query_id}, Results Count: {len(results)}")
 def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
@@ -392,26 +414,31 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
     # Normalize the abstract for cosine similarity
     abstract = abstract.replace("\n", " ")
-    # Replace multiple whitespaces with a single space
     abstract = " ".join(abstract.split())
-    # Remove leading/trailing whitespace
     abstract = abstract.strip()
-    # Check if the abstract is empty
     if not abstract:
         raise ValueError("Abstract is empty. Please provide a valid abstract.")
-    # Generate embedding for the query abstract (normalized for cosine similarity)
     abstract_embedding = embed_text(abstract)
     # Access the dataset's train split from the DatasetManager instance
     train_dataset = dataset.dataset["train"]
-    # Search for similar papers using FAISS with inner product (cosine similarity for normalized vectors)
     scores, examples = train_dataset.get_nearest_examples("embedding", abstract_embedding, k=limit)
     papers = []
     for i in range(len(scores)):
-        # With cosine similarity, higher scores are better (closer to 1)
         paper_dict = {
             "id": examples["id"][i],
             "title": examples["title"][i],
@@ -419,13 +446,17 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
             "categories": examples["categories"][i],
             "abstract": examples["abstract"][i],
             "update_date": examples["update_date"][i],
-            "synergy_score": float(scores[i]),  # Convert to float for serialization
         }
         papers.append(paper_dict)
     # Log the query and results
     log_query_and_results(query_id, abstract, papers)
     return papers

 import json
 import os
 import uuid
+import hashlib
+import pickle
 import gradio as gr
 import pandas as pd
 embedding_model = None
 reasoning_model = None
+# Define a cache file path
+cache_file = os.path.join(data_path, "query_cache.pkl")
+# Load cache from file if it exists
+if os.path.exists(cache_file):
+    with open(cache_file, "rb") as f:
+        query_cache = pickle.load(f)
+else:
+    query_cache = {}
+def hash_query(query: str) -> str:
+    """Generate a unique hash for the query."""
+    return hashlib.sha256(query.encode("utf-8")).hexdigest()
+def save_cache():
+    """Save the cache to a file."""
+    with open(cache_file, "wb") as f:
+        pickle.dump(query_cache, f)
 def init_embedding_model(
     model_name_or_path: str, model_revision: str = None, hf_token: str = None
 # })
+def log_query_and_results(query_id: str, query: str, results: list[dict], cache_hit: bool = False):
+    """Log the query and results to a file, including whether it was a cache hit."""
     log_entry = {
         "timestamp": datetime.datetime.now().isoformat(),
         "query_id": query_id,
         "query": query,
         "results": results,
+        "cache_hit": cache_hit,
     }
     log_file = os.path.join(data_path, "query_results_log.jsonl")
     with open(log_file, "a") as f:
         f.write(json.dumps(log_entry) + "\n")
+    # Print a short summary of the log entry with timestamp
+    cache_status = "Cache Hit" if cache_hit else "Cache Miss"
+    print(f"[{log_entry['timestamp']}] Query ID: {query_id}, Results Count: {len(results)}, Status: {cache_status}")
 def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
     # Normalize the abstract for cosine similarity
     abstract = abstract.replace("\n", " ")
     abstract = " ".join(abstract.split())
     abstract = abstract.strip()
     if not abstract:
         raise ValueError("Abstract is empty. Please provide a valid abstract.")
+    # Hash the query to use as a cache key
+    query_hash = hash_query(abstract)
+    # Check if the query result is in the cache
+    if query_hash in query_cache:
+        print("Cache hit for query")
+        log_query_and_results(query_id, abstract, query_cache[query_hash], cache_hit=True)  # Log cache hit details
+        return query_cache[query_hash]
+    # Generate embedding for the query abstract
     abstract_embedding = embed_text(abstract)
     # Access the dataset's train split from the DatasetManager instance
     train_dataset = dataset.dataset["train"]
+    # Search for similar papers using FAISS
     scores, examples = train_dataset.get_nearest_examples("embedding", abstract_embedding, k=limit)
     papers = []
     for i in range(len(scores)):
         paper_dict = {
             "id": examples["id"][i],
             "title": examples["title"][i],
             "categories": examples["categories"][i],
             "abstract": examples["abstract"][i],
             "update_date": examples["update_date"][i],
+            "synergy_score": float(scores[i]),
         }
         papers.append(paper_dict)
     # Log the query and results
     log_query_and_results(query_id, abstract, papers)
+    # Store the result in the cache
+    query_cache[query_hash] = papers
+    save_cache()
     return papers