Spaces:

nomadicsynth
/

inkling

Running on Zero

App Files Files Community

nomadicsynth commited on May 7

Commit

c241b7f

1 Parent(s): aa83efc

Refactor dataset management and improve dataset update functionality

Browse files

Files changed (2) hide show

app.py +23 -17
dataset_utils.py +42 -38

app.py CHANGED Viewed

@@ -1,19 +1,15 @@
 import json
 import os
-import faiss
 import gradio as gr
 import pandas as pd
 import spaces
 import torch
-from datasets import load_dataset
-from huggingface_hub import InferenceClient, hf_hub_download
-from huggingface_hub import login as hf_hub_login
-from huggingface_hub import upload_file
 from sentence_transformers import SentenceTransformer
 from arxiv_stuff import ARXIV_CATEGORIES_FLAT
-from dataset_utils import DatasetManager
 # Get HF_TOKEN from environment variables
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -24,11 +20,6 @@ if persistent_storage:
     # Use persistent storage
     print("Using persistent storage")
-# Dataset details
-dataset_name = "nomadicsynth/arxiv-dataset-abstract-embeddings"
-dataset_revision = "v1.0.0"
-local_index_path = "arxiv_faiss_index.faiss"
 # Embedding model details
 embedding_model_name = "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
 embedding_model_revision = "2025-01-28_23-06-17-1epochs-12batch-32eval-512embed-final"
@@ -57,7 +48,18 @@ embedding_model = None
 reasoning_model = None
-def init_embedding_model(model_name_or_path: str, model_revision: str = None, hf_token: str = None) -> SentenceTransformer:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     embedding_model = SentenceTransformer(
         model_name_or_path,
@@ -71,6 +73,13 @@ def init_embedding_model(model_name_or_path: str, model_revision: str = None, hf
 @spaces.GPU
 def embed_text(text: str | list[str]) -> torch.Tensor:
     global embedding_model
     # Strip any leading/trailing whitespace
@@ -393,12 +402,9 @@ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
 def format_search_results_json(abstract: str) -> str:
     """Format search results as JSON for display"""
-    # Find papers synergistic with the given abstract
-    papers = find_synergistic_papers(abstract)
-    # Convert to JSON for display
     json_output = json.dumps(papers, indent=2)
-    print(f"JSON output: {json_output}")
     return json_output

 import json
 import os
 import gradio as gr
 import pandas as pd
 import spaces
 import torch
+from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 from arxiv_stuff import ARXIV_CATEGORIES_FLAT
+from dataset_utils import DatasetManager, dataset_name
 # Get HF_TOKEN from environment variables
 HF_TOKEN = os.getenv("HF_TOKEN")
     # Use persistent storage
     print("Using persistent storage")
 # Embedding model details
 embedding_model_name = "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
 embedding_model_revision = "2025-01-28_23-06-17-1epochs-12batch-32eval-512embed-final"
 reasoning_model = None
+def init_embedding_model(
+    model_name_or_path: str, model_revision: str = None, hf_token: str = None
+) -> SentenceTransformer:
+    """
+    Initialize the embedding model with the specified model name or path and revision.
+    Args:
+        model_name_or_path (str): The name or path of the model.
+        model_revision (str): The revision of the model.
+        hf_token (str): The Hugging Face token for authentication.
+    Returns:
+        SentenceTransformer: The initialized embedding model.
+    """
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     embedding_model = SentenceTransformer(
         model_name_or_path,
 @spaces.GPU
 def embed_text(text: str | list[str]) -> torch.Tensor:
+    """
+    Generate embeddings for the given text using the embedding model.
+    Args:
+        text (str | list[str]): The text or list of texts to embed.
+    Returns:
+        torch.Tensor: The generated embeddings.
+    """
     global embedding_model
     # Strip any leading/trailing whitespace
 def format_search_results_json(abstract: str) -> str:
     """Format search results as JSON for display"""
+    papers = find_synergistic_papers(abstract, limit=10)
     json_output = json.dumps(papers, indent=2)
     return json_output

dataset_utils.py CHANGED Viewed

@@ -1,17 +1,18 @@
-from datasets import load_dataset
-from huggingface_hub import HfApi, hf_hub_download
-import faiss
-import os
 import datetime
 import time
-# from embedding_model import EmbeddingModel
-# from app import EmbeddingModel
-from arxiv_stuff import retrieve_arxiv_papers, ARXIV_CATEGORIES_FLAT
 from sentence_transformers import SentenceTransformer
 # Dataset details
-dataset_name = "nomadicsynth/arxiv-dataset-abstract-embeddings"
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -19,8 +20,8 @@ class DatasetManager:
     def __init__(
         self,
-        dataset_name: str,
-        embedding_model: SentenceTransformer,
         hf_token: str = None,
     ):
         """
@@ -33,16 +34,23 @@ class DatasetManager:
         self.dataset_name = dataset_name
         self.hf_token = hf_token
         self.embedding_model = embedding_model
-        self.dataset = None
         self.setup_dataset()
-    def get_revision_name(self):
         """Generate a timestamp-based revision name."""
         return datetime.datetime.now().strftime("v%Y-%m-%d")
     def get_latest_revision(self):
         """Return the latest timestamp-based revision."""
         api = HfApi()
         print(f"Fetching revisions for dataset: {self.dataset_name}")
@@ -58,8 +66,8 @@ class DatasetManager:
         ]
         if not timestamp_tags:
-            print("No valid timestamp-based revisions found. Using `v1.0.0` as default.")
-            return "v1.0.0"
         print(f"Valid timestamp-based revisions: {timestamp_tags}")
         # Sort and return the most recent tag
@@ -71,22 +79,20 @@ class DatasetManager:
         """Load dataset with FAISS index."""
         print("Loading dataset from Hugging Face...")
-        # Fetch the latest revision dynamically
-        latest_revision = self.get_latest_revision()
         # Load dataset
         dataset = load_dataset(
-            dataset_name,
-            revision=latest_revision,
         )
         # Try to load the index from the Hub
         try:
             print("Downloading pre-built FAISS index...")
             index_path = hf_hub_download(
-                repo_id=dataset_name,
-                filename="arxiv_faiss_index.faiss",
-                revision=latest_revision,
                 token=self.hf_token,
                 repo_type="dataset",
             )
@@ -173,6 +179,15 @@ class DatasetManager:
                 }
             )
         # Update the FAISS index
         self.dataset["train"].add_faiss_index(
             column="embedding",
@@ -181,38 +196,27 @@ class DatasetManager:
         )
         # Save the FAISS index to the Hub
-        self.save_faiss_index_to_hub()
-        # Save the updated dataset to the Hub with a new revision
-        new_revision = self.get_revision_name()
-        self.dataset.push_to_hub(
-            repo_id=self.dataset_name,
-            token=self.hf_token,
-            commit_message=f"Update dataset with new papers ({new_revision})",
-            revision=new_revision,
-        )
         print(f"Dataset updated and saved to the Hub with revision {new_revision}.")
-    def save_faiss_index_to_hub(self):
         """Save the FAISS index to the Hub for easy access"""
-        local_index_path = "arxiv_faiss_index.faiss"
         # 1. Save the index to a local file
         self.dataset["train"].save_faiss_index("embedding", local_index_path)
         print(f"FAISS index saved locally to {local_index_path}")
         # 2. Upload the index file to the Hub
-        from huggingface_hub import upload_file
         remote_path = upload_file(
             path_or_fileobj=local_index_path,
             path_in_repo=local_index_path,  # Same name on the Hub
             repo_id=self.dataset_name,  # Use your dataset repo
             token=self.hf_token,
             repo_type="dataset",  # This is a dataset file
-            revision=self.get_revision_name(),  # Use the current revision
-            commit_message="Add FAISS index",  # Commit message
         )
         print(f"FAISS index uploaded to Hub at {remote_path}")

 import datetime
+import os
 import time
+import faiss
+from datasets import load_dataset
+from huggingface_hub import HfApi, hf_hub_download, upload_file
 from sentence_transformers import SentenceTransformer
+from arxiv_stuff import ARXIV_CATEGORIES_FLAT, retrieve_arxiv_papers
 # Dataset details
+default_dataset_revision = "v1.0.0"
+local_index_path = "arxiv_faiss_index.faiss"
 HF_TOKEN = os.getenv("HF_TOKEN")
     def __init__(
         self,
+        dataset_name: str = "nomadicsynth/arxiv-dataset-abstract-embeddings",
+        embedding_model: SentenceTransformer = None,
         hf_token: str = None,
     ):
         """
         self.dataset_name = dataset_name
         self.hf_token = hf_token
         self.embedding_model = embedding_model
+        self.revision = self.get_latest_revision()
+        if self.hf_token is None:
+            self.hf_token = HF_TOKEN
+        if self.embedding_model is None:
+            raise ValueError("Embedding model must be provided.")
+        self.dataset = None
         self.setup_dataset()
+    def generate_revision_name(self):
         """Generate a timestamp-based revision name."""
         return datetime.datetime.now().strftime("v%Y-%m-%d")
     def get_latest_revision(self):
         """Return the latest timestamp-based revision."""
+        global default_dataset_revision
         api = HfApi()
         print(f"Fetching revisions for dataset: {self.dataset_name}")
         ]
         if not timestamp_tags:
+            print(f"No valid timestamp-based revisions found. Using `{default_dataset_revision}` as default.")
+            return default_dataset_revision
         print(f"Valid timestamp-based revisions: {timestamp_tags}")
         # Sort and return the most recent tag
         """Load dataset with FAISS index."""
         print("Loading dataset from Hugging Face...")
         # Load dataset
         dataset = load_dataset(
+            self.dataset_name,
+            revision=self.revision,
+            token=self.hf_token,
         )
         # Try to load the index from the Hub
         try:
             print("Downloading pre-built FAISS index...")
             index_path = hf_hub_download(
+                repo_id=self.dataset_name,
+                filename=local_index_path,
+                revision=self.revision,
                 token=self.hf_token,
                 repo_type="dataset",
             )
                 }
             )
+        # Save the updated dataset to the Hub with a new revision
+        new_revision = self.generate_revision_name()
+        self.dataset.push_to_hub(
+            repo_id=self.dataset_name,
+            token=self.hf_token,
+            commit_message=f"Update dataset with new papers ({new_revision})",
+            revision=new_revision,
+        )
         # Update the FAISS index
         self.dataset["train"].add_faiss_index(
             column="embedding",
         )
         # Save the FAISS index to the Hub
+        self.save_faiss_index_to_hub(new_revision)
         print(f"Dataset updated and saved to the Hub with revision {new_revision}.")
+    def save_faiss_index_to_hub(self, revision: str):
         """Save the FAISS index to the Hub for easy access"""
+        global local_index_path
         # 1. Save the index to a local file
         self.dataset["train"].save_faiss_index("embedding", local_index_path)
         print(f"FAISS index saved locally to {local_index_path}")
         # 2. Upload the index file to the Hub
         remote_path = upload_file(
             path_or_fileobj=local_index_path,
             path_in_repo=local_index_path,  # Same name on the Hub
             repo_id=self.dataset_name,  # Use your dataset repo
             token=self.hf_token,
             repo_type="dataset",  # This is a dataset file
+            revision=revision,  # Use the new revision
+            commit_message=f"Add FAISS index for dataset revision {revision}",
         )
         print(f"FAISS index uploaded to Hub at {remote_path}")