Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 8

Commit

2d991dc

1 Parent(s): 7f09c10

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -74

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ import pandas as pd
 import subprocess
 from typing import Dict, Optional
 import codecs
 try:
     subprocess.run(['git', 'lfs', 'pull'], check=True)
@@ -93,86 +94,32 @@ def load_models():
         return False
-class LFSEmbeddingsUnpickler(pickle.Unpickler):
-    def persistent_load(self, pid):
-        # Ensure persistent ID is ASCII string
-        if isinstance(pid, bytes):
-            return pid.decode('ascii')
-        return str(pid)
-def load_embeddings(embeddings_path: str = 'embeddings.pkl') -> Optional[Dict[str, np.ndarray]]:
-    """
-    Load embeddings from a pickle file with support for Git LFS and protocol 0 requirements.
-    Args:
-        embeddings_path (str): Path to the pickle file containing embeddings
-    Returns:
-        Optional[Dict[str, np.ndarray]]: Dictionary of embeddings or None if loading fails
-    """
-    if not os.path.exists(embeddings_path):
-        print(f"Error: {embeddings_path} not found")
-        return None
     try:
-        # Open file in binary mode with buffering
-        with open(embeddings_path, 'rb', buffering=1024*1024) as f:
-            # Check if it's a Git LFS pointer file
-            first_line = f.peek(100)[:100].decode('utf-8', errors='ignore')
-            if 'version https://git-lfs.github.com/spec/' in first_line:
-                print("Warning: This appears to be a Git LFS pointer file.")
-                print("Please ensure you've properly downloaded the actual embeddings file using Git LFS")
-                return None
-            # Use custom unpickler with ASCII string handling
-            unpickler = LFSEmbeddingsUnpickler(f)
-            # Set encoding for protocol 0 compatibility
-            if hasattr(unpickler, 'encoding'):
-                unpickler.encoding = 'ascii'
-            try:
-                embeddings = unpickler.load()
-            except UnicodeDecodeError:
-                # If ASCII decode fails, try UTF-8
-                f.seek(0)
-                unpickler = pickle.Unpickler(f)
-                embeddings = unpickler.load()
-        # Validate the loaded data
-        if not isinstance(embeddings, dict):
-            print(f"Error: Expected dict, got {type(embeddings)}")
-            return None
-        # Convert values to numpy arrays
-        processed_embeddings = {}
-        for key, value in embeddings.items():
-            try:
-                # Handle various input types
-                if isinstance(value, np.ndarray):
-                    processed_embeddings[key] = value
-                else:
-                    processed_embeddings[key] = np.array(value, dtype=np.float32)
-            except Exception as e:
-                print(f"Warning: Could not process embedding for {key}: {e}")
-                continue
-        if processed_embeddings:
-            sample_key = next(iter(processed_embeddings))
-            print(f"Data type: {type(processed_embeddings)}")
-            print(f"Total embeddings loaded: {len(processed_embeddings)}")
-            print(f"Sample embedding shape: {processed_embeddings[sample_key].shape}")
-            return processed_embeddings
-        else:
-            print("Error: No valid embeddings were processed")
-            return None
     except Exception as e:
-        print(f"Error loading embeddings: {str(e)}")
-        print("If using Git LFS, ensure you've run 'git lfs pull' to download the actual file")
         return None
 def load_documents_data():
     """Load document data with error handling"""
     try:

 import subprocess
 from typing import Dict, Optional
 import codecs
+from huggingface_hub import hf_hub_download
 try:
     subprocess.run(['git', 'lfs', 'pull'], check=True)
         return False
+def load_embeddings(repo_id: str) -> Optional[Dict[str, np.ndarray]]:
+    """Load embeddings using HuggingFace Hub"""
     try:
+        # Download file from HF Hub
+        file_path = hf_hub_download(
+            repo_id=repo_id,
+            filename="embeddings.pkl",
+            repo_type="space"
+        )
+        # Load with custom unpickler
+        with open(file_path, 'rb') as f:
+            unpickler = pickle.Unpickler(f)
+            unpickler.encoding = 'ascii'
+            embeddings = unpickler.load()
+            if not isinstance(embeddings, dict):
+                return None
+            # Convert to numpy arrays
+            return {k: np.array(v, dtype=np.float32) for k, v in embeddings.items()}
     except Exception as e:
+        print(f"Error loading embeddings: {e}")
         return None
 def load_documents_data():
     """Load document data with error handling"""
     try: