Spaces:

Michaeldavidstein
/

michael_stein_reports_qna

Sleeping

Michaeldavidstein commited on Jun 25, 2024

Commit

a4e00f7

verified ·

1 Parent(s): 87ec5ca

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,17 +12,10 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
 # Initialize the ChromaDB client
 client = chromadb.Client()
-# Function to build the database from CSV
 def build_database():
     # Read the CSV file
     df = pd.read_csv('collection_data.csv')
-    # Drop unnecessary columns if they exist
-    if 'uris' in df.columns:
-        df.drop('uris', axis=1, inplace=True)
-    if 'data' in df.columns:
-        df.drop('data', axis=1, inplace=True)
     # Create a collection
     collection_name = 'Dataset-10k-companies'
@@ -33,12 +26,21 @@ def build_database():
     # Create a new collection
     collection = client.create_collection(name=collection_name)
     # Add the data from the DataFrame to the collection
     collection.add(
         documents=df['documents'].tolist(),
         ids=df['ids'].tolist(),
         metadatas=df['metadatas'].apply(eval).tolist(),
-        embeddings=df['embeddings'].apply(lambda x: eval(x.replace(',,', ','))).tolist()
     )
     return collection

 # Initialize the ChromaDB client
 client = chromadb.Client()
 def build_database():
     # Read the CSV file
     df = pd.read_csv('collection_data.csv')
     # Create a collection
     collection_name = 'Dataset-10k-companies'
     # Create a new collection
     collection = client.create_collection(name=collection_name)
+    # Function to safely process embeddings
+    def process_embedding(x):
+        if isinstance(x, str):
+            return eval(x.replace(',,', ','))
+        elif isinstance(x, float):
+            return []  # or some default value
+        else:
+            return x
     # Add the data from the DataFrame to the collection
     collection.add(
         documents=df['documents'].tolist(),
         ids=df['ids'].tolist(),
         metadatas=df['metadatas'].apply(eval).tolist(),
+        embeddings=df['embeddings'].apply(process_embedding).tolist()
     )
     return collection