Update app.py
Browse files
app.py
CHANGED
@@ -12,17 +12,10 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
12 |
# Initialize the ChromaDB client
|
13 |
client = chromadb.Client()
|
14 |
|
15 |
-
# Function to build the database from CSV
|
16 |
def build_database():
|
17 |
# Read the CSV file
|
18 |
df = pd.read_csv('collection_data.csv')
|
19 |
|
20 |
-
# Drop unnecessary columns if they exist
|
21 |
-
if 'uris' in df.columns:
|
22 |
-
df.drop('uris', axis=1, inplace=True)
|
23 |
-
if 'data' in df.columns:
|
24 |
-
df.drop('data', axis=1, inplace=True)
|
25 |
-
|
26 |
# Create a collection
|
27 |
collection_name = 'Dataset-10k-companies'
|
28 |
|
@@ -33,12 +26,21 @@ def build_database():
|
|
33 |
# Create a new collection
|
34 |
collection = client.create_collection(name=collection_name)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# Add the data from the DataFrame to the collection
|
37 |
collection.add(
|
38 |
documents=df['documents'].tolist(),
|
39 |
ids=df['ids'].tolist(),
|
40 |
metadatas=df['metadatas'].apply(eval).tolist(),
|
41 |
-
embeddings=df['embeddings'].apply(
|
42 |
)
|
43 |
|
44 |
return collection
|
|
|
12 |
# Initialize the ChromaDB client
|
13 |
client = chromadb.Client()
|
14 |
|
|
|
15 |
def build_database():
|
16 |
# Read the CSV file
|
17 |
df = pd.read_csv('collection_data.csv')
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# Create a collection
|
20 |
collection_name = 'Dataset-10k-companies'
|
21 |
|
|
|
26 |
# Create a new collection
|
27 |
collection = client.create_collection(name=collection_name)
|
28 |
|
29 |
+
# Function to safely process embeddings
|
30 |
+
def process_embedding(x):
|
31 |
+
if isinstance(x, str):
|
32 |
+
return eval(x.replace(',,', ','))
|
33 |
+
elif isinstance(x, float):
|
34 |
+
return [] # or some default value
|
35 |
+
else:
|
36 |
+
return x
|
37 |
+
|
38 |
# Add the data from the DataFrame to the collection
|
39 |
collection.add(
|
40 |
documents=df['documents'].tolist(),
|
41 |
ids=df['ids'].tolist(),
|
42 |
metadatas=df['metadatas'].apply(eval).tolist(),
|
43 |
+
embeddings=df['embeddings'].apply(process_embedding).tolist()
|
44 |
)
|
45 |
|
46 |
return collection
|