TAPAS_WTQ_Chunking / weaviate_utils.py
jskinner215's picture
Update weaviate_utils.py
f8ac089
raw
history blame
4.51 kB
import weaviate
import streamlit as st
from weaviate.embedded import EmbeddedOptions
from weaviate import Client
import pandas as pd # <-- Add this import
from io import StringIO # <-- Add this import
import pandas as pd
def hybrid_search_weaviate(client, selected_class, query):
"""
Perform a hybrid search on Weaviate using the provided class and query.
Return the results as a list of dictionaries.
"""
# Construct the search query
search_query = {
"where": {
"path": ["*"],
"operator": "Like",
"valueString": query
}
}
# Execute the query and retrieve the results
results = client.query.get(selected_class, "*").with_where(search_query).do()
# Extract the data objects from the results
data_objects = results.get('data', {}).get('Get', {}).get('Things', [])
return data_objects
def convert_to_tapas_format(data):
"""
Convert the list of dictionaries (from Weaviate) into the format TAPAS expects.
Return the table as a list of lists.
"""
# Extract the data objects from the results
data_objects = data.get('data', {}).get('Get', {}).get('Things', [])
# Convert the data objects into a DataFrame
df = pd.DataFrame([obj['thing'] for obj in data_objects])
table = [df.columns.tolist()] + df.values.tolist()
return table
def initialize_weaviate_client():
return weaviate.Client(embedded_options=EmbeddedOptions())
def class_exists(client, class_name):
try:
client.schema.get_class(class_name)
return True
except:
return False
def map_dtype_to_weaviate(dtype):
if "int" in str(dtype):
return "int"
elif "float" in str(dtype):
return "number"
elif "bool" in str(dtype):
return "boolean"
else:
return "string"
def create_new_class_schema(client, class_name, class_description):
class_schema = {
"class": class_name,
"description": class_description,
"properties": []
}
try:
client.schema.create({"classes": [class_schema]})
st.success(f"Class {class_name} created successfully!")
except Exception as e:
st.error(f"Error creating class: {e}")
def ingest_data_to_weaviate(client, csv_file, selected_class):
# Read the CSV data
data = csv_file.read().decode("utf-8")
dataframe = pd.read_csv(StringIO(data))
# Fetch the schema for the selected class
class_schema = get_class_schema(client, selected_class)
# If the schema is empty, create it based on the CSV columns
if not class_schema or not class_schema["properties"]:
for column_name, data_type in zip(dataframe.columns, dataframe.dtypes):
property_schema = {
"name": column_name,
"description": f"Property for {column_name}",
"dataType": [map_dtype_to_weaviate(data_type)]
}
try:
client.schema.property.create(selected_class, property_schema)
except weaviate.exceptions.SchemaValidationException:
# Property might already exist, so we can continue
pass
else:
# If the schema is not empty, compare it with the CSV columns
schema_columns = [prop["name"] for prop in class_schema["properties"]]
if set(dataframe.columns) != set(schema_columns):
st.error("The columns in the uploaded CSV do not match the schema of the selected class. Please check and upload the correct CSV or create a new class.")
return
# Ingest the data into Weaviate
data = dataframe.to_dict(orient="records")
for record in data:
try:
client.data_object.create(record, selected_class)
except Exception as e:
st.error(f"Error ingesting record: {e}")
# Display a preview of the ingested data
st.write(f"Your CSV was successfully integrated into the vector database under the class '{selected_class}'")
st.write(dataframe.head()) # Display the first few rows of the dataframe as a preview
# Return the dataframe for preview
return dataframe # Added this line
def get_class_schema(client, class_name):
try:
schema = client.schema.get()
for cls in schema["classes"]:
if cls["class"] == class_name:
return cls
return None
except weaviate.exceptions.SchemaValidationException:
return None