Spaces:

jskinner215
/

TAPAS_WTQ_Chunking

Build error

File size: 4,511 Bytes

736842d
b1a798e
736842d
 
b1a798e
 
13778dd
 
 
 
 
 
 
955af6a
f9e10ad
69983c5
 
 
 
 
f9e10ad
69983c5
faa37d1
2fbdaee
13778dd
faa37d1
 
 
 
 
13778dd
c2c40bf
2fbdaee
f8ac089
13778dd
 
 
 
 
475a4a4
 
 
 
 
 
13778dd
 
736842d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ae17c8
340cc83
 
 
3ae17c8
340cc83
 
 
3ae17c8
 
 
1cb0871
3ae17c8
1cb0871
3ae17c8
 
41b5bdf
1cb0871
3ae17c8
1cb0871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ae17c8
1cb0871
 
 
 
 
3ceb12a
 
 
 
 
 
1cb0871
 
 
 
 
093848b
 
41b5bdf
 
 
5e4315c
 
 
 
41b5bdf
5e4315c

import weaviate
import streamlit as st
from weaviate.embedded import EmbeddedOptions
from weaviate import Client
import pandas as pd  # <-- Add this import
from io import StringIO  # <-- Add this import
import pandas as pd

def hybrid_search_weaviate(client, selected_class, query):
    """
    Perform a hybrid search on Weaviate using the provided class and query.
    Return the results as a list of dictionaries.
    """
    # Construct the search query
    search_query = {
        "where": {
            "path": ["*"],
            "operator": "Like",
            "valueString": query
        }
    }
    
    # Execute the query and retrieve the results
    results = client.query.get(selected_class, "*").with_where(search_query).do()
    
    # Extract the data objects from the results
    data_objects = results.get('data', {}).get('Get', {}).get('Things', [])
    
    return data_objects





def convert_to_tapas_format(data):
    """
    Convert the list of dictionaries (from Weaviate) into the format TAPAS expects.
    Return the table as a list of lists.
    """
    # Extract the data objects from the results
    data_objects = data.get('data', {}).get('Get', {}).get('Things', [])
    
    # Convert the data objects into a DataFrame
    df = pd.DataFrame([obj['thing'] for obj in data_objects])
    
    table = [df.columns.tolist()] + df.values.tolist()
    return table

def initialize_weaviate_client():
    return weaviate.Client(embedded_options=EmbeddedOptions())

def class_exists(client, class_name):
    try:
        client.schema.get_class(class_name)
        return True
    except:
        return False

def map_dtype_to_weaviate(dtype):
    if "int" in str(dtype):
        return "int"
    elif "float" in str(dtype):
        return "number"
    elif "bool" in str(dtype):
        return "boolean"
    else:
        return "string"

def create_new_class_schema(client, class_name, class_description):
    class_schema = {
        "class": class_name,
        "description": class_description,
        "properties": []
    }
    try:
        client.schema.create({"classes": [class_schema]})
        st.success(f"Class {class_name} created successfully!")
    except Exception as e:
        st.error(f"Error creating class: {e}")
        
def ingest_data_to_weaviate(client, csv_file, selected_class):
    # Read the CSV data
    data = csv_file.read().decode("utf-8")
    dataframe = pd.read_csv(StringIO(data))

    # Fetch the schema for the selected class
    class_schema = get_class_schema(client, selected_class)

    # If the schema is empty, create it based on the CSV columns
    if not class_schema or not class_schema["properties"]:
        for column_name, data_type in zip(dataframe.columns, dataframe.dtypes):
            property_schema = {
                "name": column_name,
                "description": f"Property for {column_name}",
                "dataType": [map_dtype_to_weaviate(data_type)]
            }
            try:
                client.schema.property.create(selected_class, property_schema)
            except weaviate.exceptions.SchemaValidationException:
                # Property might already exist, so we can continue
                pass
    else:
        # If the schema is not empty, compare it with the CSV columns
        schema_columns = [prop["name"] for prop in class_schema["properties"]]
        if set(dataframe.columns) != set(schema_columns):
            st.error("The columns in the uploaded CSV do not match the schema of the selected class. Please check and upload the correct CSV or create a new class.")
            return

    # Ingest the data into Weaviate
    data = dataframe.to_dict(orient="records")
    for record in data:
        try:
            client.data_object.create(record, selected_class)
        except Exception as e:
            st.error(f"Error ingesting record: {e}")

    # Display a preview of the ingested data
    st.write(f"Your CSV was successfully integrated into the vector database under the class '{selected_class}'")
    st.write(dataframe.head())  # Display the first few rows of the dataframe as a preview

    # Return the dataframe for preview
    return dataframe  # Added this line

def get_class_schema(client, class_name):
    try:
        schema = client.schema.get()
        for cls in schema["classes"]:
            if cls["class"] == class_name:
                return cls
        return None
    except weaviate.exceptions.SchemaValidationException:
        return None