File size: 5,502 Bytes
736842d
b1a798e
736842d
 
b1a798e
 
13778dd
 
 
 
 
 
 
955af6a
f9e10ad
69983c5
 
 
 
 
f9e10ad
69983c5
faa37d1
2fbdaee
13778dd
faa37d1
 
 
 
 
13778dd
c2c40bf
2fbdaee
f8ac089
13778dd
 
 
 
 
475a4a4
 
 
 
 
 
13778dd
 
736842d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ae17c8
340cc83
 
 
3ae17c8
340cc83
 
 
3ae17c8
 
 
1cb0871
3ae17c8
1cb0871
3ae17c8
 
41b5bdf
77a6da5
 
1233a10
 
 
 
77a6da5
1233a10
 
 
 
 
77a6da5
1cb0871
3ae17c8
1cb0871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ae17c8
1cb0871
 
 
 
 
3ceb12a
 
 
 
 
 
1cb0871
 
 
 
 
093848b
 
41b5bdf
 
 
5e4315c
 
 
 
41b5bdf
5e4315c
77a6da5
 
 
 
bda63f0
 
 
 
 
 
 
 
77a6da5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import weaviate
import streamlit as st
from weaviate.embedded import EmbeddedOptions
from weaviate import Client
import pandas as pd  # <-- Add this import
from io import StringIO  # <-- Add this import
import pandas as pd

def hybrid_search_weaviate(client, selected_class, query):
    """
    Perform a hybrid search on Weaviate using the provided class and query.
    Return the results as a list of dictionaries.
    """
    # Construct the search query
    search_query = {
        "where": {
            "path": ["*"],
            "operator": "Like",
            "valueString": query
        }
    }
    
    # Execute the query and retrieve the results
    results = client.query.get(selected_class, "*").with_where(search_query).do()
    
    # Extract the data objects from the results
    data_objects = results.get('data', {}).get('Get', {}).get('Things', [])
    
    return data_objects





def convert_to_tapas_format(data):
    """
    Convert the list of dictionaries (from Weaviate) into the format TAPAS expects.
    Return the table as a list of lists.
    """
    # Extract the data objects from the results
    data_objects = data.get('data', {}).get('Get', {}).get('Things', [])
    
    # Convert the data objects into a DataFrame
    df = pd.DataFrame([obj['thing'] for obj in data_objects])
    
    table = [df.columns.tolist()] + df.values.tolist()
    return table

def initialize_weaviate_client():
    return weaviate.Client(embedded_options=EmbeddedOptions())

def class_exists(client, class_name):
    try:
        client.schema.get_class(class_name)
        return True
    except:
        return False

def map_dtype_to_weaviate(dtype):
    if "int" in str(dtype):
        return "int"
    elif "float" in str(dtype):
        return "number"
    elif "bool" in str(dtype):
        return "boolean"
    else:
        return "string"

def create_new_class_schema(client, class_name, class_description):
    class_schema = {
        "class": class_name,
        "description": class_description,
        "properties": []
    }
    try:
        client.schema.create({"classes": [class_schema]})
        st.success(f"Class {class_name} created successfully!")
    except Exception as e:
        st.error(f"Error creating class: {e}")
        
def ingest_data_to_weaviate(client, csv_file, selected_class):
    # Read the CSV data
    data = csv_file.read().decode("utf-8")
    dataframe = pd.read_csv(StringIO(data))

    # After converting the CSV to a dataframe
    embedded_table = tapas_utils.embed_table(dataframe)
    
    # Create a unique ID for the table (for example, based on its content)
    table_id = hashlib.md5(dataframe.to_csv(index=False).encode()).hexdigest()
    
    # Store the embedded table in Weaviate
    client.data_object.create({
        "id": table_id,
        "embeddedTable": embedded_table.tolist(),
        "content": dataframe.to_csv(index=False)
    }, selected_class)
    
    # Fetch the schema for the selected class
    class_schema = get_class_schema(client, selected_class)

    # If the schema is empty, create it based on the CSV columns
    if not class_schema or not class_schema["properties"]:
        for column_name, data_type in zip(dataframe.columns, dataframe.dtypes):
            property_schema = {
                "name": column_name,
                "description": f"Property for {column_name}",
                "dataType": [map_dtype_to_weaviate(data_type)]
            }
            try:
                client.schema.property.create(selected_class, property_schema)
            except weaviate.exceptions.SchemaValidationException:
                # Property might already exist, so we can continue
                pass
    else:
        # If the schema is not empty, compare it with the CSV columns
        schema_columns = [prop["name"] for prop in class_schema["properties"]]
        if set(dataframe.columns) != set(schema_columns):
            st.error("The columns in the uploaded CSV do not match the schema of the selected class. Please check and upload the correct CSV or create a new class.")
            return

    # Ingest the data into Weaviate
    data = dataframe.to_dict(orient="records")
    for record in data:
        try:
            client.data_object.create(record, selected_class)
        except Exception as e:
            st.error(f"Error ingesting record: {e}")

    # Display a preview of the ingested data
    st.write(f"Your CSV was successfully integrated into the vector database under the class '{selected_class}'")
    st.write(dataframe.head())  # Display the first few rows of the dataframe as a preview

    # Return the dataframe for preview
    return dataframe  # Added this line

def get_class_schema(client, class_name):
    try:
        schema = client.schema.get()
        for cls in schema["classes"]:
            if cls["class"] == class_name:
                return cls
        return None
    except weaviate.exceptions.SchemaValidationException:
        return None

def retrieve_relevant_table(client, selected_class, question_embedding):
    # Query Weaviate to get the most relevant table
    results = client.query.get(selected_class, ["content"]).with_near_text(question_embedding).do()
    
    # Extract the table content from the results
    table_content = results.get('data', {}).get('Get', {}).get('Things', [])[0].get('content')
    
    # Convert the table content to a DataFrame
    table = pd.read_csv(StringIO(table_content))
    
    return table