Spaces:
Build error
Weaviate schema based on user input and csv upload
Browse filesUI Input for Class and Description:
We'll add an input field in the Streamlit app where users can define the class name and description of the CSV they're uploading.
Auto-Populate Schema from CSV:
Once the CSV is uploaded, we'll read its headers to determine the column names. We can then use simple heuristics to determine the data type of each column (e.g., if a column contains only numbers, it's likely a float or int, if it matches date patterns, it's a date, otherwise, it's a string). We'll also use the column names as descriptions for simplicity, but this can be enhanced further if needed.
Create Schema in Weaviate:
Using the class name, description, and the auto-populated properties, we'll define the schema and create it in Weaviate.
Ingest Data:
Once the schema is created, we can then ingest the data from the CSV into Weaviate.
@@ -16,20 +16,46 @@ client = weaviate.Client(
|
|
16 |
embedded_options=EmbeddedOptions()
|
17 |
)
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
obj = {
|
23 |
-
"class":
|
24 |
-
"id": str(index),
|
25 |
"properties": row.to_dict()
|
26 |
}
|
27 |
-
|
|
|
28 |
|
29 |
-
# Function to query data from Weaviate
|
30 |
def query_weaviate(question):
|
31 |
# This is a basic example; adapt the query based on the question
|
32 |
-
results = client.query.get(
|
33 |
return results
|
34 |
|
35 |
def ask_llm_chunk(chunk, questions):
|
@@ -89,20 +115,25 @@ def summarize_map_reduce(data, questions):
|
|
89 |
all_answers.extend(chunk_answers)
|
90 |
return all_answers
|
91 |
|
92 |
-
st.title("TAPAS Table Question Answering with Weaviate")
|
|
|
|
|
|
|
|
|
93 |
|
94 |
# Upload CSV data
|
95 |
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
96 |
if csv_file is not None:
|
97 |
data = csv_file.read().decode("utf-8")
|
98 |
dataframe = pd.read_csv(StringIO(data))
|
99 |
-
|
100 |
-
# Ingest data into Weaviate
|
101 |
-
ingest_data_to_weaviate(dataframe)
|
102 |
-
|
103 |
st.write("CSV Data Preview:")
|
104 |
st.write(dataframe.head())
|
105 |
|
|
|
|
|
|
|
|
|
|
|
106 |
# Input for questions
|
107 |
questions = st.text_area("Enter your questions (one per line)")
|
108 |
questions = questions.split("\n") # split questions by line
|
@@ -110,14 +141,7 @@ if csv_file is not None:
|
|
110 |
|
111 |
if st.button("Submit"):
|
112 |
if data and questions:
|
113 |
-
|
114 |
-
relevant_data = query_weaviate(questions[0]) # Example: using the first question
|
115 |
-
# Convert the relevant data to a DataFrame (you might need to adjust this based on the Weaviate response format)
|
116 |
-
relevant_df = pd.DataFrame(relevant_data)
|
117 |
-
|
118 |
-
# Pass the relevant data to TAPAS
|
119 |
-
answers = summarize_map_reduce(relevant_df, questions)
|
120 |
-
|
121 |
st.write("Answers:")
|
122 |
for q, a in zip(questions, answers):
|
123 |
st.write(f"Question: {q}")
|
|
|
16 |
embedded_options=EmbeddedOptions()
|
17 |
)
|
18 |
|
19 |
+
def ingest_data_to_weaviate(dataframe, class_name, class_description):
|
20 |
+
properties = []
|
21 |
+
for column in dataframe.columns:
|
22 |
+
data_type = "string"
|
23 |
+
if dataframe[column].dtype == "float64":
|
24 |
+
data_type = "float"
|
25 |
+
elif dataframe[column].dtype == "int64":
|
26 |
+
data_type = "int"
|
27 |
+
properties.append({
|
28 |
+
"name": column,
|
29 |
+
"description": column,
|
30 |
+
"dataType": [data_type]
|
31 |
+
})
|
32 |
+
|
33 |
+
schema = {
|
34 |
+
"classes": [
|
35 |
+
{
|
36 |
+
"class": class_name,
|
37 |
+
"description": class_description,
|
38 |
+
"properties": properties
|
39 |
+
}
|
40 |
+
]
|
41 |
+
}
|
42 |
+
|
43 |
+
# Create Schema in Weaviate
|
44 |
+
client.schema.create(schema)
|
45 |
+
|
46 |
+
# Ingest Data
|
47 |
+
batch_request = weaviate.ObjectsBatchRequest()
|
48 |
+
for _, row in dataframe.iterrows():
|
49 |
obj = {
|
50 |
+
"class": class_name,
|
|
|
51 |
"properties": row.to_dict()
|
52 |
}
|
53 |
+
batch_request.add(obj)
|
54 |
+
client.batch.create(batch_request)
|
55 |
|
|
|
56 |
def query_weaviate(question):
|
57 |
# This is a basic example; adapt the query based on the question
|
58 |
+
results = client.query.get(class_name).with_near_text(question).do()
|
59 |
return results
|
60 |
|
61 |
def ask_llm_chunk(chunk, questions):
|
|
|
115 |
all_answers.extend(chunk_answers)
|
116 |
return all_answers
|
117 |
|
118 |
+
st.title("TAPAS Table Question Answering with Weaviate Integration")
|
119 |
+
|
120 |
+
# UI Input for Class and Description
|
121 |
+
class_name = st.text_input("Enter the class name for your CSV data:")
|
122 |
+
class_description = st.text_input("Enter a description for your class:")
|
123 |
|
124 |
# Upload CSV data
|
125 |
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
126 |
if csv_file is not None:
|
127 |
data = csv_file.read().decode("utf-8")
|
128 |
dataframe = pd.read_csv(StringIO(data))
|
|
|
|
|
|
|
|
|
129 |
st.write("CSV Data Preview:")
|
130 |
st.write(dataframe.head())
|
131 |
|
132 |
+
# Ingest data to Weaviate
|
133 |
+
if st.button("Ingest to Weaviate"):
|
134 |
+
ingest_data_to_weaviate(dataframe, class_name, class_description)
|
135 |
+
st.write("Data ingested successfully!")
|
136 |
+
|
137 |
# Input for questions
|
138 |
questions = st.text_area("Enter your questions (one per line)")
|
139 |
questions = questions.split("\n") # split questions by line
|
|
|
141 |
|
142 |
if st.button("Submit"):
|
143 |
if data and questions:
|
144 |
+
answers = summarize_map_reduce(data, questions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
st.write("Answers:")
|
146 |
for q, a in zip(questions, answers):
|
147 |
st.write(f"Question: {q}")
|