jskinner215 commited on
Commit
b9d05c0
·
1 Parent(s): 46ad3c2

Weaviate schema based on user input and csv upload

Browse files

UI Input for Class and Description:
We'll add an input field in the Streamlit app where users can define the class name and description of the CSV they're uploading.

Auto-Populate Schema from CSV:
Once the CSV is uploaded, we'll read its headers to determine the column names. We can then use simple heuristics to determine the data type of each column (e.g., if a column contains only numbers, it's likely a float or int, if it matches date patterns, it's a date, otherwise, it's a string). We'll also use the column names as descriptions for simplicity, but this can be enhanced further if needed.

Create Schema in Weaviate:
Using the class name, description, and the auto-populated properties, we'll define the schema and create it in Weaviate.

Ingest Data:
Once the schema is created, we can then ingest the data from the CSV into Weaviate.

Files changed (1) hide show
  1. app.py +45 -21
app.py CHANGED
@@ -16,20 +16,46 @@ client = weaviate.Client(
16
  embedded_options=EmbeddedOptions()
17
  )
18
 
19
- # Function to ingest data into Weaviate
20
- def ingest_data_to_weaviate(dataframe):
21
- for index, row in dataframe.iterrows():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  obj = {
23
- "class": "YourClassName",
24
- "id": str(index),
25
  "properties": row.to_dict()
26
  }
27
- client.data_object.create(obj)
 
28
 
29
- # Function to query data from Weaviate
30
  def query_weaviate(question):
31
  # This is a basic example; adapt the query based on the question
32
- results = client.query.get('YourClassName').with_near_text(question).do()
33
  return results
34
 
35
  def ask_llm_chunk(chunk, questions):
@@ -89,20 +115,25 @@ def summarize_map_reduce(data, questions):
89
  all_answers.extend(chunk_answers)
90
  return all_answers
91
 
92
- st.title("TAPAS Table Question Answering with Weaviate")
 
 
 
 
93
 
94
  # Upload CSV data
95
  csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
96
  if csv_file is not None:
97
  data = csv_file.read().decode("utf-8")
98
  dataframe = pd.read_csv(StringIO(data))
99
-
100
- # Ingest data into Weaviate
101
- ingest_data_to_weaviate(dataframe)
102
-
103
  st.write("CSV Data Preview:")
104
  st.write(dataframe.head())
105
 
 
 
 
 
 
106
  # Input for questions
107
  questions = st.text_area("Enter your questions (one per line)")
108
  questions = questions.split("\n") # split questions by line
@@ -110,14 +141,7 @@ if csv_file is not None:
110
 
111
  if st.button("Submit"):
112
  if data and questions:
113
- # Query Weaviate to get relevant data
114
- relevant_data = query_weaviate(questions[0]) # Example: using the first question
115
- # Convert the relevant data to a DataFrame (you might need to adjust this based on the Weaviate response format)
116
- relevant_df = pd.DataFrame(relevant_data)
117
-
118
- # Pass the relevant data to TAPAS
119
- answers = summarize_map_reduce(relevant_df, questions)
120
-
121
  st.write("Answers:")
122
  for q, a in zip(questions, answers):
123
  st.write(f"Question: {q}")
 
16
  embedded_options=EmbeddedOptions()
17
  )
18
 
19
+ def ingest_data_to_weaviate(dataframe, class_name, class_description):
20
+ properties = []
21
+ for column in dataframe.columns:
22
+ data_type = "string"
23
+ if dataframe[column].dtype == "float64":
24
+ data_type = "float"
25
+ elif dataframe[column].dtype == "int64":
26
+ data_type = "int"
27
+ properties.append({
28
+ "name": column,
29
+ "description": column,
30
+ "dataType": [data_type]
31
+ })
32
+
33
+ schema = {
34
+ "classes": [
35
+ {
36
+ "class": class_name,
37
+ "description": class_description,
38
+ "properties": properties
39
+ }
40
+ ]
41
+ }
42
+
43
+ # Create Schema in Weaviate
44
+ client.schema.create(schema)
45
+
46
+ # Ingest Data
47
+ batch_request = weaviate.ObjectsBatchRequest()
48
+ for _, row in dataframe.iterrows():
49
  obj = {
50
+ "class": class_name,
 
51
  "properties": row.to_dict()
52
  }
53
+ batch_request.add(obj)
54
+ client.batch.create(batch_request)
55
 
 
56
  def query_weaviate(question):
57
  # This is a basic example; adapt the query based on the question
58
+ results = client.query.get(class_name).with_near_text(question).do()
59
  return results
60
 
61
  def ask_llm_chunk(chunk, questions):
 
115
  all_answers.extend(chunk_answers)
116
  return all_answers
117
 
118
+ st.title("TAPAS Table Question Answering with Weaviate Integration")
119
+
120
+ # UI Input for Class and Description
121
+ class_name = st.text_input("Enter the class name for your CSV data:")
122
+ class_description = st.text_input("Enter a description for your class:")
123
 
124
  # Upload CSV data
125
  csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
126
  if csv_file is not None:
127
  data = csv_file.read().decode("utf-8")
128
  dataframe = pd.read_csv(StringIO(data))
 
 
 
 
129
  st.write("CSV Data Preview:")
130
  st.write(dataframe.head())
131
 
132
+ # Ingest data to Weaviate
133
+ if st.button("Ingest to Weaviate"):
134
+ ingest_data_to_weaviate(dataframe, class_name, class_description)
135
+ st.write("Data ingested successfully!")
136
+
137
  # Input for questions
138
  questions = st.text_area("Enter your questions (one per line)")
139
  questions = questions.split("\n") # split questions by line
 
141
 
142
  if st.button("Submit"):
143
  if data and questions:
144
+ answers = summarize_map_reduce(data, questions)
 
 
 
 
 
 
 
145
  st.write("Answers:")
146
  for q, a in zip(questions, answers):
147
  st.write(f"Question: {q}")