Spaces:

jskinner215
/

TAPAS_WTQ_Chunking

Build error

jskinner215 commited on Sep 10, 2023

Commit

b9d05c0

1 Parent(s): 46ad3c2

Weaviate schema based on user input and csv upload

UI Input for Class and Description:
We'll add an input field in the Streamlit app where users can define the class name and description of the CSV they're uploading.

Auto-Populate Schema from CSV:
Once the CSV is uploaded, we'll read its headers to determine the column names. We can then use simple heuristics to determine the data type of each column (e.g., if a column contains only numbers, it's likely a float or int, if it matches date patterns, it's a date, otherwise, it's a string). We'll also use the column names as descriptions for simplicity, but this can be enhanced further if needed.

Create Schema in Weaviate:
Using the class name, description, and the auto-populated properties, we'll define the schema and create it in Weaviate.

Ingest Data:
Once the schema is created, we can then ingest the data from the CSV into Weaviate.

Files changed (1) hide show

app.py +45 -21

app.py CHANGED Viewed

@@ -16,20 +16,46 @@ client = weaviate.Client(
   embedded_options=EmbeddedOptions()
 )
-# Function to ingest data into Weaviate
-def ingest_data_to_weaviate(dataframe):
-    for index, row in dataframe.iterrows():
         obj = {
-            "class": "YourClassName",
-            "id": str(index),
             "properties": row.to_dict()
         }
-        client.data_object.create(obj)
-# Function to query data from Weaviate
 def query_weaviate(question):
     # This is a basic example; adapt the query based on the question
-    results = client.query.get('YourClassName').with_near_text(question).do()
     return results
 def ask_llm_chunk(chunk, questions):
@@ -89,20 +115,25 @@ def summarize_map_reduce(data, questions):
         all_answers.extend(chunk_answers)
     return all_answers
-st.title("TAPAS Table Question Answering with Weaviate")
 # Upload CSV data
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if csv_file is not None:
     data = csv_file.read().decode("utf-8")
     dataframe = pd.read_csv(StringIO(data))
-    # Ingest data into Weaviate
-    ingest_data_to_weaviate(dataframe)
     st.write("CSV Data Preview:")
     st.write(dataframe.head())
     # Input for questions
     questions = st.text_area("Enter your questions (one per line)")
     questions = questions.split("\n")  # split questions by line
@@ -110,14 +141,7 @@ if csv_file is not None:
     if st.button("Submit"):
         if data and questions:
-            # Query Weaviate to get relevant data
-            relevant_data = query_weaviate(questions[0])  # Example: using the first question
-            # Convert the relevant data to a DataFrame (you might need to adjust this based on the Weaviate response format)
-            relevant_df = pd.DataFrame(relevant_data)
-            # Pass the relevant data to TAPAS
-            answers = summarize_map_reduce(relevant_df, questions)
             st.write("Answers:")
             for q, a in zip(questions, answers):
                 st.write(f"Question: {q}")

   embedded_options=EmbeddedOptions()
 )
+def ingest_data_to_weaviate(dataframe, class_name, class_description):
+    properties = []
+    for column in dataframe.columns:
+        data_type = "string"
+        if dataframe[column].dtype == "float64":
+            data_type = "float"
+        elif dataframe[column].dtype == "int64":
+            data_type = "int"
+        properties.append({
+            "name": column,
+            "description": column,
+            "dataType": [data_type]
+        })
+    schema = {
+        "classes": [
+            {
+                "class": class_name,
+                "description": class_description,
+                "properties": properties
+            }
+        ]
+    }
+    # Create Schema in Weaviate
+    client.schema.create(schema)
+    # Ingest Data
+    batch_request = weaviate.ObjectsBatchRequest()
+    for _, row in dataframe.iterrows():
         obj = {
+            "class": class_name,
             "properties": row.to_dict()
         }
+        batch_request.add(obj)
+    client.batch.create(batch_request)
 def query_weaviate(question):
     # This is a basic example; adapt the query based on the question
+    results = client.query.get(class_name).with_near_text(question).do()
     return results
 def ask_llm_chunk(chunk, questions):
         all_answers.extend(chunk_answers)
     return all_answers
+st.title("TAPAS Table Question Answering with Weaviate Integration")
+# UI Input for Class and Description
+class_name = st.text_input("Enter the class name for your CSV data:")
+class_description = st.text_input("Enter a description for your class:")
 # Upload CSV data
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if csv_file is not None:
     data = csv_file.read().decode("utf-8")
     dataframe = pd.read_csv(StringIO(data))
     st.write("CSV Data Preview:")
     st.write(dataframe.head())
+    # Ingest data to Weaviate
+    if st.button("Ingest to Weaviate"):
+        ingest_data_to_weaviate(dataframe, class_name, class_description)
+        st.write("Data ingested successfully!")
     # Input for questions
     questions = st.text_area("Enter your questions (one per line)")
     questions = questions.split("\n")  # split questions by line
     if st.button("Submit"):
         if data and questions:
+            answers = summarize_map_reduce(data, questions)
             st.write("Answers:")
             for q, a in zip(questions, answers):
                 st.write(f"Question: {q}")