Ari
Update app.py
17fbb60 verified
raw
history blame
9.33 kB
import os
import streamlit as st
import pandas as pd
import sqlite3
from langchain import OpenAI, LLMChain, PromptTemplate
from transformers import LlamaForCausalLM, LlamaTokenizer
import torch
import sqlparse
import logging
# Initialize conversation history
if 'history' not in st.session_state:
st.session_state.history = []
# OpenAI API key (ensure it is securely stored)
openai_api_key = os.getenv("OPENAI_API_KEY")
# Check if the API key is set
if not openai_api_key:
st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.")
st.stop()
# Load the LLaMA model and tokenizer
model_name = "llama3-70b-8192" # Replace with the actual LLaMA model name you want to use
device = "cuda" if torch.cuda.is_available() else "cpu"
llama_tokenizer = LlamaTokenizer.from_pretrained(model_name)
llama_model = LlamaForCausalLM.from_pretrained(model_name).to(device)
# Function to generate responses using LLaMA
def generate_llama_response(prompt):
inputs = llama_tokenizer(prompt, return_tensors="pt").to(device)
outputs = llama_model.generate(inputs.input_ids, max_length=200)
return llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Step 1: Upload CSV data file (or use default)
st.title("Natural Language to SQL Query App with Enhanced Insights")
st.write("Upload a CSV file to get started, or use the default dataset.")
csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
if csv_file is None:
data = pd.read_csv("default_data.csv") # Ensure this file exists in your working directory
st.write("Using default_data.csv file.")
table_name = "default_table"
else:
data = pd.read_csv(csv_file)
table_name = csv_file.name.split('.')[0]
st.write(f"Data Preview ({csv_file.name}):")
st.dataframe(data.head())
# Step 2: Load CSV data into a persistent SQLite database
db_file = 'my_database.db'
conn = sqlite3.connect(db_file)
data.to_sql(table_name, conn, index=False, if_exists='replace')
# SQL table metadata (for validation and schema)
valid_columns = list(data.columns)
st.write(f"Valid columns: {valid_columns}")
# Step 3: Set up the LLM Chains (SQL generation with OpenAI, insights with LLaMA)
# SQL Generation Chain with OpenAI
sql_template = """
You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
Ensure that:
- You only use the columns provided.
- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
- Do not apply 'COLLATE NOCASE' to numeric columns.
If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
Question: {question}
Table name: {table_name}
Valid columns: {columns}
SQL Query:
"""
sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
sql_llm = OpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=180)
sql_generation_chain = LLMChain(llm=sql_llm, prompt=sql_prompt)
# General Insights and Recommendations Chain with LLaMA
def generate_insights_llama(question, data_summary):
insights_template = f"""
You are an expert data scientist. Based on the user's question and the dataset summary provided below, generate concise data insights and actionable recommendations.
User's Question: {question}
Dataset Summary:
{data_summary}
Concise Insights and Recommendations:
"""
return generate_llama_response(insights_template)
# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
def clean_sql_query(query):
"""Removes incorrect usage of COLLATE NOCASE from the SQL query."""
parsed = sqlparse.parse(query)
statements = []
for stmt in parsed:
tokens = []
idx = 0
while idx < len(stmt.tokens):
token = stmt.tokens[idx]
if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
# Check if the next token is 'NOCASE'
next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
if next_token and next_token.value.upper() == 'NOCASE':
# Skip 'COLLATE' and 'NOCASE' tokens
idx += 3 # Skip 'COLLATE', whitespace, 'NOCASE'
continue
tokens.append(token)
idx += 1
statements.append(''.join([str(t) for t in tokens]))
return ' '.join(statements)
# Function to classify user query
def classify_query(question):
"""Classify the user query as either 'SQL' or 'INSIGHTS'."""
classification_template = """
You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
Determine the appropriate category for the following user question.
Question: "{question}"
Category (SQL/INSIGHTS):
"""
classification_prompt = PromptTemplate(template=classification_template, input_variables=['question'])
classification_chain = LLMChain(llm=sql_llm, prompt=classification_prompt)
category = classification_chain.run({'question': question}).strip().upper()
if category.startswith('SQL'):
return 'SQL'
else:
return 'INSIGHTS'
# Function to generate dataset summary
def generate_dataset_summary(data):
"""Generate a summary of the dataset for general insights."""
summary = f"Number of records: {len(data)}, Number of columns: {len(data.columns)}, Columns: {list(data.columns)}"
return summary
# Define the callback function
def process_input():
user_prompt = st.session_state['user_input']
if user_prompt:
try:
# Append user message to history
st.session_state.history.append({"role": "user", "content": user_prompt})
# Classify the user query
category = classify_query(user_prompt)
logging.info(f"User query classified as: {category}")
if "COLUMNS" in user_prompt.upper():
assistant_response = f"The columns are: {', '.join(valid_columns)}"
st.session_state.history.append({"role": "assistant", "content": assistant_response})
elif category == 'SQL':
columns = ', '.join(valid_columns)
generated_sql = sql_generation_chain.run({
'question': user_prompt,
'table_name': table_name,
'columns': columns
}).strip()
if generated_sql.upper() == "NO_SQL":
assistant_response = "No SQL query could be generated."
st.session_state.history.append({"role": "assistant", "content": assistant_response})
else:
cleaned_sql = clean_sql_query(generated_sql)
logging.info(f"Generated SQL Query: {cleaned_sql}")
# Attempt to execute SQL query and handle exceptions
try:
result = pd.read_sql_query(cleaned_sql, conn)
if result.empty:
assistant_response = "The query returned no results. Please try a different question."
st.session_state.history.append({"role": "assistant", "content": assistant_response})
else:
# Display query results
st.session_state.history.append({"role": "assistant", "content": result})
except Exception as e:
logging.error(f"An error occurred during SQL execution: {e}")
assistant_response = f"Error executing SQL query: {e}"
st.session_state.history.append({"role": "assistant", "content": assistant_response})
else: # INSIGHTS category
dataset_summary = generate_dataset_summary(data)
insights = generate_insights_llama(user_prompt, dataset_summary)
st.session_state.history.append({"role": "assistant", "content": insights})
except Exception as e:
logging.error(f"An error occurred: {e}")
assistant_response = f"Error: {e}"
st.session_state.history.append({"role": "assistant", "content": assistant_response})
# Reset the user_input in session state
st.session_state['user_input'] = ''
# Display the conversation history
for message in st.session_state.history:
if message['role'] == 'user':
st.markdown(f"**User:** {message['content']}")
elif message['role'] == 'assistant':
if isinstance(message['content'], pd.DataFrame):
st.markdown("**Assistant:** Query Results:")
st.dataframe(message['content'])
else:
st.markdown(f"**Assistant:** {message['content']}")
# Place the input field at the bottom with the callback
st.text_input("Enter your message:", key='user_input', on_change=process_input)