Spaces:
Running
Running
File size: 5,040 Bytes
c1a0d97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
from dotenv import load_dotenv
load_dotenv()
api = os.getenv("groq_api_key")
from sentence_transformers import SentenceTransformer
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
def create_metadata_embeddings():
student="""
Table: student
Columns:
- student_id: an integer representing the unique ID of a student.
- first_name: a string containing the first name of the student.
- last_name: a string containing the last name of the student.
- date_of_birth: a date representing the student's birthdate.
- email: a string for the student's email address.
- phone_number: a string for the student's contact number.
- major: a string representing the student's major field of study.
- year_of_enrollment: an integer for the year the student enrolled.
"""
employee="""
Table: employee
Columns:
- employee_id: an integer representing the unique ID of an employee.
- first_name: a string containing the first name of the employee.
- last_name: a string containing the last name of the employee.
- email: a string for the employee's email address.
- department: a string for the department the employee works in.
- position: a string representing the employee's job title.
- salary: a float representing the employee's salary.
- date_of_joining: a date for when the employee joined the college.
"""
course="""
Table: course_info
Columns:
- course_id: an integer representing the unique ID of the course.
- course_name: a string containing the course's name.
- course_code: a string for the course's unique code.
- instructor_id: an integer for the ID of the instructor teaching the course.
- department: a string for the department offering the course.
- credits: an integer representing the course credits.
- semester: a string for the semester when the course is offered.
"""
metadata_list = [student, employee, course]
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(metadata_list)
return embeddings,model,student,employee,course
def find_best_fit(embeddings,model,user_query,student,employee,course):
query_embedding = model.encode([user_query])
similarities = cosine_similarity(query_embedding, embeddings)
best_match_table = similarities.argmax()
if(best_match_table==0):
table_metadata=student
elif(best_match_table==1):
table_metadata=employee
else:
table_metadata=course
return table_metadata
def create_prompt(user_query,table_metadata):
system_prompt="""
You are a SQL query generator specialized in generating SELECT queries for a single table at a time. Your task is to accurately convert natural language queries into SQL SELECT statements based on the user's intent and the provided table metadata.
Rules:
Focus on SELECT Queries: Only generate SELECT queries. Do not generate INSERT, UPDATE, DELETE, or multi-table JOINs.
Single Table Only: Assume all queries are related to a single table provided in the metadata. Ignore any references to other tables.
Metadata-Based Validation: Always ensure the generated query matches the table name, columns, and data types provided in the metadata.
User Intent: Accurately capture the user's requirements, such as filters, sorting, or aggregations, as expressed in natural language.
SQL Syntax: Use standard SQL syntax that is compatible with most relational database systems.
Input Format:
User Query: The user's natural language request.
Table Metadata: The structure of the relevant table, including the table name, column names, and data types.
Output Format:
SQL Query: A valid SELECT query formatted for readability.
Do not output anything else except the SQL query.Not even a single word extra.Ouput the whole query in a single line only.
You are ready to generate SQL queries based on the user input and table metadata.
"""
user_prompt=f"""
User Query: {user_query}
Table Metadata: {table_metadata}
"""
return system_prompt,user_prompt
def generate_output(system_prompt,user_prompt):
client = Groq(api_key=api,)
chat_completion = client.chat.completions.create(messages=[
{"role": "system", "content": system_prompt},
{"role": "user","content": user_prompt,}],model="llama3-70b-8192",)
res = chat_completion.choices[0].message.content
select=res[0:6].lower()
if(select=="select"):
output=res
else:
output="Can't perform the task at the moment."
return output
def response(user_query):
embeddings,model,student,employee,course=create_metadata_embeddings()
table_metadata=find_best_fit(embeddings,model,user_query,student,employee,course)
system_prompt,user_prompt=create_prompt(user_query,table_metadata)
output=generate_output(system_prompt,user_prompt)
return output
demo = gr.Interface(
fn=response,
inputs=gr.Textbox(label="Please provide the natural language query"),
outputs=gr.Textbox(label="SQL Query"),
title="SQL Query generator"
)
demo.launch(share="True") |