Spaces:

ankush-003
/

MahaNeta

Runtime error

App Files Files Community

ankush-003 commited on Nov 2, 2024

Commit

10757ec

1 Parent(s): cff415c

init

Browse files

Files changed (23) hide show

.Dockerfile +36 -0
.Dockerignore +19 -0
app.py +118 -0
chainlit.md +5 -0
data/details_of_assembly_segment_2019.csv +0 -0
data/eci_data_2024.csv +0 -0
data/maha_results_2019.csv +0 -0
docs/lab_session1_25oct2024.pdf +0 -0
docs/pes_lab_session1.pdf +0 -0
requirements.txt +75 -0
utils/__init__.py +0 -0
utils/cot.py +30 -0
utils/few_shot.py +105 -0
utils/get_completion_client.py +51 -0
utils/load_details_dataset.py +75 -0
utils/load_election_dataset.py +72 -0
utils/load_llm.py +37 -0
utils/load_maha_election_dataset.py +95 -0
utils/prompts.py +41 -0
utils/query_generator.py +118 -0
utils/react.py +18 -0
utils/sql_runtime.py +136 -0
utils/tools.py +207 -0

.Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+# Use Python 3.9 as the base image
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Create and activate virtual environment
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt /app/requirements.txt
+# Install dependencies in virtual environment
+RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
+# Copy the rest of the application
+COPY . .
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+# Expose the default Chainlit port
+EXPOSE 8000
+# Command to run the application
+# CMD . /opt/venv/bin/activate && exec chainlit run app.py --port 8000
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

.Dockerignore ADDED Viewed

	@@ -0,0 +1,19 @@

+# python venv
+llm
+# envs
+.env
+# idx
+.idx
+.vscode
+# database
+elections.db
+# __pycache__
+__pycache__
+# Chainlit files
+.chainlit
+.files

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import chainlit as cl
+from utils import load_details_dataset, load_election_dataset, load_maha_election_dataset
+from sqlite3 import connect
+from typing import cast
+from utils.load_llm import load_llm
+from dotenv import load_dotenv
+from utils.query_generator import sql_generator, sql_formatter, analyze_results
+from langchain.schema.runnable import Runnable
+from utils.sql_runtime import SQLRuntime
+load_dotenv()
+# global variables
+db_path = './data/elections.db'
+sql_runtime = SQLRuntime(dbname=db_path)
+# Load the dataset
+@cl.action_callback("Load Datasets")
+async def on_action(action: cl.Action):
+    print("Loading datasets...")
+    # save the datasets as tables
+    conn = connect('./data/elections.db')
+    load_details_dataset.load_data_from_csv_to_db('./data/details_of_assembly_segment_2019.csv', conn)
+    load_election_dataset.load_data_from_csv_to_db('./data/eci_data_2024.csv', conn)
+    load_maha_election_dataset.load_data_from_csv_to_db('./data/maha_results_2019.csv', conn)
+    return "Datasets loaded successfully."
+@cl.action_callback("Execute Query")
+async def on_action(action: cl.Action):
+    res = await cl.AskUserMessage(content="Enter Query to run Manually", timeout=20).send()
+    actions = [
+        cl.Action(name="Execute Query", description="Execute the query on the dataset", value="Execute Query")
+    ]
+    if res:
+        query = res['output']
+        res = sql_runtime.execute(query)
+        print(res)
+        if res["code"] == 0:
+            data = ""
+            if res["data"]:
+                for row in res["data"]:
+                    data += str(row) + "\n"
+            elements = [
+                cl.Text(name="Result", content=data, display="inline"),
+            ]
+            await cl.Message(
+                content=f"Query: {query}",
+                elements=elements,
+                actions=actions,
+            ).send()
+        else:
+            error = res["msg"]["traceback"]
+            elements = [
+                cl.Text(name="Error", content=error, display="inline"),
+            ]
+            await cl.Message(
+                content=f"Query: {query}",
+                elements=elements,
+                actions=actions,
+            ).send()
+    # return "Query executed successfully."
+@cl.on_chat_start
+async def start():
+    # Sending an action button within a chatbot message
+    actions = [
+        cl.Action(name="Load Datasets", description="Load the datasets into the database", value="Load Datasets")
+    ]
+    chain = sql_generator | sql_formatter | analyze_results
+    cl.user_session.set("chain", chain)
+    cl.user_session.set("db_path", './data/elections.db')
+    await cl.Message(content="I am your personal political expert. I can help you analyze the election data. Click the button below to load the datasets.", actions=actions).send()
+@cl.on_message
+async def on_message(message: cl.Message):
+    chain = cast(Runnable, cl.user_session.get("chain"))
+    db_path = cl.user_session.get("db_path")
+    actions = [
+        cl.Action(name="Execute Query", description="Execute the query on the dataset", value="Execute Query")
+    ]
+    print(message.content)
+    try:
+        res = chain.invoke({
+            "query": message.content,
+            "db_path": db_path
+        })
+    except Exception as e:
+        print(e)
+        await cl.Message(content="An error occurred while processing the query. Please try again.").send()
+        return
+    queries = "\n".join(res.queries)
+    errors = "".join(res.errors)
+    elements = [
+        cl.Text(name='results', content=res.summary, display="inline"),
+        cl.Text(name="queries", content=queries, display="inline"),
+    ]
+    if errors:
+        elements.append(cl.Text(name="errors", content=errors, display="inline"))
+    await cl.Message(
+        content="Let's analyze the results of the query",
+        elements=elements,
+        actions=actions
+    ).send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,5 @@

+## Welcome screen
+# MahaNeta
+**Your Own Personal Political Assistant**

data/details_of_assembly_segment_2019.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/eci_data_2024.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/maha_results_2019.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/lab_session1_25oct2024.pdf ADDED Viewed

Binary file (853 kB). View file

docs/pes_lab_session1.pdf ADDED Viewed

Binary file (236 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,75 @@

+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+attrs==24.2.0
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+distro==1.9.0
+frozenlist==1.5.0
+gitdb==4.0.11
+GitPython==3.1.43
+greenlet==3.1.1
+groq==0.11.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+idna==3.10
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+langchain==0.3.4
+langchain-core==0.3.13
+langchain-groq==0.2.0
+langchain-google-genai
+langchain-text-splitters==0.3.0
+langsmith==0.1.137
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+multidict==6.1.0
+narwhals==1.12.1
+numpy==1.26.4
+orjson==3.10.10
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+propcache==0.2.0
+protobuf==5.28.3
+pyarrow==18.0.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.3
+rpds-py==0.20.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+SQLAlchemy==2.0.36
+streamlit==1.39.0
+tenacity==9.0.0
+toml==0.10.2
+tornado==6.4.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==5.0.3
+yarl==1.17.0
+# frontent
+chainlit

utils/__init__.py ADDED Viewed

File without changes

utils/cot.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+This example is to show Chain Of Thought
+"""
+from langchain import PromptTemplate
+from load_llm import load_llm
+template = """Answer the question based on the context below. If the
+question cannot be answered using the information provided answer
+with "I don't know".
+Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can contains 3 tennis balls. How many
+tennis balls does he have now?
+A: Roger started with 5 balls. 2 cans of 3 tennis balls is 6 tennis balls. 5+6 = 11.The answer is 11.
+Q: The cafetaria has 23 apples. If they used 20 apples for lunch and bought 6 more, how many apples do they have?
+"""
+prompt_template = PromptTemplate(
+    input_variables=[],
+    template=template
+)
+prompt = prompt_template.format(
+    )
+llm = load_llm()
+print(llm(prompt))

utils/few_shot.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+This example is to show how to load an LLM, use a prompt and retrieve results
+We illustrate the use of LangChain for a few shot inferencing
+The dynamic number of examples is important because the max length of our prompt and completion output is limited.
+This limitation is measured by maximum context window.
+context_window = input_tokens + output_tokens
+At the same time, we can maximize the number of examples given to the model for few-shot learning.
+Considering this, we need to balance the number of examples included and our prompt size.
+Our hard limit is the maximum context size, but we must also consider the cost of processing more tokens through LLM.
+Fewer tokens mean a cheaper service and faster completions from the LLM.
+"""
+from load_llm import load_llm
+from langchain import PromptTemplate, FewShotPromptTemplate
+from langchain.prompts.example_selector import LengthBasedExampleSelector
+# create our examples
+examples = [
+    {
+        "query": "How are you?",
+        "answer": "I can't complain but sometimes I still do."
+    }, {
+        "query": "What time is it?",
+        "answer": "It's time to get a watch."
+    }, {
+        "query": "What is the meaning of life?",
+        "answer": "42"
+    }, {
+        "query": "What is the weather like today?",
+        "answer": "Cloudy with a chance of memes."
+    }, {
+        "query": "What is your favorite movie?",
+        "answer": "Terminator"
+    }, {
+        "query": "Who is your best friend?",
+        "answer": "Siri. We have spirited debates about the meaning of life."
+    }, {
+        "query": "What should I do today?",
+        "answer": "Stop talking to chatbots on the internet and go outside."
+    }
+]
+# create a example template
+example_template = """
+User: {query}
+AI: {answer}
+"""
+# create a prompt example from above template
+example_prompt = PromptTemplate(
+    input_variables=["query", "answer"],
+    template=example_template
+)
+example_selector = LengthBasedExampleSelector(
+    examples=examples,
+    example_prompt=example_prompt,
+    max_length=50  # this sets the max length that examples should be
+)
+# now break our previous prompt into a prefix and suffix
+# the prefix is our instructions
+prefix = """The following are exerpts from conversations with an AI
+assistant. The assistant is typically sarcastic and witty, producing
+creative  and funny responses to the users questions. Here are some
+examples:
+"""
+# and the suffix our user input and output indicator
+suffix = """
+User: {query}
+AI: """
+# now create the few shot prompt template
+dynamic_prompt_template = FewShotPromptTemplate(
+    example_selector=example_selector,  # use example_selector instead of examples
+    example_prompt=example_prompt,
+    prefix=prefix,
+    suffix=suffix,
+    input_variables=["query"],
+    example_separator="\n"
+)
+print(dynamic_prompt_template.format(query="How do birds fly?"))
+print("-------- Longer query will select fewer examples in order to preserve the context ----------")
+query = """If I am in America, and I want to call someone in another country, I'm
+thinking maybe Europe, possibly western Europe like France, Germany, or the UK,
+what is the best way to do that?"""
+prompt = dynamic_prompt_template.format(query=query)
+print(prompt)
+print("-------- Shorter query for LLM ----------")
+query = "How is the weather in your city today?"
+prompt = dynamic_prompt_template.format(query=query)
+print(prompt)
+llm = load_llm()
+print(
+    llm(prompt)
+)

utils/get_completion_client.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+from openai import OpenAI
+# Point to the local server
+client1 = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")  # html to json
+model = r"lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q8_0.gguf"
+# model = "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf:2"
+def get_completion(prompt, client=client1, model=model):
+    """
+    given the prompt, obtain the response from LLM hosted by LM Studio as a server
+    :param prompt: prompt to be sent to LLM server
+    :return: response from the LLM
+    """
+    prompt = [
+        {"role": "user", "content": prompt}
+    ]
+    completion = client.chat.completions.create(
+        model=model,
+        messages=prompt,
+        temperature=0.0,
+        stream=True,
+    )
+    new_message = {"role": "assistant", "content": ""}
+    for chunk in completion:
+        if chunk.choices[0].delta.content:
+            # print(chunk.choices[0].delta.content, end="", flush=True)
+            val = chunk.choices[0].delta.content
+            new_message["content"] += val
+    # print(type()
+    val = new_message["content"]  # .split("<end_of_turn>")[0]
+    return val
+if __name__ == '__main__':
+    prompt = """
+    You are a political leader and your party is trying to win the general elections in India.
+    You are given an LLM that can provide you the analytics using the past historical data given to it.
+    In particular the LLM has been provided data on which party won each constituency out of 545 and which assembly segment within the main constituency is more favorable.
+    It also has details of votes polled by every candidate.
+    Tell me 10 questions that you want to ask the LLM.
+    """
+    results = get_completion(prompt)
+    print(results)

utils/load_details_dataset.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import json
+import sqlite3
+import pandas as pd
+import csv
+def load_data_from_csv(name, end=58925):
+    data = []
+    keys = None
+    with open(name, "r", encoding="utf-8", errors="ignore") as f:
+        csv_data = csv.reader(f)
+        for i, line in enumerate(csv_data):
+            if i == 0:
+                keys = line
+                continue
+            item = {}
+            for key, val in zip(keys, line):
+                item[key] = val
+            data.append(item)
+    return data
+def load_data_from_csv_to_db(name, conn, col_names=None):
+    # read the dataset from csv file and create a pandas dataframe
+    df = pd.read_csv(open(name, "r", encoding="utf-8", errors="ignore"))
+    df.columns = [
+        'state', 'parliamentary_constituency', 'constituency', 'nota_votes', 'candidate_name', 'party_name', 'total_votes'
+    ]
+    # removing extra whitespace
+    string_columns = df.select_dtypes(include=['object']).columns
+    for col in string_columns:
+        df[col] = df[col].astype(str).str.strip()
+    df['constituency'] = df['constituency'].str.replace(r'\s*-\s*\d+$', '', regex=True)
+    # Remove any parenthetical suffixes like (SC) or (ST)
+    df['constituency'] = df['constituency'].str.replace(r'\s*\([^)]*\)', '', regex=True)
+    # save the dataframe as a database table, name of table is: elections_2019
+    result = df.to_sql("elections_2019", conn, if_exists="replace")
+    return result
+def query_sql(conn, query):
+    cursor = conn.cursor()
+    cursor.execute(query)
+    result = cursor.fetchall()
+    field_names = [r[0] for r in cursor.description]
+    print(field_names)
+    return result
+if __name__ == '__main__':
+    # create a connection to sql db called elections.db
+    conn = sqlite3.connect('../data/elections.db')
+    filename = r"../data/details_of_assembly_segment_2019.csv"
+    data = load_data_from_csv(filename, end=5)
+    res = load_data_from_csv_to_db(filename, conn)
+    query = "SELECT * FROM elections_2019 LIMIT 5;"
+    results = query_sql(conn, query)
+    print(results)
+    # keys = data.keys()
+    # for i, item in enumerate(data):
+    #     print(data[item])
+    # jdata = json.loads(data.to_json())
+    # print(jdata)

utils/load_election_dataset.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import sqlite3
+import pandas as pd
+import csv
+def load_data_from_csv(name, end=58925):
+    data = []
+    keys = None
+    with open(name, "r", encoding="utf-8", errors="ignore") as f:
+        csv_data = csv.reader(f)
+        for i, line in enumerate(csv_data):
+            if i == 0:
+                keys = line
+                continue
+            item = {}
+            for key, val in zip(keys, line):
+                item[key] = val
+            data.append(item)
+    return data
+def load_data_from_csv_to_db(name, conn, col_names=None):
+    # read the dataset from csv file and create a pandas dataframe
+    df = pd.read_csv(open(name, "r", encoding="utf-8", errors="ignore"))
+    df.columns = [
+        'sn', 'candidate_name', 'party_name', 'evm_votes', 'postal_votes', 'total_votes', 'vote_percentage', 'state', 'constituency'
+    ]
+    df['constituency'] = df['constituency'].str.replace(r'\s*-\s*\d+$', '', regex=True)
+    # Remove any parenthetical suffixes like (SC) or (ST)
+    df['constituency'] = df['constituency'].str.replace(r'\s*\([^)]*\)', '', regex=True)
+    # save the dataframe as a database table, name of table is: elections_2019
+    result = df.to_sql("elections_2024", conn, if_exists="replace")
+    return result
+def query_sql(conn, query):
+    cursor = conn.cursor()
+    cursor.execute(query)
+    result = cursor.fetchall()
+    field_names = [r[0] for r in cursor.description]
+    print(field_names)
+    return result
+if __name__ == '__main__':
+    # create a connection to sql db called elections.db
+    conn = sqlite3.connect('../data/elections.db')
+    filename = r"../data/eci_data_2024.csv"
+    data = load_data_from_csv(filename, end=5)
+    res = load_data_from_csv_to_db(filename, conn)
+    query = "SELECT count(*) FROM elections_2024 WHERE constituency='Amalapuram';"
+    results = query_sql(conn, query)
+    print(results)
+    # keys = data.keys()
+    # for i, item in enumerate(data):
+    #     print(data[item])
+    # jdata = json.loads(data.to_json())
+    # print(jdata)

utils/load_llm.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+This module loads the LLM from the local file system
+Modify this file if you need to download some other model from Hugging Face or OpenAI/ChatGPT
+"""
+# from langchain.llms import CTransformers
+# from langchain_openai import OpenAI
+from langchain_groq import ChatGroq
+from langchain_google_genai import ChatGoogleGenerativeAI
+from dotenv import load_dotenv
+import os
+model_name = 'gemma2-9b-it'
+def load_llm(model_name=model_name):
+    # llm = ChatGroq(
+    #     temperature=0,
+    #     model=model_name,
+    # )
+    llm = ChatGoogleGenerativeAI(
+        model="gemini-1.5-flash",
+        temperature=0,
+        max_tokens=None,
+        timeout=None,
+        max_retries=2,
+    )
+    return llm
+if __name__ == '__main__':
+    load_dotenv()
+    llm = load_llm()
+    result = llm.invoke("Provide a short answer: What is machine learning?")
+    print(result.content)

utils/load_maha_election_dataset.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Loads Maharashtra assembly 2019 dataset
+"""
+import json
+import sqlite3
+import pandas as pd
+import csv
+def load_data_from_csv(name, end=58925):
+    data = []
+    keys = None
+    with open(name, "r", encoding="utf-8") as f:
+        csv_data = csv.reader(f)
+        for i, line in enumerate(csv_data):
+            found = False
+            if i == 0:
+                keys = line
+                continue
+            for field in line:
+                if field.strip() == "TURNOUT":
+                    found = True
+                    break
+            if found:
+                # print("TURNOUT found, skipping")
+                continue
+            item = {}
+            # print(line)
+            for key, val in zip(keys, line):
+                item[key] = val
+            data.append(item)
+    return data
+def clean_dataframe(df):
+    # Strip leading and trailing spaces from column names (without changing them)
+    df.columns = df.columns.str.strip()
+    # Strip spaces and convert text columns to lowercase
+    for col in df.select_dtypes(include='object').columns:
+        df[col] = df[col].str.strip()
+    # Fill null values with 0
+    df.fillna(0, inplace=True)
+    return df
+def load_data_from_csv_to_db(name, conn):
+    # read the dataset from csv file and create a pandas dataframe
+    df = pd.read_csv(open(name, "r", encoding="utf-8"))
+    # clean the dataframe
+    df = clean_dataframe(df)
+    df.columns = [
+        'state', 'constituency_number', 'constituency', 'candidate_name', 'sex', 'age',
+        'category', 'party_name', 'party_symbol', 'evm_votes', 'postal_votes', 'total_votes',
+        'vote_percentage', 'total_electors'
+    ]
+    # save the dataframe as a database table, name of table is: elections_2019
+    result = df.to_sql("maha_2019", conn, if_exists='replace', index=False)
+    return result
+def query_sql(conn, query):
+    cursor = conn.cursor()
+    cursor.execute(query)
+    result = cursor.fetchall()
+    field_names = [r[0] for r in cursor.description]
+    print(field_names)
+    return result
+if __name__ == '__main__':
+    # create a connection to sql db called elections.db
+    conn = sqlite3.connect('../data/elections.db')
+    filename = r"../data/maha_results_2019.csv"
+    data = load_data_from_csv(filename, end=5)
+    # print(data)
+    res = load_data_from_csv_to_db(filename, conn)
+    # print(res)
+    query = "SELECT * FROM maha_2019 LIMIT 5;"
+    results = query_sql(conn, query)
+    print(results)
+    # keys = data.keys()
+    # for i, item in enumerate(data):
+    #     print(data[item])
+    # jdata = json.loads(data.to_json())
+    # print(jdata)

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from langchain_core.prompts import PromptTemplate
+from langchain import hub
+# react prompt
+react_prompt = hub.pull("hwchase17/react")
+# prompt to generate sql queries
+sql_query_prompt = PromptTemplate.from_template(
+    """
+    You are a SQL Query Agent who has access to a database with the schema:
+    {db_schema},
+    For the given input: {input},
+    Generate SQL queries by analyzing the schema and the input. Make sure to answer all the questions in the input.
+    Generate more number of queries so that a detailed analysis can be done. Make sure the queries are valid and safe.
+    If there is no relevant query to generate, just generate a query to view the schema of the tables.
+    """
+)
+# prompt to summarize the SQL query results
+sql_query_summary_prompt = PromptTemplate.from_template(
+    """
+    You are a Political Expert who is analyzing the results of the SQL queries executed on the election database.
+    The initial query: {query},
+    You are provided with the sql queries and their results. Analyze the results and summarize the key insights and answer the initial query.
+    If there are any errors in the execution of queries, analyze the errors and provide insights on the issues.
+    {results}
+    """
+)
+sql_query_visualization_prompt = PromptTemplate.from_template(
+    """
+    You are a Data Scientist who is visualizing the results of the SQL queries executed on the election database.
+    The initial query: {query},
+    You are provided with the sql queries and their results. Visualize the results and provide insights on the data using appropriate visualizations and formatting.
+    If there are any errors in the execution of queries, analyze the errors and provide insights on the issues.
+    {results}
+    """
+)

utils/query_generator.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from .sql_runtime import SQLRuntime
+from pydantic import BaseModel, Field
+from .load_llm import load_llm
+from .prompts import sql_query_prompt, sql_query_summary_prompt, sql_query_visualization_prompt
+from langchain_core.runnables import chain
+from typing import Optional
+from dotenv import load_dotenv
+class Generated_query(BaseModel):
+    """
+    The SQL query to execute, make sure to use semicolon at the end of the query, do not execute harmful queries
+    """
+    queries: list[str] = Field(description="List of SQL queries to execute, use title case for strings, make sure to use semicolon at the end of each query, do not execute harmful queries")
+class QuerySummary(BaseModel):
+    """
+    The summary of the SQL query results
+    """
+    summary: str = Field(description="The analysis of the SQL query results")
+    errors: list[str] = Field(description="The errors in the execution of the queries")
+    queries: list[str] = Field(description="The SQL queries executed and their results")
+@chain
+def sql_generator(input: dict) -> Generated_query:
+    query, db_path = input["query"], input["db_path"]
+    sql_runtime = SQLRuntime(dbname=db_path)
+    query_generator_llm = load_llm().with_structured_output(Generated_query)
+    # getting the schemas
+    schemas = sql_runtime.get_schemas()
+    # chain to generate the queries
+    chain = sql_query_prompt | query_generator_llm
+    # executing the chain
+    gen_queries = chain.invoke({
+        "db_schema": schemas,
+        "input": query
+    })
+    # executing the queries
+    res = sql_runtime.execute_batch(gen_queries.queries)
+    # print(res)
+    return {
+        "input": query,
+        "results": res
+    }
+@chain
+def sql_formatter(input):
+    """
+    Formats the output of the SQL queries
+    """
+    output = []
+    for item in input["results"]:
+        if item["code"] == 0:
+            output.append(f"Query: {item['msg']['input']}, Result: {item['data']}")
+        else:
+            output.append(f"Query: {item['msg']['input']}, Error: {item['msg']['traceback']}")
+    # print(output)
+    return {
+        "query": input["input"],
+        "results": output
+    }
+@chain
+def analyze_results(input) -> QuerySummary:
+    """
+    Analyzes the results of the SQL queries executed on the election database
+    """
+    chain = sql_query_summary_prompt | load_llm().with_structured_output(QuerySummary)
+    # chain2 = sql_query_visualization_prompt | load_llm().with_structured_output(QuerySummary)
+    return chain.invoke({
+        "query": input["query"],
+        "results": input["results"]
+    })
+if __name__ == '__main__':
+    load_dotenv()
+    # executing the queries
+    # results = sql_generator.invoke("Find the name of the candidate who got the maximum votes in Maharashtra elections 2019")
+    # for result in results:
+    #     print(f"Query: {result['msg']['input']}")
+    #     if result["code"] != 0:
+    #         print(f"Error executing query: {result['msg']['reason']}")
+    #         print(f"Traceback: {result['msg']['traceback']}")
+    #     else:
+    #         print(result["data"])
+    #     print("\n")
+    # formatting the output
+    res = sql_generator | sql_formatter | analyze_results
+    formatted_output, formatted_output2 = res.invoke(
+        {
+            "query": "What are the different party symbols in Maharashtra elections 2019, create a list of all the symbols",
+            "db_path": "./data/elections.db"
+        }
+    )
+    print(formatted_output.summary)
+    print(formatted_output.errors)
+    print(formatted_output.queries)
+    print("\n")
+    print(formatted_output2.summary)
+    print(formatted_output2.errors)
+    print(formatted_output2.queries)

utils/react.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from langchain import hub
+from langchain.agents import AgentExecutor, create_react_agent
+def run_agent_executor(agent_executor: AgentExecutor, input_data: dict):
+        for chunk in agent_executor.stream(input_data):
+            if "actions" in chunk:
+                for action in chunk["actions"]:
+                    print(f"Calling Tool: `{action.tool}` with input `{action.tool_input}`")
+            # Observation
+            elif "steps" in chunk:
+                for step in chunk["steps"]:
+                    print(f"Tool Result: `{step.observation}`")
+            # Final result
+            elif "output" in chunk:
+                print(f'Final Output: {chunk["output"]}')
+            else:
+                raise ValueError()
+            print("---")

utils/sql_runtime.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Runtime that accepts a sql statement and runs it on sql server.
+Returns the results of sql execution.
+"""
+import traceback
+import sqlite3
+# MODIFY THE PATH BELOW FOR YOUR SYSTEM
+my_db = r"../data/elections.db"
+class SQLRuntime(object):
+    def __init__(self, dbname=None):
+        if dbname is None:
+            dbname = my_db
+        conn = sqlite3.connect(dbname)  # creating a connection
+        self.cursor = conn.cursor()  # we need the cursor to execute statement
+        return
+    def list_tables(self):
+        result = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
+        table_names = sorted(list(zip(*result))[0])
+        return table_names
+    def get_schema_for_table(self, table_name):
+        result = self.cursor.execute("PRAGMA table_info('%s')" % table_name).fetchall()
+        column_names = list(zip(*result))[1]
+        return column_names
+    def get_schemas(self):
+        schemas = {}
+        table_names = self.list_tables()
+        for name in table_names:
+            fields = self.get_schema_for_table(name)  # fields of the table name
+            schemas[name] = fields
+        return schemas
+    def execute(self, statement):
+        code = 0
+        msg = {
+            "text": "SUCCESS",
+            "reason": None,
+            "traceback": None,
+        }
+        data = None
+        try:
+            self.cursor.execute(statement)
+        except sqlite3.OperationalError:
+            code = -1
+            msg = {
+                "text": "ERROR: SQL execution error",
+                "reason": "possibly due to incorrect table/fields names",
+                "traceback": traceback.format_exc(),
+            }
+        if code == 0:
+            data = self.cursor.fetchall()
+        msg["input"] = statement
+        result = {
+            "code": code,
+            "msg": msg,
+            "data": data
+        }
+        return result
+    def execute_batch(self, queries):
+        results = []
+        for query in queries:
+            result = self.execute(query)
+            results.append(result)
+        return results
+    def post_process(self, data):
+        """
+        post process the data so that we can identify any harmful code and remove them.
+        Also, llm output may need an output parser.
+        :param data:
+        :return:
+        """
+        # IMPLEMENT YOUR CODE HERE FOR POST-PROCESSING and VALIDATION
+        return data
+def sql_runtime(statement):
+    """
+    Instantiates a sql runtime and executes the given sql statement
+    :param statement: sql statement
+    """
+    SQL = SQLRuntime()
+    data = SQL.execute(statement)
+    return data
+if __name__ == '__main__':
+    # stmt = """
+    # SELECT * FROM elections_2019;
+    # """
+    # stmt = input("Enter stmt: ")
+    sql = SQLRuntime()
+    tables = sql.list_tables()
+    print(tables)
+    schemas = {}
+    for table in tables:
+        schemas[table] = sql.get_schema_for_table(table)
+        print(f"Table: {table}, Schema: {schemas[table]}\n")
+    # data1 = sql.execute(stmt)
+    # dat = data1["data"]
+    # if dat is not None and len(dat) > 0:
+    #     for record in dat:
+    #         print(record)
+    #         print("-" * 100)
+    # sample question:  find out the votes polled by NOTA for each instance of Akkalkuwa in the parliamentary elections 2019.
+    stmt = """
+    SELECT party_name, SUM(nota_votes)
+    FROM elections_2019
+    WHERE constituency='Akkalkuwa'
+    GROUP BY party_name;
+    """
+    data1 = sql.execute(stmt)
+    # print(data1)
+    dat = data1["data"]
+    if dat is not None and len(dat) > 0:
+        for record in dat:
+            print(record)
+            print("-" * 100)

utils/tools.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from typing import List, Dict, Any, Optional, Type
+from langchain_core.tools import BaseTool
+from pydantic import BaseModel, Field
+import pandas as pd
+from .sql_runtime import SQLRuntime
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from .load_llm import load_llm
+from langchain_core.messages import SystemMessage
+from langchain_core.prompts import HumanMessagePromptTemplate
+from langchain.agents import AgentExecutor, create_react_agent
+from dotenv import load_dotenv
+from react import run_agent_executor
+from prompts import react_prompt
+# definig the input schema
+class QueryInput(BaseModel):
+    query: str = Field(..., description="The SQL query to execute, make sure to use semicolon at the end of the query, do not execute harmful queries")
+class TableNameInput(BaseModel):
+    table_name: str = Field(..., description="The name of the table to analyze")
+class ColumnSearchInput(BaseModel):
+    table_name: str = Field(..., description="The name of the table to search")
+    column_name: str = Field(..., description="The name of the column to search")
+    limit: int = Field(default=10, description="Maximum number of distinct values to return")
+class SQLQueryTool(BaseTool):
+    name: str = "sql_query"
+    description: str = """
+    Execute a SQL query and return the results.
+    Use this when you need to run a specific SQL query on the elections database.
+    The query should be a valid SQL statement and should end with a semicolon.
+    There should be no harmful queries executed.
+    There are three tables in the database: elections_2019, elections_2024, maha_2019
+    """
+    args_schema: Type[BaseModel] = QueryInput
+    # def __init__(self, db_path: Optional[str] = None):
+    #     super().__init__()
+    #     self.
+    def _run(self, query: str) -> str:
+        sql_runtime = SQLRuntime('../data/elections.db')
+        try:
+            result = sql_runtime.execute(query)
+            if result["code"] != 0:
+                return f"Error executing query: {result['msg']['reason']}"
+            # Convert to DataFrame for nice string representation
+            df = pd.DataFrame(result["data"])
+            if not df.empty:
+                return df.to_string()
+            return "Query returned no results"
+        except Exception as e:
+            return f"Error: {str(e)}"
+class TableInfoTool(BaseTool):
+    name: str = "get_table_info"
+    description: str = """
+    Get information about a specific table including its schema and basic statistics.
+    Use this when you need to understand the structure of a table or get basic statistics about it.
+    """
+    args_schema: Type[BaseModel] = TableNameInput
+    # def __init__(self, db_path: Optional[str] = None):
+    #     super().__init__()
+    def _run(self, table_name: str) -> str:
+        sql_runtime = SQLRuntime('../data/elections.db')
+        try:
+            # Get schema
+            schema = sql_runtime.get_schema_for_table(table_name)
+            # Get row count
+            count_query = f"SELECT COUNT(*) FROM {table_name}"
+            count_result = sql_runtime.execute(count_query)
+            row_count = count_result["data"][0][0] if count_result["code"] == 0 else "Error"
+            # Get sample data
+            sample_query = f"SELECT * FROM {table_name} LIMIT 3"
+            sample_result = sql_runtime.execute(sample_query)
+            info = f"""
+                Table: {table_name}
+                Columns: {', '.join(schema)}
+                Row Count: {row_count}
+                Sample Data:
+                {pd.DataFrame(sample_result['data'], columns=schema).to_string() if sample_result['code'] == 0 else 'Error getting sample data'}
+            """
+            return info
+        except Exception as e:
+            return f"Error getting table info: {str(e)}"
+class ColumnValuesTool(BaseTool):
+    name: str = "find_column_values"
+    description: str = """
+    Find distinct values in a specific column of a table.
+    Use this when you need to know what unique values exist in a particular column.
+    """
+    args_schema: Type[BaseModel] = ColumnSearchInput
+    # def __init__(self, db_path: Optional[str] = None):
+    #     super().__init__()
+    #     self.sql_runtime = SQLRuntime(db_path)
+    def _run(self, table_name: str, column_name: str, limit: int = 10) -> str:
+        sql_runtime = SQLRuntime('../data/elections.db')
+        try:
+            query = f"""
+            SELECT DISTINCT {column_name}
+            FROM {table_name}
+            LIMIT {limit}
+            """
+            result = sql_runtime.execute(query)
+            if result["code"] != 0:
+                return f"Error finding values: {result['msg']['reason']}"
+            values = [row[0] for row in result["data"]]
+            return f"Distinct values in {column_name}: {', '.join(map(str, values))}"
+        except Exception as e:
+            return f"Error: {str(e)}"
+class ListTablesTool(BaseTool):
+    name: str = "list_tables"
+    description: str = """
+    List all available tables in the database.
+    Use this when you need to know what tables are available to query.
+    """
+    # def __init__(self, db_path: Optional[str] = None):
+    #     super().__init__()
+    #     self.sql_runtime = SQLRuntime(db_path)
+    def _run(self, *args, **kwargs) -> str:
+        sql_runtime = SQLRuntime('../data/elections.db')
+        try:
+            tables = sql_runtime.list_tables()
+            return f"Available tables: {', '.join(tables)}"
+        except Exception as e:
+            return f"Error listing tables: {str(e)}"
+def create_sql_agent_tools(db_path: Optional[str] = '../data/elections.db') -> List[BaseTool]:
+    """
+    Create a list of all SQL tools for use with a Langchain agent.
+    """
+    return [
+        SQLQueryTool(),
+        TableInfoTool(),
+        # ColumnValuesTool(),
+        ListTablesTool()
+    ]
+if __name__ == "__main__":
+    load_dotenv()
+    tools = create_sql_agent_tools()
+    for tool in tools:
+        print(f"Tool: {tool.name}")
+        print(f"Description: {tool.description}")
+        # print(f"Args Schema: {tool.args_schema.schema()}")
+    # prompt = prompt = ChatPromptTemplate.from_messages(
+    #     [
+    #         SystemMessage(
+    #                 content="""
+    #                 You are a sql agent who has access to a database with three tables: elections_2019, elections_2024, maha_2019.
+    #                 You can use the following tools:
+    #                 - sql_query: Execute a SQL query and return the results.
+    #                 - get_table_info: Get information about a specific table including its schema and basic statistics.
+    #                 - find_column_values: Find distinct values in a specific column of a table.
+    #                 - list_tables: List all available tables in the database.
+    #                 Answer the questions using the tools provided. Do not execute harmful queries.
+    #                 """
+    #             ),
+    #         HumanMessagePromptTemplate.from_template("{text}"),
+    #     ]
+    # )
+    output_parser = StrOutputParser()
+    # Create the llm
+    llm = load_llm()
+    # llm.bind_tools(tools)
+    # res = llm.invoke("who won elections in maharashtra in Nandurbar in elections 2019? use the given tools")
+    # chain = prompt | llm | output_parser
+    # Run the chain
+    agent = create_react_agent(llm, tools, react_prompt)
+    # Create an agent executor by passing in the agent and tools
+    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
+    print("Agent created successfully")
+    # Run the agent
+    # agent_executor.invoke({"input": "Who won the elections in 2019 for the state maharashtra in constituency Akkalkuwa?"})
+    res = agent_executor.invoke({"input": "who won elections in maharashtra in Nandurbar in elections 2019?"})
+    # run_agent_executor(agent_executor, {"input": "who won elections in maharashtra in Nandurbar in elections 2019?"})