import pandas as pd import numpy as np import matplotlib.pyplot as plt import gradio as gr from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from langchain.memory import ConversationBufferMemory from langchain.llms.huggingface_pipeline import HuggingFacePipeline from langchain.schema.runnable import RunnableLambda from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.text_splitter import CharacterTextSplitter from langchain.chains.retrieval_qa.base import RetrievalQA import io import contextlib from PIL import Image import unittest from unittest.mock import patch df = pd.read_csv('Global_Superstore2.csv', encoding='ISO-8859-1') schema_info = "\n".join([f"- `{col}` ({dtype})" for col, dtype in df.dtypes.items()]) history_df = pd.read_csv('sample_requests_and_code_300plus.csv') embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") faiss_index = FAISS.from_texts(history_df['request'].tolist(), embeddings) retriever = faiss_index.as_retriever() # Load the model model_name = "neuralmagic/Llama-2-7b-chat-quantized.w4a16" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Create a text-generation pipeline small_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, trust_remote_code=True, device_map="auto", max_new_tokens=250, temperature=0.2, top_p=0.9, do_sample=True, repetition_penalty=1.1, pad_token_id=tokenizer.eos_token_id ) llm = HuggingFacePipeline(pipeline=small_pipeline) memory = ConversationBufferMemory() retrieval_qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff") def generate_prompt(user_query, schema_info): retrieved_docs = retrieval_qa.run(user_query) similar_doc = retriever.get_relevant_documents(user_query, k=1) similar_code = "" if similar_doc: idx = similar_doc[0].metadata.get('index', None) if idx is not None: similar_code = history_df.iloc[idx]['code'] messages = [ {"role": "system", "content": f""" You are an expert data analyst. Your response MUST: - Return ONLY valid Python Pandas code (no text, no introductions, no explanations, no extra comments). - ⚠️ Start IMMEDIATELY with the Python code block. - ⚡ Use proper parentheses when using logical operators (&, |) in Pandas conditions. - Always include necessary import statements. - ⚡ Do NOT add ANY extra lines, comments, or explanations. {f"- Reference similar code: {similar_code}" if similar_code else ""} """}, {"role": "user", "content": f""" Dataset Schema: {retrieved_docs} Query: {user_query} """} ] prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) return prompt def execute_generated_code(code): local_env = {} output = io.StringIO() plt.close('all') with contextlib.redirect_stdout(output), contextlib.redirect_stderr(output): try: exec(code, globals(), local_env) if plt.get_fignums(): buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) img = Image.open(buf) return img return None except Exception: return None def process_query(user_query): prompt = generate_prompt(user_query, schema_info) llm_chain = RunnableLambda(lambda x: llm(x["user_query"])) response = llm_chain.invoke({"user_query": prompt}) generated_code = response.strip() if "```python" in generated_code: generated_code = generated_code.split("```python")[1].split("```", 1)[0].strip() elif "```" in generated_code: generated_code = generated_code.split("```", 1)[1].split("```", 1)[0].strip() return generated_code def gradio_chat_interface(history, query): history.append((query, "⏳ **Processing...**")) yield history, None, "" generated_code = process_query(query) with open('/content/generated_code.py', 'w') as f: f.write(generated_code) image = execute_generated_code(generated_code) history[-1] = (query, f"```python\n{generated_code}\n```) ") yield history, image, "" with gr.Blocks() as demo: gr.Markdown(""" # **Interactive Pandas Chat with InsightAI** 💬 **Talk to your data, get instant answers!**
🔍 Explore your dataset! 💻 Instantly view generated Pandas code.
📊 Get accurate responses with RAG-enhanced retrieval. 📈 Live visualizations update on the right.
""") with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(label="Chat with RAG & Historical Context Expert") query_input = gr.Textbox(placeholder="Type your query and press Enter...", label="Your Query") with gr.Column(scale=2): plot_output = gr.Image(label="📊 Visualization", height=500) query_input.submit( fn=gradio_chat_interface, inputs=[chatbot, query_input], outputs=[chatbot, plot_output, query_input] ) demo.launch()