Spaces:

sandeep-huggingface
/

RAG_CSV_Chatbot

Sleeping

App Files Files Community

sandeep-huggingface commited on Jul 21

Commit

af2cce2

verified ·

1 Parent(s): 4745ef7

app.py

Browse files

Files changed (1) hide show

app.py +454 -0

app.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import gradio as gr
+import os
+import pandas as pd
+from typing import List, Dict
+from langchain.text_splitter import (
+    RecursiveCharacterTextSplitter,
+    CharacterTextSplitter,
+    TokenTextSplitter
+)
+from langchain_community.vectorstores import FAISS, Chroma, Qdrant
+from langchain_community.document_loaders import CSVLoader
+from langchain.chains import ConversationalRetrievalChain
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain.memory import ConversationBufferMemory
+from langchain.schema import Document
+import tempfile
+import shutil
+list_llm = ["meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"]
+list_llm_simple = [os.path.basename(llm) for llm in list_llm]
+api_token = os.getenv("HF_TOKEN")
+CHUNK_SIZES = {
+    "small": {"recursive": 512, "fixed": 512, "token": 256},
+    "medium": {"recursive": 1024, "fixed": 1024, "token": 512}
+}
+def get_text_splitter(strategy: str, chunk_size: int = 1024, chunk_overlap: int = 64):
+    """Get text splitter based on strategy"""
+    splitters = {
+        "recursive": RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        ),
+        "fixed": CharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        ),
+        "token": TokenTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        )
+    }
+    return splitters.get(strategy)
+def csv_to_documents(file_path: str) -> List[Document]:
+    """Convert CSV file to LangChain documents with enhanced metadata"""
+    try:
+        # Read CSV file
+        df = pd.read_csv(file_path)
+        # Get basic info about the CSV
+        filename = os.path.basename(file_path)
+        total_rows = len(df)
+        columns = list(df.columns)
+        documents = []
+        # Create documents from each row
+        for idx, row in df.iterrows():
+            # Create a readable text representation of the row
+            row_text_parts = []
+            # Add column headers and values
+            for col in df.columns:
+                value = str(row[col]) if pd.notna(row[col]) else "N/A"
+                row_text_parts.append(f"{col}: {value}")
+            # Combine all column-value pairs
+            content = "\n".join(row_text_parts)
+            # Create document with rich metadata
+            doc = Document(
+                page_content=content,
+                metadata={
+                    "source": filename,
+                    "row": idx + 1,  # 1-based row numbering
+                    "total_rows": total_rows,
+                    "columns": ", ".join(columns),
+                    "file_path": file_path
+                }
+            )
+            documents.append(doc)
+        return documents
+    except Exception as e:
+        print(f"Error processing CSV file {file_path}: {str(e)}")
+        return []
+def load_doc(list_file_path: List[str], splitting_strategy: str, chunk_size: str):
+    """Load and process CSV documents"""
+    chunk_size_value = CHUNK_SIZES[chunk_size][splitting_strategy]
+    # Load all CSV files and convert to documents
+    all_documents = []
+    for file_path in list_file_path:
+        documents = csv_to_documents(file_path)
+        all_documents.extend(documents)
+    if not all_documents:
+        return []
+    # Apply text splitting
+    text_splitter = get_text_splitter(splitting_strategy, chunk_size_value)
+    doc_splits = text_splitter.split_documents(all_documents)
+    return doc_splits
+def create_db(splits, db_choice: str = "faiss"):
+    """Create vector database from document splits"""
+    embeddings = HuggingFaceEmbeddings()
+    db_creators = {
+        "faiss": lambda: FAISS.from_documents(splits, embeddings),
+        "chroma": lambda: Chroma.from_documents(splits, embeddings),
+        "qdrant": lambda: Qdrant.from_documents(
+            splits,
+            embeddings,
+            location=":memory:",
+            collection_name="csv_docs"
+        )
+    }
+    return db_creators[db_choice]()
+def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice, progress=gr.Progress()):
+    """Initialize vector database with error handling"""
+    try:
+        if not list_file_obj:
+            return None, "No files uploaded. Please upload CSV documents first."
+        list_file_path = [x.name for x in list_file_obj if x is not None]
+        if not list_file_path:
+            return None, "No valid files found. Please upload CSV documents."
+        # Validate that all files are CSV
+        non_csv_files = [path for path in list_file_path if not path.lower().endswith('.csv')]
+        if non_csv_files:
+            return None, f"Non-CSV files detected: {', '.join([os.path.basename(f) for f in non_csv_files])}. Please upload only CSV files."
+        progress(0.2, desc="Loading CSV files...")
+        doc_splits = load_doc(list_file_path, splitting_strategy, chunk_size)
+        if not doc_splits:
+            return None, "No content extracted from CSV documents. Please check if the files contain data."
+        progress(0.6, desc="Creating vector database...")
+        vector_db = create_db(doc_splits, db_choice)
+        progress(1.0, desc="Database created successfully!")
+        num_files = len(list_file_path)
+        num_chunks = len(doc_splits)
+        file_names = [os.path.basename(f) for f in list_file_path]
+        success_msg = (f"Database created successfully!\n"
+                      f"📁 Files processed: {num_files} ({', '.join(file_names)})\n"
+                      f"📊 Document chunks: {num_chunks}\n"
+                      f"🔧 Strategy: {splitting_strategy} splitting\n"
+                      f"💾 Database: {db_choice}")
+        return vector_db, success_msg
+    except Exception as e:
+        return None, f"Error creating database: {str(e)}"
+def initialize_llmchain(llm_choice, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
+    """Initialize LLM chain with error handling"""
+    try:
+        if vector_db is None:
+            return None, "Please create vector database first."
+        progress(0.3, desc="Initializing LLM...")
+        llm_model = list_llm[llm_choice]
+        llm = HuggingFaceEndpoint(
+            repo_id=llm_model,
+            huggingfacehub_api_token=api_token,
+            temperature=temperature,
+            max_new_tokens=max_tokens,
+            top_k=top_k
+        )
+        progress(0.7, desc="Setting up memory and retriever...")
+        memory = ConversationBufferMemory(
+            memory_key="chat_history",
+            output_key='answer',
+            return_messages=True
+        )
+        retriever = vector_db.as_retriever()
+        qa_chain = ConversationalRetrievalChain.from_llm(
+            llm,
+            retriever=retriever,
+            memory=memory,
+            return_source_documents=True
+        )
+        progress(1.0, desc="LLM initialized successfully!")
+        success_msg = (f"LLM initialized successfully!\n"
+                      f"🤖 Model: {os.path.basename(llm_model)}\n"
+                      f"🌡️ Temperature: {temperature}\n"
+                      f"📝 Max tokens: {max_tokens}\n"
+                      f"🔝 Top K: {top_k}")
+        return qa_chain, success_msg
+    except Exception as e:
+        return None, f"Error initializing LLM: {str(e)}"
+def conversation(qa_chain, message, history):
+    """Conversation function with CSV-specific source formatting"""
+    try:
+        response = qa_chain.invoke({
+            "question": message,
+            "chat_history": [(hist[0], hist[1]) for hist in history]
+        })
+        response_answer = response["answer"]
+        if "Helpful Answer:" in response_answer:
+            response_answer = response_answer.split("Helpful Answer:")[-1].strip()
+        # Get source documents
+        sources = response["source_documents"][:3]
+        source_contents = []
+        source_info = []
+        for source in sources:
+            # Format source content for CSV data
+            content = source.page_content.strip()
+            metadata = source.metadata
+            # Create readable source info for CSV
+            source_file = metadata.get("source", "Unknown")
+            row_num = metadata.get("row", 0)
+            source_contents.append(content)
+            source_info.append(f"File: {source_file} | Row: {row_num}")
+        # Pad with empty values if needed
+        while len(source_contents) < 3:
+            source_contents.append("")
+            source_info.append("No source")
+        return (
+            qa_chain,
+            gr.update(value=""),
+            history + [(message, response_answer)],
+            source_contents[0],
+            source_info[0],
+            source_contents[1],
+            source_info[1],
+            source_contents[2],
+            source_info[2]
+        )
+    except Exception as e:
+        error_msg = f"Error in conversation: {str(e)}"
+        return (
+            qa_chain,
+            gr.update(value=""),
+            history + [(message, error_msg)],
+            "", "Error", "", "Error", "", "Error"
+        )
+def demo():
+    with gr.Blocks(theme=gr.themes.Default(primary_hue="green", secondary_hue="blue", neutral_hue="slate")) as demo:
+        vector_db = gr.State()
+        qa_chain = gr.State()
+        gr.HTML("<center><h1>📊 RAG CSV Chatbot</h1></center>")
+        gr.HTML("<center><p>Upload CSV files and chat with your data using advanced RAG techniques</p></center>")
+        with gr.Row():
+            with gr.Column(scale=86):
+                gr.Markdown("### 📁 Step 1 - Configure and Initialize RAG Pipeline")
+                document = gr.Files(
+                    height=300,
+                    file_count="multiple",
+                    file_types=[".csv"],
+                    interactive=True,
+                    label="Upload CSV documents",
+                    elem_id="file_upload"
+                )
+                with gr.Row():
+                    splitting_strategy = gr.Radio(
+                        ["recursive", "fixed", "token"],
+                        label="Text Splitting Strategy",
+                        value="recursive",
+                        info="How to split CSV data into chunks"
+                    )
+                    db_choice = gr.Radio(
+                        ["faiss", "chroma", "qdrant"],
+                        label="Vector Database",
+                        value="faiss",
+                        info="Vector storage backend"
+                    )
+                    chunk_size = gr.Radio(
+                        ["small", "medium"],
+                        label="Chunk Size",
+                        value="medium",
+                        info="Size of text chunks for processing"
+                    )
+                with gr.Row():
+                    db_btn = gr.Button("🔄 Create Vector Database", variant="primary")
+                db_progress = gr.Textbox(
+                    value="❌ Not initialized - Please upload CSV files and create database",
+                    show_label=False,
+                    interactive=False,
+                    lines=4
+                )
+                gr.Markdown("### 🤖 Step 2 - Configure LLM")
+                llm_choice = gr.Radio(
+                    list_llm_simple,
+                    label="Available LLMs",
+                    value=list_llm_simple[0],
+                    type="index",
+                    info="Choose the language model for responses"
+                )
+                with gr.Accordion("🔧 LLM Parameters", open=False):
+                    temperature = gr.Slider(
+                        minimum=0.01,
+                        maximum=1.0,
+                        value=0.5,
+                        step=0.1,
+                        label="Temperature",
+                        info="Controls randomness in responses"
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=128,
+                        maximum=4096,
+                        value=2048,
+                        step=128,
+                        label="Max Tokens",
+                        info="Maximum length of generated responses"
+                    )
+                    top_k = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=3,
+                        step=1,
+                        label="Top K",
+                        info="Number of top documents to retrieve"
+                    )
+                with gr.Row():
+                    init_llm_btn = gr.Button("🚀 Initialize LLM", variant="primary", interactive=False)
+                llm_progress = gr.Textbox(
+                    value="❌ Not initialized - Please create database first",
+                    show_label=False,
+                    interactive=False,
+                    lines=4
+                )
+            with gr.Column(scale=200):
+                gr.Markdown("### 💬 Step 3 - Chat with Your CSV Data")
+                chatbot = gr.Chatbot(
+                    height=505,
+                    show_label=False,
+                    elem_id="chatbot",
+                    placeholder="Your conversation will appear here after initializing the system..."
+                )
+                with gr.Accordion("📋 Source References", open=False):
+                    gr.Markdown("*Top 3 most relevant sources from your CSV data:*")
+                    with gr.Row():
+                        with gr.Column():
+                            source1 = gr.Textbox(label="📄 Source 1", lines=3, interactive=False)
+                            info1 = gr.Textbox(label="ℹ️ Source 1 Info", interactive=False)
+                    with gr.Row():
+                        with gr.Column():
+                            source2 = gr.Textbox(label="📄 Source 2", lines=3, interactive=False)
+                            info2 = gr.Textbox(label="ℹ️ Source 2 Info", interactive=False)
+                    with gr.Row():
+                        with gr.Column():
+                            source3 = gr.Textbox(label="📄 Source 3", lines=3, interactive=False)
+                            info3 = gr.Textbox(label="ℹ️ Source 3 Info", interactive=False)
+                with gr.Row():
+                    msg = gr.Textbox(
+                        placeholder="Ask questions about your CSV data... (e.g., 'What are the main trends?', 'Summarize the key findings', 'What patterns do you see?')",
+                        show_label=False,
+                        scale=4,
+                        interactive=False
+                    )
+                    submit_btn = gr.Button("📤 Send", scale=1, interactive=False)
+                with gr.Row():
+                    clear_btn = gr.ClearButton(
+                        [msg, chatbot, source1, info1, source2, info2, source3, info3],
+                        value="🗑️ Clear Chat",
+                        scale=1
+                    )
+                gr.Markdown("### 💡 Tips for Better Results")
+                gr.Markdown("""
+                - **Ask specific questions** about your data (e.g., "What are the highest values in column X?")
+                - **Request summaries** (e.g., "Summarize the key insights from this dataset")
+                - **Compare data** (e.g., "Compare categories A and B")
+                - **Ask for trends** (e.g., "What patterns do you see over time?")
+                """)
+        # Event handlers
+        db_btn.click(
+            initialize_database,
+            inputs=[document, splitting_strategy, chunk_size, db_choice],
+            outputs=[vector_db, db_progress]
+        ).then(
+            lambda x: gr.update(interactive=True) if x is not None else gr.update(interactive=False),
+            inputs=[vector_db],
+            outputs=[init_llm_btn]
+        )
+        init_llm_btn.click(
+            initialize_llmchain,
+            inputs=[llm_choice, temperature, max_tokens, top_k, vector_db],
+            outputs=[qa_chain, llm_progress]
+        ).then(
+            lambda x: [gr.update(interactive=True), gr.update(interactive=True)] if x is not None else [gr.update(interactive=False), gr.update(interactive=False)],
+            inputs=[qa_chain],
+            outputs=[msg, submit_btn]
+        )
+        msg.submit(
+            conversation,
+            inputs=[qa_chain, msg, chatbot],
+            outputs=[qa_chain, msg, chatbot, source1, info1, source2, info2, source3, info3]
+        )
+        submit_btn.click(
+            conversation,
+            inputs=[qa_chain, msg, chatbot],
+            outputs=[qa_chain, msg, chatbot, source1, info1, source2, info2, source3, info3]
+        )
+        clear_btn.click(
+            lambda: [[], "", "", "", "", "", ""],
+            outputs=[chatbot, source1, info1, source2, info2, source3, info3]
+        )
+    demo.queue().launch(debug=True)
+if __name__ == "__main__":
+    demo()