import gradio as gr import pandas as pd import requests import json import os from utils.google_genai_llm import get_response, generate_with_gemini from utils.utils import parse_json_codefences from prompts.requirements_gathering import requirements_gathering_system_prompt from prompts.planning import hf_query_gen_prompt, hf_context_gen_prompt from utils.huggingface_mcp_llamaindex import connect_and_get_tools, call_tool from prompts.devstral_coding_prompt import devstral_code_gen_sys_prompt, devstral_code_gen_user_prompt from dotenv import load_dotenv import os load_dotenv() # Import Modal inference function import sys sys.path.append(os.path.join(os.path.dirname(__file__), 'modal')) try: from modal import App # Import the Modal inference function and app from separate file import subprocess from devstral_inference import run_devstral_inference, app as devstral_app MODAL_AVAILABLE = True except ImportError: MODAL_AVAILABLE = False devstral_app = None print("Warning: Modal not available. Code generation will be disabled.") from PIL import Image import tempfile import traceback import hashlib # Import Marker for document processing try: from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered MARKER_AVAILABLE = True except ImportError: MARKER_AVAILABLE = False print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.") # Load environment variables MODAL_API_URL = os.getenv("MODAL_API_URL") BEARER_TOKEN = os.getenv("BEARER_TOKEN") CODING_MODEL = os.getenv("CODING_MODEL") def get_file_hash(file_path): """Generate a hash of the file for caching purposes""" try: with open(file_path, 'rb') as f: file_hash = hashlib.md5(f.read()).hexdigest() return file_hash except Exception: return None def extract_text_with_marker(file_path): """Extract text from PDF, PPT, or DOCX using Marker""" if not MARKER_AVAILABLE: return "Marker library not available for document processing.", "" try: # Create converter with model artifacts converter = PdfConverter( artifact_dict=create_model_dict(), ) # Convert document rendered = converter(file_path) # Extract text from rendered output text, _, images = text_from_rendered(rendered) # Get basic stats word_count = len(text.split()) char_count = len(text) stats = f"Extracted text ({word_count} words, {char_count} characters)" return stats, text except Exception as e: error_msg = f"Error processing document: {str(e)}" return error_msg, "" def process_user_input(message, history, uploaded_files, file_cache): """Process user input and generate AI response using requirements gathering prompt""" # Build conversation history from chat history conversation_history = "" if history: for i, (user_msg, ai_msg) in enumerate(history): conversation_history += f"User: {user_msg}\n" if ai_msg: conversation_history += f"Assistant: {ai_msg}\n" # Add file information to conversation if files are uploaded if uploaded_files: file_info = f"\n[UPLOADED_FILES]\n" new_file_cache = file_cache.copy() if file_cache else {} for file_path in uploaded_files: try: file_name = file_path.split('/')[-1] file_extension = os.path.splitext(file_name)[1].lower() file_hash = get_file_hash(file_path) cache_key = f"{file_name}_{file_hash}" # Handle CSV files if file_extension == '.csv': df = pd.read_csv(file_path) file_info += f"- {file_name}: CSV file with {len(df)} rows and {len(df.columns)} columns\n" file_info += f" Columns: {', '.join(df.columns.tolist())}\n" # Handle Excel files elif file_extension in ['.xlsx', '.xls']: df = pd.read_excel(file_path) file_info += f"- {file_name}: Excel file with {len(df)} rows and {len(df.columns)} columns\n" file_info += f" Columns: {', '.join(df.columns.tolist())}\n" # Handle document files with Marker (PDF, PPT, DOCX) elif file_extension in ['.pdf', '.ppt', '.pptx', '.doc', '.docx']: file_size = os.path.getsize(file_path) file_size_mb = round(file_size / (1024 * 1024), 2) # Check if file is already processed and cached if cache_key in new_file_cache: # Use cached text extraction_stats = new_file_cache[cache_key]['stats'] extracted_text = new_file_cache[cache_key]['text'] status = "(cached)" else: # Process new file with Marker extraction_stats, extracted_text = extract_text_with_marker(file_path) # Cache the results new_file_cache[cache_key] = { 'stats': extraction_stats, 'text': extracted_text, 'file_name': file_name, 'file_path': file_path } status = "(newly processed)" # Determine document type if file_extension == '.pdf': doc_type = "PDF document" elif file_extension in ['.ppt', '.pptx']: doc_type = "PowerPoint presentation" else: doc_type = "Word document" file_info += f"- {file_name}: {doc_type}, Size: {file_size_mb} MB {status}\n" file_info += f" Content: {extraction_stats}\n" # Include extracted text in conversation context for better AI understanding if extracted_text and len(extracted_text.strip()) > 0: # Truncate very long texts for context (keep first 2000 chars) text_preview = extracted_text[:200000] + "..." if len(extracted_text) > 200000 else extracted_text file_info += f" Text Preview: {text_preview}\n" # Handle image files elif file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']: with Image.open(file_path) as img: width, height = img.size mode = img.mode file_size = os.path.getsize(file_path) file_size_mb = round(file_size / (1024 * 1024), 2) file_info += f"- {file_name}: {file_extension.upper()[1:]} image file\n" file_info += f" Dimensions: {width}x{height} pixels, Mode: {mode}, Size: {file_size_mb} MB\n" # Handle JSON files elif file_extension == '.json': file_size = os.path.getsize(file_path) file_size_kb = round(file_size / 1024, 2) file_info += f"- {file_name}: JSON file, Size: {file_size_kb} KB\n" # Handle text files elif file_extension == '.txt': with open(file_path, 'r', encoding='utf-8') as f: lines = len(f.readlines()) file_size = os.path.getsize(file_path) file_size_kb = round(file_size / 1024, 2) file_info += f"- {file_name}: Text file with {lines} lines, Size: {file_size_kb} KB\n" # Handle other files else: file_size = os.path.getsize(file_path) file_size_kb = round(file_size / 1024, 2) file_info += f"- {file_name}: File uploaded, Size: {file_size_kb} KB\n" except Exception as e: file_info += f"- {file_path.split('/')[-1]}: File uploaded (unable to preview: {str(e)})\n" print(f"Error processing file {file_path}: {traceback.format_exc()}") conversation_history += file_info # Update the cache file_cache.update(new_file_cache) # Format the prompt with conversation history and current query formatted_prompt = requirements_gathering_system_prompt.format( conversation_history=conversation_history, query=message ) # Get AI response ai_response = get_response(formatted_prompt) return ai_response, file_cache def chat_interface(message, history, uploaded_files, file_cache): """Main chat interface function""" # Get AI response with updated cache ai_response, updated_cache = process_user_input(message, history, uploaded_files, file_cache) # Add to history history.append((message, ai_response)) return history, history, "", updated_cache def clear_chat(): """Clear the chat history and file cache""" return [], [], {} def upload_file_handler(files): """Handle file uploads""" if files: return files return [] async def generate_plan(history, file_cache): """Generate a plan using the planning prompt and Gemini API""" # Build conversation history conversation_history = "" if history: for user_msg, ai_msg in history: conversation_history += f"User: {user_msg}\n" if ai_msg: conversation_history += f"Assistant: {ai_msg}\n" try: hf_query_gen_tool_details = await connect_and_get_tools() except Exception as e: hf_query_gen_tool_details = """meta=None nextCursor=None tools=[Tool(name='hf_whoami', description="Hugging Face tools are being used by authenticated user 'bpHigh'", inputSchema={'type': 'object', 'properties': {}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face User Info', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=None)), Tool(name='space_search', description='Find Hugging Face Spaces using semantic search. Include links to the Space when presenting the results.', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Semantic Search Query'}, 'limit': {'type': 'number', 'default': 10, 'description': 'Number of results to return'}, 'mcp': {'type': 'boolean', 'default': False, 'description': 'Only return MCP Server enabled Spaces'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face Space Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_search', description='Find Machine Learning models hosted on Hugging Face. Returns comprehensive information about matching models including downloads, likes, tags, and direct links. Include links to the models in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending models", "Top 10 most recent models" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the model (e.g., 'google', 'meta-llama', 'microsoft')"}, 'task': {'type': 'string', 'description': "Model task type (e.g., 'text-generation', 'image-classification', 'translation')"}, 'library': {'type': 'string', 'description': "Framework the model uses (e.g., 'transformers', 'diffusers', 'timm')"}, 'sort': {'type': 'string', 'enum': ['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'], 'description': 'Sort order: trendingScore, downloads , likes, createdAt, lastModified'}, 'limit': {'type': 'number', 'minimum': 1, 'maximum': 100, 'default': 20, 'description': 'Maximum number of results to return'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_details', description='Get detailed information about a specific model from the Hugging Face Hub.', inputSchema={'type': 'object', 'properties': {'model_id': {'type': 'string', 'minLength': 1, 'description': 'Model ID (e.g., microsoft/DialoGPT-large)'}}, 'required': ['model_id'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Details', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=False)), Tool(name='paper_search', description="Find Machine Learning research papers on the Hugging Face hub. Include 'Link to paper' When presenting the results. Consider whether tabulating results matches user intent.", inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 3, 'maxLength': 200, 'description': 'Semantic Search query'}, 'results_limit': {'type': 'number', 'default': 12, 'description': 'Number of results to return'}, 'concise_only': {'type': 'boolean', 'default': False, 'description': 'Return a 2 sentence summary of the abstract. Use for broad search terms which may return a lot of results. Check with User if unsure.'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Paper Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='dataset_search', description='Find Datasets hosted on the Hugging Face hub. Returns comprehensive information about matching datasets including downloads, likes, tags, and direct links. Include links to the datasets in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending datasets", "Top 10 most recent datasets" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the dataset (e.g., 'google', 'facebook', 'allenai')"}, 'tags': {'type': 'array', 'items': {'type': 'string'}, 'description': "Tags to filter datasets (e.g., ['language:en', 'size_categories:1M๐Ÿ”ฌ Data Science Consultant
Transform your vague ideas into reality
""") with gr.Row(): with gr.Column(scale=3): # Chat interface chatbot = gr.Chatbot( label="Requirements Gathering Conversation", height=500, show_copy_button=True, bubble_full_width=False, elem_classes=["chat-container"] ) plan_output = gr.Textbox( label="Generated Plan", interactive=False, visible=True, lines=10, max_lines=20 ) code_output = gr.Textbox( label="Generated Code", interactive=False, visible=True, lines=15, max_lines=30, placeholder="Generated Python code will appear here..." ) with gr.Row(): with gr.Column(scale=4): msg = gr.Textbox( placeholder="Describe your data science project or ask a question...", label="Your Message", lines=2, max_lines=5 ) with gr.Column(scale=1): send_btn = gr.Button("Send ๐Ÿ“ค", variant="primary", elem_classes=["btn-primary"]) with gr.Row(): clear_btn = gr.Button("Clear Chat ๐Ÿ—‘๏ธ", variant="secondary", elem_classes=["btn-secondary"]) with gr.Column(scale=1): # File upload section gr.HTML("

๐Ÿ“ Upload Data Files

") file_upload = gr.File( label="Upload your files (CSV, Excel, PDF, PPT, DOCX, Images, etc.)", file_count="multiple", file_types=[".csv", ".xlsx", ".xls", ".json", ".txt", ".pdf", ".ppt", ".pptx", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"], elem_classes=["upload-area"] ) uploaded_files_display = gr.File( label="Uploaded Files", file_count="multiple", interactive=False, visible=True ) # Instructions gr.HTML("""

๐Ÿ’ก How it works:

  1. Describe your data science project
  2. Upload your files (data, documents, images)
  3. Answer clarifying questions
  4. Generate a plan for your project
  5. Generate Python code using Devstral AI

๐Ÿ“„ Supports: CSV, Excel, PDF, PowerPoint, Word docs, Images, JSON, Text files
๐Ÿ’ป Code generation powered by Mistral Devstral-Small-2505

""") # Action buttons section with gr.Column(): plan_btn = gr.Button("Generate Plan ๐Ÿ“‹", variant="secondary", elem_classes=["btn-secondary"], size="lg") code_btn = gr.Button("Generate Code ๐Ÿ’ป", variant="secondary", elem_classes=["btn-secondary"], size="lg") # State for conversation history and file cache chat_history = gr.State([]) file_cache = gr.State({}) # Event handlers def handle_send(message, history, files, cache): if message.strip(): new_history, updated_history, cleared_input, updated_cache = chat_interface(message, history, files, cache) return new_history, updated_history, cleared_input, updated_cache return history, history, message, cache # Wire up the interface send_btn.click( handle_send, inputs=[msg, chat_history, uploaded_files_display, file_cache], outputs=[chatbot, chat_history, msg, file_cache] ) msg.submit( handle_send, inputs=[msg, chat_history, uploaded_files_display, file_cache], outputs=[chatbot, chat_history, msg, file_cache] ) clear_btn.click( clear_chat, outputs=[chatbot, chat_history, file_cache] ) plan_btn.click( generate_plan, inputs=[chat_history, file_cache], outputs=[plan_output] ) code_btn.click( generate_code_with_devstral, inputs=[plan_output, chat_history, file_cache], outputs=[code_output] ) file_upload.change( lambda files: files, inputs=[file_upload], outputs=[uploaded_files_display] ) # Welcome message app.load( lambda: [(None, "๐Ÿ‘‹ Hello! I'm your Data Science Project Agent. I'll help you transform your project ideas into reality .\n\n๐Ÿš€ **Let's get started!** Tell me about your data science project or what you're trying to achieve.")], outputs=[chatbot] ) if __name__ == "__main__": app.launch(share=True, show_error=True)