triAGI-Coder

Runtime error

App Files Files Community

acecalisto3 commited on Dec 1, 2024

Commit

40667c5

verified ·

1 Parent(s): 9ff74fe

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -107

app.py CHANGED Viewed

@@ -1,138 +1,227 @@
-import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM
 import os
 import json
-import time
 import logging
 from threading import Lock
-CONFIG_FILE = "config.json"
-MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-CACHE_DIR = "model_cache"  # Directory for storing model cache
-# Create cache directory if it doesn't exist
-os.makedirs(CACHE_DIR, exist_ok=True)
-# Setup logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-messages = [
-    {"role": "user", "content": "Who are you?"},
-]
 class EnhancedChatbot:
     def __init__(self):
         self.model = None
-        self.config = self.load_config()
         self.model_lock = Lock()
-        self.load_model()
     def load_config(self):
-        if os.path.exists(CONFIG_FILE):
-            with open(CONFIG_FILE, 'r') as f:
-                return json.load(f)
-        return {
-            "model_name": MODEL_NAME,
-            "max_tokens": 512,
             "temperature": 0.7,
             "top_p": 0.95,
-            "system_message": "You are a friendly and helpful AI assistant.",
-            "gpu_layers": 0
         }
-    def save_config(self):
-        with open(CONFIG_FILE, 'w') as f:
-            json.dump(self.config, f, indent=2)
     def load_model(self):
         try:
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.config["model_name"],
-                model_type="llama",
-                gpu_layers=self.config["gpu_layers"],
-                cache_dir=CACHE_DIR
             )
-            logging.info(f"Model loaded successfully: {self.config['model_name']}")
         except Exception as e:
             logging.error(f"Error loading model: {str(e)}")
             raise
-    def generate_response(self, message, history, system_message, max_tokens, temperature, top_p):
-        prompt = f"{system_message}\n\n"
-        for user_msg, assistant_msg in history:
-            prompt += f"Human: {user_msg}\nAssistant: {assistant_msg}\n"
-        prompt += f"Human: {message}\nAssistant: "
-        start_time = time.time()
-        with self.model_lock:
-            generated_text = self.model(
-                prompt,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-            )
-        end_time = time.time()
-        response_time = end_time - start_time
-        logging.info(f"Response generated in {response_time:.2f} seconds")
-        return generated_text.strip()
-chatbot = EnhancedChatbot()
-def respond(message, history, system_message, max_tokens, temperature, top_p):
-    try:
-        response = chatbot.generate_response(message, history, system_message, max_tokens, temperature, top_p)
-        yield response
-    except Exception as e:
-        logging.error(f"Error generating response: {str(e)}")
-        yield "I apologize, but I encountered an error while processing your request. Please try again."
-def update_model_config(model_name, gpu_layers):
-    chatbot.config["model_name"] = model_name
-    chatbot.config["gpu_layers"] = gpu_layers
-    chatbot.save_config()
-    chatbot.load_model()
-    return f"Model updated to {model_name} with {gpu_layers} GPU layers."
-def update_system_message(system_message):
-    chatbot.config["system_message"] = system_message
-    chatbot .save_config()
-    return f"System message updated: {system_message}"
-with gr.Blocks() as demo:
-    gr.Markdown("# Enhanced AI Chatbot")
-    with gr.Tab("Chat"):
-        chatbot_interface= gr.ChatInterface(
-            respond,
-            additional_inputs=[
-                gr.Textbox(value=chatbot.config["system_message"], label="System message"),
-                gr.Slider(minimum=1, maximum=2048, value=chatbot.config["max_tokens"], step=1, label="Max new tokens"),
-                gr.Slider(minimum=0.1, maximum=4.0, value=chatbot.config["temperature"], step=0.1, label="Temperature"),
-                gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=chatbot.config["top_p"],
-                    step=0.05,
-                    label="Top-p (nucleus sampling)",
-                ),
-            ],
-        )
-    with gr.Tab("Settings"):
-        with gr.Group():
-            gr.Markdown("### Model Settings")
-            model_name_input = gr.Textbox(value=chatbot.config["model_name"], label="Model name")
-            gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=chatbot.config["gpu_layers"], step=1, label="GPU layers")
-            update_model_button = gr.Button("Update model")
-            update_model_button.click(update_model_config, inputs=[model_name_input, gpu_layers_input], outputs="text")
-        with gr.Group():
-            gr.Markdown("### System Message Settings")
-            system_message_input = gr.Textbox(value=chatbot.config["system_message"], label="System message")
-            update_system_message_button = gr.Button("Update system message")
-            update_system_message_button.click(update_system_message, inputs=[system_message_input], outputs="text")
 if __name__ == "__main__":
-    demo.launch()

+import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 import json
 import logging
 from threading import Lock
+import torch
+# Constants with optimized values for Mixtral
+DEFAULT_MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+MAX_INPUT_TOKENS = 24576  # 24K tokens for input (leaving room for output)
+MAX_NEW_TOKENS = 8192    # 8K tokens for generation
+DEFAULT_CONTEXT_LENGTH = 16384  # 16K default context
+CONFIG_FILE = "chatbot_config.json"
+CACHE_DIR = "model_cache"
 class EnhancedChatbot:
     def __init__(self):
         self.model = None
+        self.tokenizer = None
         self.model_lock = Lock()
+        # Ensure cache directory exists
+        os.makedirs(CACHE_DIR, exist_ok=True)
+        # Initialize configuration with higher limits
+        self.config = self.load_config()
+        # Initialize model and tokenizer
+        try:
+            self.load_model()
+        except Exception as e:
+            st.error(f"Error loading model: {str(e)}")
+            logging.error(f"Error loading model: {str(e)}")
     def load_config(self):
+        """Load or create configuration file with optimized settings"""
+        default_config = {
+            "model_name": DEFAULT_MODEL_NAME,
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "context_length": DEFAULT_CONTEXT_LENGTH,
             "temperature": 0.7,
             "top_p": 0.95,
+            "top_k": 50,
+            "repetition_penalty": 1.1,
+            "system_message": "You are a helpful AI assistant with high context understanding.",
+            "gpu_layers": "auto"
         }
+        try:
+            if os.path.exists(CONFIG_FILE):
+                with open(CONFIG_FILE, 'r') as f:
+                    config = json.load(f)
+                    # Update with any missing keys from default_config
+                    for key, value in default_config.items():
+                        if key not in config:
+                            config[key] = value
+            else:
+                config = default_config
+                self.save_config(config)
+            return config
+        except Exception as e:
+            logging.error(f"Error loading config: {str(e)}")
+            return default_config
     def load_model(self):
+        """Load the model and tokenizer with optimized settings"""
         try:
+            # Clear CUDA cache if using GPU
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Load tokenizer first
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.config["model_name"],
+                cache_dir=CACHE_DIR,
+                model_max_length=self.config["context_length"],
+                padding_side="left"
+            )
+            # Load model with optimized settings
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.config["model_name"],
+                torch_dtype=torch.bfloat16,  # Use bfloat16 for better performance
+                low_cpu_mem_usage=True,
+                cache_dir=CACHE_DIR,
+                device_map="auto",
+                max_memory={0: "24GiB"},  # Adjust based on your GPU
+                trust_remote_code=True
             )
+            logging.info(f"Model {self.config['model_name']} loaded successfully")
         except Exception as e:
             logging.error(f"Error loading model: {str(e)}")
             raise
+    def generate_response(self, message, history):
+        """Generate response with high token limit"""
+        try:
+            with self.model_lock:
+                # Prepare conversation history
+                full_prompt = self.prepare_prompt(message, history)
+                # Tokenize with proper handling of long sequences
+                inputs = self.tokenizer(full_prompt,
+                                      return_tensors="pt",
+                                      truncation=True,
+                                      max_length=MAX_INPUT_TOKENS)
+                # Move to GPU if available
+                inputs = inputs.to(self.model.device)
+                # Generate with optimized parameters
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=self.config["max_new_tokens"],
+                    temperature=self.config["temperature"],
+                    top_p=self.config["top_p"],
+                    top_k=self.config["top_k"],
+                    repetition_penalty=self.config["repetition_penalty"],
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+                # Decode response
+                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                return response.strip()
+        except Exception as e:
+            logging.error(f"Error generating response: {str(e)}")
+            return "I apologize, but I encountered an error. Please try again."
+    def prepare_prompt(self, message, history):
+        """Prepare prompt with history management"""
+        system_msg = self.config["system_message"]
+        prompt = f"{system_msg}\n\n"
+        # Add history with token counting
+        total_tokens = 0
+        for msg in history:
+            tokens = len(self.tokenizer.encode(msg["content"]))
+            if total_tokens + tokens < MAX_INPUT_TOKENS:
+                prompt += f"{msg['role']}: {msg['content']}\n"
+                total_tokens += tokens
+            else:
+                break
+        prompt += f"user: {message}\nassistant:"
+        return prompt
+# Streamlit UI with advanced settings
+def main():
+    st.title("Enhanced AI Chatbot (High Context)")
+    try:
+        chatbot = EnhancedChatbot()
+        # Advanced settings in sidebar
+        with st.sidebar:
+            st.subheader("Model Settings")
+            # Context length slider
+            new_context = st.slider(
+                "Context Length (tokens)",
+                min_value=1024,
+                max_value=32768,
+                value=chatbot.config["context_length"],
+                step=1024
+            )
+            # Generation settings
+            new_max_tokens = st.slider(
+                "Max New Tokens",
+                min_value=1024,
+                max_value=MAX_NEW_TOKENS,
+                value=chatbot.config["max_new_tokens"],
+                step=1024
+            )
+            temperature = st.slider(
+                "Temperature",
+                min_value=0.1,
+                max_value=2.0,
+                value=chatbot.config["temperature"]
+            )
+            # Update settings button
+            if st.button("Update Settings"):
+                chatbot.config.update({
+                    "context_length": new_context,
+                    "max_new_tokens": new_max_tokens,
+                    "temperature": temperature
+                })
+                chatbot.save_config(chatbot.config)
+                st.experimental_rerun()
+        # Chat interface
+        if "messages" not in st.session_state:
+            st.session_state.messages = []
+        # Display chat messages
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+        # Chat input
+        if prompt := st.chat_input("What would you like to know?"):
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            with st.chat_message("assistant"):
+                with st.spinner("Generating response..."):
+                    response = chatbot.generate_response(prompt, st.session_state.messages)
+                    st.markdown(response)
+                    st.session_state.messages.append({"role": "assistant", "content": response})
+    except Exception as e:
+        st.error(f"Application Error: {str(e)}")
+        logging.error(f"Application Error: {str(e)}")
 if __name__ == "__main__":
+    main()