Spaces:

nafisneehal
/

trialbrain-playground

Sleeping

App Files Files Community

nafisneehal commited on Nov 21, 2024

Commit

41e7fe3

verified ·

1 Parent(s): 76d6bf4

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -48

app.py CHANGED Viewed

@@ -17,12 +17,11 @@ except Exception as e:
 MODEL_FILE = "model_links.txt"
 def load_model_links():
-    # """Load model links from file"""
-    # if not os.path.exists(MODEL_FILE):
-    #     # Create default file with some example models
-    #     with open(MODEL_FILE, "w") as f:
-    #         f.write("meta-llama/Llama-2-7b-chat-hf\n")
-    #         f.write("tiiuae/falcon-7b-instruct\n")
     with open(MODEL_FILE, "r") as f:
         return [line.strip() for line in f.readlines() if line.strip()]
@@ -32,7 +31,6 @@ class ModelManager:
         self.current_model = None
         self.current_tokenizer = None
         self.current_model_name = None
-        #self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     def load_model(self, model_name):
@@ -48,34 +46,17 @@ class ModelManager:
                 model_name,
                 load_in_4bit=False,
                 torch_dtype=torch.bfloat16,
-                device_map="auto"
-            )
             self.current_model_name = model_name
-            return f"Successfully loaded model: {model_name}"
         except Exception as e:
             return f"Error loading model: {str(e)}"
-# Initialize model manager
-model_manager = ModelManager()
-# Default system message for JSON output
-default_system_message = """You are a helpful AI assistant. You must ALWAYS return your response in valid JSON format.
-Each response should be formatted as follows:
-{
-    "response": {
-        "main_answer": "Your primary response here",
-        "additional_details": "Any additional information or context",
-        "confidence": 0.0 to 1.0,
-        "tags": ["relevant", "tags", "here"]
-    },
-    "metadata": {
-        "response_type": "type of response",
-        "source": "basis of response if applicable"
-    }
-}
-Ensure EVERY response strictly follows this JSON structure."""
 @spaces.GPU
 def generate_response(model_name, system_instruction, user_input):
@@ -93,20 +74,17 @@ Remember to ALWAYS format your response as valid JSON.
 ### Input:
 {user_input}
 ### Response:
-{{"""  # Note the opening curly brace to hint JSON response
     try:
-        # Ensure inputs are on the correct device
-        inputs = model_manager.current_tokenizer([prompt], return_tensors="pt")
-        # Move input_ids and attention_mask to the same device as the model
-        inputs = {k: v.to(model_manager.device) for k, v in inputs.items()}
-        # Generation configuration optimized for JSON output
         meta_config = {
             "do_sample": False,
             "temperature": 0.0,
             "max_new_tokens": 512,
-            "repetition_penalty": 1.2,
             "use_cache": True,
             "pad_token_id": model_manager.current_tokenizer.eos_token_id,
             "eos_token_id": model_manager.current_tokenizer.eos_token_id
@@ -116,20 +94,20 @@ Remember to ALWAYS format your response as valid JSON.
         # Generate response
         with torch.no_grad():
             outputs = model_manager.current_model.generate(
-                input_ids=inputs['input_ids'],
-                attention_mask=inputs['attention_mask'],
                 generation_config=generation_config
-            )
-            decoded_output = model_manager.current_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
             assistant_response = decoded_output.split("### Response:")[-1].strip()
-            # Clean up and validate JSON
             try:
-                # Find the last complete JSON object
                 last_brace = assistant_response.rindex('}')
                 assistant_response = assistant_response[:last_brace + 1]
-                # Parse and re-format JSON
                 json_response = json.loads(assistant_response)
                 return json.dumps(json_response, indent=2)
             except (json.JSONDecodeError, ValueError):
@@ -141,9 +119,13 @@ Remember to ALWAYS format your response as valid JSON.
     except Exception as e:
         return json.dumps({
             "error": f"Error generating response: {str(e)}",
-            "details": "An unexpected error occurred during generation"
         }, indent=2)
 # Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("# Chat Interface with Model Selection (JSON Output)")

 MODEL_FILE = "model_links.txt"
 def load_model_links():
+    """Load model links from file"""
+    if not os.path.exists(MODEL_FILE):
+        # Create default file with some example models
+        with open(MODEL_FILE, "w") as f:
+            f.write("meta-llama/Llama-2-7b-chat-hf\n")
     with open(MODEL_FILE, "r") as f:
         return [line.strip() for line in f.readlines() if line.strip()]
         self.current_model = None
         self.current_tokenizer = None
         self.current_model_name = None
         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     def load_model(self, model_name):
                 model_name,
                 load_in_4bit=False,
                 torch_dtype=torch.bfloat16,
+                device_map={"": self.device}  # Changed this line
+            ).to(self.device)  # Added explicit device movement
             self.current_model_name = model_name
+            return f"Successfully loaded model: {model_name} on {self.device}"
         except Exception as e:
             return f"Error loading model: {str(e)}"
+    def generate(self, prompt):
+        """Helper method for generation"""
+        inputs = self.current_tokenizer(prompt, return_tensors="pt").to(self.device)
+        return inputs
 @spaces.GPU
 def generate_response(model_name, system_instruction, user_input):
 ### Input:
 {user_input}
 ### Response:
+{{"""
     try:
+        # Get tokenized inputs using helper method
+        inputs = model_manager.generate(prompt)
         meta_config = {
             "do_sample": False,
             "temperature": 0.0,
             "max_new_tokens": 512,
+            "repetition_penalty": 1.1,
             "use_cache": True,
             "pad_token_id": model_manager.current_tokenizer.eos_token_id,
             "eos_token_id": model_manager.current_tokenizer.eos_token_id
         # Generate response
         with torch.no_grad():
             outputs = model_manager.current_model.generate(
+                **inputs,
                 generation_config=generation_config
+            ).to(model_manager.device)  # Ensure outputs are on correct device
+            decoded_output = model_manager.current_tokenizer.batch_decode(
+                outputs.to(model_manager.device),
+                skip_special_tokens=True
+            )[0]
             assistant_response = decoded_output.split("### Response:")[-1].strip()
             try:
                 last_brace = assistant_response.rindex('}')
                 assistant_response = assistant_response[:last_brace + 1]
                 json_response = json.loads(assistant_response)
                 return json.dumps(json_response, indent=2)
             except (json.JSONDecodeError, ValueError):
     except Exception as e:
         return json.dumps({
             "error": f"Error generating response: {str(e)}",
+            "details": "An unexpected error occurred during generation",
+            "device_info": f"Model device: {model_manager.device}, Input device: {inputs.input_ids.device if inputs else 'unknown'}"
         }, indent=2)
 # Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("# Chat Interface with Model Selection (JSON Output)")