Spaces:

YALCINKAYA
/

opsgenius3

Sleeping

YALCINKAYA commited on 26 days ago

Commit

9143358

verified ·

1 Parent(s): cbe8e48

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -28,8 +28,19 @@ def get_model_and_tokenizer(model_id):
             tokenizer.pad_token = tokenizer.eos_token
             print(f"Loading model for model_id: {model_id} on {device}")
-            model = AutoModelForCausalLM.from_pretrained(model_id).to(device)  # Move model to GPU
-            model.config.use_cache = False
         except Exception as e:
             print(f"Error loading model: {e}")
             raise e  # Raise the error to be caught in the POST request
@@ -40,9 +51,8 @@ def generate_response(user_input, model_id):
     # Ensure model and tokenizer are loaded
     get_model_and_tokenizer(model_id)
-    prompt = user_input
-    inputs = tokenizer([prompt], return_tensors="pt").to(device)  # Move inputs to GPU
     generation_config = GenerationConfig(
         penalty_alpha=0.6,
         do_sample=True,
@@ -55,8 +65,10 @@ def generate_response(user_input, model_id):
         stop_sequences=["User:", "Assistant:", "\n"],
     )
-    outputs = model.generate(**inputs, generation_config=generation_config)
-    response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
     cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
     return cleaned_response.strip().split("\n")[0]  # Keep only the first line of response

             tokenizer.pad_token = tokenizer.eos_token
             print(f"Loading model for model_id: {model_id} on {device}")
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id, quantization_config=bnb_config, device_map="auto"
+            )
+            model.config.use_cache=False
+            model.config.pretraining_tp=1
         except Exception as e:
             print(f"Error loading model: {e}")
             raise e  # Raise the error to be caught in the POST request
     # Ensure model and tokenizer are loaded
     get_model_and_tokenizer(model_id)
+    prompt = user_input
     generation_config = GenerationConfig(
         penalty_alpha=0.6,
         do_sample=True,
         stop_sequences=["User:", "Assistant:", "\n"],
     )
+    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
+    response = (tokenizer.decode(outputs[0], skip_special_tokens=True))
     cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
     return cleaned_response.strip().split("\n")[0]  # Keep only the first line of response