Spaces:

1inkusFace
/

qwen2.5-32b-instruct

Running on Zero

1inkusFace commited on Feb 14

Commit

417df1c

verified ·

1 Parent(s): 6d728a0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,20 @@ import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import gradio as gr
 model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
@@ -11,7 +25,7 @@ model = AutoModelForCausalLM.from_pretrained(
     torch_dtype="auto",
    # device_map="auto",
     trust_remote_code=True # Add this line for Qwen models
-).to('cuda')
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Add this line for Qwen models
@@ -32,9 +46,8 @@ def generate_code(prompt):
             **model_inputs,
             max_new_tokens = 1024,
             min_new_tokens = 256,
-            #low_memory = True,
             do_sample = True,
-            #token_healing = True,
             #guidance_scale = 3.8,
         )
     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import gradio as gr
+impot os
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+torch.backends.cudnn.allow_tf32 = False
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = False
+#torch.backends.cuda.preferred_blas_library="cublas"
+# torch.backends.cuda.preferred_linalg_library="cusolver"
+torch.set_float32_matmul_precision("highest")
+os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
+os.environ["SAFETENSORS_FAST_GPU"] = "1"
 model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
     torch_dtype="auto",
    # device_map="auto",
     trust_remote_code=True # Add this line for Qwen models
+).to('cuda',torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Add this line for Qwen models
             **model_inputs,
             max_new_tokens = 1024,
             min_new_tokens = 256,
+            low_memory = False,
             do_sample = True,
             #guidance_scale = 3.8,
         )
     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]