Cylanoid commited on
Commit
6f2b1d7
·
verified ·
1 Parent(s): 0e44b56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -4
app.py CHANGED
@@ -1,5 +1,5 @@
1
- # updated_app.py
2
- # Enhanced Gradio app for Llama 4 Maverick healthcare fraud detection (text-only)
3
 
4
  import gradio as gr
5
  from transformers import AutoTokenizer, Llama4ForConditionalGeneration
@@ -45,12 +45,22 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
45
  if tokenizer.pad_token is None:
46
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
47
 
48
- # Load model with 8-bit quantization to fit in 80 GB VRAM
 
 
 
 
 
 
 
 
 
49
  model = Llama4ForConditionalGeneration.from_pretrained(
50
  MODEL_ID,
51
  torch_dtype=torch.bfloat16,
52
- device_map="auto",
53
  quantization_config={"load_in_8bit": True},
 
54
  attn_implementation="flex_attention"
55
  )
56
 
 
1
+ # app.py
2
+ # Enhanced Gradio app for Llama 4 Maverick healthcare fraud detection (text-only with CPU offloading)
3
 
4
  import gradio as gr
5
  from transformers import AutoTokenizer, Llama4ForConditionalGeneration
 
45
  if tokenizer.pad_token is None:
46
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
47
 
48
+ # Custom device map to offload some layers to CPU
49
+ device_map = {
50
+ "model.embed_tokens": 0,
51
+ "model.layers.0-15": 0, # Keep first 16 layers on GPU
52
+ "model.layers.16-31": "cpu", # Offload remaining layers to CPU
53
+ "model.norm": 0,
54
+ "lm_head": 0
55
+ }
56
+
57
+ # Load model with 8-bit quantization and CPU offloading
58
  model = Llama4ForConditionalGeneration.from_pretrained(
59
  MODEL_ID,
60
  torch_dtype=torch.bfloat16,
61
+ device_map=device_map,
62
  quantization_config={"load_in_8bit": True},
63
+ llm_int8_enable_fp32_cpu_offload=True,
64
  attn_implementation="flex_attention"
65
  )
66