Gabriele Morello commited on
Commit
a001687
Β·
1 Parent(s): 83f1462

Add inference and req

Browse files
Files changed (1) hide show
  1. app.py +29 -4
app.py CHANGED
@@ -2,13 +2,38 @@ import gradio as gr
2
  import spaces
3
  import torch
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  zero = torch.Tensor([0]).cuda()
6
  print(zero.device) # <-- 'cpu' πŸ€”
7
 
8
  @spaces.GPU
9
- def greet(n):
10
- print(zero.device) # <-- 'cuda:0' πŸ€—
11
- return f"Hello {zero + n} Tensor"
 
 
 
12
 
13
- demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
14
  demo.launch()
 
2
  import spaces
3
  import torch
4
 
5
+ from peft import PeftConfig, PeftModel
6
+ from transformers import LlamaForCausalLM, AutoTokenizer, BitsAndBytesConfig
7
+
8
+ config = PeftConfig.from_pretrained("GGmorello/FLAMES")
9
+ model = LlamaForCausalLM.from_pretrained(
10
+ config.base_model_name_or_path,
11
+ quantization_config=BitsAndBytesConfig(
12
+ load_in_4bit=True,
13
+ bnb_4bit_quant_type="nf4",
14
+ bnb_4bit_use_double_quant=True,
15
+ bnb_4bit_compute_dtype=torch.bfloat16,
16
+ ),
17
+ )
18
+ model = PeftModel.from_pretrained(model, "GGmorello/FLAMES")
19
+
20
+
21
+ MAX_SEQ_LEN = 4096
22
+ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
23
+ model.config.pad_token = tokenizer.pad_token = tokenizer.unk_token
24
+
25
+
26
+
27
  zero = torch.Tensor([0]).cuda()
28
  print(zero.device) # <-- 'cpu' πŸ€”
29
 
30
  @spaces.GPU
31
+ def predict(text):
32
+ input_ids = tokenizer(text, return_tensors='pt')["input_ids"]
33
+ generated_ids = model.generate(input_ids, max_new_tokens=256)
34
+ filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
35
+ return filling
36
+
37
 
38
+ demo = gr.Interface(fn=predict, inputs=gr.Text(), outputs=gr.Text())
39
  demo.launch()