pyakhurel commited on
Commit
9276eae
·
1 Parent(s): fcf1e60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -3
app.py CHANGED
@@ -1,10 +1,39 @@
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
 
 
 
 
3
 
4
- client = InferenceClient(
5
- "pyakhurel/mistral-7b-mj-finetuned"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  )
7
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def format_prompt(message, history):
10
  prompt = "<s>"
@@ -33,7 +62,13 @@ def generate(
33
 
34
  formatted_prompt = format_prompt(prompt, history)
35
 
36
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
 
 
 
 
 
 
37
  output = ""
38
 
39
  for response in stream:
 
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
+ import torch
4
+ from peft import PeftModel
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import transformers
7
 
8
+ adapters_name = "pyakhurel/mistral-7b-mj-finetuned"
9
+ model_name = "bn22/Mistral-7B-Instruct-v0.1-sharded"
10
+ device = "cuda"
11
+
12
+ bnb_config = transformers.BitsAndBytesConfig(
13
+ load_in_4bit=True,
14
+ bnb_4bit_use_double_quant=True,
15
+ bnb_4bit_quant_type="nf4",
16
+ bnb_4bit_compute_dtype=torch.bfloat16
17
+ )
18
+
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ model_name,
21
+ load_in_4bit=True,
22
+ torch_dtype=torch.bfloat16,
23
+ quantization_config=bnb_config,
24
+ device_map='auto'
25
  )
26
 
27
+ model = PeftModel.from_pretrained(model, adapters_name)
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ tokenizer.bos_token_id = 1
31
+
32
+ stop_token_ids = [0]
33
+
34
+
35
+
36
+
37
 
38
  def format_prompt(message, history):
39
  prompt = "<s>"
 
62
 
63
  formatted_prompt = format_prompt(prompt, history)
64
 
65
+
66
+ encoded = tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=False)
67
+ model_input = encoded
68
+ model.to(device)
69
+ generated_ids = model.generate(**model_input, max_new_tokens=1048, do_sample=True)
70
+ stream = tokenizer.batch_decode(generated_ids)
71
+
72
  output = ""
73
 
74
  for response in stream: