samlam111 commited on
Commit
a2e195e
·
1 Parent(s): eae1279

Fixed model not working

Browse files

- Not an ideal solution, streaming doesn't work

Files changed (1) hide show
  1. app.py +28 -10
app.py CHANGED
@@ -1,5 +1,9 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
 
 
 
3
 
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
@@ -7,8 +11,20 @@ For more information on `huggingface_hub` Inference API support, please check th
7
 
8
  model_name_or_path = "samlama111/lora_model"
9
 
10
- client = InferenceClient(model_name_or_path)
 
 
 
 
 
 
11
 
 
 
 
 
 
 
12
 
13
  def respond(
14
  message,
@@ -29,17 +45,19 @@ def respond(
29
  messages.append({"role": "user", "content": message})
30
 
31
  response = ""
 
 
32
 
33
- for message in client.chat_completion(
34
- messages,
35
- max_tokens=max_tokens,
36
- stream=True,
37
- temperature=temperature,
38
- top_p=top_p,
39
- ):
40
- token = message.choices[0].delta.content
 
41
 
42
- response += token
43
  yield response
44
 
45
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ from unsloth import FastLanguageModel
4
+ from unsloth.chat_templates import get_chat_template
5
+ from transformers import TextStreamer
6
+
7
 
8
  """
9
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
11
 
12
  model_name_or_path = "samlama111/lora_model"
13
 
14
+ # client = InferenceClient(model_name_or_path)
15
+ model, tokenizer = FastLanguageModel.from_pretrained(
16
+ model_name = model_name_or_path,
17
+ max_seq_length = 8192,
18
+ load_in_4bit = True,
19
+ # token = "hf_...", # No need since our model is public
20
+ )
21
 
22
+ tokenizer = get_chat_template(
23
+ tokenizer,
24
+ chat_template = "llama-3.1",
25
+ mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
26
+ )
27
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
28
 
29
  def respond(
30
  message,
 
45
  messages.append({"role": "user", "content": message})
46
 
47
  response = ""
48
+
49
+ inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt")
50
 
51
+ text_streamer = TextStreamer(tokenizer)
52
+ # TODO: Doesn't stream ATM
53
+ for message in model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True):
54
+ # Decode the tensor to a string
55
+ decoded_message = tokenizer.decode(message, skip_special_tokens=True)
56
+
57
+ # Manually getting the response
58
+ response = decoded_message.split("assistant")[-1].strip() # Extract only the assistant's response
59
+ print(response)
60
 
 
61
  yield response
62
 
63