EPark25 commited on
Commit
db2ba20
·
1 Parent(s): 61eea9e
app.py CHANGED
@@ -1,19 +1,11 @@
1
  import gradio as gr
2
- from peft import PeftModel
3
- from transformers import TextStreamer
4
-
5
- # Load model directly
6
- from transformers import AutoModelForCausalLM, AutoTokenizer
7
 
8
  """
9
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
10
- Info of how to use a model after training on hf https://huggingface.co/docs/trl/main/en/use_model
11
  """
12
 
13
- peft_model_id = "samlama111/lora_model"
14
- model = AutoModelForCausalLM.from_pretrained(peft_model_id)
15
-
16
- tokenizer = AutoTokenizer.from_pretrained("samlama111/lora_model")
17
 
18
 
19
  def respond(
@@ -36,24 +28,16 @@ def respond(
36
 
37
  response = ""
38
 
39
- inputs = tokenizer.apply_chat_template(
40
- messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
41
- )
42
-
43
- text_streamer = TextStreamer(tokenizer)
44
  # TODO: Doesn't stream ATM
45
- for message in model.generate(
46
- input_ids=inputs, streamer=text_streamer, max_new_tokens=1024, use_cache=True
 
 
 
 
47
  ):
48
- # Decode the tensor to a string
49
- decoded_message = tokenizer.decode(message, skip_special_tokens=True)
50
-
51
- # Manually getting the response
52
- response = decoded_message.split("assistant")[
53
- -1
54
- ].strip() # Extract only the assistant's response
55
- print(response)
56
-
57
  yield response
58
 
59
 
 
1
  import gradio as gr
2
+ from huggingface_hub import InferenceClient
 
 
 
 
3
 
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
6
  """
7
 
8
+ client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
 
 
 
9
 
10
 
11
  def respond(
 
28
 
29
  response = ""
30
 
 
 
 
 
 
31
  # TODO: Doesn't stream ATM
32
+ for message in client.chat_completion(
33
+ messages,
34
+ max_tokens=max_tokens,
35
+ stream=True,
36
+ temperature=temperature,
37
+ top_p=top_p,
38
  ):
39
+ token = message.choices[0].delta.content
40
+ response += token
 
 
 
 
 
 
 
41
  yield response
42
 
43
 
bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:18b28598f8ef7a07c1784b0a52480b8c64e15918ac752964ac0ca2085953b78c
3
- size 1811514
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
- # huggingface_hub==0.25.2
2
  huggingface_hub
3
- transformers>=4.45.1
4
- torch
5
- peft
6
- /tmp/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl
 
 
1
  huggingface_hub
2
+ unsloth
3
+ gradio