Heit39 commited on
Commit
932c085
·
verified ·
1 Parent(s): 4d2144b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +45 -19
  2. requirements.txt +5 -1
app.py CHANGED
@@ -2,40 +2,66 @@ import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
  """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
  """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def respond(
11
- message,
12
  history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
  ):
 
18
  messages = [{"role": "system", "content": system_message}]
19
-
20
  for val in history:
21
  if val[0]:
22
  messages.append({"role": "user", "content": val[0]})
23
  if val[1]:
24
  messages.append({"role": "assistant", "content": val[1]})
25
-
26
  messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
  messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
38
 
 
 
 
39
  response += token
40
  yield response
41
 
 
2
  from huggingface_hub import InferenceClient
3
 
4
  """
5
+ Copied from inference in colab notebook
6
  """
 
7
 
8
+ from transformers import AutoTokenizer , AutoModelForSeq2SeqLM , TextIteratorStreamer
9
+ from threading import Thread
10
+
11
+ # Load model and tokenizer globally to avoid reloading for every request
12
+ base_model = "Helsinki-NLP/europarl"
13
+ model_path = "Mat17892/t5small_enfr_opus"
14
+
15
+ # Load tokenizer
16
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
17
+
18
+ # Load the base model (e.g., LLaMA)
19
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model)
20
+
21
+ # Load LoRA adapter
22
+ from peft import PeftModel
23
+ model = PeftModel.from_pretrained(base_model, model_path)
24
 
25
  def respond(
26
+ message: str,
27
  history: list[tuple[str, str]],
28
+ system_message: str,
29
+ max_tokens: int,
30
+ temperature: float,
31
+ top_p: float,
32
  ):
33
+ # Combine system message and history into a single prompt
34
  messages = [{"role": "system", "content": system_message}]
 
35
  for val in history:
36
  if val[0]:
37
  messages.append({"role": "user", "content": val[0]})
38
  if val[1]:
39
  messages.append({"role": "assistant", "content": val[1]})
 
40
  messages.append({"role": "user", "content": message})
41
+
42
+ # Tokenize the messages
43
+ inputs = tokenizer.apply_chat_template(
 
44
  messages,
45
+ tokenize = True,
46
+ add_generation_prompt = True, # Must add for generation
47
+ return_tensors = "pt",
48
+ )
49
+ # Generate tokens incrementally
50
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
51
+ generation_kwargs = {
52
+ "input_ids": inputs,
53
+ "max_new_tokens": max_tokens,
54
+ "temperature": temperature,
55
+ "top_p": top_p,
56
+ "do_sample": True,
57
+ "streamer": streamer,
58
+ }
59
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
60
+ thread.start()
61
 
62
+ # Yield responses as they are generated
63
+ response = ""
64
+ for token in streamer:
65
  response += token
66
  yield response
67
 
requirements.txt CHANGED
@@ -1 +1,5 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+
3
+ transformers
4
+ accelerate
5
+ peft