Kvikontent commited on
Commit
13ea59e
·
verified ·
1 Parent(s): 05a9a16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -46
app.py CHANGED
@@ -1,65 +1,136 @@
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
  import spaces
 
 
 
 
 
 
4
 
 
 
 
5
  """
6
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
 
 
 
7
  """
8
- client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
9
-
10
- @spaces.GPU(duration=180)
11
- def respond(
12
- message,
13
- history: list[tuple[str, str]],
14
- system_message,
15
- max_tokens,
16
- temperature,
17
- top_p,
18
- ):
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- for val in history:
22
- if val[0]:
23
- messages.append({"role": "user", "content": val[0]})
24
- if val[1]:
25
- messages.append({"role": "assistant", "content": val[1]})
26
-
27
- messages.append({"role": "user", "content": message})
28
-
29
- response = ""
30
-
31
- for message in client.chat_completion(
32
- messages,
33
- max_tokens=max_tokens,
34
- stream=True,
35
- temperature=temperature,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  top_p=top_p,
37
- ):
38
- token = message.choices[0].delta.content
 
 
 
 
 
39
 
40
- response += token
41
- yield response
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
  additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
 
 
 
 
 
52
  gr.Slider(
 
53
  minimum=0.1,
 
 
 
 
 
 
 
54
  maximum=1.0,
55
- value=0.95,
56
  step=0.05,
57
- label="Top-p (nucleus sampling)",
58
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ],
60
- theme="soft"
61
  )
62
 
 
 
 
 
63
 
64
  if __name__ == "__main__":
65
- demo.launch()
 
1
+ import os
2
+ from threading import Thread
3
+ from typing import Iterator
4
+
5
  import gradio as gr
 
6
  import spaces
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
+
10
+ MAX_MAX_NEW_TOKENS = 2048
11
+ DEFAULT_MAX_NEW_TOKENS = 1024
12
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
 
14
+ DESCRIPTION = """\
15
+ # Llama-2 13B Chat
16
+ We use Llama-2-13b-chat-hf to implements this ai chat
17
  """
18
+
19
+ LICENSE = """
20
+ <p/>
21
+ ---
22
+ As a derivate work of [Llama-2-13b-chat-ff](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) by Meta
23
  """
24
+
25
+ if not torch.cuda.is_available():
26
+ DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
27
+
28
+
29
+ if torch.cuda.is_available():
30
+ model_id = "meta-llama/Llama-2-13b-chat-hf"
31
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
32
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
33
+ tokenizer.use_default_system_prompt = False
34
+
35
+
36
+ @spaces.GPU
37
+ def generate(
38
+ message: str,
39
+ chat_history: list[tuple[str, str]],
40
+ system_prompt: str,
41
+ max_new_tokens: int = 1024,
42
+ temperature: float = 0.6,
43
+ top_p: float = 0.9,
44
+ top_k: int = 50,
45
+ repetition_penalty: float = 1.2,
46
+ ) -> Iterator[str]:
47
+ conversation = []
48
+ if system_prompt:
49
+ conversation.append({"role": "system", "content": system_prompt})
50
+ for user, assistant in chat_history:
51
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
52
+ conversation.append({"role": "user", "content": message})
53
+
54
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
55
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
56
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
57
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
58
+ input_ids = input_ids.to(model.device)
59
+
60
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
61
+ generate_kwargs = dict(
62
+ {"input_ids": input_ids},
63
+ streamer=streamer,
64
+ max_new_tokens=max_new_tokens,
65
+ do_sample=True,
66
  top_p=top_p,
67
+ top_k=top_k,
68
+ temperature=temperature,
69
+ num_beams=1,
70
+ repetition_penalty=repetition_penalty,
71
+ )
72
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
73
+ t.start()
74
 
75
+ outputs = []
76
+ for text in streamer:
77
+ outputs.append(text)
78
+ yield "".join(outputs)
79
 
80
+
81
+ chat_interface = gr.ChatInterface(
82
+ fn=generate,
 
 
83
  additional_inputs=[
84
+ gr.Textbox(label="System prompt", lines=6),
85
+ gr.Slider(
86
+ label="Max new tokens",
87
+ minimum=1,
88
+ maximum=MAX_MAX_NEW_TOKENS,
89
+ step=1,
90
+ value=DEFAULT_MAX_NEW_TOKENS,
91
+ ),
92
  gr.Slider(
93
+ label="Temperature",
94
  minimum=0.1,
95
+ maximum=4.0,
96
+ step=0.1,
97
+ value=0.6,
98
+ ),
99
+ gr.Slider(
100
+ label="Top-p (nucleus sampling)",
101
+ minimum=0.05,
102
  maximum=1.0,
 
103
  step=0.05,
104
+ value=0.9,
105
  ),
106
+ gr.Slider(
107
+ label="Top-k",
108
+ minimum=1,
109
+ maximum=1000,
110
+ step=1,
111
+ value=50,
112
+ ),
113
+ gr.Slider(
114
+ label="Repetition penalty",
115
+ minimum=1.0,
116
+ maximum=2.0,
117
+ step=0.05,
118
+ value=1.2,
119
+ ),
120
+ ],
121
+ stop_btn=None,
122
+ examples=[
123
+ ["Hello there! How are you doing?"],
124
+ ["Can you explain briefly to me what is the Python programming language?"],
125
+ ["Explain the plot of Cinderella in a sentence."],
126
+ ["How many hours does it take a man to eat a Helicopter?"],
127
  ],
 
128
  )
129
 
130
+ with gr.Blocks() as demo:
131
+ gr.Markdown(DESCRIPTION)
132
+ gr.DuplicateButton(value="Duplicate Space for private use")
133
+ chat_interface.render()
134
 
135
  if __name__ == "__main__":
136
+ demo.queue(max_size=20).launch()