|
import time |
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer |
|
|
|
def run_LLM (model, tokenizer, streamer, prompt): |
|
|
|
token_ids = tokenizer.encode(prompt, return_tensors="pt") |
|
output_ids = model.generate( |
|
input_ids=token_ids.to(model.device), |
|
|
|
max_new_tokens=3000000, |
|
do_sample=True, |
|
temperature=0.8, |
|
) |
|
|
|
n_tokens = len(output_ids[0]) |
|
output_text = tokenizer.decode(output_ids[0]) |
|
|
|
return (output_text, n_tokens) |
|
|
|
def display_message(): |
|
model = AutoModelForCausalLM.from_pretrained("cyberagent/calm2-7b-chat", |
|
device_map="cuda", |
|
torch_dtype="auto") |
|
tokenizer = AutoTokenizer.from_pretrained("cyberagent/calm2-7b-chat") |
|
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
|
prompt = """わが国の経済について今後の予想を教えてください。 |
|
ASSISTANT: """ |
|
|
|
|
|
t_start = time.perf_counter() |
|
t_prev = t_start |
|
t_sum = 0.0 |
|
total_tokens = 0 |
|
|
|
log = '' |
|
|
|
for i in range(10): |
|
(result, n_tokens) = run_LLM(model, tokenizer, streamer, prompt) |
|
|
|
total_tokens = total_tokens + n_tokens |
|
t_curr = time.perf_counter() |
|
t_lap = t_curr - t_prev |
|
t_prev = t_curr |
|
t_sum = t_sum + t_lap |
|
t_avg = t_sum/(i+1.0) |
|
|
|
speed_now = n_tokens/t_lap |
|
speed_avg = total_tokens/t_sum |
|
|
|
|
|
row = [] |
|
row.append(i+1) |
|
row.append(t_lap) |
|
row.append(t_avg) |
|
row.append(speed_now) |
|
row.append(speed_avg) |
|
row.append(n_tokens) |
|
row.append(total_tokens) |
|
row.append(t_sum) |
|
|
|
|
|
line = "%d %f %f %f %f %d %d %f" % (i+1, t_lap, t_avg, speed_now, speed_avg, n_tokens, total_tokens, t_sum) |
|
log = log + line + "¥n" |
|
|
|
return log |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
iface = gr.Interface(fn=display_message, inputs=None, outputs="text") |
|
iface.launch() |
|
|