Spaces:
Runtime error
Runtime error
File size: 4,254 Bytes
867ef01 f38c19f e2d20d4 867ef01 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 fca36f2 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 f38c19f 867ef01 392dfeb 867ef01 fca36f2 f38c19f 867ef01 392dfeb 867ef01 e2d20d4 f38c19f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from fastapi import FastAPI
import time
time = time.time()
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # the device to load the model onto
time1 = time.time()
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-0.5B-Instruct",
torch_dtype="auto",
device_map="auto"
)
time2 = time.time()
print(time2-time1)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
time3 = time.time()
print(time3-time1)
model1 = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-1.5B-Instruct",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
time4 = time.time()
print(time4-time3)
app = FastAPI()
time5 = time.time()
print(time5-time4)
@app.get("/")
async def read_root():
return {"Hello": "World!"}
start_time = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
time1 = time.time()
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
time2 = time.time()
print(time2-time1)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
time3 = time.time()
print(time3-time2)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=64
)
time4 = time.time()
print(time4-time3)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
time5 = time.time()
print(time5-time4)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
time6 = time.time()
print(time6-time5)
end_time = time.time()
time_taken = end_time - start_time
print(time_taken)
@app.get("/test")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=64
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": "World!"}
@app.get("/text")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer1([text], return_tensors="pt").to(device)
generated_ids = model1.generate(
model_inputs.input_ids,
max_new_tokens=64
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer1.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": "World!"}
#return {response: time}
|