Spaces:
Runtime error
Runtime error
File size: 4,164 Bytes
867ef01 e2d20d4 1cec5b6 19d0396 867ef01 318fbfd 867ef01 318fbfd f38c19f 92b7d80 1cec5b6 92b7d80 1cec5b6 92b7d80 318fbfd 92b7d80 867ef01 392dfeb 867ef01 fca36f2 f38c19f 867ef01 318fbfd 867ef01 392dfeb 867ef01 e2d20d4 f38c19f 318fbfd f38c19f 318fbfd f38c19f 92b7d80 318fbfd 92b7d80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from fastapi import FastAPI
import time
import torch
import os
access_token = os.getenv("read_access")
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # the device to load the model onto
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
model1 = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-1.5B-Instruct",
device_map="auto"
)
tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
model2 = AutoModelForCausalLM.from_pretrained(
"google/gemma-2-2b-it",
device_map="auto",
token=access_token
)
app = FastAPI()
@app.get("/")
async def read_root():
return {"Hello": "World!"}
@app.get("/test")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=128
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": "World!"}
@app.get("/text")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model1.generate(
model_inputs.input_ids,
max_new_tokens=64
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": "World!"}
#return {response: time}
@app.get("/tet")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer2([text], return_tensors="pt").to(device)
generated_ids = model2.generate(
model_inputs.input_ids,
max_new_tokens=64
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer2.batch_decode(generated_ids, skip_special_tokens=True)[0]
respons = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": respons}
#return {response: time} |