Spaces:
Sleeping
Sleeping
File size: 5,118 Bytes
d092d11 44d180e d092d11 2b167f5 d092d11 09f135e d092d11 2b167f5 a1fddf9 01a2ce5 143b62d 1921336 143b62d 3f40f6e c9c9f16 e2bb507 143b62d 01a2ce5 1921336 a1fddf9 01a2ce5 d092d11 203771e d092d11 1921336 d092d11 331a464 d092d11 1921336 d092d11 1921336 d092d11 1921336 68c64e4 01a2ce5 d092d11 8e22bd4 d092d11 c89910e d092d11 c89910e d092d11 c89910e d092d11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import logging
import os
import torch
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
from vllm import LLM, SamplingParams
login(token=os.getenv('HF_TOKEN'))
class Model(torch.nn.Module):
number_of_models = 0
__model_list__ = [
"Qwen/Qwen2-1.5B-Instruct",
"lmsys/vicuna-7b-v1.5",
"google-t5/t5-large",
"mistralai/Mistral-7B-Instruct-v0.1",
"meta-llama/Meta-Llama-3.1-8B-Instruct"
]
def __init__(self, model_name="Qwen/Qwen2-1.5B-Instruct") -> None:
super(Model, self).__init__()
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.name = model_name
self.use_vllm = model_name != "google-t5/t5-large"
logging.info(f'Start loading model {self.name}')
if self.use_vllm:
# 使用vLLM加载模型
self.llm = LLM(
model=model_name,
dtype="half",
tokenizer=model_name,
trust_remote_code=True
)
else:
# 加载原始transformers模型
self.model = AutoModelForSeq2SeqLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto"
)
self.model.eval()
logging.info(f'Loaded model {self.name}')
self.update()
@classmethod
def update(cls):
cls.number_of_models += 1
def gen(self, content_list, temp=0.001, max_length=500, do_sample=True):
if self.use_vllm:
sampling_params = SamplingParams(
temperature=temp,
max_tokens=max_length,
#top_p=0.95 if do_sample else 1.0,
stop_token_ids=[self.tokenizer.eos_token_id]
)
outputs = self.llm.generate(content_list, sampling_params)
return [output.outputs[0].text for output in outputs]
else:
input_ids = self.tokenizer(content_list, return_tensors="pt", padding=True, truncation=True).input_ids.to(self.model.device)
outputs = self.model.generate(
input_ids,
max_new_tokens=max_length,
do_sample=do_sample,
temperature=temp,
eos_token_id=self.tokenizer.eos_token_id,
)
return self.tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)
def streaming(self, content_list, temp=0.001, max_length=500, do_sample=True):
if self.use_vllm:
sampling_params = SamplingParams(
temperature=temp,
max_tokens=max_length,
top_p=0.95 if do_sample else 1.0,
stop_token_ids=[self.tokenizer.eos_token_id]
)
outputs = self.llm.generate(content_list, sampling_params, stream=True)
prev_token_ids = [[] for _ in content_list]
for output in outputs:
for i, request_output in enumerate(output.outputs):
current_token_ids = request_output.token_ids
new_token_ids = current_token_ids[len(prev_token_ids[i]):]
prev_token_ids[i] = current_token_ids.copy()
for token_id in new_token_ids:
token_text = self.tokenizer.decode(token_id, skip_special_tokens=True)
yield i, token_text
else:
input_ids = self.tokenizer(content_list, return_tensors="pt", padding=True, truncation=True).input_ids.to(self.model.device)
gen_kwargs = {
"input_ids": input_ids,
"do_sample": do_sample,
"temperature": temp,
"eos_token_id": self.tokenizer.eos_token_id,
"max_new_tokens": 1,
"return_dict_in_generate": True,
"output_scores": True
}
generated_tokens = 0
batch_size = input_ids.shape[0]
active_sequences = torch.arange(batch_size)
while generated_tokens < max_length and len(active_sequences) > 0:
with torch.no_grad():
output = self.model.generate(**gen_kwargs)
next_tokens = output.sequences[:, -1].unsqueeze(-1)
for i, token in zip(active_sequences, next_tokens):
yield i.item(), self.tokenizer.decode(token[0], skip_special_tokens=True)
gen_kwargs["input_ids"] = torch.cat([gen_kwargs["input_ids"], next_tokens], dim=-1)
generated_tokens += 1
completed = (next_tokens.squeeze(-1) == self.tokenizer.eos_token_id).nonzero().squeeze(-1)
active_sequences = torch.tensor([i for i in active_sequences if i not in completed])
if len(active_sequences) > 0:
gen_kwargs["input_ids"] = gen_kwargs["input_ids"][active_sequences] |