|
import torch |
|
from transformers import LlamaTokenizer, TextGenerationPipeline, AutoModelForCausalLM |
|
from yuan_moe_hf_model import YuanForCausalLM |
|
import sys, os |
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
quantized_model_dir = "/temp_data/LLM_test/MOE/Yuan2-M32-int4-hf" |
|
|
|
|
|
tokenizer = LlamaTokenizer.from_pretrained(quantized_model_dir, add_eos_token=False, add_bos_token=False, eos_token='<eod>') |
|
|
|
|
|
model = YuanForCausalLM.from_pretrained(quantized_model_dir, trust_remote_code=True, use_safetensors=True, torch_dtype=torch.float16).to(device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_text = "北京是中国的" |
|
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device) |
|
output_ids = model.generate(input_ids, max_new_tokens=256) |
|
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
|
|
print(output_text) |
|
|
|
|