Spaces:
Running
on
T4
Running
on
T4
ffreemt
103cf8f
"""Run qwen 7b. | |
transformers 4.31.0 | |
""" | |
import os | |
import time | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel | |
from transformers.generation import GenerationConfig | |
from transformers import BitsAndBytesConfig | |
from loguru import logger | |
os.environ["TZ"] = "Asia/Shanghai" | |
try: | |
time.tzset() # type: ignore # pylint: disable=no-member | |
except Exception: | |
# Windows | |
logger.warning("Windows, cant run time.tzset()") | |
device_map = "cuda:0" if torch.cuda.is_available() else "cpu" | |
# has_cuda = False # force cpu | |
model_name = "Qwen/Qwen-7B-Chat" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
# quantization configuration for NF4 (4 bits) | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type='nf4', | |
bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
# quantization configuration for Int8 (8 bits) | |
quantization_config = BitsAndBytesConfig(load_in_8bit=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map=device_map, | |
quantization_config=quantization_config, | |
# max_memory=max_memory, | |
trust_remote_code=True, | |
).eval() | |
# model = model.eval() | |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval() | |
# Runs | |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval() | |
# 可指定不同的生成长度、top_p等相关超参 | |
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) | |
# response, history = model.chat(tokenizer, "你好", history=None) | |
response, history = model.chat(tokenizer, "你好", history=[]) | |
print(response) | |