Spaces:
Runtime error
Runtime error
import sys | |
import torch | |
import torch.nn as nn | |
import transformers | |
import gradio as gr | |
import argparse | |
import warnings | |
import os | |
import quant | |
from gptq import GPTQ | |
from datautils import get_loaders | |
assert ( | |
"LlamaTokenizer" in transformers._import_structure["models.llama"] | |
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git" | |
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig | |
def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): | |
if type(module) in layers: | |
return {name: module} | |
res = {} | |
for name1, child in module.named_children(): | |
res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1)) | |
return res | |
def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True): | |
from transformers import LlamaConfig, LlamaForCausalLM | |
config = LlamaConfig.from_pretrained(model) | |
def noop(*args, **kwargs): | |
pass | |
torch.nn.init.kaiming_uniform_ = noop | |
torch.nn.init.uniform_ = noop | |
torch.nn.init.normal_ = noop | |
torch.set_default_dtype(torch.half) | |
transformers.modeling_utils._init_weights = False | |
torch.set_default_dtype(torch.half) | |
model = LlamaForCausalLM(config) | |
torch.set_default_dtype(torch.float) | |
if eval: | |
model = model.eval() | |
layers = find_layers(model) | |
for name in ['lm_head']: | |
if name in layers: | |
del layers[name] | |
quant.make_quant_linear(model, layers, wbits, groupsize) | |
del layers | |
print('Loading model ...') | |
model.load_state_dict(torch.load(checkpoint), strict=False) | |
quant.make_quant_attn(model) | |
if eval and fused_mlp: | |
quant.make_fused_mlp(model) | |
if warmup_autotune: | |
quant.autotune_warmup_linear(model, transpose=not (eval)) | |
if eval and fused_mlp: | |
quant.autotune_warmup_fused(model) | |
model.seqlen = 2048 | |
print('Done.') | |
return model | |
def generate_prompt(instruction, input=None): | |
if input: | |
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Input: | |
{input} | |
### Response:""" | |
else: | |
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Response:""" | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--model_path",type=str,default="decapoda-research/llama-7b-hf",help="llama huggingface model to load") | |
parser.add_argument("--quant_path",type=str,default="llama7b-8bit-128g.pt",help="the quantified model path") | |
parser.add_argument( | |
"--wbits", | |
type=int, | |
default=4, | |
choices=[2, 3, 4, 8], | |
help="bits to use for quantization; use 8 for evaluating base model.") | |
parser.add_argument('--text', type=str, default='the mean of life is', help='input text') | |
parser.add_argument('--min_length', type=int, default=10, help='The minimum length of the sequence to be generated.') | |
parser.add_argument('--max_length', type=int, default=256, help='The maximum length of the sequence to be generated.') | |
parser.add_argument('--top_p', | |
type=float, | |
default=0.95, | |
help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.') | |
parser.add_argument('--temperature', type=float, default=0.1, help='The value used to module the next token probabilities.') | |
parser.add_argument('--repetition_penalty',type=float, default=2.0, help='The parameter for repetition penalty. 1.0 means no penalty(0~10)') | |
parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.') | |
parser.add_argument('--gradio', action='store_true', help='Whether to use gradio to present results.') | |
args = parser.parse_args() | |
if torch.cuda.is_available(): | |
device = "cuda" | |
else: | |
device = "cpu" | |
model = load_quant(args.model_path, args.quant_path, args.wbits, args.groupsize) | |
model.to(device) | |
tokenizer = LlamaTokenizer.from_pretrained(args.model_path) | |
model.eval() | |
if torch.__version__ >= "2" and sys.platform != "win32": | |
model = torch.compile(model) | |
#[Way1]: drectly generate | |
if not args.gradio: | |
input_ids = tokenizer.encode(args.text, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
generated_ids = model.generate( | |
input_ids, | |
min_new_tokens=args.min_length, | |
max_new_tokens=args.max_length, | |
top_p=args.top_p, | |
temperature=args.temperature, | |
repetition_penalty=args.repetition_penalty, | |
) | |
print("*"*80) | |
print("🦙:", tokenizer.decode([el.item() for el in generated_ids[0]],skip_special_tokens=True)) | |
#[Way2]: generate through the gradio interface | |
else: | |
def evaluate( | |
input, | |
temperature=0.1, | |
top_p=0.75, | |
top_k=40, | |
num_beams=1, | |
max_new_tokens=128, | |
repetition_penalty=1.0, | |
**kwargs, | |
): | |
prompt = generate_prompt(input) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(device) | |
generation_config = GenerationConfig( | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
**kwargs, | |
) | |
with torch.no_grad(): | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=max_new_tokens, | |
repetition_penalty=float(repetition_penalty), | |
) | |
s = generation_output.sequences[0] | |
output = tokenizer.decode(s,skip_special_tokens=True) | |
return output.split("### Response:")[1].strip() | |
gr.Interface( | |
fn=evaluate, | |
inputs=[ | |
gr.components.Textbox( | |
lines=2, label="Input", placeholder="Tell me about alpacas." | |
), | |
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), | |
gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), | |
gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), | |
gr.components.Slider(minimum=1, maximum=5, step=1, value=1, label="Beams"), | |
gr.components.Slider( | |
minimum=1, maximum=2000, step=1, value=256, label="Max tokens" | |
), | |
gr.components.Slider( | |
minimum=0.1, maximum=10.0, step=0.1, value=1.0, label="Repetition Penalty" | |
), | |
], | |
outputs=[ | |
gr.inputs.Textbox( | |
lines=5, | |
label="Output", | |
) | |
], | |
title="Chinese-Vicuna 中文小羊驼", | |
description="中文小羊驼由各种高质量的开源instruction数据集,结合Alpaca-lora的代码训练而来,模型基于开源的llama7B,主要贡献是对应的lora模型。由于代码训练资源要求较小,希望为llama中文lora社区做一份贡献。", | |
).launch(share=True) | |
if __name__ == '__main__': | |
main() | |