Spaces:
Runtime error
Runtime error
File size: 2,905 Bytes
402c662 6c88d0a c8d71ce 6c88d0a 402c662 05abaae 402c662 492f975 402c662 c611b16 05abaae c611b16 52b97b4 c611b16 402c662 748b107 402c662 c611b16 402c662 6c88d0a b635f37 402c662 b635f37 6c88d0a 1ccd59a 6c88d0a 402c662 6c88d0a c8d71ce a6b7210 c562a5d 402c662 b635f37 402c662 0580eba 402c662 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import torch
import gradio as gr
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# from transformers import LlamaForCausalLM, LlamaForTokenizer
from utils import load_hyperparam, load_model
from models.tokenize import Tokenizer
from models.llama import *
from generate import LmGeneration
from huggingface_hub import hf_hub_download
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
args = None
lm_generation = None
def init_args():
global args
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args = parser.parse_args()
args.load_model_path = 'Linly-AI/ChatFlow-13B'
#args.load_model_path = 'Linly-AI/ChatFlow-7B'
# args.load_model_path = './model_file/chatllama_7b.bin'
#args.config_path = './config/llama_7b.json'
#args.load_model_path = './model_file/chatflow_13b.bin'
args.config_path = './config/llama_13b_config.json'
args.spm_model_path = './model_file/tokenizer.model'
args.batch_size = 1
args.seq_length = 1024
args.world_size = 1
args.use_int8 = True
args.top_p = 0
args.repetition_penalty_range = 1024
args.repetition_penalty_slope = 0
args.repetition_penalty = 1.15
args = load_hyperparam(args)
# args.tokenizer = Tokenizer(model_path=args.spm_model_path)
args.tokenizer = AutoTokenizer.from_pretrained("Linly-AI/Chinese-LLaMA-2-7B-hf", use_fast=False, trust_remote_code=True)
args.vocab_size = args.tokenizer.sp_model.vocab_size()
def init_model():
global lm_generation
global model
# torch.set_default_tensor_type(torch.HalfTensor)
# model = LLaMa(args)
# torch.set_default_tensor_type(torch.FloatTensor)
# # args.load_model_path = hf_hub_download(repo_id=args.load_model_path, filename='chatflow_13b.bin')
# args.load_model_path = hf_hub_download(repo_id=args.load_model_path, filename='chatflow_13b.bin')
# model = load_model(model, args.load_model_path)
# model.eval()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
model = AutoModelForCausalLM.from_pretrained("Linly-AI/Chinese-LLaMA-2-7B-hf", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
print(model)
print(torch.cuda.max_memory_allocated() / 1024 ** 3)
lm_generation = LmGeneration(model, args.tokenizer)
def chat(prompt, top_k, temperature):
# args.top_k = int(top_k)
# args.temperature = temperature
# response = lm_generation.generate(args, [prompt])
response = model.chat(args.tokenizer, [prompt])
print('log:', response)
return response
if __name__ == '__main__':
init_args()
init_model()
demo = gr.Interface(
fn=chat,
inputs=["text", gr.Slider(1, 60, value=10, step=1), gr.Slider(0.1, 2.0, value=1.0, step=0.1)],
outputs="text",
)
demo.launch()
|