File size: 1,723 Bytes
72268ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
from model import ExLlama, ExLlamaCache, ExLlamaConfig
from tokenizer import ExLlamaTokenizer
from generator import ExLlamaGenerator
import os, glob
# Directory containing model, tokenizer, generator
model_directory = "/mnt/str/models/llama-13b-4bit-128g/"
# Locate files we need within that directory
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)
# Batched prompts
prompts = [
"Once upon a time,",
"I don't like to",
"A turbo encabulator is a",
"In the words of Mark Twain,"
]
# Create config, model, tokenizer and generator
config = ExLlamaConfig(model_config_path) # create config from config.json
config.model_path = model_path # supply path to model weights file
model = ExLlama(config) # create ExLlama instance and load the weights
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
cache = ExLlamaCache(model, batch_size = len(prompts)) # create cache for inference
generator = ExLlamaGenerator(model, tokenizer, cache) # create generator
# Configure generator
generator.disallow_tokens([tokenizer.eos_token_id])
generator.settings.token_repetition_penalty_max = 1.2
generator.settings.temperature = 0.95
generator.settings.top_p = 0.65
generator.settings.top_k = 100
generator.settings.typical = 0.5
# Generate, batched
for line in prompts:
print(line)
output = generator.generate_simple(prompts, max_new_tokens = 200)
for line in output:
print("---")
print(line)
|