Spaces:

W404NET
/

My-Chat

Build error

My-Chat / modules /llamacpp_model.py

Lee Thanh

Upload All

0eeee8c about 1 year ago

5.88 kB

	import re
	from functools import partial

	import numpy as np
	import torch

	from modules import RoPE, shared
	from modules.callbacks import Iteratorize
	from modules.logging_colors import logger
	from modules.text_generation import get_max_prompt_length

	try:
	import llama_cpp
	except:
	llama_cpp = None

	try:
	import llama_cpp_cuda
	except:
	llama_cpp_cuda = None


	def llama_cpp_lib():
	if (shared.args.cpu and llama_cpp is not None) or llama_cpp_cuda is None:
	return llama_cpp
	else:
	return llama_cpp_cuda


	def ban_eos_logits_processor(eos_token, input_ids, logits):
	logits[eos_token] = -float('inf')
	return logits


	def custom_token_ban_logits_processor(token_ids, input_ids, logits):
	for token_id in token_ids:
	logits[token_id] = -float('inf')

	return logits


	class LlamaCppModel:
	def __init__(self):
	self.initialized = False
	self.grammar_string = ''
	self.grammar = None

	def __del__(self):
	self.model.__del__()

	@classmethod
	def from_pretrained(self, path):

	Llama = llama_cpp_lib().Llama
	LlamaCache = llama_cpp_lib().LlamaCache

	result = self()
	cache_capacity = 0
	if shared.args.cache_capacity is not None:
	if 'GiB' in shared.args.cache_capacity:
	cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000
	elif 'MiB' in shared.args.cache_capacity:
	cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000
	else:
	cache_capacity = int(shared.args.cache_capacity)

	logger.info("Cache capacity is " + str(cache_capacity) + " bytes")

	if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
	tensor_split_list = None
	else:
	tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]

	params = {
	'model_path': str(path),
	'n_ctx': shared.args.n_ctx,
	'seed': int(shared.args.llama_cpp_seed),
	'n_threads': shared.args.threads or None,
	'n_threads_batch': shared.args.threads_batch or None,
	'n_batch': shared.args.n_batch,
	'use_mmap': not shared.args.no_mmap,
	'use_mlock': shared.args.mlock,
	'mul_mat_q': not shared.args.no_mul_mat_q,
	'numa': shared.args.numa,
	'n_gpu_layers': shared.args.n_gpu_layers,
	'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
	'tensor_split': tensor_split_list,
	'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
	}

	result.model = Llama(**params)
	if cache_capacity > 0:
	result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))

	# This is ugly, but the model and the tokenizer are the same object in this library.
	return result, result

	def encode(self, string):
	if type(string) is str:
	string = string.encode()

	return self.model.tokenize(string)

	def decode(self, ids, **kwargs):
	return self.model.detokenize(ids).decode('utf-8')

	def get_logits(self, tokens):
	self.model.eval(tokens)
	logits = self.model._scores
	logits = np.expand_dims(logits, 0) # batch dim is expected
	return torch.tensor(logits, dtype=torch.float32)

	def load_grammar(self, string):
	if string != self.grammar_string:
	self.grammar_string = string
	if string.strip() != '':
	self.grammar = llama_cpp_lib().LlamaGrammar.from_string(string)
	else:
	self.grammar = None

	def generate(self, prompt, state, callback=None):

	LogitsProcessorList = llama_cpp_lib().LogitsProcessorList

	prompt = prompt if type(prompt) is str else prompt.decode()

	# Handle truncation
	prompt = self.encode(prompt)
	prompt = prompt[-get_max_prompt_length(state):]
	prompt = self.decode(prompt)

	self.load_grammar(state['grammar_string'])
	logit_processors = LogitsProcessorList()
	if state['ban_eos_token']:
	logit_processors.append(partial(ban_eos_logits_processor, self.model.token_eos()))

	if state['custom_token_bans']:
	to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
	if len(to_ban) > 0:
	logit_processors.append(partial(custom_token_ban_logits_processor, to_ban))

	completion_chunks = self.model.create_completion(
	prompt=prompt,
	max_tokens=state['max_new_tokens'],
	temperature=state['temperature'],
	top_p=state['top_p'],
	top_k=state['top_k'],
	repeat_penalty=state['repetition_penalty'],
	presence_penalty=state['presence_penalty'],
	frequency_penalty=state['frequency_penalty'],
	tfs_z=state['tfs'],
	mirostat_mode=int(state['mirostat_mode']),
	mirostat_tau=state['mirostat_tau'],
	mirostat_eta=state['mirostat_eta'],
	stream=True,
	logits_processor=logit_processors,
	grammar=self.grammar
	)

	output = ""
	for completion_chunk in completion_chunks:
	if shared.stop_everything:
	break
	text = completion_chunk['choices'][0]['text']
	output += text
	if callback:
	callback(text)

	return output

	def generate_with_streaming(self, args, *kwargs):
	with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
	reply = ''
	for token in generator:
	reply += token
	yield reply