Spaces:

Aston-xMAD
/

1bit_llama3_instruct_xmad_chatbot

Runtime error

App Files Files Community

1bit_llama3_instruct_xmad_chatbot / src /transformers /benchmark /benchmark.py

Aston-xMAD

init commit

b37c16f verified 5 months ago

raw

history blame contribute delete

10.8 kB

	# coding=utf-8
	# Copyright 2018 The HuggingFace Inc. team.
	# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Benchmarking the library on inference and training in PyTorch.
	"""


	import timeit
	from typing import Callable, Optional

	from ..configuration_utils import PretrainedConfig
	from ..models.auto.modeling_auto import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING
	from ..utils import is_py3nvml_available, is_torch_available, logging
	from .benchmark_utils import (
	Benchmark,
	Memory,
	MemorySummary,
	measure_peak_memory_cpu,
	start_memory_tracing,
	stop_memory_tracing,
	)


	if is_torch_available():
	import torch

	from .benchmark_args import PyTorchBenchmarkArguments


	if is_py3nvml_available():
	import py3nvml.py3nvml as nvml


	logger = logging.get_logger(__name__)


	class PyTorchBenchmark(Benchmark):
	args: PyTorchBenchmarkArguments
	configs: PretrainedConfig
	framework: str = "PyTorch"

	@property
	def framework_version(self):
	return torch.__version__

	def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
	_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
	return self._measure_speed(_inference)

	def _inference_memory(
	self, model_name: str, batch_size: int, sequence_length: int
	) -> [Memory, Optional[MemorySummary]]:
	_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
	return self._measure_memory(_inference)

	def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
	_train = self._prepare_train_func(model_name, batch_size, sequence_length)
	return self._measure_speed(_train)

	def _train_memory(
	self, model_name: str, batch_size: int, sequence_length: int
	) -> [Memory, Optional[MemorySummary]]:
	_train = self._prepare_train_func(model_name, batch_size, sequence_length)
	return self._measure_memory(_train)

	def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
	config = self.config_dict[model_name]

	if self.args.torchscript:
	config.torchscript = True

	has_model_class_in_config = (
	hasattr(config, "architectures")
	and isinstance(config.architectures, list)
	and len(config.architectures) > 0
	)
	if not self.args.only_pretrain_model and has_model_class_in_config:
	try:
	model_class = config.architectures[0]
	transformers_module = __import__("transformers", fromlist=[model_class])
	model_cls = getattr(transformers_module, model_class)
	model = model_cls(config)
	except ImportError:
	raise ImportError(
	f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
	" set `--only_pretrain_model` or `args.only_pretrain_model=True`."
	)
	else:
	model = MODEL_MAPPING[config.__class__](config)

	model.eval()
	model.to(self.args.device)

	# encoder-decoder has vocab size saved differently
	vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
	input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)

	if self.args.fp16:
	logger.info("Running training in Mixed Precision...")
	if not self.args.is_gpu:
	raise ValueError("Mixed precision is possible only for GPU.")
	# amp seems to have memory leaks so that memory usage
	# is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
	model.half()

	if self.args.torchscript:
	with torch.no_grad():
	inference_model = torch.jit.trace(model, input_ids)
	else:
	inference_model = model

	def encoder_decoder_forward():
	with torch.no_grad():
	outputs = inference_model(input_ids, decoder_input_ids=input_ids)
	return outputs

	def encoder_forward():
	with torch.no_grad():
	outputs = inference_model(input_ids)
	return outputs

	_forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
	return _forward

	def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
	config = self.config_dict[model_name]

	has_model_class_in_config = (
	hasattr(config, "architectures")
	and isinstance(config.architectures, list)
	and len(config.architectures) > 0
	)
	if not self.args.only_pretrain_model and has_model_class_in_config:
	try:
	model_class = config.architectures[0]
	transformers_module = __import__("transformers", fromlist=[model_class])
	model_cls = getattr(transformers_module, model_class)
	model = model_cls(config)
	except ImportError:
	raise ImportError(
	f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
	" set `--only_pretrain_model` or `args.only_pretrain_model=True`."
	)
	else:
	model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)

	if self.args.torchscript:
	raise NotImplementedError("Training for torchscript is currently not implemented")
	else:
	train_model = model

	model.train()
	model.to(self.args.device)

	# encoder-decoder has vocab size saved differently
	vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
	input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)

	if self.args.fp16:
	logger.info("Running training in Mixed Precision...")
	if not self.args.is_gpu:
	raise ValueError("Mixed precision is possible only for GPU.")

	# amp seems to have memory leaks so that memory usage
	# is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
	model.half()

	def compute_loss_and_backprob_encoder():
	loss = train_model(input_ids, labels=input_ids)[0]
	loss.backward()
	return loss

	def compute_loss_and_backprob_encoder_decoder():
	loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
	loss.backward()
	return loss

	_train = (
	compute_loss_and_backprob_encoder_decoder
	if config.is_encoder_decoder
	else compute_loss_and_backprob_encoder
	)
	return _train

	def _measure_speed(self, func) -> float:
	try:
	if self.args.is_tpu or self.args.torchscript:
	# run additional 10 times to stabilize compilation for tpu and torchscript
	logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
	timeit.repeat(
	func,
	repeat=1,
	number=5,
	)

	# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
	runtimes = timeit.repeat(
	func,
	repeat=self.args.repeat,
	number=10,
	)

	if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
	import torch_xla.debug.metrics as met

	self.print_fn(met.metrics_report())

	return min(runtimes) / 10.0
	except RuntimeError as e:
	self.print_fn(f"Doesn't fit on GPU. {e}")
	return "N/A"

	def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
	try:
	if self.args.trace_memory_line_by_line:
	trace = start_memory_tracing("transformers")

	if self.args.is_tpu:
	# tpu
	raise NotImplementedError(
	"Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with"
	" `--no-memory` or `args.memory=False`"
	)
	elif self.args.is_gpu:
	if not is_py3nvml_available():
	logger.warning(
	"py3nvml not installed, we won't log GPU memory usage. "
	"Install py3nvml (pip install py3nvml) to log information about GPU."
	)
	memory = "N/A"
	else:
	logger.info(
	"Measuring total GPU usage on GPU device. Make sure to not have additional processes running"
	" on the same GPU."
	)
	# init nvml
	nvml.nvmlInit()
	func()
	handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
	meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
	max_bytes_in_use = meminfo.used
	memory = Memory(max_bytes_in_use)
	# shutdown nvml
	nvml.nvmlShutdown()
	else:
	# cpu
	memory_bytes = measure_peak_memory_cpu(func)
	memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes

	if self.args.trace_memory_line_by_line:
	summary = stop_memory_tracing(trace)
	else:
	summary = None

	return memory, summary
	except RuntimeError as e:
	self.print_fn(f"Doesn't fit on GPU. {e}")
	return "N/A", None