mbuali
/

Llama-3.1-8B-DALv0.1

Text Generation

Model card Files Files and versions Community

Llama-3.1-8B-DALv0.1 / venv /lib /python3.12 /site-packages /torch /nn /parallel /parallel_apply.py

mbuali's picture

Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

history blame contribute delete

4.29 kB

	import threading
	import torch
	from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast
	from ..modules import Module
	from torch.cuda._utils import _get_device_index
	from torch.cuda.amp import autocast
	from torch._utils import ExceptionWrapper

	__all__ = ['get_a_var', 'parallel_apply']

	def get_a_var(obj: Union[torch.Tensor, List[Any], Tuple[Any, ...], Dict[Any, Any]]) -> Optional[torch.Tensor]:
	if isinstance(obj, torch.Tensor):
	return obj

	if isinstance(obj, (list, tuple)):
	for result in map(get_a_var, obj):
	if isinstance(result, torch.Tensor):
	return result
	if isinstance(obj, dict):
	for result in map(get_a_var, obj.items()):
	if isinstance(result, torch.Tensor):
	return result
	return None

	def parallel_apply(
	modules: Sequence[Module],
	inputs: Sequence[Any],
	kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None,
	devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None,
	) -> List[Any]:
	r"""Apply each `module` in :attr:`modules` in parallel on each of :attr:`devices`.

	Args:
	modules (Module): modules to be parallelized
	inputs (tensor): inputs to the modules
	devices (list of int or torch.device): CUDA devices

	:attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
	:attr:`devices` (if given) should all have same length. Moreover, each
	element of :attr:`inputs` can either be a single object as the only argument
	to a module, or a collection of positional arguments.
	"""
	assert len(modules) == len(inputs), f'The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}'
	if kwargs_tup is not None:
	assert len(modules) == len(kwargs_tup)
	else:
	kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules)
	if devices is not None:
	assert len(modules) == len(devices)
	else:
	devices = [None] * len(modules)
	devices = [_get_device_index(x, True) for x in devices]
	streams = [torch.cuda.current_stream(x) for x in devices]
	lock = threading.Lock()
	results = {}
	grad_enabled, autocast_enabled = torch.is_grad_enabled(), torch.is_autocast_enabled()

	def _worker(
	i: int,
	module: Module,
	input: Any,
	kwargs: Dict[str, Any],
	device: Optional[Union[int, torch.device]] = None,
	stream: Optional[torch.cuda.Stream] = None,
	) -> None:
	torch.set_grad_enabled(grad_enabled)
	if device is None:
	t = get_a_var(input)
	if t is None:
	with lock:
	results[i] = ExceptionWrapper(
	where=f"in replica {i}, no device was provided and no tensor input was found; "
	"device cannot be resolved")
	return
	device = t.get_device()
	if stream is None:
	stream = torch.cuda.current_stream(device)
	try:
	with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
	# this also avoids accidental slicing of `input` if it is a Tensor
	if not isinstance(input, (list, tuple)):
	input = (input,)
	output = module(input, *kwargs)
	with lock:
	results[i] = output
	except Exception:
	with lock:
	results[i] = ExceptionWrapper(
	where=f"in replica {i} on device {device}")

	if len(modules) > 1:
	threads = [threading.Thread(target=_worker,
	args=(i, module, input, kwargs, device, stream))
	for i, (module, input, kwargs, device, stream) in
	enumerate(zip(modules, inputs, kwargs_tup, devices, streams))]

	for thread in threads:
	thread.start()
	for thread in threads:
	thread.join()
	else:
	_worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0])

	outputs = []
	for i in range(len(inputs)):
	output = results[i]
	if isinstance(output, ExceptionWrapper):
	output.reraise()
	outputs.append(output)
	return outputs