|
"""A block manager that manages token blocks.""" |
|
|
|
import enum |
|
from typing import Dict, List, Optional, Set, Tuple |
|
|
|
from vllm.block import PhysicalTokenBlock |
|
from .sequence import Sequence, SequenceGroup, SequenceStatus |
|
from vllm.utils import Device |
|
|
|
|
|
BlockTable = List[PhysicalTokenBlock] |
|
|
|
|
|
class BlockAllocator: |
|
"""Manages free physical token blocks for a device. |
|
|
|
The allocator maintains a list of free blocks and allocates a block when |
|
requested. When a block is freed, its reference count is decremented. If |
|
the reference count becomes zero, the block is added back to the free list. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
device: Device, |
|
block_size: int, |
|
num_blocks: int, |
|
) -> None: |
|
self.device = device |
|
self.block_size = block_size |
|
self.num_blocks = num_blocks |
|
|
|
|
|
self.free_blocks: BlockTable = [] |
|
for i in range(num_blocks): |
|
block = PhysicalTokenBlock( |
|
device=device, block_number=i, block_size=block_size |
|
) |
|
self.free_blocks.append(block) |
|
|
|
def allocate(self) -> PhysicalTokenBlock: |
|
if not self.free_blocks: |
|
raise ValueError("Out of memory! No free blocks are available.") |
|
block = self.free_blocks.pop() |
|
block.ref_count = 1 |
|
return block |
|
|
|
def free(self, block: PhysicalTokenBlock) -> None: |
|
if block.ref_count == 0: |
|
raise ValueError(f"Double free! {block} is already freed.") |
|
block.ref_count -= 1 |
|
if block.ref_count == 0: |
|
self.free_blocks.append(block) |
|
|
|
def get_num_free_blocks(self) -> int: |
|
return len(self.free_blocks) |
|
|
|
|
|
class AllocStatus(enum.Enum): |
|
"""Result for BlockSpaceManager.can_allocate |
|
|
|
1. Ok: seq_group can be allocated now. |
|
2. Later: seq_group cannot be allocated. |
|
The capacity of allocator is larger than seq_group required. |
|
3. Never: seq_group can never be allocated. |
|
The seq_group is too large to allocated in GPU. |
|
""" |
|
|
|
OK = enum.auto() |
|
LATER = enum.auto() |
|
NEVER = enum.auto() |
|
|
|
|
|
class BlockSpaceManager: |
|
"""Manages the mapping between logical and physical token blocks.""" |
|
|
|
def __init__( |
|
self, |
|
block_size: int, |
|
num_gpu_blocks: int, |
|
num_cpu_blocks: int, |
|
watermark: float = 0.01, |
|
sliding_window: Optional[int] = None, |
|
) -> None: |
|
self.block_size = block_size |
|
self.num_total_gpu_blocks = num_gpu_blocks |
|
self.num_total_cpu_blocks = num_cpu_blocks |
|
|
|
self.block_sliding_window = None |
|
if sliding_window is not None: |
|
assert sliding_window % block_size == 0, (sliding_window, block_size) |
|
self.block_sliding_window = sliding_window // block_size |
|
|
|
self.watermark = watermark |
|
assert watermark >= 0.0 |
|
|
|
self.watermark_blocks = int(watermark * num_gpu_blocks) |
|
self.gpu_allocator = BlockAllocator(Device.GPU, block_size, num_gpu_blocks) |
|
self.cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks) |
|
|
|
self.block_tables: Dict[int, BlockTable] = {} |
|
|
|
def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: |
|
|
|
|
|
seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] |
|
num_required_blocks = len(seq.logical_token_blocks) |
|
if self.block_sliding_window is not None: |
|
num_required_blocks = min(num_required_blocks, self.block_sliding_window) |
|
num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() |
|
|
|
|
|
if self.num_total_gpu_blocks - num_required_blocks < self.watermark_blocks: |
|
return AllocStatus.NEVER |
|
if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: |
|
return AllocStatus.OK |
|
else: |
|
return AllocStatus.LATER |
|
|
|
def allocate(self, seq_group: SequenceGroup) -> None: |
|
|
|
|
|
seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] |
|
|
|
|
|
block_table: BlockTable = [] |
|
for logical_idx in range(len(seq.logical_token_blocks)): |
|
if ( |
|
self.block_sliding_window is not None |
|
and logical_idx >= self.block_sliding_window |
|
): |
|
block = block_table[logical_idx % self.block_sliding_window] |
|
else: |
|
block = self.gpu_allocator.allocate() |
|
|
|
block.ref_count = seq_group.num_seqs() |
|
block_table.append(block) |
|
|
|
|
|
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): |
|
self.block_tables[seq.seq_id] = block_table.copy() |
|
|
|
def can_append_slot(self, seq_group: SequenceGroup) -> bool: |
|
|
|
|
|
num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() |
|
num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) |
|
return num_seqs <= num_free_gpu_blocks |
|
|
|
def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: |
|
"""Allocate a physical slot for a new token.""" |
|
logical_blocks = seq.logical_token_blocks |
|
block_table = self.block_tables[seq.seq_id] |
|
|
|
if len(block_table) < len(logical_blocks): |
|
if ( |
|
self.block_sliding_window |
|
and len(block_table) >= self.block_sliding_window |
|
): |
|
|
|
block_table.append( |
|
block_table[len(block_table) % self.block_sliding_window] |
|
) |
|
else: |
|
|
|
|
|
block = self.gpu_allocator.allocate() |
|
block_table.append(block) |
|
return None |
|
|
|
|
|
last_block = block_table[-1] |
|
assert last_block.device == Device.GPU |
|
if last_block.ref_count == 1: |
|
|
|
return None |
|
else: |
|
|
|
|
|
new_block = self.gpu_allocator.allocate() |
|
block_table[-1] = new_block |
|
self.gpu_allocator.free(last_block) |
|
return last_block.block_number, new_block.block_number |
|
|
|
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: |
|
|
|
|
|
src_block_table = self.block_tables[parent_seq.seq_id] |
|
self.block_tables[child_seq.seq_id] = src_block_table.copy() |
|
for block in src_block_table: |
|
block.ref_count += 1 |
|
|
|
def _get_physical_blocks( |
|
self, seq_group: SequenceGroup |
|
) -> List[PhysicalTokenBlock]: |
|
|
|
|
|
blocks: Set[PhysicalTokenBlock] = set() |
|
for seq in seq_group.get_seqs(): |
|
if seq.is_finished(): |
|
continue |
|
blocks.update(self.block_tables[seq.seq_id]) |
|
return list(blocks) |
|
|
|
def can_swap_in(self, seq_group: SequenceGroup) -> bool: |
|
blocks = self._get_physical_blocks(seq_group) |
|
num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) |
|
num_free_blocks = self.gpu_allocator.get_num_free_blocks() |
|
|
|
|
|
|
|
num_required_blocks = len(blocks) + num_swapped_seqs |
|
return num_free_blocks - num_required_blocks >= self.watermark_blocks |
|
|
|
def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: |
|
|
|
mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} |
|
for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): |
|
new_block_table: BlockTable = [] |
|
block_table = self.block_tables[seq.seq_id] |
|
|
|
for cpu_block in block_table: |
|
if cpu_block in mapping: |
|
gpu_block = mapping[cpu_block] |
|
gpu_block.ref_count += 1 |
|
else: |
|
gpu_block = self.gpu_allocator.allocate() |
|
mapping[cpu_block] = gpu_block |
|
new_block_table.append(gpu_block) |
|
|
|
self.cpu_allocator.free(cpu_block) |
|
self.block_tables[seq.seq_id] = new_block_table |
|
|
|
block_number_mapping = { |
|
cpu_block.block_number: gpu_block.block_number |
|
for cpu_block, gpu_block in mapping.items() |
|
} |
|
return block_number_mapping |
|
|
|
def can_swap_out(self, seq_group: SequenceGroup) -> bool: |
|
blocks = self._get_physical_blocks(seq_group) |
|
return len(blocks) <= self.cpu_allocator.get_num_free_blocks() |
|
|
|
def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: |
|
|
|
mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} |
|
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): |
|
new_block_table: BlockTable = [] |
|
block_table = self.block_tables[seq.seq_id] |
|
|
|
for gpu_block in block_table: |
|
if gpu_block in mapping: |
|
cpu_block = mapping[gpu_block] |
|
cpu_block.ref_count += 1 |
|
else: |
|
cpu_block = self.cpu_allocator.allocate() |
|
mapping[gpu_block] = cpu_block |
|
new_block_table.append(cpu_block) |
|
|
|
self.gpu_allocator.free(gpu_block) |
|
self.block_tables[seq.seq_id] = new_block_table |
|
|
|
block_number_mapping = { |
|
gpu_block.block_number: cpu_block.block_number |
|
for gpu_block, cpu_block in mapping.items() |
|
} |
|
return block_number_mapping |
|
|
|
def _free_block_table(self, block_table: BlockTable) -> None: |
|
for block in set(block_table): |
|
if block.device == Device.GPU: |
|
self.gpu_allocator.free(block) |
|
else: |
|
self.cpu_allocator.free(block) |
|
|
|
def free(self, seq: Sequence) -> None: |
|
if seq.seq_id not in self.block_tables: |
|
|
|
return |
|
block_table = self.block_tables[seq.seq_id] |
|
self._free_block_table(block_table) |
|
del self.block_tables[seq.seq_id] |
|
|
|
def reset(self) -> None: |
|
for block_table in self.block_tables.values(): |
|
self._free_block_table(block_table) |
|
self.block_tables.clear() |
|
|
|
def get_block_table(self, seq: Sequence) -> List[int]: |
|
block_table = self.block_tables[seq.seq_id] |
|
return [block.block_number for block in block_table] |
|
|
|
def get_num_free_gpu_blocks(self) -> int: |
|
return self.gpu_allocator.get_num_free_blocks() |
|
|
|
def get_num_free_cpu_blocks(self) -> int: |
|
return self.cpu_allocator.get_num_free_blocks() |
|
|