classKVCacheBlock: # Block ID, ranging from 0 to num_gpu_blocks - 1. block_id: int # Reference count. ref_cnt: int = 0 # The hash of the block composed of (block hash, tuple of token IDs). # It is only available when the block is full. _block_hash: Optional[BlockHashType] = None
# Used to construct a doubly linked list for free blocks. # These two attributes should only be manipulated by FreeKVCacheBlockQueue. prev_free_block: Optional["KVCacheBlock"] = None next_free_block: Optional["KVCacheBlock"] = None
classFreeKVCacheBlockQueue: """This class organizes a list of KVCacheBlock objects to a doubly linked list of free blocks. We implement this class instead of using Python builtin deque to support removing a block in the middle of the queue in O(1) time. To close the performance gap to the builtin deque which is implemented in C++, this class does not allocate any Python objects when manipulating the linked list. Instead, this class manipulates the prev_free_block and next_free_block attributes of the given blocks. The queue is ordered by block ID in the beginning. When a block is allocated and then freed, it will be appended back with the eviction order: 1. The least recent used block is at the front (LRU). 2. If two blocks have the same last accessed time (allocated by the same sequence), the one with more hash tokens (the tail of a block chain) is at the front. Note that we maintain this order by reversing the block order when free blocks of a request. This operation is outside of this class. Args: blocks: A list of KVCacheBlock objects. """
# Initialize the doubly linked list of free blocks. self.free_list_head: Optional[KVCacheBlock] = blocks[0] self.free_list_tail: Optional[KVCacheBlock] = blocks[-1] for i inrange(self.num_free_blocks): if i > 0: blocks[i].prev_free_block = blocks[i - 1] if i < self.num_free_blocks - 1: blocks[i].next_free_block = blocks[i + 1]
classBlockPool: """BlockPool that manages KVCacheBlocks. It provides methods to allocate, free and cache the kv cache blocks. The free_block_queue stores the free blocks in eviction order to enable allocation, free, and cache eviction. The cached_block_hash_to_block maps between block hash and cached block to support finding cached blocks by their block hash. """
def__init__( self, num_gpu_blocks: int, enable_caching: bool, enable_kv_cache_events: bool = False, ): assertisinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 self.num_gpu_blocks = num_gpu_blocks self.enable_caching = enable_caching # All kv-cache blocks. self.blocks: list[KVCacheBlock] = [ KVCacheBlock(idx) for idx inrange(num_gpu_blocks) ] # Free block queue that constructs and manipulates a doubly linked # list of free blocks (including eviction candidates when caching is # enabled). self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
# {block_hash: {block ID: block}}. A cached block is # a full block with a block hash that can be used for prefix caching. # The cached block may be used by running requests or in the # free_block_queue that could potentially be evicted. # NOTE: We currently don't de-duplicate the blocks in the cache, # meaning that if a block becomes full and is cached, we don't check # if there is already an identical block in the cache. This is because # we want to make sure the allocated block IDs won't change so that # block tables are append-only. self.cached_block_hash_to_block: dict[BlockHashType, dict[ int, KVCacheBlock]] = defaultdict(dict)
# To represent a placeholder block with block_id=0. # The ref_cnt of null_block is not maintained, needs special care to # avoid freeing it. self.null_block = self.free_block_queue.popleft()