mistralrs_core/dummy_paged_attention/
mod.rs

1/// The higher-level manager of the blocks allocated. Operations performed by the block engine do
2/// not directly change memory.
3mod block_engine;
4mod block_engine_sequence;
5/// This is the lower-level manager of the cache. It manages swapping and copying the blocks and
6/// actually allocates the KV cache for the CPU and GPU. It is used by the LLMEngine to execute
7/// operations issued by the scheduler.
8mod cache_engine;
9mod config;
10mod layers;
11mod scheduler;
12pub const _PAD_SLOT_ID: i64 = -1;
13
14pub use block_engine::{BlockEngine, BlockTables, LogicalTokenBlock, PhysicalTokenBlock};
15pub use block_engine_sequence::BlockEngineSequence;
16pub use cache_engine::{CacheConfig, CacheEngine};
17use candle_core::{DType, Device};
18pub use config::{ModelConfigLike, ModelConfigMetadata};
19pub use layers::PagedAttention;
20pub use scheduler::{
21    PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
22};
23
24use crate::MemoryUsage;
25use tracing::info;
26
27pub const DEFAULT_PAGED_ATTENTION_BLOCK_SIZE: usize = 32;
28
29/// All memory counts in MB. Default for block size is 32.
30#[derive(Clone, Copy)]
31pub struct PagedAttentionConfig {
32    pub(crate) block_size: Option<usize>,
33    pub(crate) mem_cpu: usize,
34    pub(crate) mem_gpu: MemoryGpuConfig,
35}
36
37impl PagedAttentionConfig {
38    pub fn new(
39        block_size: Option<usize>,
40        mem_cpu: usize,
41        mem_gpu: MemoryGpuConfig,
42    ) -> anyhow::Result<Self> {
43        Ok(Self {
44            block_size,
45            mem_cpu,
46            mem_gpu,
47        })
48    }
49}
50
51#[derive(Debug, Clone, Copy, PartialEq)]
52pub enum AttentionImplementation {
53    Eager,
54    PagedAttention,
55}
56
57#[derive(Clone, Copy)]
58#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
59pub enum MemoryGpuConfig {
60    MbAmount(usize),
61    Utilization(f32),
62    ContextSize(usize),
63}
64
65// See `pagedattention.cu` CALL_V1_LAUNCHER_BLOCK_SIZE
66const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32];
67
68const SIZE_IN_MB: usize = 1024 * 1024;
69
70macro_rules! mb_to_blocks {
71    ($mb_size:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
72        $mb_size
73            / $dtype_size
74            / $block_size
75            / $config.num_kv_heads()
76            / ($config.k_head_dim().max($config.v_head_dim()))
77            / $config.num_layers()
78            / 2
79    };
80}
81
82macro_rules! ctxt_to_blocks {
83    ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
84        $context_len
85            * $dtype_size
86            * $config.num_kv_heads()
87            * ($config.k_head_dim().max($config.v_head_dim()))
88            * $config.num_layers()
89            * 2
90    };
91}
92
93/// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
94#[allow(clippy::too_many_arguments)]
95pub fn calculate_cache_config(
96    mem_gpu: MemoryGpuConfig,
97    mem_cpu: usize,
98    block_size: Option<usize>,
99    dtype: DType,
100    config: &dyn ModelConfigLike,
101    device: &Device,
102    layer_devices: &[Option<Device>],
103    silent: bool,
104) -> anyhow::Result<CacheConfig> {
105    let block_size = block_size.unwrap_or(DEFAULT_PAGED_ATTENTION_BLOCK_SIZE);
106    if !SUPPORTED_BLOCK_SIZE.contains(&block_size) {
107        anyhow::bail!("Block size must be in {SUPPORTED_BLOCK_SIZE:?}, got {block_size}");
108    }
109    let dtype_size = dtype.size_in_bytes();
110
111    let mut min_mem_gpu = usize::MAX;
112    for dev in layer_devices {
113        let device = dev.as_ref().unwrap_or(device);
114
115        #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
116        let mem_gpu = match mem_gpu {
117            MemoryGpuConfig::MbAmount(v) => v,
118            MemoryGpuConfig::Utilization(f) => {
119                let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
120                let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
121                let used = total - free;
122                (total * f - used) as usize
123            }
124            MemoryGpuConfig::ContextSize(toks) => {
125                ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB
126            }
127        };
128        min_mem_gpu = min_mem_gpu.min(mem_gpu);
129    }
130
131    // // Cap at kv cache for max seq len
132    // let mem_for_toks =
133    //     ctxt_to_blocks!(config.max_seq_len(), dtype_size, block_size, config) / SIZE_IN_MB;
134    // let mem_gpu = min_mem_gpu.min(mem_for_toks);
135    let mem_gpu = min_mem_gpu;
136
137    let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
138    let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
139    if num_gpu_blocks == 0 {
140        anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory amount/utilization/context size or disable PagedAttention.");
141    }
142
143    if !silent {
144        info!("Allocating {mem_gpu} MB for PagedAttention KV cache per GPU");
145        info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
146    }
147    Ok(CacheConfig {
148        block_size,
149        num_gpu_blocks,
150        num_cpu_blocks,
151    })
152}