mistralrs_core/dummy_paged_attention/
mod.rs1mod block_engine;
4mod block_engine_sequence;
5mod cache_engine;
9mod config;
10mod layers;
11mod scheduler;
12pub const _PAD_SLOT_ID: i64 = -1;
13
14pub use block_engine::{BlockEngine, BlockTables, LogicalTokenBlock, PhysicalTokenBlock};
15pub use block_engine_sequence::BlockEngineSequence;
16pub use cache_engine::{CacheConfig, CacheEngine, PagedCacheType};
17use candle_core::{DType, Device};
18pub use config::{ModelConfigLike, ModelConfigMetadata};
19pub use layers::PagedAttention;
20pub use scheduler::{
21 PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
22};
23
24use crate::MemoryUsage;
25use tracing::info;
26
27pub const DEFAULT_PAGED_ATTENTION_BLOCK_SIZE: usize = 32;
28
29#[derive(Clone, Copy)]
31pub struct PagedAttentionConfig {
32 pub(crate) block_size: Option<usize>,
33 pub(crate) mem_cpu: usize,
34 pub(crate) mem_gpu: MemoryGpuConfig,
35 pub(crate) cache_type: PagedCacheType,
36}
37
38impl PagedAttentionConfig {
39 pub fn new(
40 block_size: Option<usize>,
41 mem_cpu: usize,
42 mem_gpu: MemoryGpuConfig,
43 cache_type: PagedCacheType,
44 ) -> anyhow::Result<Self> {
45 Ok(Self {
46 block_size,
47 mem_cpu,
48 mem_gpu,
49 cache_type,
50 })
51 }
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
55pub enum AttentionImplementation {
56 Eager,
57 PagedAttention,
58}
59
60#[derive(Clone, Copy)]
61#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
62pub enum MemoryGpuConfig {
63 MbAmount(usize),
64 Utilization(f32),
65 ContextSize(usize),
66}
67
68const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32];
70
71const SIZE_IN_MB: usize = 1024 * 1024;
72
73macro_rules! mb_to_blocks {
74 ($mb_size:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
75 $mb_size
76 / $dtype_size
77 / $block_size
78 / $config.num_kv_heads()
79 / ($config.k_head_dim().max($config.v_head_dim()))
80 / $config.num_layers()
81 / 2
82 };
83}
84
85macro_rules! ctxt_to_blocks {
86 ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
87 $context_len
88 * $dtype_size
89 * $config.num_kv_heads()
90 * ($config.k_head_dim().max($config.v_head_dim()))
91 * $config.num_layers()
92 * 2
93 };
94}
95
96#[allow(clippy::too_many_arguments)]
98pub fn calculate_cache_config(
99 mem_gpu: MemoryGpuConfig,
100 mem_cpu: usize,
101 block_size: Option<usize>,
102 dtype: DType,
103 cache_type: PagedCacheType,
104 config: &dyn ModelConfigLike,
105 device: &Device,
106 layer_devices: &[Option<Device>],
107 silent: bool,
108) -> anyhow::Result<CacheConfig> {
109 let block_size = block_size.unwrap_or(DEFAULT_PAGED_ATTENTION_BLOCK_SIZE);
110 if !SUPPORTED_BLOCK_SIZE.contains(&block_size) {
111 anyhow::bail!("Block size must be in {SUPPORTED_BLOCK_SIZE:?}, got {block_size}");
112 }
113 let dtype = cache_type.to_dtype(dtype);
114 let dtype_size = dtype.size_in_bytes();
115
116 let mut min_mem_gpu = usize::MAX;
117 for dev in layer_devices {
118 let device = dev.as_ref().unwrap_or(device);
119
120 #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
121 let mem_gpu = match mem_gpu {
122 MemoryGpuConfig::MbAmount(v) => v,
123 MemoryGpuConfig::Utilization(f) => {
124 let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
125 let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
126 let used = total - free;
127 (total * f - used) as usize
128 }
129 MemoryGpuConfig::ContextSize(toks) => {
130 ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB
131 }
132 };
133 min_mem_gpu = min_mem_gpu.min(mem_gpu);
134 }
135
136 let mem_gpu = min_mem_gpu;
141
142 let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
143 let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
144 if num_gpu_blocks == 0 {
145 anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory amount/utilization/context size or disable PagedAttention.");
146 }
147
148 if !silent {
149 info!("Allocating {mem_gpu} MB for PagedAttention KV cache per GPU");
150 info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
151 }
152 Ok(CacheConfig {
153 block_size,
154 num_gpu_blocks,
155 num_cpu_blocks,
156 cache_type,
157 })
158}