mistralrs_core/paged_attention/
mod.rs1mod block_engine;
4mod block_engine_sequence;
5mod cache_engine;
9mod config;
10mod layers;
11mod prefix_cacher;
13mod scheduler;
14pub const _PAD_SLOT_ID: i64 = -1;
15
16pub use block_engine::{BlockEngine, BlockTables, LogicalTokenBlock, PhysicalTokenBlock};
17pub use block_engine_sequence::BlockEngineSequence;
18pub use cache_engine::{CacheConfig, CacheEngine, PagedCacheType};
19use candle_core::{DType, Device};
20pub use config::{ModelConfigLike, ModelConfigMetadata};
21pub use layers::PagedAttention;
22pub use scheduler::{
23 PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
24};
25
26use crate::MemoryUsage;
27use tracing::{info, warn};
28
29pub const DEFAULT_PAGED_ATTENTION_BLOCK_SIZE: usize = 32;
30
31#[derive(Clone, Copy)]
33pub struct PagedAttentionConfig {
34 pub(crate) block_size: Option<usize>,
35 pub(crate) mem_gpu: MemoryGpuConfig,
36 pub(crate) cache_type: PagedCacheType,
37}
38
39impl PagedAttentionConfig {
40 pub fn new(
41 block_size: Option<usize>,
42 mem_gpu: MemoryGpuConfig,
43 cache_type: PagedCacheType,
44 ) -> anyhow::Result<Self> {
45 Ok(Self {
46 block_size,
47 mem_gpu,
48 cache_type,
49 })
50 }
51}
52
53#[derive(Debug, Clone, Copy, PartialEq)]
54pub enum AttentionImplementation {
55 Eager,
56 PagedAttention,
57}
58
59#[derive(Clone, Copy)]
60#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
61pub enum MemoryGpuConfig {
62 MbAmount(usize),
63 Utilization(f32),
64 ContextSize(usize),
65}
66
67const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32];
69
70const SIZE_IN_MB: usize = 1024 * 1024;
71
72macro_rules! mb_to_blocks {
73 ($mb_size:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
74 $mb_size
75 / $dtype_size
76 / $block_size
77 / $config.num_kv_heads()
78 / ($config.k_head_dim().max($config.v_head_dim()))
79 / $config.num_layers()
80 / 2
81 };
82}
83
84macro_rules! ctxt_to_blocks {
85 ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
86 $context_len
87 * $dtype_size
88 * $config.num_kv_heads()
89 * ($config.k_head_dim().max($config.v_head_dim()))
90 * $config.num_layers()
91 * 2
92 };
93}
94
95#[allow(clippy::too_many_arguments)]
97pub fn calculate_cache_config(
98 mem_gpu: MemoryGpuConfig,
99 block_size: Option<usize>,
100 dtype: DType,
101 cache_type: PagedCacheType,
102 config: &dyn ModelConfigLike,
103 device: &Device,
104 layer_devices: &[Option<Device>],
105 silent: bool,
106) -> anyhow::Result<CacheConfig> {
107 let block_size = block_size.unwrap_or(DEFAULT_PAGED_ATTENTION_BLOCK_SIZE);
108 if !SUPPORTED_BLOCK_SIZE.contains(&block_size) {
109 anyhow::bail!("Block size must be in {SUPPORTED_BLOCK_SIZE:?}, got {block_size}");
110 }
111 let dtype = cache_type.to_dtype(dtype);
112 let dtype_size = dtype.size_in_bytes();
113
114 let mut min_mem_gpu = usize::MAX;
115 for dev in layer_devices {
116 let device = dev.as_ref().unwrap_or(device);
117
118 #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
119 let mem_gpu = match mem_gpu {
120 MemoryGpuConfig::MbAmount(v) => v,
121 MemoryGpuConfig::Utilization(f) => {
122 let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
123 let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
124 let used = total - free;
125 (total * f - used) as usize
126 }
127 MemoryGpuConfig::ContextSize(toks) => {
128 ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB
129 }
130 };
131 min_mem_gpu = min_mem_gpu.min(mem_gpu);
132 }
133
134 let mem_gpu = if matches!(device, Device::Metal(_)) {
142 let metal_cap_mb = MemoryUsage.get_total_memory(device)? / SIZE_IN_MB;
143
144 info!("Metal GPU wired limit is {metal_cap_mb} MB.");
145
146 if min_mem_gpu > metal_cap_mb {
147 if !silent {
148 warn!(
149 "Capping Metal GPU memory allocation from {} MB to {} MB (limited by iogpu.wired_limit_mb). \
150To raise this cap run: `sudo sysctl -w iogpu.wired_limit_mb=<desired_mb>`.",
151 min_mem_gpu,
152 metal_cap_mb
153 );
154 }
155 metal_cap_mb
156 } else {
157 min_mem_gpu
158 }
159 } else {
160 min_mem_gpu
161 };
162
163 let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
164 if num_gpu_blocks == 0 {
165 anyhow::bail!("Num GPU blocks is 0. This means there is not enough memory. Either reduce the memory amount/utilization/context size or disable PagedAttention.");
166 }
167
168 if !silent {
169 info!("Allocating {mem_gpu} MB for PagedAttention KV cache per GPU");
170 info!("PagedAttention KV cache type is {dtype:?}");
171 info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
172 }
173 Ok(CacheConfig {
174 block_size,
175 num_gpu_blocks,
176 cache_type,
177 })
178}