mistralrs_core/dummy_paged_attention/
mod.rs

1/// The higher-level manager of the blocks allocated. Operations performed by the block engine do
2/// not directly change memory.
3mod block_engine;
4mod block_engine_sequence;
5/// This is the lower-level manager of the cache. It manages swapping and copying the blocks and
6/// actually allocates the KV cache for the CPU and GPU. It is used by the LLMEngine to execute
7/// operations issued by the scheduler.
8mod cache_engine;
9mod config;
10mod layers;
11mod scheduler;
12pub const _PAD_SLOT_ID: i64 = -1;
13
14pub use block_engine::{BlockEngine, BlockTables, LogicalTokenBlock};
15pub use block_engine_sequence::BlockEngineSequence;
16pub use cache_engine::{CacheConfig, CacheEngine};
17use candle_core::{DType, Device};
18pub use config::{ModelConfigLike, ModelConfigMetadata};
19pub use layers::PagedAttention;
20pub use scheduler::{
21    PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
22};
23
24pub const DEFAULT_PAGED_ATTENTION_BLOCK_SIZE: usize = 32;
25
26/// All memory counts in MB. Default for block size is 32.
27#[derive(Clone, Copy)]
28pub struct PagedAttentionConfig {
29    pub(crate) block_size: Option<usize>,
30    pub(crate) mem_cpu: usize,
31    pub(crate) mem_gpu: MemoryGpuConfig,
32}
33
34impl PagedAttentionConfig {
35    pub fn new(
36        _block_size: Option<usize>,
37        _mem_cpu: usize,
38        _mem_gpu: MemoryGpuConfig,
39    ) -> anyhow::Result<Self> {
40        anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.")
41    }
42}
43
44#[derive(Debug, Clone, Copy)]
45pub enum AttentionImplementation {
46    Eager,
47    PagedAttention,
48}
49
50#[derive(Clone, Copy)]
51#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
52pub enum MemoryGpuConfig {
53    MbAmount(usize),
54    Utilization(f32),
55    ContextSize(usize),
56}
57
58/// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
59#[allow(clippy::too_many_arguments)]
60pub fn calculate_cache_config(
61    _mem_gpu: MemoryGpuConfig,
62    _mem_cpu: usize,
63    _block_size: Option<usize>,
64    _dtype: DType,
65    _config: &dyn ModelConfigLike,
66    _device: &Device,
67    _layer_devices: &[Option<Device>],
68    _silent: bool,
69) -> anyhow::Result<CacheConfig> {
70    anyhow::bail!("Cannot calculate cache config when not using PagedAttention.")
71}