mistralrs_core/dummy_paged_attention/
mod.rs1mod block_engine;
4mod block_engine_sequence;
5mod cache_engine;
9mod config;
10mod layers;
11mod scheduler;
12pub const _PAD_SLOT_ID: i64 = -1;
13
14pub use block_engine::{BlockEngine, BlockTables, LogicalTokenBlock};
15pub use block_engine_sequence::BlockEngineSequence;
16pub use cache_engine::{CacheConfig, CacheEngine};
17use candle_core::{DType, Device};
18pub use config::{ModelConfigLike, ModelConfigMetadata};
19pub use layers::PagedAttention;
20pub use scheduler::{
21 PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
22};
23
24pub const DEFAULT_PAGED_ATTENTION_BLOCK_SIZE: usize = 32;
25
26#[derive(Clone, Copy)]
28pub struct PagedAttentionConfig {
29 pub(crate) block_size: Option<usize>,
30 pub(crate) mem_cpu: usize,
31 pub(crate) mem_gpu: MemoryGpuConfig,
32}
33
34impl PagedAttentionConfig {
35 pub fn new(
36 _block_size: Option<usize>,
37 _mem_cpu: usize,
38 _mem_gpu: MemoryGpuConfig,
39 ) -> anyhow::Result<Self> {
40 anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.")
41 }
42}
43
44#[derive(Debug, Clone, Copy)]
45pub enum AttentionImplementation {
46 Eager,
47 PagedAttention,
48}
49
50#[derive(Clone, Copy)]
51#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
52pub enum MemoryGpuConfig {
53 MbAmount(usize),
54 Utilization(f32),
55 ContextSize(usize),
56}
57
58#[allow(clippy::too_many_arguments)]
60pub fn calculate_cache_config(
61 _mem_gpu: MemoryGpuConfig,
62 _mem_cpu: usize,
63 _block_size: Option<usize>,
64 _dtype: DType,
65 _config: &dyn ModelConfigLike,
66 _device: &Device,
67 _layer_devices: &[Option<Device>],
68 _silent: bool,
69) -> anyhow::Result<CacheConfig> {
70 anyhow::bail!("Cannot calculate cache config when not using PagedAttention.")
71}