mistralrs_core/dummy_paged_attention/
mod.rsmod block_engine;
mod block_engine_sequence;
mod cache_engine;
mod config;
mod layers;
mod scheduler;
pub const _PAD_SLOT_ID: i64 = -1;
pub use block_engine::{BlockEngine, BlockTables, LogicalTokenBlock};
pub use block_engine_sequence::BlockEngineSequence;
pub use cache_engine::{CacheConfig, CacheEngine};
use candle_core::{DType, Device};
pub use config::{ModelConfigLike, ModelConfigMetadata};
pub use layers::PagedAttention;
pub use scheduler::{
PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
};
pub const DEFAULT_PAGED_ATTENTION_BLOCK_SIZE: usize = 32;
#[derive(Clone, Copy)]
pub struct PagedAttentionConfig {
pub(crate) block_size: Option<usize>,
pub(crate) mem_cpu: usize,
pub(crate) mem_gpu: MemoryGpuConfig,
}
impl PagedAttentionConfig {
pub fn new(
_block_size: Option<usize>,
_mem_cpu: usize,
_mem_gpu: MemoryGpuConfig,
) -> anyhow::Result<Self> {
anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.")
}
}
#[derive(Debug, Clone, Copy)]
pub enum AttentionImplementation {
Eager,
PagedAttention,
}
#[derive(Clone, Copy)]
#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
pub enum MemoryGpuConfig {
MbAmount(usize),
Utilization(f32),
ContextSize(usize),
}
#[allow(clippy::too_many_arguments)]
pub fn calculate_cache_config(
_mem_gpu: MemoryGpuConfig,
_mem_cpu: usize,
_block_size: Option<usize>,
_dtype: DType,
_config: &dyn ModelConfigLike,
_device: &Device,
_layer_devices: &[Option<Device>],
_silent: bool,
) -> anyhow::Result<CacheConfig> {
anyhow::bail!("Cannot calculate cache config when not using PagedAttention.")
}