mistralrs/
speculative.rs

1use std::sync::Arc;
2
3use mistralrs_core::{
4    initialize_logging, AutoDeviceMapParams, DefaultSchedulerMethod, DeviceMapSetting,
5    MistralRsBuilder, NormalLoaderBuilder, NormalSpecificConfig, Pipeline, SchedulerConfig,
6    SpeculativeConfig, SpeculativePipeline,
7};
8use tokio::sync::Mutex;
9
10use crate::{best_device, Model, TextModelBuilder};
11
12pub struct TextSpeculativeBuilder {
13    target: TextModelBuilder,
14    draft: TextModelBuilder,
15    speculative_config: SpeculativeConfig,
16}
17
18impl TextSpeculativeBuilder {
19    /// Create a builder for a speculative decoding pipeline.
20    ///
21    /// - PagedAttention settings are ignored as our impl of speculative decoding does not support this yet.
22    /// - Prefix caching settings are ignored as our impl of speculative decoding does not support this yet.
23    ///
24    /// Otherwise, scheduling parameters such as `max_num_seqs` are sourced from the target model.
25    pub fn new(
26        target: TextModelBuilder,
27        draft: TextModelBuilder,
28        speculative_config: SpeculativeConfig,
29    ) -> anyhow::Result<Self> {
30        if target.no_kv_cache || draft.no_kv_cache {
31            anyhow::bail!("Both target and draft must have KV cache enabled.");
32        }
33
34        Ok(Self {
35            target,
36            draft,
37            speculative_config,
38        })
39    }
40
41    fn build_pipeline(builder: TextModelBuilder) -> anyhow::Result<Arc<Mutex<dyn Pipeline>>> {
42        let config = NormalSpecificConfig {
43            use_flash_attn: builder.use_flash_attn,
44            prompt_chunksize: builder.prompt_chunksize,
45            topology: builder.topology,
46            organization: builder.organization,
47            write_uqff: builder.write_uqff,
48            from_uqff: builder.from_uqff,
49            imatrix: builder.imatrix,
50            calibration_file: builder.calibration_file,
51            hf_cache_path: builder.hf_cache_path,
52        };
53
54        if builder.with_logging {
55            initialize_logging();
56        }
57
58        let loader = NormalLoaderBuilder::new(
59            config,
60            builder.chat_template,
61            builder.tokenizer_json,
62            Some(builder.model_id),
63            builder.no_kv_cache,
64            builder.jinja_explicit,
65        )
66        .build(builder.loader_type)?;
67
68        // Load, into a Pipeline
69        let pipeline = loader.load_model_from_hf(
70            builder.hf_revision,
71            builder.token_source,
72            &builder.dtype,
73            &best_device(builder.force_cpu)?,
74            !builder.with_logging,
75            builder
76                .device_mapping
77                .unwrap_or(DeviceMapSetting::Auto(AutoDeviceMapParams::default_text())),
78            builder.isq,
79            builder.paged_attn_cfg,
80        )?;
81        Ok(pipeline)
82    }
83
84    pub async fn build(self) -> anyhow::Result<Model> {
85        let target = Self::build_pipeline(self.target.clone())?;
86        let draft = Self::build_pipeline(self.draft.clone())?;
87
88        let scheduler_method = SchedulerConfig::DefaultScheduler {
89            method: DefaultSchedulerMethod::Fixed(self.target.max_num_seqs.try_into()?),
90        };
91
92        let pipeline = Arc::new(Mutex::new(SpeculativePipeline::new(
93            target,
94            draft,
95            self.speculative_config,
96        )?));
97
98        let runner = MistralRsBuilder::new(
99            pipeline,
100            scheduler_method,
101            self.target.throughput_logging,
102            self.target.search_bert_model,
103        );
104
105        Ok(Model::new(runner.build()))
106    }
107}