mistralrs/
speculative.rs

1use std::sync::Arc;
2
3use mistralrs_core::{
4    initialize_logging, AutoDeviceMapParams, DefaultSchedulerMethod, DeviceMapSetting,
5    MistralRsBuilder, NormalLoaderBuilder, NormalSpecificConfig, Pipeline, SchedulerConfig,
6    SpeculativeConfig, SpeculativePipeline,
7};
8use tokio::sync::Mutex;
9
10use crate::{best_device, Model, TextModelBuilder};
11
12pub struct TextSpeculativeBuilder {
13    target: TextModelBuilder,
14    draft: TextModelBuilder,
15    speculative_config: SpeculativeConfig,
16}
17
18impl TextSpeculativeBuilder {
19    /// Create a builder for a speculative decoding pipeline.
20    ///
21    /// - PagedAttention settings are ignored as our impl of speculative decoding does not support this yet.
22    /// - Prefix caching settings are ignored as our impl of speculative decoding does not support this yet.
23    ///
24    /// Otherwise, scheduling parameters such as `max_num_seqs` are sourced from the target model.
25    pub fn new(
26        target: TextModelBuilder,
27        draft: TextModelBuilder,
28        speculative_config: SpeculativeConfig,
29    ) -> anyhow::Result<Self> {
30        if target.no_kv_cache || draft.no_kv_cache {
31            anyhow::bail!("Both target and draft must have KV cache enabled.");
32        }
33
34        Ok(Self {
35            target,
36            draft,
37            speculative_config,
38        })
39    }
40
41    fn build_pipeline(builder: TextModelBuilder) -> anyhow::Result<Arc<Mutex<dyn Pipeline>>> {
42        let config = NormalSpecificConfig {
43            prompt_chunksize: builder.prompt_chunksize,
44            topology: builder.topology,
45            organization: builder.organization,
46            write_uqff: builder.write_uqff,
47            from_uqff: builder.from_uqff,
48            imatrix: builder.imatrix,
49            calibration_file: builder.calibration_file,
50            hf_cache_path: builder.hf_cache_path,
51        };
52
53        if builder.with_logging {
54            initialize_logging();
55        }
56
57        let loader = NormalLoaderBuilder::new(
58            config,
59            builder.chat_template,
60            builder.tokenizer_json,
61            Some(builder.model_id),
62            builder.no_kv_cache,
63            builder.jinja_explicit,
64        )
65        .build(builder.loader_type)?;
66
67        // Load, into a Pipeline
68        let pipeline = loader.load_model_from_hf(
69            builder.hf_revision,
70            builder.token_source,
71            &builder.dtype,
72            &best_device(builder.force_cpu)?,
73            !builder.with_logging,
74            builder
75                .device_mapping
76                .unwrap_or(DeviceMapSetting::Auto(AutoDeviceMapParams::default_text())),
77            builder.isq,
78            builder.paged_attn_cfg,
79        )?;
80        Ok(pipeline)
81    }
82
83    pub async fn build(self) -> anyhow::Result<Model> {
84        let target = Self::build_pipeline(self.target.clone())?;
85        let draft = Self::build_pipeline(self.draft.clone())?;
86
87        let scheduler_method = SchedulerConfig::DefaultScheduler {
88            method: DefaultSchedulerMethod::Fixed(self.target.max_num_seqs.try_into()?),
89        };
90
91        let pipeline = Arc::new(Mutex::new(SpeculativePipeline::new(
92            target,
93            draft,
94            self.speculative_config,
95        )?));
96
97        let mut runner = MistralRsBuilder::new(
98            pipeline,
99            scheduler_method,
100            self.target.throughput_logging,
101            self.target.search_bert_model,
102        );
103        if let Some(cb) = self.target.search_callback.clone() {
104            runner = runner.with_search_callback(cb);
105        }
106        for (name, cb) in &self.target.tool_callbacks {
107            runner = runner.with_tool_callback(name.clone(), cb.clone());
108        }
109
110        Ok(Model::new(runner.build().await))
111    }
112}