mistralrs/
speculative.rs

1use std::sync::Arc;
2
3use mistralrs_core::{
4    initialize_logging, AutoDeviceMapParams, DefaultSchedulerMethod, DeviceMapSetting,
5    MistralRsBuilder, NormalLoaderBuilder, NormalSpecificConfig, Pipeline, SchedulerConfig,
6    SpeculativeConfig, SpeculativePipeline,
7};
8use tokio::sync::Mutex;
9
10use crate::{best_device, Model, TextModelBuilder};
11
12pub struct TextSpeculativeBuilder {
13    target: TextModelBuilder,
14    draft: TextModelBuilder,
15    speculative_config: SpeculativeConfig,
16}
17
18impl TextSpeculativeBuilder {
19    /// Create a builder for a speculative decoding pipeline.
20    ///
21    /// - PagedAttention settings are ignored as our impl of speculative decoding does not support this yet.
22    /// - Prefix caching settings are ignored as our impl of speculative decoding does not support this yet.
23    ///
24    /// Otherwise, scheduling parameters such as `max_num_seqs` are sourced from the target model.
25    pub fn new(
26        target: TextModelBuilder,
27        draft: TextModelBuilder,
28        speculative_config: SpeculativeConfig,
29    ) -> anyhow::Result<Self> {
30        if target.no_kv_cache || draft.no_kv_cache {
31            anyhow::bail!("Both target and draft must have KV cache enabled.");
32        }
33
34        Ok(Self {
35            target,
36            draft,
37            speculative_config,
38        })
39    }
40
41    fn build_pipeline(builder: TextModelBuilder) -> anyhow::Result<Arc<Mutex<dyn Pipeline>>> {
42        let config = NormalSpecificConfig {
43            topology: builder.topology,
44            organization: builder.organization,
45            write_uqff: builder.write_uqff,
46            from_uqff: builder.from_uqff,
47            imatrix: builder.imatrix,
48            calibration_file: builder.calibration_file,
49            hf_cache_path: builder.hf_cache_path,
50            matformer_config_path: None,
51            matformer_slice_name: None,
52        };
53
54        if builder.with_logging {
55            initialize_logging();
56        }
57
58        let loader = NormalLoaderBuilder::new(
59            config,
60            builder.chat_template,
61            builder.tokenizer_json,
62            Some(builder.model_id),
63            builder.no_kv_cache,
64            builder.jinja_explicit,
65        )
66        .build(builder.loader_type)?;
67
68        // Load, into a Pipeline
69        let pipeline = loader.load_model_from_hf(
70            builder.hf_revision,
71            builder.token_source,
72            &builder.dtype,
73            &best_device(builder.force_cpu)?,
74            !builder.with_logging,
75            builder
76                .device_mapping
77                .unwrap_or(DeviceMapSetting::Auto(AutoDeviceMapParams::default_text())),
78            builder.isq,
79            builder.paged_attn_cfg,
80        )?;
81        Ok(pipeline)
82    }
83
84    pub async fn build(self) -> anyhow::Result<Model> {
85        let target = Self::build_pipeline(self.target.clone())?;
86        let draft = Self::build_pipeline(self.draft.clone())?;
87
88        let scheduler_method = SchedulerConfig::DefaultScheduler {
89            method: DefaultSchedulerMethod::Fixed(self.target.max_num_seqs.try_into()?),
90        };
91
92        let pipeline = Arc::new(Mutex::new(SpeculativePipeline::new(
93            target,
94            draft,
95            self.speculative_config,
96        )?));
97
98        let mut runner = MistralRsBuilder::new(
99            pipeline,
100            scheduler_method,
101            self.target.throughput_logging,
102            self.target.search_bert_model,
103        );
104        if let Some(cb) = self.target.search_callback.clone() {
105            runner = runner.with_search_callback(cb);
106        }
107        for (name, cb) in &self.target.tool_callbacks {
108            runner = runner.with_tool_callback(name.clone(), cb.clone());
109        }
110
111        Ok(Model::new(runner.build().await))
112    }
113}
mistralrs/speculative.rs

mistralrs/
speculative.rs