1use std::sync::Arc;
2
3use mistralrs_core::{
4 initialize_logging, AutoDeviceMapParams, DefaultSchedulerMethod, DeviceMapSetting,
5 MistralRsBuilder, NormalLoaderBuilder, NormalSpecificConfig, Pipeline, SchedulerConfig,
6 SpeculativeConfig, SpeculativePipeline,
7};
8use tokio::sync::Mutex;
9
10use crate::{best_device, Model, TextModelBuilder};
11
12pub struct TextSpeculativeBuilder {
13 target: TextModelBuilder,
14 draft: TextModelBuilder,
15 speculative_config: SpeculativeConfig,
16}
17
18impl TextSpeculativeBuilder {
19 pub fn new(
26 target: TextModelBuilder,
27 draft: TextModelBuilder,
28 speculative_config: SpeculativeConfig,
29 ) -> anyhow::Result<Self> {
30 if target.no_kv_cache || draft.no_kv_cache {
31 anyhow::bail!("Both target and draft must have KV cache enabled.");
32 }
33
34 Ok(Self {
35 target,
36 draft,
37 speculative_config,
38 })
39 }
40
41 fn build_pipeline(builder: TextModelBuilder) -> anyhow::Result<Arc<Mutex<dyn Pipeline>>> {
42 let config = NormalSpecificConfig {
43 prompt_chunksize: builder.prompt_chunksize,
44 topology: builder.topology,
45 organization: builder.organization,
46 write_uqff: builder.write_uqff,
47 from_uqff: builder.from_uqff,
48 imatrix: builder.imatrix,
49 calibration_file: builder.calibration_file,
50 hf_cache_path: builder.hf_cache_path,
51 };
52
53 if builder.with_logging {
54 initialize_logging();
55 }
56
57 let loader = NormalLoaderBuilder::new(
58 config,
59 builder.chat_template,
60 builder.tokenizer_json,
61 Some(builder.model_id),
62 builder.no_kv_cache,
63 builder.jinja_explicit,
64 )
65 .build(builder.loader_type)?;
66
67 let pipeline = loader.load_model_from_hf(
69 builder.hf_revision,
70 builder.token_source,
71 &builder.dtype,
72 &best_device(builder.force_cpu)?,
73 !builder.with_logging,
74 builder
75 .device_mapping
76 .unwrap_or(DeviceMapSetting::Auto(AutoDeviceMapParams::default_text())),
77 builder.isq,
78 builder.paged_attn_cfg,
79 )?;
80 Ok(pipeline)
81 }
82
83 pub async fn build(self) -> anyhow::Result<Model> {
84 let target = Self::build_pipeline(self.target.clone())?;
85 let draft = Self::build_pipeline(self.draft.clone())?;
86
87 let scheduler_method = SchedulerConfig::DefaultScheduler {
88 method: DefaultSchedulerMethod::Fixed(self.target.max_num_seqs.try_into()?),
89 };
90
91 let pipeline = Arc::new(Mutex::new(SpeculativePipeline::new(
92 target,
93 draft,
94 self.speculative_config,
95 )?));
96
97 let mut runner = MistralRsBuilder::new(
98 pipeline,
99 scheduler_method,
100 self.target.throughput_logging,
101 self.target.search_bert_model,
102 );
103 if let Some(cb) = self.target.search_callback.clone() {
104 runner = runner.with_search_callback(cb);
105 }
106 for (name, cb) in &self.target.tool_callbacks {
107 runner = runner.with_tool_callback(name.clone(), cb.clone());
108 }
109
110 Ok(Model::new(runner.build().await))
111 }
112}