1use std::sync::Arc;
2
3use mistralrs_core::{
4 initialize_logging, AutoDeviceMapParams, DefaultSchedulerMethod, DeviceMapSetting,
5 MistralRsBuilder, NormalLoaderBuilder, NormalSpecificConfig, Pipeline, SchedulerConfig,
6 SpeculativeConfig, SpeculativePipeline,
7};
8use tokio::sync::Mutex;
9
10use crate::{best_device, Model, TextModelBuilder};
11
12pub struct TextSpeculativeBuilder {
13 target: TextModelBuilder,
14 draft: TextModelBuilder,
15 speculative_config: SpeculativeConfig,
16}
17
18impl TextSpeculativeBuilder {
19 pub fn new(
26 target: TextModelBuilder,
27 draft: TextModelBuilder,
28 speculative_config: SpeculativeConfig,
29 ) -> anyhow::Result<Self> {
30 if target.no_kv_cache || draft.no_kv_cache {
31 anyhow::bail!("Both target and draft must have KV cache enabled.");
32 }
33
34 Ok(Self {
35 target,
36 draft,
37 speculative_config,
38 })
39 }
40
41 fn build_pipeline(builder: TextModelBuilder) -> anyhow::Result<Arc<Mutex<dyn Pipeline>>> {
42 let config = NormalSpecificConfig {
43 topology: builder.topology,
44 organization: builder.organization,
45 write_uqff: builder.write_uqff,
46 from_uqff: builder.from_uqff,
47 imatrix: builder.imatrix,
48 calibration_file: builder.calibration_file,
49 hf_cache_path: builder.hf_cache_path,
50 matformer_config_path: None,
51 matformer_slice_name: None,
52 };
53
54 if builder.with_logging {
55 initialize_logging();
56 }
57
58 let loader = NormalLoaderBuilder::new(
59 config,
60 builder.chat_template,
61 builder.tokenizer_json,
62 Some(builder.model_id),
63 builder.no_kv_cache,
64 builder.jinja_explicit,
65 )
66 .build(builder.loader_type)?;
67
68 let pipeline = loader.load_model_from_hf(
70 builder.hf_revision,
71 builder.token_source,
72 &builder.dtype,
73 &best_device(builder.force_cpu)?,
74 !builder.with_logging,
75 builder
76 .device_mapping
77 .unwrap_or(DeviceMapSetting::Auto(AutoDeviceMapParams::default_text())),
78 builder.isq,
79 builder.paged_attn_cfg,
80 )?;
81 Ok(pipeline)
82 }
83
84 pub async fn build(self) -> anyhow::Result<Model> {
85 let target = Self::build_pipeline(self.target.clone())?;
86 let draft = Self::build_pipeline(self.draft.clone())?;
87
88 let scheduler_method = SchedulerConfig::DefaultScheduler {
89 method: DefaultSchedulerMethod::Fixed(self.target.max_num_seqs.try_into()?),
90 };
91
92 let pipeline = Arc::new(Mutex::new(SpeculativePipeline::new(
93 target,
94 draft,
95 self.speculative_config,
96 )?));
97
98 let mut runner = MistralRsBuilder::new(
99 pipeline,
100 scheduler_method,
101 self.target.throughput_logging,
102 self.target.search_bert_model,
103 );
104 if let Some(cb) = self.target.search_callback.clone() {
105 runner = runner.with_search_callback(cb);
106 }
107 for (name, cb) in &self.target.tool_callbacks {
108 runner = runner.with_tool_callback(name.clone(), cb.clone());
109 }
110
111 Ok(Model::new(runner.build().await))
112 }
113}