1use std::sync::Arc;
2
3use mistralrs_core::{
4 initialize_logging, AutoDeviceMapParams, DefaultSchedulerMethod, DeviceMapSetting,
5 MistralRsBuilder, NormalLoaderBuilder, NormalSpecificConfig, Pipeline, SchedulerConfig,
6 SpeculativeConfig, SpeculativePipeline,
7};
8use tokio::sync::Mutex;
9
10use crate::{best_device, Model, TextModelBuilder};
11
12pub struct TextSpeculativeBuilder {
13 target: TextModelBuilder,
14 draft: TextModelBuilder,
15 speculative_config: SpeculativeConfig,
16}
17
18impl TextSpeculativeBuilder {
19 pub fn new(
26 target: TextModelBuilder,
27 draft: TextModelBuilder,
28 speculative_config: SpeculativeConfig,
29 ) -> anyhow::Result<Self> {
30 if target.no_kv_cache || draft.no_kv_cache {
31 anyhow::bail!("Both target and draft must have KV cache enabled.");
32 }
33
34 Ok(Self {
35 target,
36 draft,
37 speculative_config,
38 })
39 }
40
41 fn build_pipeline(builder: TextModelBuilder) -> anyhow::Result<Arc<Mutex<dyn Pipeline>>> {
42 let config = NormalSpecificConfig {
43 use_flash_attn: builder.use_flash_attn,
44 prompt_chunksize: builder.prompt_chunksize,
45 topology: builder.topology,
46 organization: builder.organization,
47 write_uqff: builder.write_uqff,
48 from_uqff: builder.from_uqff,
49 imatrix: builder.imatrix,
50 calibration_file: builder.calibration_file,
51 hf_cache_path: builder.hf_cache_path,
52 };
53
54 if builder.with_logging {
55 initialize_logging();
56 }
57
58 let loader = NormalLoaderBuilder::new(
59 config,
60 builder.chat_template,
61 builder.tokenizer_json,
62 Some(builder.model_id),
63 builder.no_kv_cache,
64 builder.jinja_explicit,
65 )
66 .build(builder.loader_type)?;
67
68 let pipeline = loader.load_model_from_hf(
70 builder.hf_revision,
71 builder.token_source,
72 &builder.dtype,
73 &best_device(builder.force_cpu)?,
74 !builder.with_logging,
75 builder
76 .device_mapping
77 .unwrap_or(DeviceMapSetting::Auto(AutoDeviceMapParams::default_text())),
78 builder.isq,
79 builder.paged_attn_cfg,
80 )?;
81 Ok(pipeline)
82 }
83
84 pub async fn build(self) -> anyhow::Result<Model> {
85 let target = Self::build_pipeline(self.target.clone())?;
86 let draft = Self::build_pipeline(self.draft.clone())?;
87
88 let scheduler_method = SchedulerConfig::DefaultScheduler {
89 method: DefaultSchedulerMethod::Fixed(self.target.max_num_seqs.try_into()?),
90 };
91
92 let pipeline = Arc::new(Mutex::new(SpeculativePipeline::new(
93 target,
94 draft,
95 self.speculative_config,
96 )?));
97
98 let runner = MistralRsBuilder::new(
99 pipeline,
100 scheduler_method,
101 self.target.throughput_logging,
102 self.target.search_bert_model,
103 );
104
105 Ok(Model::new(runner.build()))
106 }
107}