mistralrs_core/
model_selected.rs

1use std::path::PathBuf;
2
3use clap::Subcommand;
4
5use crate::{
6    pipeline::{AutoDeviceMapParams, IsqOrganization, NormalLoaderType, VisionLoaderType},
7    DiffusionLoaderType, ModelDType, SpeechLoaderType,
8};
9
10// Default value functions for serde deserialization
11fn default_model_dtype() -> ModelDType {
12    ModelDType::Auto
13}
14
15fn default_max_seq_len() -> usize {
16    AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN
17}
18
19fn default_max_batch_size() -> usize {
20    AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE
21}
22
23fn parse_arch(x: &str) -> Result<NormalLoaderType, String> {
24    x.parse()
25}
26
27fn parse_vision_arch(x: &str) -> Result<VisionLoaderType, String> {
28    x.parse()
29}
30
31fn parse_diffusion_arch(x: &str) -> Result<DiffusionLoaderType, String> {
32    x.parse()
33}
34
35fn parse_speech_arch(x: &str) -> Result<SpeechLoaderType, String> {
36    x.parse()
37}
38
39fn parse_model_dtype(x: &str) -> Result<ModelDType, String> {
40    x.parse()
41}
42
43#[derive(Debug, Clone, Subcommand, serde::Deserialize)]
44pub enum ModelSelected {
45    /// Select the model from a toml file
46    Toml {
47        /// .toml file containing the selector configuration.
48        #[arg(short, long)]
49        file: String,
50    },
51
52    /// Select a model for running via auto loader
53    Run {
54        /// Model ID to load from. May be a HF hub repo or a local path.
55        #[arg(short, long)]
56        model_id: String,
57
58        /// Path to local tokenizer.json file. If specified, it is used over any remote file.
59        #[arg(short, long)]
60        tokenizer_json: Option<String>,
61
62        /// Model data type. Defaults to `auto`.
63        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
64        dtype: ModelDType,
65
66        /// Path to a topology YAML file.
67        #[arg(long)]
68        topology: Option<String>,
69
70        /// ISQ organization: `default` or `moqe`.
71        #[arg(short, long)]
72        organization: Option<IsqOrganization>,
73
74        /// UQFF path to write to.
75        #[arg(short, long)]
76        write_uqff: Option<PathBuf>,
77
78        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
79        #[arg(short, long)]
80        from_uqff: Option<String>,
81
82        /// .imatrix file to enhance GGUF quantizations with.
83        #[arg(short, long)]
84        imatrix: Option<PathBuf>,
85
86        /// Generate and utilize an imatrix to enhance GGUF quantizations.
87        #[arg(short, long)]
88        calibration_file: Option<PathBuf>,
89
90        /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
91        /// Only supported on specific vision models.
92        #[arg(short = 'e', long)]
93        max_edge: Option<u32>,
94
95        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
96        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
97        max_seq_len: usize,
98
99        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
100        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
101        max_batch_size: usize,
102
103        /// Maximum prompt number of images to expect for this model. This affects automatic device mapping but is not a hard limit.
104        /// Only supported on specific vision models.
105        #[arg(long)]
106        max_num_images: Option<usize>,
107
108        /// Maximum expected image size will have this edge length on both edges.
109        /// This affects automatic device mapping but is not a hard limit.
110        /// Only supported on specific vision models.
111        #[arg(long)]
112        max_image_length: Option<usize>,
113
114        /// Cache path for Hugging Face models downloaded locally.
115        #[arg(long)]
116        hf_cache_path: Option<PathBuf>,
117
118        /// Path to local Matryoshka Transformer configuration CSV file
119        #[arg(long)]
120        matformer_config_path: Option<PathBuf>,
121
122        /// Name of the Matryoshka Transformer slice to use
123        #[arg(long)]
124        matformer_slice_name: Option<String>,
125    },
126
127    /// Select a plain model, without quantization or adapters
128    Plain {
129        /// Model ID to load from. This may be a HF hub repo or a local path.
130        #[arg(short, long)]
131        model_id: String,
132
133        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
134        #[arg(short, long)]
135        #[serde(default)]
136        tokenizer_json: Option<String>,
137
138        /// The architecture of the model.
139        #[arg(short, long, value_parser = parse_arch)]
140        #[serde(default)]
141        arch: Option<NormalLoaderType>,
142
143        /// Model data type. Defaults to `auto`.
144        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
145        #[serde(default = "default_model_dtype")]
146        dtype: ModelDType,
147
148        /// Path to a topology YAML file.
149        #[arg(long)]
150        #[serde(default)]
151        topology: Option<String>,
152
153        #[allow(rustdoc::bare_urls)]
154        /// ISQ organization: `default` or `moqe` (Mixture of Quantized Experts: https://arxiv.org/abs/2310.02410).
155        #[arg(short, long)]
156        #[serde(default)]
157        organization: Option<IsqOrganization>,
158
159        /// UQFF path to write to.
160        #[arg(short, long)]
161        #[serde(default)]
162        write_uqff: Option<PathBuf>,
163
164        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;)
165        #[arg(short, long)]
166        #[serde(default)]
167        from_uqff: Option<String>,
168
169        /// .imatrix file to enhance GGUF quantizations with.
170        /// Incompatible with `--calibration-file/-c`
171        #[arg(short, long)]
172        #[serde(default)]
173        imatrix: Option<PathBuf>,
174
175        /// Generate and utilize an imatrix to enhance GGUF quantizations.
176        /// Incompatible with `--imatrix/-i`
177        #[arg(short, long)]
178        #[serde(default)]
179        calibration_file: Option<PathBuf>,
180
181        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
182        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
183        #[serde(default = "default_max_seq_len")]
184        max_seq_len: usize,
185
186        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
187        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
188        #[serde(default = "default_max_batch_size")]
189        max_batch_size: usize,
190
191        /// Cache path for Hugging Face models downloaded locally
192        #[arg(long)]
193        #[serde(default)]
194        hf_cache_path: Option<PathBuf>,
195
196        /// Path to local Matryoshka Transformer configuration CSV file
197        #[arg(long)]
198        #[serde(default)]
199        matformer_config_path: Option<PathBuf>,
200
201        /// Name of the Matryoshka Transformer slice to use
202        #[arg(long)]
203        #[serde(default)]
204        matformer_slice_name: Option<String>,
205    },
206
207    /// Select an X-LoRA architecture
208    XLora {
209        /// Force a base model ID to load from instead of using the ordering file. This may be a HF hub repo or a local path.
210        #[arg(short, long)]
211        model_id: Option<String>,
212
213        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
214        #[arg(short, long)]
215        tokenizer_json: Option<String>,
216
217        /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
218        #[arg(short, long)]
219        xlora_model_id: String,
220
221        /// Ordering JSON file
222        #[arg(short, long)]
223        order: String,
224
225        /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
226        /// This makes the maximum running sequences 1.
227        #[arg(long)]
228        tgt_non_granular_index: Option<usize>,
229
230        /// The architecture of the model.
231        #[arg(short, long, value_parser = parse_arch)]
232        arch: Option<NormalLoaderType>,
233
234        /// Model data type. Defaults to `auto`.
235        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
236        dtype: ModelDType,
237
238        /// Path to a topology YAML file.
239        #[arg(long)]
240        topology: Option<String>,
241
242        /// UQFF path to write to.
243        #[arg(short, long)]
244        write_uqff: Option<PathBuf>,
245
246        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
247        #[arg(short, long)]
248        from_uqff: Option<String>,
249
250        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
251        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
252        max_seq_len: usize,
253
254        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
255        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
256        max_batch_size: usize,
257
258        /// Cache path for Hugging Face models downloaded locally
259        #[arg(long)]
260        hf_cache_path: Option<PathBuf>,
261    },
262
263    /// Select a LoRA architecture
264    Lora {
265        /// Force a base model ID to load from instead of using the ordering file. This may be a HF hub repo or a local path.
266        #[arg(short, long)]
267        model_id: Option<String>,
268
269        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
270        #[arg(short, long)]
271        tokenizer_json: Option<String>,
272
273        /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
274        #[arg(short, long)]
275        adapter_model_id: String,
276
277        /// The architecture of the model.
278        #[arg(long, value_parser = parse_arch)]
279        arch: Option<NormalLoaderType>,
280
281        /// Model data type. Defaults to `auto`.
282        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
283        dtype: ModelDType,
284
285        /// Path to a topology YAML file.
286        #[arg(long)]
287        topology: Option<String>,
288
289        /// UQFF path to write to.
290        #[arg(short, long)]
291        write_uqff: Option<PathBuf>,
292
293        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
294        #[arg(short, long)]
295        from_uqff: Option<String>,
296
297        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
298        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
299        max_seq_len: usize,
300
301        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
302        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
303        max_batch_size: usize,
304
305        /// Cache path for Hugging Face models downloaded locally
306        #[arg(long)]
307        hf_cache_path: Option<PathBuf>,
308    },
309
310    /// Select a GGUF model.
311    GGUF {
312        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
313        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
314        /// removing all remote accesses.
315        #[arg(short, long)]
316        tok_model_id: Option<String>,
317
318        /// Quantized model ID to find the `quantized_filename`.
319        /// This may be a HF hub repo or a local path.
320        #[arg(short = 'm', long)]
321        quantized_model_id: String,
322
323        /// Quantized filename(s).
324        /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
325        #[arg(short = 'f', long)]
326        quantized_filename: String,
327
328        /// Model data type. Defaults to `auto`.
329        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
330        dtype: ModelDType,
331
332        /// Path to a topology YAML file.
333        #[arg(long)]
334        topology: Option<String>,
335
336        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
337        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
338        max_seq_len: usize,
339
340        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
341        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
342        max_batch_size: usize,
343    },
344
345    /// Select a GGUF model with X-LoRA.
346    XLoraGGUF {
347        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
348        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
349        /// removing all remote accesses.
350        #[arg(short, long)]
351        tok_model_id: Option<String>,
352
353        /// Quantized model ID to find the `quantized_filename`.
354        /// This may be a HF hub repo or a local path.
355        #[arg(short = 'm', long)]
356        quantized_model_id: String,
357
358        /// Quantized filename(s).
359        /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
360        #[arg(short = 'f', long)]
361        quantized_filename: String,
362
363        /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
364        #[arg(short, long)]
365        xlora_model_id: String,
366
367        /// Ordering JSON file
368        #[arg(short, long)]
369        order: String,
370
371        /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
372        /// This makes the maximum running sequences 1.
373        #[arg(long)]
374        tgt_non_granular_index: Option<usize>,
375
376        /// Model data type. Defaults to `auto`.
377        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
378        dtype: ModelDType,
379
380        /// Path to a topology YAML file.
381        #[arg(long)]
382        topology: Option<String>,
383
384        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
385        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
386        max_seq_len: usize,
387
388        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
389        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
390        max_batch_size: usize,
391    },
392
393    /// Select a GGUF model with LoRA.
394    LoraGGUF {
395        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
396        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
397        /// removing all remote accesses.
398        #[arg(short, long)]
399        tok_model_id: Option<String>,
400
401        /// Quantized model ID to find the `quantized_filename`.
402        /// This may be a HF hub repo or a local path.
403        #[arg(short = 'm', long)]
404        quantized_model_id: String,
405
406        /// Quantized filename(s).
407        /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
408        #[arg(short = 'f', long)]
409        quantized_filename: String,
410
411        /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
412        #[arg(short, long)]
413        adapters_model_id: String,
414
415        /// Ordering JSON file
416        #[arg(short, long)]
417        order: String,
418
419        /// Model data type. Defaults to `auto`.
420        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
421        dtype: ModelDType,
422
423        /// Path to a topology YAML file.
424        #[arg(long)]
425        topology: Option<String>,
426
427        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
428        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
429        max_seq_len: usize,
430
431        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
432        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
433        max_batch_size: usize,
434    },
435
436    /// Select a GGML model.
437    GGML {
438        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
439        #[arg(short, long)]
440        tok_model_id: String,
441
442        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
443        #[arg(long)]
444        tokenizer_json: Option<String>,
445
446        /// Quantized model ID to find the `quantized_filename`.
447        /// This may be a HF hub repo or a local path.
448        #[arg(short = 'm', long)]
449        quantized_model_id: String,
450
451        /// Quantized filename.
452        #[arg(short = 'f', long)]
453        quantized_filename: String,
454
455        /// GQA value
456        #[arg(short, long, default_value_t = 1)]
457        gqa: usize,
458
459        /// Model data type. Defaults to `auto`.
460        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
461        dtype: ModelDType,
462
463        /// Path to a topology YAML file.
464        #[arg(long)]
465        topology: Option<String>,
466
467        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
468        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
469        max_seq_len: usize,
470
471        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
472        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
473        max_batch_size: usize,
474    },
475
476    /// Select a GGML model with X-LoRA.
477    XLoraGGML {
478        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
479        #[arg(short, long)]
480        tok_model_id: Option<String>,
481
482        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
483        #[arg(long)]
484        tokenizer_json: Option<String>,
485
486        /// Quantized model ID to find the `quantized_filename`.
487        /// This may be a HF hub repo or a local path.
488        #[arg(short = 'm', long)]
489        quantized_model_id: String,
490
491        /// Quantized filename.
492        #[arg(short = 'f', long)]
493        quantized_filename: String,
494
495        /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
496        #[arg(short, long)]
497        xlora_model_id: String,
498
499        /// Ordering JSON file
500        #[arg(short, long)]
501        order: String,
502
503        /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
504        /// This makes the maximum running sequences 1.
505        #[arg(long)]
506        tgt_non_granular_index: Option<usize>,
507
508        /// GQA value
509        #[arg(short, long, default_value_t = 1)]
510        gqa: usize,
511
512        /// Model data type. Defaults to `auto`.
513        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
514        dtype: ModelDType,
515
516        /// Path to a topology YAML file.
517        #[arg(long)]
518        topology: Option<String>,
519
520        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
521        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
522        max_seq_len: usize,
523
524        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
525        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
526        max_batch_size: usize,
527    },
528
529    /// Select a GGML model with LoRA.
530    LoraGGML {
531        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
532        #[arg(short, long)]
533        tok_model_id: Option<String>,
534
535        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
536        #[arg(long)]
537        tokenizer_json: Option<String>,
538
539        /// Quantized model ID to find the `quantized_filename`.
540        /// This may be a HF hub repo or a local path.
541        #[arg(short = 'm', long)]
542        quantized_model_id: String,
543
544        /// Quantized filename.
545        #[arg(short = 'f', long)]
546        quantized_filename: String,
547
548        /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
549        #[arg(short, long)]
550        adapters_model_id: String,
551
552        /// Ordering JSON file
553        #[arg(short, long)]
554        order: String,
555
556        /// GQA value
557        #[arg(short, long, default_value_t = 1)]
558        gqa: usize,
559
560        /// Model data type. Defaults to `auto`.
561        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
562        dtype: ModelDType,
563
564        /// Path to a topology YAML file.
565        #[arg(long)]
566        topology: Option<String>,
567
568        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
569        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
570        max_seq_len: usize,
571
572        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
573        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
574        max_batch_size: usize,
575    },
576
577    /// Select a vision plain model, without quantization or adapters
578    VisionPlain {
579        /// Model ID to load from. This may be a HF hub repo or a local path.
580        #[arg(short, long)]
581        model_id: String,
582
583        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
584        #[arg(short, long)]
585        tokenizer_json: Option<String>,
586
587        /// The architecture of the model.
588        #[arg(short, long, value_parser = parse_vision_arch)]
589        arch: Option<VisionLoaderType>,
590
591        /// Model data type. Defaults to `auto`.
592        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
593        dtype: ModelDType,
594
595        /// Path to a topology YAML file.
596        #[arg(long)]
597        topology: Option<String>,
598
599        /// UQFF path to write to.
600        #[arg(short, long)]
601        write_uqff: Option<PathBuf>,
602
603        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
604        #[arg(short, long)]
605        from_uqff: Option<String>,
606
607        /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
608        /// This is only supported on the Qwen2-VL and Idefics models. Others handle this internally.
609        #[arg(short = 'e', long)]
610        max_edge: Option<u32>,
611
612        /// Generate and utilize an imatrix to enhance GGUF quantizations.
613        #[arg(short, long)]
614        calibration_file: Option<PathBuf>,
615
616        /// .cimatrix file to enhance GGUF quantizations with. This must be a .cimatrix file.
617        #[arg(short, long)]
618        imatrix: Option<PathBuf>,
619
620        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
621        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
622        max_seq_len: usize,
623
624        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
625        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
626        max_batch_size: usize,
627
628        /// Maximum prompt number of images to expect for this model. This affects automatic device mapping but is not a hard limit.
629        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_NUM_IMAGES)]
630        max_num_images: usize,
631
632        /// Maximum expected image size will have this edge length on both edges.
633        /// This affects automatic device mapping but is not a hard limit.
634        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_IMAGE_LENGTH)]
635        max_image_length: usize,
636
637        /// Cache path for Hugging Face models downloaded locally
638        #[arg(long)]
639        hf_cache_path: Option<PathBuf>,
640
641        /// Path to local Matryoshka Transformer configuration CSV file
642        #[arg(long)]
643        matformer_config_path: Option<PathBuf>,
644
645        /// Name of the Matryoshka Transformer slice to use
646        #[arg(long)]
647        matformer_slice_name: Option<String>,
648    },
649
650    /// Select a diffusion model, without quantization or adapters
651    #[command(name = "diffusion")]
652    DiffusionPlain {
653        /// Model ID to load from. This may be a HF hub repo or a local path.
654        #[arg(short, long)]
655        model_id: String,
656
657        /// The architecture of the model.
658        #[arg(short, long, value_parser = parse_diffusion_arch)]
659        arch: DiffusionLoaderType,
660
661        /// Model data type. Defaults to `auto`.
662        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
663        dtype: ModelDType,
664    },
665
666    Speech {
667        /// Model ID to load from. This may be a HF hub repo or a local path.
668        #[arg(short, long)]
669        model_id: String,
670
671        /// DAC Model ID to load from. If not provided, this is automatically downloaded from the default path for the model.
672        /// This may be a HF hub repo or a local path.
673        #[arg(short, long)]
674        dac_model_id: Option<String>,
675
676        /// The architecture of the model.
677        #[arg(short, long, value_parser = parse_speech_arch)]
678        arch: SpeechLoaderType,
679
680        /// Model data type. Defaults to `auto`.
681        #[arg(long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
682        dtype: ModelDType,
683    },
684
685    /// Select multi-model mode with configuration file
686    #[command(name = "multi-model")]
687    MultiModel {
688        /// Multi-model configuration file path (JSON format)
689        #[arg(short, long)]
690        config: String,
691
692        /// Default model ID to use when no model is specified in requests
693        #[arg(short, long)]
694        default_model_id: Option<String>,
695    },
696}