mistralrs_core/
model_selected.rs

1use std::path::PathBuf;
2
3use clap::Subcommand;
4
5use crate::{
6    pipeline::{
7        AutoDeviceMapParams, EmbeddingLoaderType, IsqOrganization, NormalLoaderType,
8        VisionLoaderType,
9    },
10    DiffusionLoaderType, ModelDType, SpeechLoaderType,
11};
12
13// Default value functions for serde deserialization
14fn default_model_dtype() -> ModelDType {
15    ModelDType::Auto
16}
17
18fn default_max_seq_len() -> usize {
19    AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN
20}
21
22fn default_max_batch_size() -> usize {
23    AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE
24}
25
26fn parse_arch(x: &str) -> Result<NormalLoaderType, String> {
27    x.parse()
28}
29
30fn parse_vision_arch(x: &str) -> Result<VisionLoaderType, String> {
31    x.parse()
32}
33
34fn parse_embedding_arch(x: &str) -> Result<EmbeddingLoaderType, String> {
35    x.parse()
36}
37
38fn parse_diffusion_arch(x: &str) -> Result<DiffusionLoaderType, String> {
39    x.parse()
40}
41
42fn parse_speech_arch(x: &str) -> Result<SpeechLoaderType, String> {
43    x.parse()
44}
45
46fn parse_model_dtype(x: &str) -> Result<ModelDType, String> {
47    x.parse()
48}
49
50#[derive(Debug, Clone, Subcommand, serde::Deserialize)]
51pub enum ModelSelected {
52    /// Select the model from a toml file
53    Toml {
54        /// .toml file containing the selector configuration.
55        #[arg(short, long)]
56        file: String,
57    },
58
59    /// Select a model for running via auto loader
60    Run {
61        /// Model ID to load from. May be a HF hub repo or a local path.
62        #[arg(short, long)]
63        model_id: String,
64
65        /// Path to local tokenizer.json file. If specified, it is used over any remote file.
66        #[arg(short, long)]
67        tokenizer_json: Option<String>,
68
69        /// Model data type. Defaults to `auto`.
70        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
71        dtype: ModelDType,
72
73        /// Path to a topology YAML file.
74        #[arg(long)]
75        topology: Option<String>,
76
77        /// ISQ organization: `default` or `moqe`.
78        #[arg(short, long)]
79        organization: Option<IsqOrganization>,
80
81        /// UQFF path to write to.
82        #[arg(short, long)]
83        write_uqff: Option<PathBuf>,
84
85        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
86        #[arg(short, long)]
87        from_uqff: Option<String>,
88
89        /// .imatrix file to enhance GGUF quantizations with.
90        #[arg(short, long)]
91        imatrix: Option<PathBuf>,
92
93        /// Generate and utilize an imatrix to enhance GGUF quantizations.
94        #[arg(short, long)]
95        calibration_file: Option<PathBuf>,
96
97        /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
98        /// Only supported on specific vision models.
99        #[arg(short = 'e', long)]
100        max_edge: Option<u32>,
101
102        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
103        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
104        max_seq_len: usize,
105
106        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
107        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
108        max_batch_size: usize,
109
110        /// Maximum prompt number of images to expect for this model. This affects automatic device mapping but is not a hard limit.
111        /// Only supported on specific vision models.
112        #[arg(long)]
113        max_num_images: Option<usize>,
114
115        /// Maximum expected image size will have this edge length on both edges.
116        /// This affects automatic device mapping but is not a hard limit.
117        /// Only supported on specific vision models.
118        #[arg(long)]
119        max_image_length: Option<usize>,
120
121        /// Cache path for Hugging Face models downloaded locally.
122        #[arg(long)]
123        hf_cache_path: Option<PathBuf>,
124
125        /// Path to local Matryoshka Transformer configuration CSV file
126        #[arg(long)]
127        matformer_config_path: Option<PathBuf>,
128
129        /// Name of the Matryoshka Transformer slice to use
130        #[arg(long)]
131        matformer_slice_name: Option<String>,
132    },
133
134    /// Select a plain model, without quantization or adapters
135    Plain {
136        /// Model ID to load from. This may be a HF hub repo or a local path.
137        #[arg(short, long)]
138        model_id: String,
139
140        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
141        #[arg(short, long)]
142        #[serde(default)]
143        tokenizer_json: Option<String>,
144
145        /// The architecture of the model.
146        #[arg(short, long, value_parser = parse_arch)]
147        #[serde(default)]
148        arch: Option<NormalLoaderType>,
149
150        /// Model data type. Defaults to `auto`.
151        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
152        #[serde(default = "default_model_dtype")]
153        dtype: ModelDType,
154
155        /// Path to a topology YAML file.
156        #[arg(long)]
157        #[serde(default)]
158        topology: Option<String>,
159
160        #[allow(rustdoc::bare_urls)]
161        /// ISQ organization: `default` or `moqe` (Mixture of Quantized Experts: https://arxiv.org/abs/2310.02410).
162        #[arg(short, long)]
163        #[serde(default)]
164        organization: Option<IsqOrganization>,
165
166        /// UQFF path to write to.
167        #[arg(short, long)]
168        #[serde(default)]
169        write_uqff: Option<PathBuf>,
170
171        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;)
172        #[arg(short, long)]
173        #[serde(default)]
174        from_uqff: Option<String>,
175
176        /// .imatrix file to enhance GGUF quantizations with.
177        /// Incompatible with `--calibration-file/-c`
178        #[arg(short, long)]
179        #[serde(default)]
180        imatrix: Option<PathBuf>,
181
182        /// Generate and utilize an imatrix to enhance GGUF quantizations.
183        /// Incompatible with `--imatrix/-i`
184        #[arg(short, long)]
185        #[serde(default)]
186        calibration_file: Option<PathBuf>,
187
188        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
189        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
190        #[serde(default = "default_max_seq_len")]
191        max_seq_len: usize,
192
193        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
194        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
195        #[serde(default = "default_max_batch_size")]
196        max_batch_size: usize,
197
198        /// Cache path for Hugging Face models downloaded locally
199        #[arg(long)]
200        #[serde(default)]
201        hf_cache_path: Option<PathBuf>,
202
203        /// Path to local Matryoshka Transformer configuration CSV file
204        #[arg(long)]
205        #[serde(default)]
206        matformer_config_path: Option<PathBuf>,
207
208        /// Name of the Matryoshka Transformer slice to use
209        #[arg(long)]
210        #[serde(default)]
211        matformer_slice_name: Option<String>,
212    },
213
214    /// Select an X-LoRA architecture
215    XLora {
216        /// Force a base model ID to load from instead of using the ordering file. This may be a HF hub repo or a local path.
217        #[arg(short, long)]
218        model_id: Option<String>,
219
220        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
221        #[arg(short, long)]
222        tokenizer_json: Option<String>,
223
224        /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
225        #[arg(short, long)]
226        xlora_model_id: String,
227
228        /// Ordering JSON file
229        #[arg(short, long)]
230        order: String,
231
232        /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
233        /// This makes the maximum running sequences 1.
234        #[arg(long)]
235        tgt_non_granular_index: Option<usize>,
236
237        /// The architecture of the model.
238        #[arg(short, long, value_parser = parse_arch)]
239        arch: Option<NormalLoaderType>,
240
241        /// Model data type. Defaults to `auto`.
242        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
243        dtype: ModelDType,
244
245        /// Path to a topology YAML file.
246        #[arg(long)]
247        topology: Option<String>,
248
249        /// UQFF path to write to.
250        #[arg(short, long)]
251        write_uqff: Option<PathBuf>,
252
253        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
254        #[arg(short, long)]
255        from_uqff: Option<String>,
256
257        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
258        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
259        max_seq_len: usize,
260
261        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
262        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
263        max_batch_size: usize,
264
265        /// Cache path for Hugging Face models downloaded locally
266        #[arg(long)]
267        hf_cache_path: Option<PathBuf>,
268    },
269
270    /// Select a LoRA architecture
271    Lora {
272        /// Force a base model ID to load from instead of using the ordering file. This may be a HF hub repo or a local path.
273        #[arg(short, long)]
274        model_id: Option<String>,
275
276        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
277        #[arg(short, long)]
278        tokenizer_json: Option<String>,
279
280        /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
281        #[arg(short, long)]
282        adapter_model_id: String,
283
284        /// The architecture of the model.
285        #[arg(long, value_parser = parse_arch)]
286        arch: Option<NormalLoaderType>,
287
288        /// Model data type. Defaults to `auto`.
289        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
290        dtype: ModelDType,
291
292        /// Path to a topology YAML file.
293        #[arg(long)]
294        topology: Option<String>,
295
296        /// UQFF path to write to.
297        #[arg(short, long)]
298        write_uqff: Option<PathBuf>,
299
300        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
301        #[arg(short, long)]
302        from_uqff: Option<String>,
303
304        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
305        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
306        max_seq_len: usize,
307
308        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
309        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
310        max_batch_size: usize,
311
312        /// Cache path for Hugging Face models downloaded locally
313        #[arg(long)]
314        hf_cache_path: Option<PathBuf>,
315    },
316
317    /// Select a GGUF model.
318    GGUF {
319        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
320        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
321        /// removing all remote accesses.
322        #[arg(short, long)]
323        tok_model_id: Option<String>,
324
325        /// Quantized model ID to find the `quantized_filename`.
326        /// This may be a HF hub repo or a local path.
327        #[arg(short = 'm', long)]
328        quantized_model_id: String,
329
330        /// Quantized filename(s).
331        /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
332        #[arg(short = 'f', long)]
333        quantized_filename: String,
334
335        /// Model data type. Defaults to `auto`.
336        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
337        dtype: ModelDType,
338
339        /// Path to a topology YAML file.
340        #[arg(long)]
341        topology: Option<String>,
342
343        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
344        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
345        max_seq_len: usize,
346
347        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
348        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
349        max_batch_size: usize,
350    },
351
352    /// Select a GGUF model with X-LoRA.
353    XLoraGGUF {
354        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
355        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
356        /// removing all remote accesses.
357        #[arg(short, long)]
358        tok_model_id: Option<String>,
359
360        /// Quantized model ID to find the `quantized_filename`.
361        /// This may be a HF hub repo or a local path.
362        #[arg(short = 'm', long)]
363        quantized_model_id: String,
364
365        /// Quantized filename(s).
366        /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
367        #[arg(short = 'f', long)]
368        quantized_filename: String,
369
370        /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
371        #[arg(short, long)]
372        xlora_model_id: String,
373
374        /// Ordering JSON file
375        #[arg(short, long)]
376        order: String,
377
378        /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
379        /// This makes the maximum running sequences 1.
380        #[arg(long)]
381        tgt_non_granular_index: Option<usize>,
382
383        /// Model data type. Defaults to `auto`.
384        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
385        dtype: ModelDType,
386
387        /// Path to a topology YAML file.
388        #[arg(long)]
389        topology: Option<String>,
390
391        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
392        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
393        max_seq_len: usize,
394
395        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
396        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
397        max_batch_size: usize,
398    },
399
400    /// Select a GGUF model with LoRA.
401    LoraGGUF {
402        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
403        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
404        /// removing all remote accesses.
405        #[arg(short, long)]
406        tok_model_id: Option<String>,
407
408        /// Quantized model ID to find the `quantized_filename`.
409        /// This may be a HF hub repo or a local path.
410        #[arg(short = 'm', long)]
411        quantized_model_id: String,
412
413        /// Quantized filename(s).
414        /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
415        #[arg(short = 'f', long)]
416        quantized_filename: String,
417
418        /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
419        #[arg(short, long)]
420        adapters_model_id: String,
421
422        /// Ordering JSON file
423        #[arg(short, long)]
424        order: String,
425
426        /// Model data type. Defaults to `auto`.
427        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
428        dtype: ModelDType,
429
430        /// Path to a topology YAML file.
431        #[arg(long)]
432        topology: Option<String>,
433
434        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
435        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
436        max_seq_len: usize,
437
438        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
439        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
440        max_batch_size: usize,
441    },
442
443    /// Select a GGML model.
444    GGML {
445        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
446        #[arg(short, long)]
447        tok_model_id: String,
448
449        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
450        #[arg(long)]
451        tokenizer_json: Option<String>,
452
453        /// Quantized model ID to find the `quantized_filename`.
454        /// This may be a HF hub repo or a local path.
455        #[arg(short = 'm', long)]
456        quantized_model_id: String,
457
458        /// Quantized filename.
459        #[arg(short = 'f', long)]
460        quantized_filename: String,
461
462        /// GQA value
463        #[arg(short, long, default_value_t = 1)]
464        gqa: usize,
465
466        /// Model data type. Defaults to `auto`.
467        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
468        dtype: ModelDType,
469
470        /// Path to a topology YAML file.
471        #[arg(long)]
472        topology: Option<String>,
473
474        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
475        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
476        max_seq_len: usize,
477
478        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
479        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
480        max_batch_size: usize,
481    },
482
483    /// Select a GGML model with X-LoRA.
484    XLoraGGML {
485        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
486        #[arg(short, long)]
487        tok_model_id: Option<String>,
488
489        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
490        #[arg(long)]
491        tokenizer_json: Option<String>,
492
493        /// Quantized model ID to find the `quantized_filename`.
494        /// This may be a HF hub repo or a local path.
495        #[arg(short = 'm', long)]
496        quantized_model_id: String,
497
498        /// Quantized filename.
499        #[arg(short = 'f', long)]
500        quantized_filename: String,
501
502        /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
503        #[arg(short, long)]
504        xlora_model_id: String,
505
506        /// Ordering JSON file
507        #[arg(short, long)]
508        order: String,
509
510        /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
511        /// This makes the maximum running sequences 1.
512        #[arg(long)]
513        tgt_non_granular_index: Option<usize>,
514
515        /// GQA value
516        #[arg(short, long, default_value_t = 1)]
517        gqa: usize,
518
519        /// Model data type. Defaults to `auto`.
520        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
521        dtype: ModelDType,
522
523        /// Path to a topology YAML file.
524        #[arg(long)]
525        topology: Option<String>,
526
527        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
528        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
529        max_seq_len: usize,
530
531        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
532        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
533        max_batch_size: usize,
534    },
535
536    /// Select a GGML model with LoRA.
537    LoraGGML {
538        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
539        #[arg(short, long)]
540        tok_model_id: Option<String>,
541
542        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
543        #[arg(long)]
544        tokenizer_json: Option<String>,
545
546        /// Quantized model ID to find the `quantized_filename`.
547        /// This may be a HF hub repo or a local path.
548        #[arg(short = 'm', long)]
549        quantized_model_id: String,
550
551        /// Quantized filename.
552        #[arg(short = 'f', long)]
553        quantized_filename: String,
554
555        /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
556        #[arg(short, long)]
557        adapters_model_id: String,
558
559        /// Ordering JSON file
560        #[arg(short, long)]
561        order: String,
562
563        /// GQA value
564        #[arg(short, long, default_value_t = 1)]
565        gqa: usize,
566
567        /// Model data type. Defaults to `auto`.
568        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
569        dtype: ModelDType,
570
571        /// Path to a topology YAML file.
572        #[arg(long)]
573        topology: Option<String>,
574
575        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
576        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
577        max_seq_len: usize,
578
579        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
580        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
581        max_batch_size: usize,
582    },
583
584    /// Select a vision plain model, without quantization or adapters
585    VisionPlain {
586        /// Model ID to load from. This may be a HF hub repo or a local path.
587        #[arg(short, long)]
588        model_id: String,
589
590        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
591        #[arg(short, long)]
592        tokenizer_json: Option<String>,
593
594        /// The architecture of the model.
595        #[arg(short, long, value_parser = parse_vision_arch)]
596        arch: Option<VisionLoaderType>,
597
598        /// Model data type. Defaults to `auto`.
599        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
600        dtype: ModelDType,
601
602        /// Path to a topology YAML file.
603        #[arg(long)]
604        topology: Option<String>,
605
606        /// UQFF path to write to.
607        #[arg(short, long)]
608        write_uqff: Option<PathBuf>,
609
610        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
611        #[arg(short, long)]
612        from_uqff: Option<String>,
613
614        /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
615        /// This is only supported on the Qwen2-VL and Idefics models. Others handle this internally.
616        #[arg(short = 'e', long)]
617        max_edge: Option<u32>,
618
619        /// Generate and utilize an imatrix to enhance GGUF quantizations.
620        #[arg(short, long)]
621        calibration_file: Option<PathBuf>,
622
623        /// .cimatrix file to enhance GGUF quantizations with. This must be a .cimatrix file.
624        #[arg(short, long)]
625        imatrix: Option<PathBuf>,
626
627        /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
628        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
629        max_seq_len: usize,
630
631        /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
632        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
633        max_batch_size: usize,
634
635        /// Maximum prompt number of images to expect for this model. This affects automatic device mapping but is not a hard limit.
636        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_NUM_IMAGES)]
637        max_num_images: usize,
638
639        /// Maximum expected image size will have this edge length on both edges.
640        /// This affects automatic device mapping but is not a hard limit.
641        #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_IMAGE_LENGTH)]
642        max_image_length: usize,
643
644        /// Cache path for Hugging Face models downloaded locally
645        #[arg(long)]
646        hf_cache_path: Option<PathBuf>,
647
648        /// Path to local Matryoshka Transformer configuration CSV file
649        #[arg(long)]
650        matformer_config_path: Option<PathBuf>,
651
652        /// Name of the Matryoshka Transformer slice to use
653        #[arg(long)]
654        matformer_slice_name: Option<String>,
655    },
656
657    /// Select a diffusion model, without quantization or adapters
658    #[command(name = "diffusion")]
659    DiffusionPlain {
660        /// Model ID to load from. This may be a HF hub repo or a local path.
661        #[arg(short, long)]
662        model_id: String,
663
664        /// The architecture of the model.
665        #[arg(short, long, value_parser = parse_diffusion_arch)]
666        arch: DiffusionLoaderType,
667
668        /// Model data type. Defaults to `auto`.
669        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
670        dtype: ModelDType,
671    },
672
673    Speech {
674        /// Model ID to load from. This may be a HF hub repo or a local path.
675        #[arg(short, long)]
676        model_id: String,
677
678        /// DAC Model ID to load from. If not provided, this is automatically downloaded from the default path for the model.
679        /// This may be a HF hub repo or a local path.
680        #[arg(short, long)]
681        dac_model_id: Option<String>,
682
683        /// The architecture of the model.
684        #[arg(short, long, value_parser = parse_speech_arch)]
685        arch: SpeechLoaderType,
686
687        /// Model data type. Defaults to `auto`.
688        #[arg(long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
689        dtype: ModelDType,
690    },
691
692    /// Select multi-model mode with configuration file
693    #[command(name = "multi-model")]
694    MultiModel {
695        /// Multi-model configuration file path (JSON format)
696        #[arg(short, long)]
697        config: String,
698
699        /// Default model ID to use when no model is specified in requests
700        #[arg(short, long)]
701        default_model_id: Option<String>,
702    },
703
704    /// Select an embedding model, without quantization or adapters
705    Embedding {
706        /// Model ID to load from. This may be a HF hub repo or a local path.
707        #[arg(short, long)]
708        model_id: String,
709
710        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
711        #[arg(short, long)]
712        #[serde(default)]
713        tokenizer_json: Option<String>,
714
715        /// The architecture of the model.
716        #[arg(short, long, value_parser = parse_embedding_arch)]
717        #[serde(default)]
718        arch: Option<EmbeddingLoaderType>,
719
720        /// Model data type. Defaults to `auto`.
721        #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
722        #[serde(default = "default_model_dtype")]
723        dtype: ModelDType,
724
725        /// Path to a topology YAML file.
726        #[arg(long)]
727        #[serde(default)]
728        topology: Option<String>,
729
730        /// UQFF path to write to.
731        #[arg(short, long)]
732        #[serde(default)]
733        write_uqff: Option<PathBuf>,
734
735        /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;)
736        #[arg(short, long)]
737        #[serde(default)]
738        from_uqff: Option<String>,
739
740        /// Cache path for Hugging Face models downloaded locally
741        #[arg(long)]
742        #[serde(default)]
743        hf_cache_path: Option<PathBuf>,
744    },
745}