mistralrs_core/model_selected.rs
1use std::path::PathBuf;
2
3use clap::Subcommand;
4
5use crate::{
6 pipeline::{
7 AutoDeviceMapParams, EmbeddingLoaderType, IsqOrganization, NormalLoaderType,
8 VisionLoaderType,
9 },
10 DiffusionLoaderType, ModelDType, SpeechLoaderType,
11};
12
13// Default value functions for serde deserialization
14fn default_model_dtype() -> ModelDType {
15 ModelDType::Auto
16}
17
18fn default_max_seq_len() -> usize {
19 AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN
20}
21
22fn default_max_batch_size() -> usize {
23 AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE
24}
25
26fn parse_arch(x: &str) -> Result<NormalLoaderType, String> {
27 x.parse()
28}
29
30fn parse_vision_arch(x: &str) -> Result<VisionLoaderType, String> {
31 x.parse()
32}
33
34fn parse_embedding_arch(x: &str) -> Result<EmbeddingLoaderType, String> {
35 x.parse()
36}
37
38fn parse_diffusion_arch(x: &str) -> Result<DiffusionLoaderType, String> {
39 x.parse()
40}
41
42fn parse_speech_arch(x: &str) -> Result<SpeechLoaderType, String> {
43 x.parse()
44}
45
46fn parse_model_dtype(x: &str) -> Result<ModelDType, String> {
47 x.parse()
48}
49
50#[derive(Debug, Clone, Subcommand, serde::Deserialize)]
51pub enum ModelSelected {
52 /// Select the model from a toml file
53 Toml {
54 /// .toml file containing the selector configuration.
55 #[arg(short, long)]
56 file: String,
57 },
58
59 /// Select a model for running via auto loader
60 Run {
61 /// Model ID to load from. May be a HF hub repo or a local path.
62 #[arg(short, long)]
63 model_id: String,
64
65 /// Path to local tokenizer.json file. If specified, it is used over any remote file.
66 #[arg(short, long)]
67 tokenizer_json: Option<String>,
68
69 /// Model data type. Defaults to `auto`.
70 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
71 dtype: ModelDType,
72
73 /// Path to a topology YAML file.
74 #[arg(long)]
75 topology: Option<String>,
76
77 /// ISQ organization: `default` or `moqe`.
78 #[arg(short, long)]
79 organization: Option<IsqOrganization>,
80
81 /// UQFF path to write to.
82 #[arg(short, long)]
83 write_uqff: Option<PathBuf>,
84
85 /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
86 #[arg(short, long)]
87 from_uqff: Option<String>,
88
89 /// .imatrix file to enhance GGUF quantizations with.
90 #[arg(short, long)]
91 imatrix: Option<PathBuf>,
92
93 /// Generate and utilize an imatrix to enhance GGUF quantizations.
94 #[arg(short, long)]
95 calibration_file: Option<PathBuf>,
96
97 /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
98 /// Only supported on specific vision models.
99 #[arg(short = 'e', long)]
100 max_edge: Option<u32>,
101
102 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
103 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
104 max_seq_len: usize,
105
106 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
107 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
108 max_batch_size: usize,
109
110 /// Maximum prompt number of images to expect for this model. This affects automatic device mapping but is not a hard limit.
111 /// Only supported on specific vision models.
112 #[arg(long)]
113 max_num_images: Option<usize>,
114
115 /// Maximum expected image size will have this edge length on both edges.
116 /// This affects automatic device mapping but is not a hard limit.
117 /// Only supported on specific vision models.
118 #[arg(long)]
119 max_image_length: Option<usize>,
120
121 /// Cache path for Hugging Face models downloaded locally.
122 #[arg(long)]
123 hf_cache_path: Option<PathBuf>,
124
125 /// Path to local Matryoshka Transformer configuration CSV file
126 #[arg(long)]
127 matformer_config_path: Option<PathBuf>,
128
129 /// Name of the Matryoshka Transformer slice to use
130 #[arg(long)]
131 matformer_slice_name: Option<String>,
132 },
133
134 /// Select a plain model, without quantization or adapters
135 Plain {
136 /// Model ID to load from. This may be a HF hub repo or a local path.
137 #[arg(short, long)]
138 model_id: String,
139
140 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
141 #[arg(short, long)]
142 #[serde(default)]
143 tokenizer_json: Option<String>,
144
145 /// The architecture of the model.
146 #[arg(short, long, value_parser = parse_arch)]
147 #[serde(default)]
148 arch: Option<NormalLoaderType>,
149
150 /// Model data type. Defaults to `auto`.
151 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
152 #[serde(default = "default_model_dtype")]
153 dtype: ModelDType,
154
155 /// Path to a topology YAML file.
156 #[arg(long)]
157 #[serde(default)]
158 topology: Option<String>,
159
160 #[allow(rustdoc::bare_urls)]
161 /// ISQ organization: `default` or `moqe` (Mixture of Quantized Experts: https://arxiv.org/abs/2310.02410).
162 #[arg(short, long)]
163 #[serde(default)]
164 organization: Option<IsqOrganization>,
165
166 /// UQFF path to write to.
167 #[arg(short, long)]
168 #[serde(default)]
169 write_uqff: Option<PathBuf>,
170
171 /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;)
172 #[arg(short, long)]
173 #[serde(default)]
174 from_uqff: Option<String>,
175
176 /// .imatrix file to enhance GGUF quantizations with.
177 /// Incompatible with `--calibration-file/-c`
178 #[arg(short, long)]
179 #[serde(default)]
180 imatrix: Option<PathBuf>,
181
182 /// Generate and utilize an imatrix to enhance GGUF quantizations.
183 /// Incompatible with `--imatrix/-i`
184 #[arg(short, long)]
185 #[serde(default)]
186 calibration_file: Option<PathBuf>,
187
188 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
189 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
190 #[serde(default = "default_max_seq_len")]
191 max_seq_len: usize,
192
193 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
194 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
195 #[serde(default = "default_max_batch_size")]
196 max_batch_size: usize,
197
198 /// Cache path for Hugging Face models downloaded locally
199 #[arg(long)]
200 #[serde(default)]
201 hf_cache_path: Option<PathBuf>,
202
203 /// Path to local Matryoshka Transformer configuration CSV file
204 #[arg(long)]
205 #[serde(default)]
206 matformer_config_path: Option<PathBuf>,
207
208 /// Name of the Matryoshka Transformer slice to use
209 #[arg(long)]
210 #[serde(default)]
211 matformer_slice_name: Option<String>,
212 },
213
214 /// Select an X-LoRA architecture
215 XLora {
216 /// Force a base model ID to load from instead of using the ordering file. This may be a HF hub repo or a local path.
217 #[arg(short, long)]
218 model_id: Option<String>,
219
220 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
221 #[arg(short, long)]
222 tokenizer_json: Option<String>,
223
224 /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
225 #[arg(short, long)]
226 xlora_model_id: String,
227
228 /// Ordering JSON file
229 #[arg(short, long)]
230 order: String,
231
232 /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
233 /// This makes the maximum running sequences 1.
234 #[arg(long)]
235 tgt_non_granular_index: Option<usize>,
236
237 /// The architecture of the model.
238 #[arg(short, long, value_parser = parse_arch)]
239 arch: Option<NormalLoaderType>,
240
241 /// Model data type. Defaults to `auto`.
242 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
243 dtype: ModelDType,
244
245 /// Path to a topology YAML file.
246 #[arg(long)]
247 topology: Option<String>,
248
249 /// UQFF path to write to.
250 #[arg(short, long)]
251 write_uqff: Option<PathBuf>,
252
253 /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
254 #[arg(short, long)]
255 from_uqff: Option<String>,
256
257 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
258 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
259 max_seq_len: usize,
260
261 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
262 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
263 max_batch_size: usize,
264
265 /// Cache path for Hugging Face models downloaded locally
266 #[arg(long)]
267 hf_cache_path: Option<PathBuf>,
268 },
269
270 /// Select a LoRA architecture
271 Lora {
272 /// Force a base model ID to load from instead of using the ordering file. This may be a HF hub repo or a local path.
273 #[arg(short, long)]
274 model_id: Option<String>,
275
276 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
277 #[arg(short, long)]
278 tokenizer_json: Option<String>,
279
280 /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
281 #[arg(short, long)]
282 adapter_model_id: String,
283
284 /// The architecture of the model.
285 #[arg(long, value_parser = parse_arch)]
286 arch: Option<NormalLoaderType>,
287
288 /// Model data type. Defaults to `auto`.
289 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
290 dtype: ModelDType,
291
292 /// Path to a topology YAML file.
293 #[arg(long)]
294 topology: Option<String>,
295
296 /// UQFF path to write to.
297 #[arg(short, long)]
298 write_uqff: Option<PathBuf>,
299
300 /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
301 #[arg(short, long)]
302 from_uqff: Option<String>,
303
304 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
305 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
306 max_seq_len: usize,
307
308 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
309 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
310 max_batch_size: usize,
311
312 /// Cache path for Hugging Face models downloaded locally
313 #[arg(long)]
314 hf_cache_path: Option<PathBuf>,
315 },
316
317 /// Select a GGUF model.
318 GGUF {
319 /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
320 /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
321 /// removing all remote accesses.
322 #[arg(short, long)]
323 tok_model_id: Option<String>,
324
325 /// Quantized model ID to find the `quantized_filename`.
326 /// This may be a HF hub repo or a local path.
327 #[arg(short = 'm', long)]
328 quantized_model_id: String,
329
330 /// Quantized filename(s).
331 /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
332 #[arg(short = 'f', long)]
333 quantized_filename: String,
334
335 /// Model data type. Defaults to `auto`.
336 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
337 dtype: ModelDType,
338
339 /// Path to a topology YAML file.
340 #[arg(long)]
341 topology: Option<String>,
342
343 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
344 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
345 max_seq_len: usize,
346
347 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
348 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
349 max_batch_size: usize,
350 },
351
352 /// Select a GGUF model with X-LoRA.
353 XLoraGGUF {
354 /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
355 /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
356 /// removing all remote accesses.
357 #[arg(short, long)]
358 tok_model_id: Option<String>,
359
360 /// Quantized model ID to find the `quantized_filename`.
361 /// This may be a HF hub repo or a local path.
362 #[arg(short = 'm', long)]
363 quantized_model_id: String,
364
365 /// Quantized filename(s).
366 /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
367 #[arg(short = 'f', long)]
368 quantized_filename: String,
369
370 /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
371 #[arg(short, long)]
372 xlora_model_id: String,
373
374 /// Ordering JSON file
375 #[arg(short, long)]
376 order: String,
377
378 /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
379 /// This makes the maximum running sequences 1.
380 #[arg(long)]
381 tgt_non_granular_index: Option<usize>,
382
383 /// Model data type. Defaults to `auto`.
384 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
385 dtype: ModelDType,
386
387 /// Path to a topology YAML file.
388 #[arg(long)]
389 topology: Option<String>,
390
391 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
392 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
393 max_seq_len: usize,
394
395 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
396 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
397 max_batch_size: usize,
398 },
399
400 /// Select a GGUF model with LoRA.
401 LoraGGUF {
402 /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
403 /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
404 /// removing all remote accesses.
405 #[arg(short, long)]
406 tok_model_id: Option<String>,
407
408 /// Quantized model ID to find the `quantized_filename`.
409 /// This may be a HF hub repo or a local path.
410 #[arg(short = 'm', long)]
411 quantized_model_id: String,
412
413 /// Quantized filename(s).
414 /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
415 #[arg(short = 'f', long)]
416 quantized_filename: String,
417
418 /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
419 #[arg(short, long)]
420 adapters_model_id: String,
421
422 /// Ordering JSON file
423 #[arg(short, long)]
424 order: String,
425
426 /// Model data type. Defaults to `auto`.
427 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
428 dtype: ModelDType,
429
430 /// Path to a topology YAML file.
431 #[arg(long)]
432 topology: Option<String>,
433
434 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
435 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
436 max_seq_len: usize,
437
438 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
439 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
440 max_batch_size: usize,
441 },
442
443 /// Select a GGML model.
444 GGML {
445 /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
446 #[arg(short, long)]
447 tok_model_id: String,
448
449 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
450 #[arg(long)]
451 tokenizer_json: Option<String>,
452
453 /// Quantized model ID to find the `quantized_filename`.
454 /// This may be a HF hub repo or a local path.
455 #[arg(short = 'm', long)]
456 quantized_model_id: String,
457
458 /// Quantized filename.
459 #[arg(short = 'f', long)]
460 quantized_filename: String,
461
462 /// GQA value
463 #[arg(short, long, default_value_t = 1)]
464 gqa: usize,
465
466 /// Model data type. Defaults to `auto`.
467 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
468 dtype: ModelDType,
469
470 /// Path to a topology YAML file.
471 #[arg(long)]
472 topology: Option<String>,
473
474 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
475 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
476 max_seq_len: usize,
477
478 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
479 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
480 max_batch_size: usize,
481 },
482
483 /// Select a GGML model with X-LoRA.
484 XLoraGGML {
485 /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
486 #[arg(short, long)]
487 tok_model_id: Option<String>,
488
489 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
490 #[arg(long)]
491 tokenizer_json: Option<String>,
492
493 /// Quantized model ID to find the `quantized_filename`.
494 /// This may be a HF hub repo or a local path.
495 #[arg(short = 'm', long)]
496 quantized_model_id: String,
497
498 /// Quantized filename.
499 #[arg(short = 'f', long)]
500 quantized_filename: String,
501
502 /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
503 #[arg(short, long)]
504 xlora_model_id: String,
505
506 /// Ordering JSON file
507 #[arg(short, long)]
508 order: String,
509
510 /// Index of completion tokens to generate scalings up until. If this is 1, then there will be one completion token generated before it is cached.
511 /// This makes the maximum running sequences 1.
512 #[arg(long)]
513 tgt_non_granular_index: Option<usize>,
514
515 /// GQA value
516 #[arg(short, long, default_value_t = 1)]
517 gqa: usize,
518
519 /// Model data type. Defaults to `auto`.
520 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
521 dtype: ModelDType,
522
523 /// Path to a topology YAML file.
524 #[arg(long)]
525 topology: Option<String>,
526
527 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
528 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
529 max_seq_len: usize,
530
531 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
532 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
533 max_batch_size: usize,
534 },
535
536 /// Select a GGML model with LoRA.
537 LoraGGML {
538 /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
539 #[arg(short, long)]
540 tok_model_id: Option<String>,
541
542 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
543 #[arg(long)]
544 tokenizer_json: Option<String>,
545
546 /// Quantized model ID to find the `quantized_filename`.
547 /// This may be a HF hub repo or a local path.
548 #[arg(short = 'm', long)]
549 quantized_model_id: String,
550
551 /// Quantized filename.
552 #[arg(short = 'f', long)]
553 quantized_filename: String,
554
555 /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
556 #[arg(short, long)]
557 adapters_model_id: String,
558
559 /// Ordering JSON file
560 #[arg(short, long)]
561 order: String,
562
563 /// GQA value
564 #[arg(short, long, default_value_t = 1)]
565 gqa: usize,
566
567 /// Model data type. Defaults to `auto`.
568 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
569 dtype: ModelDType,
570
571 /// Path to a topology YAML file.
572 #[arg(long)]
573 topology: Option<String>,
574
575 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
576 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
577 max_seq_len: usize,
578
579 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
580 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
581 max_batch_size: usize,
582 },
583
584 /// Select a vision plain model, without quantization or adapters
585 VisionPlain {
586 /// Model ID to load from. This may be a HF hub repo or a local path.
587 #[arg(short, long)]
588 model_id: String,
589
590 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
591 #[arg(short, long)]
592 tokenizer_json: Option<String>,
593
594 /// The architecture of the model.
595 #[arg(short, long, value_parser = parse_vision_arch)]
596 arch: Option<VisionLoaderType>,
597
598 /// Model data type. Defaults to `auto`.
599 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
600 dtype: ModelDType,
601
602 /// Path to a topology YAML file.
603 #[arg(long)]
604 topology: Option<String>,
605
606 /// UQFF path to write to.
607 #[arg(short, long)]
608 write_uqff: Option<PathBuf>,
609
610 /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
611 #[arg(short, long)]
612 from_uqff: Option<String>,
613
614 /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
615 /// This is only supported on the Qwen2-VL and Idefics models. Others handle this internally.
616 #[arg(short = 'e', long)]
617 max_edge: Option<u32>,
618
619 /// Generate and utilize an imatrix to enhance GGUF quantizations.
620 #[arg(short, long)]
621 calibration_file: Option<PathBuf>,
622
623 /// .cimatrix file to enhance GGUF quantizations with. This must be a .cimatrix file.
624 #[arg(short, long)]
625 imatrix: Option<PathBuf>,
626
627 /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
628 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
629 max_seq_len: usize,
630
631 /// Maximum prompt batch size to expect for this model. This affects automatic device mapping but is not a hard limit.
632 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_BATCH_SIZE)]
633 max_batch_size: usize,
634
635 /// Maximum prompt number of images to expect for this model. This affects automatic device mapping but is not a hard limit.
636 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_NUM_IMAGES)]
637 max_num_images: usize,
638
639 /// Maximum expected image size will have this edge length on both edges.
640 /// This affects automatic device mapping but is not a hard limit.
641 #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_IMAGE_LENGTH)]
642 max_image_length: usize,
643
644 /// Cache path for Hugging Face models downloaded locally
645 #[arg(long)]
646 hf_cache_path: Option<PathBuf>,
647
648 /// Path to local Matryoshka Transformer configuration CSV file
649 #[arg(long)]
650 matformer_config_path: Option<PathBuf>,
651
652 /// Name of the Matryoshka Transformer slice to use
653 #[arg(long)]
654 matformer_slice_name: Option<String>,
655 },
656
657 /// Select a diffusion model, without quantization or adapters
658 #[command(name = "diffusion")]
659 DiffusionPlain {
660 /// Model ID to load from. This may be a HF hub repo or a local path.
661 #[arg(short, long)]
662 model_id: String,
663
664 /// The architecture of the model.
665 #[arg(short, long, value_parser = parse_diffusion_arch)]
666 arch: DiffusionLoaderType,
667
668 /// Model data type. Defaults to `auto`.
669 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
670 dtype: ModelDType,
671 },
672
673 Speech {
674 /// Model ID to load from. This may be a HF hub repo or a local path.
675 #[arg(short, long)]
676 model_id: String,
677
678 /// DAC Model ID to load from. If not provided, this is automatically downloaded from the default path for the model.
679 /// This may be a HF hub repo or a local path.
680 #[arg(short, long)]
681 dac_model_id: Option<String>,
682
683 /// The architecture of the model.
684 #[arg(short, long, value_parser = parse_speech_arch)]
685 arch: SpeechLoaderType,
686
687 /// Model data type. Defaults to `auto`.
688 #[arg(long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
689 dtype: ModelDType,
690 },
691
692 /// Select multi-model mode with configuration file
693 #[command(name = "multi-model")]
694 MultiModel {
695 /// Multi-model configuration file path (JSON format)
696 #[arg(short, long)]
697 config: String,
698
699 /// Default model ID to use when no model is specified in requests
700 #[arg(short, long)]
701 default_model_id: Option<String>,
702 },
703
704 /// Select an embedding model, without quantization or adapters
705 Embedding {
706 /// Model ID to load from. This may be a HF hub repo or a local path.
707 #[arg(short, long)]
708 model_id: String,
709
710 /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
711 #[arg(short, long)]
712 #[serde(default)]
713 tokenizer_json: Option<String>,
714
715 /// The architecture of the model.
716 #[arg(short, long, value_parser = parse_embedding_arch)]
717 #[serde(default)]
718 arch: Option<EmbeddingLoaderType>,
719
720 /// Model data type. Defaults to `auto`.
721 #[arg(short, long, default_value_t = ModelDType::Auto, value_parser = parse_model_dtype)]
722 #[serde(default = "default_model_dtype")]
723 dtype: ModelDType,
724
725 /// Path to a topology YAML file.
726 #[arg(long)]
727 #[serde(default)]
728 topology: Option<String>,
729
730 /// UQFF path to write to.
731 #[arg(short, long)]
732 #[serde(default)]
733 write_uqff: Option<PathBuf>,
734
735 /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;)
736 #[arg(short, long)]
737 #[serde(default)]
738 from_uqff: Option<String>,
739
740 /// Cache path for Hugging Face models downloaded locally
741 #[arg(long)]
742 #[serde(default)]
743 hf_cache_path: Option<PathBuf>,
744 },
745}