1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2
3use std::{any::Any, num::NonZeroUsize, sync::Arc};
4
5use candle_core::{Device, IndexOp, Result, Tensor};
6use image::{imageops::FilterType, DynamicImage, GenericImageView};
7use mistralrs_vision::{ApplyTransforms, Normalize, ToTensor, Transforms};
8use regex::Regex;
9use tokenizers::Tokenizer;
10use tracing::warn;
11
12use crate::{
13 device_map::DeviceMapper,
14 pipeline::{
15 text_models_inputs_processor::{
16 self, get_completion_input, get_prompt_input, PagedAttentionMeta,
17 },
18 InputProcessorOutput, InputsProcessor, InputsProcessorType, MessagesAction, Processor,
19 },
20 sequence::Sequence,
21 vision_models::ModelInputs,
22};
23
24use crate::vision_models::{
25 image_processor::{ImagePreProcessor, PreprocessedImages},
26 preprocessor_config::PreProcessorConfig,
27 processor_config::ProcessorConfig,
28};
29
30use super::MiniCpmOSpecificArgs;
31
32const DEFAULT_MAX_SLICE_NUMS: usize = 9;
33const DEFAULT_SCALE_RESOLUTION: usize = 448;
34const DEFAULT_PATCH_SIZE: usize = 14;
35const DEFAULT_IMAGE_FEATURE_SIZE: usize = 64;
36const DEFAULT_IM_START_TOKEN: &str = "<image>";
37const DEFAULT_IM_END_TOKEN: &str = "</image>";
38const DEFAULT_IM_ID_START: &str = "<image_id>";
39const DEFAULT_IM_ID_END: &str = "</image_id>";
40const DEFAULT_SLICE_START_TOKEN: &str = "<slice>";
41const DEFAULT_SLICE_END_TOKEN: &str = "</slice>";
42const DEFAULT_UNK_TOKEN: &str = "<unk>";
43const DEFAULT_USE_IMAGE_ID: bool = false;
44const DEFAULT_SLICE_MODE: bool = true;
45
46pub struct MiniCpmOImageProcessor {
47 config: PreProcessorConfig,
48}
49
50pub struct MiniCpmOProcessor {
51 preprocessor_config: PreProcessorConfig,
52}
53
54impl MiniCpmOProcessor {
55 pub fn new(
56 _config: ProcessorConfig,
57 preprocessor_config: PreProcessorConfig,
58 _max_edge: Option<u32>,
59 ) -> Self {
60 Self {
61 preprocessor_config,
62 }
63 }
64}
65
66impl Processor for MiniCpmOProcessor {
67 fn inputs_processor(&self) -> Arc<dyn InputsProcessor> {
68 Arc::new(MiniCpmOImageProcessor {
69 config: self.preprocessor_config.clone(),
70 })
71 }
72
73 fn get_special_tokens(&self) -> &[&'static str] {
74 &[
75 DEFAULT_IM_START_TOKEN,
76 DEFAULT_IM_END_TOKEN,
77 DEFAULT_SLICE_START_TOKEN,
78 DEFAULT_SLICE_END_TOKEN,
79 DEFAULT_UNK_TOKEN,
80 ]
81 }
82
83 fn template_action(&self) -> MessagesAction {
84 MessagesAction::FlattenOnlyText
85 }
86}
87
88impl InputsProcessor for MiniCpmOImageProcessor {
89 fn get_type(&self) -> InputsProcessorType {
90 InputsProcessorType::Vision
91 }
92 fn process_inputs(
93 &self,
94 tokenizer: Option<Arc<Tokenizer>>,
95 input_seqs: &mut [&mut Sequence],
96 is_prompt: bool,
97 is_xlora: bool,
98 device: &Device,
99 no_kv_cache: bool,
100 last_n_context_len: Option<(usize, usize)>,
101 return_raw_logits: bool,
102 other_config: Option<Arc<dyn Any>>,
103 mut paged_attn_metadata: Option<PagedAttentionMeta>,
104 prompt_chunksize: Option<NonZeroUsize>,
105 mapper: Option<&dyn DeviceMapper>,
106 ) -> Box<dyn Iterator<Item = anyhow::Result<InputProcessorOutput>>> {
107 if is_xlora {
108 return Box::new(std::iter::once(Err(anyhow::Error::msg(
109 "Cannot make inputs for X-LoRA vision model.",
110 ))));
111 }
112 if no_kv_cache {
113 return Box::new(std::iter::once(Err(anyhow::Error::msg(
114 "Vision model must have kv cache.",
115 ))));
116 }
117 if prompt_chunksize.is_some() {
119 warn!("`prompt_chunksize` is set. MiniCpm-O does not support prompt batching.");
120 }
121 let Some(tokenizer) = tokenizer else {
122 return Box::new(std::iter::once(Err(anyhow::Error::msg(
123 "MiniCpmOImageProcessor requires a specified tokenizer.",
124 ))));
125 };
126
127 let config = other_config.expect("Need a PreProcessorConfig config.");
128 let config: &PreProcessorConfig = config.downcast_ref().expect("Downcast failed.");
129
130 let has_images = input_seqs.iter().all(|seq| seq.has_images());
131
132 let (pixel_values_all, image_bound, tgt_sizes) = if has_images {
133 const IMAGE_TAG: &str = "(<image>./</image>)";
134 const IMAGE_PATTERN: &str = r"\(<image>./</image>\)";
135 const AUDIO_PATTERN: &str = r"\(<audio>./</audio>\)";
136
137 let image_pattern = Regex::new(IMAGE_PATTERN).unwrap();
138 let _audio_pattern = Regex::new(AUDIO_PATTERN).unwrap();
139 let split_pattern = Regex::new(&format!(r"({IMAGE_PATTERN}|{AUDIO_PATTERN})")).unwrap();
140
141 let mut pixel_values_accum = Vec::new();
142 let mut tgt_sizes_accum = Vec::new();
143 let mut image_bounds_accum = Vec::new();
144
145 for seq in input_seqs.iter_mut() {
146 let PreprocessedImages {
147 pixel_values: _,
148 pixel_attention_mask: _,
149 image_sizes: _,
150 num_img_tokens: _,
151 aspect_ratio_ids: _,
152 aspect_ratio_mask: _,
153 num_tiles: _,
154 image_grid_thw: _,
155 video_grid_thw: _,
156 rows: _,
157 cols: _,
158 pixel_values_list,
159 tgt_sizes,
160 image_sizes_all,
161 num_crops: _,
162 } = self
163 .preprocess(
164 seq.take_images()
165 .expect("Need to have images by this point."),
166 vec![],
167 config,
168 device,
169 (usize::MAX, usize::MAX), )
171 .expect("Preprocessing failed");
172 let pixel_values_list = pixel_values_list.unwrap();
173 let tgt_sizes = tgt_sizes.unwrap();
174 let image_sizes_all = image_sizes_all.unwrap();
175
176 let text = tokenizer
177 .decode(seq.get_toks(), false)
178 .expect("Detokenization failed!");
179
180 let mut text_chunks = {
181 let mut results = Vec::new();
182 let mut last_end = 0;
183
184 for m in split_pattern.find_iter(&text) {
185 if m.start() > last_end {
187 results.push((false, &text[last_end..m.start()]));
188 }
189 results.push((true, m.as_str()));
190 last_end = m.end();
191 }
192 if last_end < text.len() {
194 results.push((false, &text[last_end..]));
195 }
196
197 results
198 .into_iter()
199 .map(|(_, x)| x.to_string())
200 .collect::<Vec<_>>()
201 };
202
203 let image_tags = image_pattern.find_iter(&text).collect::<Vec<_>>();
204
205 if !image_tags.is_empty() {
206 assert_eq!(image_tags.len(), image_sizes_all.len());
207 }
208
209 let mut image_id = 0;
210 for chunk in &mut text_chunks {
211 if chunk == IMAGE_TAG {
212 *chunk =
213 self.get_slice_image_placeholder(image_sizes_all[image_id], image_id);
214 image_id += 1;
215 }
216 }
217
218 let final_text = text_chunks.join("");
219
220 let input_ids = tokenizer
221 .encode_fast(final_text.clone(), false)
222 .unwrap()
223 .get_ids()
224 .to_vec();
225
226 if !seq.multimodal.has_changed_prompt {
227 seq.set_initial_prompt(final_text.clone());
228
229 seq.set_toks_and_reallocate(input_ids.clone(), paged_attn_metadata.as_mut());
230 seq.multimodal.has_changed_prompt = true;
231 }
232
233 let image_bounds = {
234 let im_start_id = tokenizer
235 .encode_fast(
236 self.config
237 .im_start_token
238 .clone()
239 .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
240 false,
241 )
242 .unwrap()
243 .get_ids()[0];
244 let im_end_id = tokenizer
245 .encode_fast(
246 self.config
247 .im_end_token
248 .clone()
249 .unwrap_or(DEFAULT_IM_END_TOKEN.to_string()),
250 false,
251 )
252 .unwrap()
253 .get_ids()[0];
254 let slice_start_id = tokenizer
255 .encode_fast(
256 self.config
257 .slice_start_token
258 .clone()
259 .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
260 false,
261 )
262 .unwrap()
263 .get_ids()[0];
264 let slice_end_id = tokenizer
265 .encode_fast(
266 self.config
267 .slice_end_token
268 .clone()
269 .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string()),
270 false,
271 )
272 .unwrap()
273 .get_ids()[0];
274
275 let image_start_idx = input_ids
276 .iter()
277 .enumerate()
278 .filter_map(|(i, &id)| {
279 if id == im_start_id || id == slice_start_id {
280 Some(i as u32 + 1)
281 } else {
282 None
283 }
284 })
285 .collect::<Vec<_>>();
286
287 let image_end_idx = input_ids
288 .iter()
289 .enumerate()
290 .filter_map(|(i, &id)| {
291 if id == im_end_id || id == slice_end_id {
292 Some(i as u32)
293 } else {
294 None
295 }
296 })
297 .collect::<Vec<_>>();
298
299 let valid_image_nums = image_start_idx.len().max(image_end_idx.len());
300
301 let image_start_idx = Tensor::from_slice(
302 &image_start_idx[..valid_image_nums],
303 (valid_image_nums, 1),
304 device,
305 )
306 .unwrap();
307 let image_end_idx = Tensor::from_slice(
308 &image_end_idx[..valid_image_nums],
309 (valid_image_nums, 1),
310 device,
311 )
312 .unwrap();
313
314 Tensor::cat(&[image_start_idx, image_end_idx], 1).unwrap()
315 };
316
317 pixel_values_accum.push(pixel_values_list);
318 tgt_sizes_accum.push(tgt_sizes);
319 image_bounds_accum.push(image_bounds);
320 }
321
322 (
323 Some(pixel_values_accum),
324 Some(image_bounds_accum),
325 Some(tgt_sizes_accum),
326 )
327 } else {
328 (None, None, None)
329 };
330
331 let text_models_inputs_processor::InnerInputProcessorOutput {
332 inputs:
333 text_models_inputs_processor::InputMetadata {
334 input,
335 positions,
336 context_lens,
337 position_ids,
338 paged_attn_meta,
339 flash_meta,
340 },
341 seq_indices,
342 } = if is_prompt {
343 get_prompt_input(
344 input_seqs
345 .iter()
346 .map(|seq| seq.get_toks())
347 .collect::<Vec<_>>(),
348 input_seqs,
349 device,
350 last_n_context_len,
351 return_raw_logits,
352 paged_attn_metadata.as_mut(),
353 None, mapper,
355 )
356 .nth(0)
357 .unwrap()
358 .unwrap()
359 } else {
360 get_completion_input(
361 input_seqs
362 .iter()
363 .map(|seq| seq.get_toks())
364 .collect::<Vec<_>>(),
365 input_seqs,
366 device,
367 no_kv_cache,
368 last_n_context_len,
369 return_raw_logits,
370 paged_attn_metadata.as_mut(),
371 None, mapper,
373 )
374 .nth(0)
375 .unwrap()
376 .unwrap()
377 };
378
379 let args = MiniCpmOSpecificArgs {
380 pixel_values_all,
381 tgt_sizes,
382 image_bound,
383 };
384
385 let inputs: Box<dyn Any> = Box::new(ModelInputs {
387 input_ids: input,
388 seqlen_offsets: positions,
389 context_lens,
390 position_ids,
391 pixel_values: None,
392 model_specific_args: Box::new(args),
393 paged_attn_meta,
394 flash_meta,
395 });
396 Box::new(std::iter::once(Ok(InputProcessorOutput {
397 inputs,
398 seq_indices,
399 })))
400 }
401}
402
403impl MiniCpmOImageProcessor {
404 fn get_sliced_grid(
405 &self,
406 (w, h): (usize, usize),
407 max_slice_nums: usize,
408 scale_resolution: usize,
409 never_split: bool,
410 ) -> Option<(usize, usize)> {
411 let log_ratio = ((w / h) as f32).ln();
412 let ratio = (w * h) as f32 / (scale_resolution * scale_resolution) as f32;
413 let multiple = ratio.ceil().min(max_slice_nums as f32);
414 if multiple <= 1. || never_split {
415 return None;
416 }
417
418 let mut candidate_split_grid_nums = Vec::new();
419 for i in [multiple - 1., multiple, multiple + 1.] {
420 if i == 1. || i > max_slice_nums as f32 {
421 continue;
422 }
423 candidate_split_grid_nums.push(i);
424 }
425
426 let mut candidate_grids = Vec::new();
427 for split_grid_nums in candidate_split_grid_nums {
428 let mut m = 1.;
429 while m <= split_grid_nums {
430 if split_grid_nums % m == 0. {
431 candidate_grids.push((m as usize, split_grid_nums as usize / m as usize));
432 }
433 m += 1.;
434 }
435 }
436
437 let mut best_grid = (1, 1);
438 let mut min_error = f32::INFINITY;
439 for grid in candidate_grids {
440 let error = (log_ratio - (grid.0 as f32 / grid.1 as f32).ln()).abs();
441 if error < min_error {
442 best_grid = grid;
443 min_error = error;
444 }
445 }
446
447 Some(best_grid)
448 }
449
450 fn ensure_divide(&self, length: usize, patch_size: usize) -> usize {
451 ((length as f32 / patch_size as f32).round() * patch_size as f32).max(patch_size as f32)
452 as usize
453 }
454
455 fn find_best_resize(
456 &self,
457 (mut w, mut h): (usize, usize),
458 scale_resolution: usize,
459 patch_size: usize,
460 allow_upscale: bool,
461 ) -> (usize, usize) {
462 if w * h > scale_resolution * scale_resolution || allow_upscale {
463 let r = w as f32 / h as f32;
464 h = (scale_resolution as f32 / r.sqrt()) as usize;
465 w = (scale_resolution as f32 * r) as usize;
466 }
467 let best_w = self.ensure_divide(w, patch_size);
468 let best_h = self.ensure_divide(h, patch_size);
469 (best_w, best_h)
470 }
471
472 fn get_refine_size(
473 &self,
474 (w, h): (usize, usize),
475 (grid_x, grid_y): (usize, usize),
476 scale_resolution: usize,
477 patch_size: usize,
478 allow_upscale: bool,
479 ) -> (usize, usize) {
480 let refine_w = self.ensure_divide(w, grid_x);
481 let refine_h = self.ensure_divide(h, grid_y);
482
483 let grid_w = refine_h / grid_x;
484 let grid_h = refine_w / grid_y;
485
486 let best_grid_size = self.find_best_resize(
487 (grid_w, grid_h),
488 scale_resolution,
489 patch_size,
490 allow_upscale,
491 );
492
493 (best_grid_size.0 * grid_x, best_grid_size.1 * grid_y)
494 }
495
496 fn split_to_patches(
497 &self,
498 image: &DynamicImage,
499 grid: (usize, usize),
500 ) -> Vec<Vec<DynamicImage>> {
501 let mut patches = Vec::new();
502 let (w, h) = image.dimensions();
503 let (w, h) = (w as usize, h as usize);
504 let grid_x = w / grid.0;
505 let grid_y = h / grid.1;
506 for i in (0..h).step_by(grid_y) {
507 let mut images = Vec::new();
508 for j in (0..w).step_by(grid_x) {
509 images.push(image.crop_imm(j as u32, i as u32, grid_x as u32, grid_y as u32));
510 }
511 patches.push(images);
512 }
513 patches
514 }
515
516 fn get_sliced_images(
517 &self,
518 image: &DynamicImage,
519 max_slice_nums: usize,
520 scale_resolution: usize,
521 patch_size: usize,
522 ) -> Vec<DynamicImage> {
523 if !self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE) {
524 return vec![image.clone()];
525 }
526
527 let dims = image.dimensions();
528 let (w, h) = (dims.0 as usize, dims.1 as usize);
529
530 let best_grid = self.get_sliced_grid((w, h), max_slice_nums, scale_resolution, false);
531
532 let (source_images, patches) = if let Some(best_grid) = best_grid {
533 let best_resize = self.find_best_resize((w, h), scale_resolution, patch_size, false);
535 let source_image = image.resize_exact(
536 best_resize.0 as u32,
537 best_resize.1 as u32,
538 FilterType::CatmullRom,
539 );
540 let refine_size =
541 self.get_refine_size((w, h), best_grid, scale_resolution, patch_size, true);
542 let refine_image = image.resize_exact(
543 refine_size.0 as u32,
544 refine_size.1 as u32,
545 FilterType::CatmullRom,
546 );
547 let patches = self
548 .split_to_patches(&refine_image, best_grid)
549 .into_iter()
550 .flatten()
551 .collect::<Vec<_>>();
552
553 (source_image, patches)
554 } else {
555 let best_size = self.find_best_resize((w, h), scale_resolution, patch_size, true);
557 let source_images = image.resize_exact(
558 best_size.0 as u32,
559 best_size.1 as u32,
560 FilterType::CatmullRom,
561 );
562
563 (source_images, vec![])
564 };
565
566 [vec![source_images], patches].concat()
567 }
568
569 fn reshape_by_patch(&self, image: &Tensor, patch_size: usize) -> Result<Tensor> {
572 let (_c, h, w) = image.dims3()?;
574 let (kh, kw) = (patch_size, patch_size);
576 let (sh, sw) = (patch_size, patch_size);
578
579 let out_h = (h - kh) / sh + 1;
580 let out_w = (w - kw) / sw + 1;
581
582 let mut patches = Vec::new();
583 for i in 0..out_h {
584 for j in 0..out_w {
585 let patch = image.i((.., i * sh..i * sh + kh, j * sw..j * sw + kw))?;
587 patches.push(patch.flatten_all()?);
589 }
590 }
591 let mut patches = Tensor::stack(&patches, 1)?;
593
594 patches = patches.reshape((image.dim(0)?, patch_size, patch_size, ()))?;
595 patches
596 .permute((0, 1, 3, 2))?
597 .reshape((image.dim(0)?, patch_size, ()))
598 }
599
600 fn get_image_id_placeholder(&self, image_idx: usize) -> String {
601 format!(
602 "{}{image_idx}{}",
603 self.config
604 .im_id_start
605 .clone()
606 .unwrap_or(DEFAULT_IM_ID_START.to_string()),
607 self.config
608 .im_id_end
609 .clone()
610 .unwrap_or(DEFAULT_IM_ID_END.to_string())
611 )
612 }
613
614 fn get_grid_placeholder(&self, grid: Option<(usize, usize)>) -> String {
615 if let Some(grid) = grid {
616 let slice_image_placeholder = format!(
617 "{}{}{}",
618 self.config
619 .slice_start_token
620 .clone()
621 .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
622 self.config
623 .unk_token
624 .clone()
625 .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
626 .repeat(
627 self.config
628 .image_feature_size
629 .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
630 ),
631 self.config
632 .slice_end_token
633 .clone()
634 .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string())
635 );
636
637 let (cols, rows) = grid;
638 let mut slices = Vec::new();
639 for _ in 0..rows {
640 let mut lines = Vec::new();
641 for _ in 0..cols {
642 lines.push(slice_image_placeholder.clone());
643 }
644 slices.push(lines.join(""));
645 }
646
647 slices.join("\n")
648 } else {
649 "".to_string()
650 }
651 }
652
653 fn get_slice_image_placeholder(&self, image_size: (u32, u32), image_idx: usize) -> String {
654 let max_slice_nums = self.config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
655 let use_image_id = self.config.use_image_id.unwrap_or(DEFAULT_USE_IMAGE_ID);
656 let slice_mode = self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE);
657
658 let grid = self.get_sliced_grid(
659 (image_size.0 as usize, image_size.1 as usize),
660 max_slice_nums,
661 DEFAULT_SCALE_RESOLUTION,
662 false,
663 );
664
665 let image_placeholder = format!(
666 "{}{}{}",
667 self.config
668 .im_start_token
669 .clone()
670 .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
671 self.config
672 .unk_token
673 .clone()
674 .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
675 .repeat(
676 self.config
677 .image_feature_size
678 .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
679 ),
680 self.config
681 .im_end_token
682 .clone()
683 .unwrap_or(DEFAULT_IM_END_TOKEN.to_string())
684 );
685
686 let final_placeholder = if use_image_id {
687 format!(
688 "{}{image_placeholder}",
689 self.get_image_id_placeholder(image_idx)
690 )
691 } else {
692 image_placeholder
693 };
694
695 if slice_mode {
696 format!("{final_placeholder}{}", self.get_grid_placeholder(grid))
697 } else {
698 final_placeholder
699 }
700 }
701}
702
703impl ImagePreProcessor for MiniCpmOImageProcessor {
704 #[allow(clippy::excessive_precision)]
705 const DEFAULT_MEAN: [f64; 3] = [0.5, 0.5, 0.5];
706 #[allow(clippy::excessive_precision)]
707 const DEFAULT_STD: [f64; 3] = [0.5, 0.5, 0.5];
708
709 fn preprocess(
710 &self,
711 images: Vec<DynamicImage>,
712 _videos: Vec<Vec<DynamicImage>>,
713 config: &PreProcessorConfig,
714 device: &Device,
715 (_bs, _max_num_images): (usize, usize),
716 ) -> Result<PreprocessedImages> {
717 let mut pixel_values = Vec::new();
718 let mut tgt_sizes = Vec::new();
719 let image_sizes = images
720 .iter()
721 .map(|img| img.dimensions())
722 .collect::<Vec<_>>();
723 for image in images {
724 let max_slice_nums = config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
725 let scale_resolution = config.scale_resolution.unwrap_or(DEFAULT_SCALE_RESOLUTION);
726 let patch_size = config.patch_size.unwrap_or(DEFAULT_PATCH_SIZE);
727
728 let image_patches =
729 self.get_sliced_images(&image, max_slice_nums, scale_resolution, patch_size);
730
731 for slice_image in image_patches {
732 let (w, h) = slice_image.dimensions();
733 let to_tensor_rescale = Transforms {
734 input: &ToTensor,
735 inner_transforms: &[&Normalize {
736 mean: config.image_mean.unwrap_or(Self::DEFAULT_MEAN).to_vec(),
737 std: config.image_std.unwrap_or(Self::DEFAULT_STD).to_vec(),
738 }],
739 };
740 let mut image = slice_image.apply(to_tensor_rescale, device)?;
741 image = self.reshape_by_patch(&image, patch_size)?;
742 pixel_values.push(image);
743 tgt_sizes.push(Tensor::from_vec(
744 vec![h / patch_size as u32, w / patch_size as u32],
745 (1, 2),
746 &Device::Cpu,
747 )?);
748 }
749 }
750
751 let tgt_sizes = Tensor::cat(&tgt_sizes, 0)?.to_device(device)?;
752 Ok(PreprocessedImages {
754 pixel_values: Tensor::new(0u32, &Device::Cpu)?,
755 pixel_attention_mask: None,
756 image_sizes: None,
757 num_img_tokens: None,
758 aspect_ratio_ids: None,
759 aspect_ratio_mask: None,
760 num_tiles: None,
761 image_grid_thw: None,
762 video_grid_thw: None,
763 rows: None,
764 cols: None,
765 pixel_values_list: Some(pixel_values),
766 tgt_sizes: Some(tgt_sizes),
767 image_sizes_all: Some(image_sizes),
768 num_crops: None,
769 })
770 }
771}