1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2
3use std::{any::Any, num::NonZeroUsize, sync::Arc};
4
5use candle_core::{Device, IndexOp, Result, Tensor};
6use image::{imageops::FilterType, DynamicImage, GenericImageView};
7use mistralrs_vision::{ApplyTransforms, Normalize, ToTensor, Transforms};
8use regex::Regex;
9use tokenizers::Tokenizer;
10use tracing::warn;
11
12use crate::{
13 device_map::DeviceMapper,
14 pipeline::{
15 text_models_inputs_processor::{
16 self, get_completion_input, get_prompt_input, PagedAttentionMeta,
17 },
18 InputProcessorOutput, InputsProcessor, InputsProcessorType, MessagesAction, Processor,
19 },
20 sequence::Sequence,
21 vision_models::ModelInputs,
22};
23
24use crate::vision_models::{
25 image_processor::{ImagePreProcessor, PreprocessedImages},
26 preprocessor_config::PreProcessorConfig,
27 processor_config::ProcessorConfig,
28};
29
30use super::MiniCpmOSpecificArgs;
31
32const DEFAULT_MAX_SLICE_NUMS: usize = 9;
33const DEFAULT_SCALE_RESOLUTION: usize = 448;
34const DEFAULT_PATCH_SIZE: usize = 14;
35const DEFAULT_IMAGE_FEATURE_SIZE: usize = 64;
36const DEFAULT_IM_START_TOKEN: &str = "<image>";
37const DEFAULT_IM_END_TOKEN: &str = "</image>";
38const DEFAULT_IM_ID_START: &str = "<image_id>";
39const DEFAULT_IM_ID_END: &str = "</image_id>";
40const DEFAULT_SLICE_START_TOKEN: &str = "<slice>";
41const DEFAULT_SLICE_END_TOKEN: &str = "</slice>";
42const DEFAULT_UNK_TOKEN: &str = "<unk>";
43const DEFAULT_USE_IMAGE_ID: bool = false;
44const DEFAULT_SLICE_MODE: bool = true;
45
46pub struct MiniCpmOImageProcessor {
47 config: PreProcessorConfig,
48}
49
50pub struct MiniCpmOProcessor {
51 preprocessor_config: PreProcessorConfig,
52}
53
54impl MiniCpmOProcessor {
55 pub fn new(
56 _config: ProcessorConfig,
57 preprocessor_config: PreProcessorConfig,
58 _max_edge: Option<u32>,
59 ) -> Self {
60 Self {
61 preprocessor_config,
62 }
63 }
64}
65
66impl Processor for MiniCpmOProcessor {
67 fn inputs_processor(&self) -> Arc<dyn InputsProcessor> {
68 Arc::new(MiniCpmOImageProcessor {
69 config: self.preprocessor_config.clone(),
70 })
71 }
72
73 fn get_special_tokens(&self) -> &[&'static str] {
74 &[
75 DEFAULT_IM_START_TOKEN,
76 DEFAULT_IM_END_TOKEN,
77 DEFAULT_SLICE_START_TOKEN,
78 DEFAULT_SLICE_END_TOKEN,
79 DEFAULT_UNK_TOKEN,
80 ]
81 }
82
83 fn template_action(&self) -> MessagesAction {
84 MessagesAction::FlattenOnlyText
85 }
86}
87
88impl InputsProcessor for MiniCpmOImageProcessor {
89 fn get_type(&self) -> InputsProcessorType {
90 InputsProcessorType::Vision
91 }
92 fn process_inputs(
93 &self,
94 tokenizer: Option<Arc<Tokenizer>>,
95 input_seqs: &mut [&mut Sequence],
96 is_prompt: bool,
97 is_xlora: bool,
98 device: &Device,
99 no_kv_cache: bool,
100 last_n_context_len: Option<(usize, usize)>,
101 return_raw_logits: bool,
102 other_config: Option<Arc<dyn Any>>,
103 mut paged_attn_metadata: Option<PagedAttentionMeta<'_>>,
104 prompt_chunksize: Option<NonZeroUsize>,
105 mapper: Option<&dyn DeviceMapper>,
106 ) -> Box<dyn Iterator<Item = anyhow::Result<InputProcessorOutput>>> {
107 if is_xlora {
108 return Box::new(std::iter::once(Err(anyhow::Error::msg(
109 "Cannot make inputs for X-LoRA vision model.",
110 ))));
111 }
112 if no_kv_cache {
113 return Box::new(std::iter::once(Err(anyhow::Error::msg(
114 "Vision model must have kv cache.",
115 ))));
116 }
117 if prompt_chunksize.is_some() {
119 warn!("`prompt_chunksize` is set. MiniCpm-O does not support prompt batching.");
120 }
121 let Some(tokenizer) = tokenizer else {
122 return Box::new(std::iter::once(Err(anyhow::Error::msg(
123 "MiniCpmOImageProcessor requires a specified tokenizer.",
124 ))));
125 };
126
127 let config = other_config.expect("Need a PreProcessorConfig config.");
128 let config: &PreProcessorConfig = config.downcast_ref().expect("Downcast failed.");
129
130 let has_images = input_seqs.iter().all(|seq| seq.has_images());
131
132 let (pixel_values_all, image_bound, tgt_sizes) = if has_images {
133 const IMAGE_TAG: &str = "(<image>./</image>)";
134 const IMAGE_PATTERN: &str = r"\(<image>./</image>\)";
135 const AUDIO_PATTERN: &str = r"\(<audio>./</audio>\)";
136
137 let image_pattern = Regex::new(IMAGE_PATTERN).unwrap();
138 let _audio_pattern = Regex::new(AUDIO_PATTERN).unwrap();
139 let split_pattern = Regex::new(&format!(r"({IMAGE_PATTERN}|{AUDIO_PATTERN})")).unwrap();
140
141 let mut pixel_values_accum = Vec::new();
142 let mut tgt_sizes_accum = Vec::new();
143 let mut image_bounds_accum = Vec::new();
144
145 for seq in input_seqs.iter_mut() {
146 let PreprocessedImages {
147 pixel_values: _,
148 pixel_attention_mask: _,
149 image_sizes: _,
150 num_img_tokens: _,
151 aspect_ratio_ids: _,
152 aspect_ratio_mask: _,
153 num_tiles: _,
154 image_grid_thw: _,
155 video_grid_thw: _,
156 rows: _,
157 cols: _,
158 pixel_values_list,
159 tgt_sizes,
160 image_sizes_all,
161 num_crops: _,
162 } = self
163 .preprocess(
164 seq.take_images()
165 .expect("Need to have images by this point."),
166 vec![],
167 config,
168 device,
169 (usize::MAX, usize::MAX), )
171 .expect("Preprocessing failed");
172 let pixel_values_list = pixel_values_list.unwrap();
173 let tgt_sizes = tgt_sizes.unwrap();
174 let image_sizes_all = image_sizes_all.unwrap();
175
176 let text = tokenizer
177 .decode(seq.get_toks(), false)
178 .expect("Detokenization failed!");
179
180 let mut text_chunks = {
181 let mut results = Vec::new();
182 let mut last_end = 0;
183
184 for m in split_pattern.find_iter(&text) {
185 if m.start() > last_end {
187 results.push((false, &text[last_end..m.start()]));
188 }
189 results.push((true, m.as_str()));
190 last_end = m.end();
191 }
192 if last_end < text.len() {
194 results.push((false, &text[last_end..]));
195 }
196
197 results
198 .into_iter()
199 .map(|(_, x)| x.to_string())
200 .collect::<Vec<_>>()
201 };
202
203 let image_tags = image_pattern.find_iter(&text).collect::<Vec<_>>();
204
205 if !image_tags.is_empty() {
206 assert_eq!(image_tags.len(), image_sizes_all.len());
207 }
208
209 let mut image_id = 0;
210 for chunk in &mut text_chunks {
211 if chunk == IMAGE_TAG {
212 *chunk =
213 self.get_slice_image_placeholder(image_sizes_all[image_id], image_id);
214 image_id += 1;
215 }
216 }
217
218 let final_text = text_chunks.join("");
219 seq.set_initial_prompt(final_text.clone());
220
221 let image_bounds = {
222 let im_start_id = tokenizer
223 .encode_fast(
224 self.config
225 .im_start_token
226 .clone()
227 .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
228 false,
229 )
230 .unwrap()
231 .get_ids()[0];
232 let im_end_id = tokenizer
233 .encode_fast(
234 self.config
235 .im_end_token
236 .clone()
237 .unwrap_or(DEFAULT_IM_END_TOKEN.to_string()),
238 false,
239 )
240 .unwrap()
241 .get_ids()[0];
242 let slice_start_id = tokenizer
243 .encode_fast(
244 self.config
245 .slice_start_token
246 .clone()
247 .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
248 false,
249 )
250 .unwrap()
251 .get_ids()[0];
252 let slice_end_id = tokenizer
253 .encode_fast(
254 self.config
255 .slice_end_token
256 .clone()
257 .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string()),
258 false,
259 )
260 .unwrap()
261 .get_ids()[0];
262
263 let input_ids = tokenizer
264 .encode_fast(final_text, false)
265 .unwrap()
266 .get_ids()
267 .to_vec();
268
269 seq.set_toks_and_reallocate(input_ids.clone(), paged_attn_metadata.as_mut());
270
271 let image_start_idx = input_ids
272 .iter()
273 .enumerate()
274 .filter_map(|(i, &id)| {
275 if id == im_start_id || id == slice_start_id {
276 Some(i as u32 + 1)
277 } else {
278 None
279 }
280 })
281 .collect::<Vec<_>>();
282
283 let image_end_idx = input_ids
284 .iter()
285 .enumerate()
286 .filter_map(|(i, &id)| {
287 if id == im_end_id || id == slice_end_id {
288 Some(i as u32)
289 } else {
290 None
291 }
292 })
293 .collect::<Vec<_>>();
294
295 let valid_image_nums = image_start_idx.len().max(image_end_idx.len());
296
297 let image_start_idx = Tensor::from_slice(
298 &image_start_idx[..valid_image_nums],
299 (valid_image_nums, 1),
300 device,
301 )
302 .unwrap();
303 let image_end_idx = Tensor::from_slice(
304 &image_end_idx[..valid_image_nums],
305 (valid_image_nums, 1),
306 device,
307 )
308 .unwrap();
309
310 Tensor::cat(&[image_start_idx, image_end_idx], 1).unwrap()
311 };
312
313 pixel_values_accum.push(pixel_values_list);
314 tgt_sizes_accum.push(tgt_sizes);
315 image_bounds_accum.push(image_bounds);
316 }
317
318 (
319 Some(pixel_values_accum),
320 Some(image_bounds_accum),
321 Some(tgt_sizes_accum),
322 )
323 } else {
324 (None, None, None)
325 };
326
327 let text_models_inputs_processor::InnerInputProcessorOutput {
328 inputs:
329 text_models_inputs_processor::InputMetadata {
330 input,
331 positions,
332 context_lens,
333 position_ids,
334 paged_attn_meta,
335 flash_meta,
336 },
337 seq_indices,
338 } = if is_prompt {
339 get_prompt_input(
340 input_seqs
341 .iter()
342 .map(|seq| seq.get_toks().to_vec())
343 .collect::<Vec<_>>(),
344 input_seqs,
345 device,
346 last_n_context_len,
347 return_raw_logits,
348 paged_attn_metadata.as_mut(),
349 None, mapper,
351 )
352 .nth(0)
353 .unwrap()
354 .unwrap()
355 } else {
356 get_completion_input(
357 input_seqs
358 .iter()
359 .map(|seq| seq.get_toks().to_vec())
360 .collect::<Vec<_>>(),
361 input_seqs,
362 device,
363 no_kv_cache,
364 last_n_context_len,
365 return_raw_logits,
366 paged_attn_metadata.as_mut(),
367 None, mapper,
369 )
370 .nth(0)
371 .unwrap()
372 .unwrap()
373 };
374
375 let args = MiniCpmOSpecificArgs {
376 pixel_values_all,
377 tgt_sizes,
378 image_bound,
379 };
380
381 let inputs: Box<dyn Any> = Box::new(ModelInputs {
383 input_ids: input,
384 seqlen_offsets: positions,
385 context_lens,
386 position_ids,
387 pixel_values: None,
388 model_specific_args: Box::new(args),
389 paged_attn_meta,
390 flash_meta,
391 });
392 Box::new(std::iter::once(Ok(InputProcessorOutput {
393 inputs,
394 seq_indices,
395 })))
396 }
397}
398
399impl MiniCpmOImageProcessor {
400 fn get_sliced_grid(
401 &self,
402 (w, h): (usize, usize),
403 max_slice_nums: usize,
404 scale_resolution: usize,
405 never_split: bool,
406 ) -> Option<(usize, usize)> {
407 let log_ratio = ((w / h) as f32).ln();
408 let ratio = (w * h) as f32 / (scale_resolution * scale_resolution) as f32;
409 let multiple = ratio.ceil().min(max_slice_nums as f32);
410 if multiple <= 1. || never_split {
411 return None;
412 }
413
414 let mut candidate_split_grid_nums = Vec::new();
415 for i in [multiple - 1., multiple, multiple + 1.] {
416 if i == 1. || i > max_slice_nums as f32 {
417 continue;
418 }
419 candidate_split_grid_nums.push(i);
420 }
421
422 let mut candidate_grids = Vec::new();
423 for split_grid_nums in candidate_split_grid_nums {
424 let mut m = 1.;
425 while m <= split_grid_nums {
426 if split_grid_nums % m == 0. {
427 candidate_grids.push((m as usize, split_grid_nums as usize / m as usize));
428 }
429 m += 1.;
430 }
431 }
432
433 let mut best_grid = (1, 1);
434 let mut min_error = f32::INFINITY;
435 for grid in candidate_grids {
436 let error = (log_ratio - (grid.0 as f32 / grid.1 as f32).ln()).abs();
437 if error < min_error {
438 best_grid = grid;
439 min_error = error;
440 }
441 }
442
443 Some(best_grid)
444 }
445
446 fn ensure_divide(&self, length: usize, patch_size: usize) -> usize {
447 ((length as f32 / patch_size as f32).round() * patch_size as f32).max(patch_size as f32)
448 as usize
449 }
450
451 fn find_best_resize(
452 &self,
453 (mut w, mut h): (usize, usize),
454 scale_resolution: usize,
455 patch_size: usize,
456 allow_upscale: bool,
457 ) -> (usize, usize) {
458 if w * h > scale_resolution * scale_resolution || allow_upscale {
459 let r = w as f32 / h as f32;
460 h = (scale_resolution as f32 / r.sqrt()) as usize;
461 w = (scale_resolution as f32 * r) as usize;
462 }
463 let best_w = self.ensure_divide(w, patch_size);
464 let best_h = self.ensure_divide(h, patch_size);
465 (best_w, best_h)
466 }
467
468 fn get_refine_size(
469 &self,
470 (w, h): (usize, usize),
471 (grid_x, grid_y): (usize, usize),
472 scale_resolution: usize,
473 patch_size: usize,
474 allow_upscale: bool,
475 ) -> (usize, usize) {
476 let refine_w = self.ensure_divide(w, grid_x);
477 let refine_h = self.ensure_divide(h, grid_y);
478
479 let grid_w = refine_h / grid_x;
480 let grid_h = refine_w / grid_y;
481
482 let best_grid_size = self.find_best_resize(
483 (grid_w, grid_h),
484 scale_resolution,
485 patch_size,
486 allow_upscale,
487 );
488
489 (best_grid_size.0 * grid_x, best_grid_size.1 * grid_y)
490 }
491
492 fn split_to_patches(
493 &self,
494 image: &DynamicImage,
495 grid: (usize, usize),
496 ) -> Vec<Vec<DynamicImage>> {
497 let mut patches = Vec::new();
498 let (w, h) = image.dimensions();
499 let (w, h) = (w as usize, h as usize);
500 let grid_x = w / grid.0;
501 let grid_y = h / grid.1;
502 for i in (0..h).step_by(grid_y) {
503 let mut images = Vec::new();
504 for j in (0..w).step_by(grid_x) {
505 images.push(image.crop_imm(j as u32, i as u32, grid_x as u32, grid_y as u32));
506 }
507 patches.push(images);
508 }
509 patches
510 }
511
512 fn get_sliced_images(
513 &self,
514 image: &DynamicImage,
515 max_slice_nums: usize,
516 scale_resolution: usize,
517 patch_size: usize,
518 ) -> Vec<DynamicImage> {
519 if !self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE) {
520 return vec![image.clone()];
521 }
522
523 let dims = image.dimensions();
524 let (w, h) = (dims.0 as usize, dims.1 as usize);
525
526 let best_grid = self.get_sliced_grid((w, h), max_slice_nums, scale_resolution, false);
527
528 let (source_images, patches) = if let Some(best_grid) = best_grid {
529 let best_resize = self.find_best_resize((w, h), scale_resolution, patch_size, false);
531 let source_image = image.resize_exact(
532 best_resize.0 as u32,
533 best_resize.1 as u32,
534 FilterType::CatmullRom,
535 );
536 let refine_size =
537 self.get_refine_size((w, h), best_grid, scale_resolution, patch_size, true);
538 let refine_image = image.resize_exact(
539 refine_size.0 as u32,
540 refine_size.1 as u32,
541 FilterType::CatmullRom,
542 );
543 let patches = self
544 .split_to_patches(&refine_image, best_grid)
545 .into_iter()
546 .flatten()
547 .collect::<Vec<_>>();
548
549 (source_image, patches)
550 } else {
551 let best_size = self.find_best_resize((w, h), scale_resolution, patch_size, true);
553 let source_images = image.resize_exact(
554 best_size.0 as u32,
555 best_size.1 as u32,
556 FilterType::CatmullRom,
557 );
558
559 (source_images, vec![])
560 };
561
562 [vec![source_images], patches].concat()
563 }
564
565 fn reshape_by_patch(&self, image: &Tensor, patch_size: usize) -> Result<Tensor> {
568 let (_c, h, w) = image.dims3()?;
570 let (kh, kw) = (patch_size, patch_size);
572 let (sh, sw) = (patch_size, patch_size);
574
575 let out_h = (h - kh) / sh + 1;
576 let out_w = (w - kw) / sw + 1;
577
578 let mut patches = Vec::new();
579 for i in 0..out_h {
580 for j in 0..out_w {
581 let patch = image.i((.., i * sh..i * sh + kh, j * sw..j * sw + kw))?;
583 patches.push(patch.flatten_all()?);
585 }
586 }
587 let mut patches = Tensor::stack(&patches, 1)?;
589
590 patches = patches.reshape((image.dim(0)?, patch_size, patch_size, ()))?;
591 patches
592 .permute((0, 1, 3, 2))?
593 .reshape((image.dim(0)?, patch_size, ()))
594 }
595
596 fn get_image_id_placeholder(&self, image_idx: usize) -> String {
597 format!(
598 "{}{image_idx}{}",
599 self.config
600 .im_id_start
601 .clone()
602 .unwrap_or(DEFAULT_IM_ID_START.to_string()),
603 self.config
604 .im_id_end
605 .clone()
606 .unwrap_or(DEFAULT_IM_ID_END.to_string())
607 )
608 }
609
610 fn get_grid_placeholder(&self, grid: Option<(usize, usize)>) -> String {
611 if let Some(grid) = grid {
612 let slice_image_placeholder = format!(
613 "{}{}{}",
614 self.config
615 .slice_start_token
616 .clone()
617 .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
618 self.config
619 .unk_token
620 .clone()
621 .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
622 .repeat(
623 self.config
624 .image_feature_size
625 .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
626 ),
627 self.config
628 .slice_end_token
629 .clone()
630 .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string())
631 );
632
633 let (cols, rows) = grid;
634 let mut slices = Vec::new();
635 for _ in 0..rows {
636 let mut lines = Vec::new();
637 for _ in 0..cols {
638 lines.push(slice_image_placeholder.clone());
639 }
640 slices.push(lines.join(""));
641 }
642
643 slices.join("\n")
644 } else {
645 "".to_string()
646 }
647 }
648
649 fn get_slice_image_placeholder(&self, image_size: (u32, u32), image_idx: usize) -> String {
650 let max_slice_nums = self.config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
651 let use_image_id = self.config.use_image_id.unwrap_or(DEFAULT_USE_IMAGE_ID);
652 let slice_mode = self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE);
653
654 let grid = self.get_sliced_grid(
655 (image_size.0 as usize, image_size.1 as usize),
656 max_slice_nums,
657 DEFAULT_SCALE_RESOLUTION,
658 false,
659 );
660
661 let image_placeholder = format!(
662 "{}{}{}",
663 self.config
664 .im_start_token
665 .clone()
666 .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
667 self.config
668 .unk_token
669 .clone()
670 .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
671 .repeat(
672 self.config
673 .image_feature_size
674 .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
675 ),
676 self.config
677 .im_end_token
678 .clone()
679 .unwrap_or(DEFAULT_IM_END_TOKEN.to_string())
680 );
681
682 let final_placeholder = if use_image_id {
683 format!(
684 "{}{image_placeholder}",
685 self.get_image_id_placeholder(image_idx)
686 )
687 } else {
688 image_placeholder
689 };
690
691 if slice_mode {
692 format!("{final_placeholder}{}", self.get_grid_placeholder(grid))
693 } else {
694 final_placeholder
695 }
696 }
697}
698
699impl ImagePreProcessor for MiniCpmOImageProcessor {
700 #[allow(clippy::excessive_precision)]
701 const DEFAULT_MEAN: [f64; 3] = [0.5, 0.5, 0.5];
702 #[allow(clippy::excessive_precision)]
703 const DEFAULT_STD: [f64; 3] = [0.5, 0.5, 0.5];
704
705 fn preprocess(
706 &self,
707 images: Vec<DynamicImage>,
708 _videos: Vec<Vec<DynamicImage>>,
709 config: &PreProcessorConfig,
710 device: &Device,
711 (_bs, _max_num_images): (usize, usize),
712 ) -> Result<PreprocessedImages> {
713 let mut pixel_values = Vec::new();
714 let mut tgt_sizes = Vec::new();
715 let image_sizes = images
716 .iter()
717 .map(|img| img.dimensions())
718 .collect::<Vec<_>>();
719 for image in images {
720 let max_slice_nums = config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
721 let scale_resolution = config.scale_resolution.unwrap_or(DEFAULT_SCALE_RESOLUTION);
722 let patch_size = config.patch_size.unwrap_or(DEFAULT_PATCH_SIZE);
723
724 let image_patches =
725 self.get_sliced_images(&image, max_slice_nums, scale_resolution, patch_size);
726
727 for slice_image in image_patches {
728 let (w, h) = slice_image.dimensions();
729 let to_tensor_rescale = Transforms {
730 input: &ToTensor,
731 inner_transforms: &[&Normalize {
732 mean: config.image_mean.unwrap_or(Self::DEFAULT_MEAN).to_vec(),
733 std: config.image_std.unwrap_or(Self::DEFAULT_STD).to_vec(),
734 }],
735 };
736 let mut image = slice_image.apply(to_tensor_rescale, device)?;
737 image = self.reshape_by_patch(&image, patch_size)?;
738 pixel_values.push(image);
739 tgt_sizes.push(Tensor::from_vec(
740 vec![h / patch_size as u32, w / patch_size as u32],
741 (1, 2),
742 &Device::Cpu,
743 )?);
744 }
745 }
746
747 let tgt_sizes = Tensor::cat(&tgt_sizes, 0)?.to_device(device)?;
748 Ok(PreprocessedImages {
750 pixel_values: Tensor::new(0u32, &Device::Cpu)?,
751 pixel_attention_mask: None,
752 image_sizes: None,
753 num_img_tokens: None,
754 aspect_ratio_ids: None,
755 aspect_ratio_mask: None,
756 num_tiles: None,
757 image_grid_thw: None,
758 video_grid_thw: None,
759 rows: None,
760 cols: None,
761 pixel_values_list: Some(pixel_values),
762 tgt_sizes: Some(tgt_sizes),
763 image_sizes_all: Some(image_sizes),
764 num_crops: None,
765 })
766 }
767}