mistralrs_core/vision_models/minicpmo/
inputs_processor.rs

1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2
3use std::{any::Any, num::NonZeroUsize, sync::Arc};
4
5use candle_core::{Device, IndexOp, Result, Tensor};
6use image::{imageops::FilterType, DynamicImage, GenericImageView};
7use mistralrs_vision::{ApplyTransforms, Normalize, ToTensor, Transforms};
8use regex::Regex;
9use tokenizers::Tokenizer;
10use tracing::warn;
11
12use crate::{
13    device_map::DeviceMapper,
14    pipeline::{
15        text_models_inputs_processor::{
16            self, get_completion_input, get_prompt_input, PagedAttentionMeta,
17        },
18        InputProcessorOutput, InputsProcessor, InputsProcessorType, MessagesAction, Processor,
19    },
20    sequence::Sequence,
21    vision_models::ModelInputs,
22};
23
24use crate::vision_models::{
25    image_processor::{ImagePreProcessor, PreprocessedImages},
26    preprocessor_config::PreProcessorConfig,
27    processor_config::ProcessorConfig,
28};
29
30use super::MiniCpmOSpecificArgs;
31
32const DEFAULT_MAX_SLICE_NUMS: usize = 9;
33const DEFAULT_SCALE_RESOLUTION: usize = 448;
34const DEFAULT_PATCH_SIZE: usize = 14;
35const DEFAULT_IMAGE_FEATURE_SIZE: usize = 64;
36const DEFAULT_IM_START_TOKEN: &str = "<image>";
37const DEFAULT_IM_END_TOKEN: &str = "</image>";
38const DEFAULT_IM_ID_START: &str = "<image_id>";
39const DEFAULT_IM_ID_END: &str = "</image_id>";
40const DEFAULT_SLICE_START_TOKEN: &str = "<slice>";
41const DEFAULT_SLICE_END_TOKEN: &str = "</slice>";
42const DEFAULT_UNK_TOKEN: &str = "<unk>";
43const DEFAULT_USE_IMAGE_ID: bool = false;
44const DEFAULT_SLICE_MODE: bool = true;
45
46pub struct MiniCpmOImageProcessor {
47    config: PreProcessorConfig,
48}
49
50pub struct MiniCpmOProcessor {
51    preprocessor_config: PreProcessorConfig,
52}
53
54impl MiniCpmOProcessor {
55    pub fn new(
56        _config: ProcessorConfig,
57        preprocessor_config: PreProcessorConfig,
58        _max_edge: Option<u32>,
59    ) -> Self {
60        Self {
61            preprocessor_config,
62        }
63    }
64}
65
66impl Processor for MiniCpmOProcessor {
67    fn inputs_processor(&self) -> Arc<dyn InputsProcessor> {
68        Arc::new(MiniCpmOImageProcessor {
69            config: self.preprocessor_config.clone(),
70        })
71    }
72
73    fn get_special_tokens(&self) -> &[&'static str] {
74        &[
75            DEFAULT_IM_START_TOKEN,
76            DEFAULT_IM_END_TOKEN,
77            DEFAULT_SLICE_START_TOKEN,
78            DEFAULT_SLICE_END_TOKEN,
79            DEFAULT_UNK_TOKEN,
80        ]
81    }
82
83    fn template_action(&self) -> MessagesAction {
84        MessagesAction::FlattenOnlyText
85    }
86}
87
88impl InputsProcessor for MiniCpmOImageProcessor {
89    fn get_type(&self) -> InputsProcessorType {
90        InputsProcessorType::Vision
91    }
92    fn process_inputs(
93        &self,
94        tokenizer: Option<Arc<Tokenizer>>,
95        input_seqs: &mut [&mut Sequence],
96        is_prompt: bool,
97        is_xlora: bool,
98        device: &Device,
99        no_kv_cache: bool,
100        last_n_context_len: Option<(usize, usize)>,
101        return_raw_logits: bool,
102        other_config: Option<Arc<dyn Any>>,
103        mut paged_attn_metadata: Option<PagedAttentionMeta<'_>>,
104        prompt_chunksize: Option<NonZeroUsize>,
105        mapper: Option<&dyn DeviceMapper>,
106    ) -> Box<dyn Iterator<Item = anyhow::Result<InputProcessorOutput>>> {
107        if is_xlora {
108            return Box::new(std::iter::once(Err(anyhow::Error::msg(
109                "Cannot make inputs for X-LoRA vision model.",
110            ))));
111        }
112        if no_kv_cache {
113            return Box::new(std::iter::once(Err(anyhow::Error::msg(
114                "Vision model must have kv cache.",
115            ))));
116        }
117        // TODO(EricLBuehler): support this? Would require some handling of image tokens.
118        if prompt_chunksize.is_some() {
119            warn!("`prompt_chunksize` is set. MiniCpm-O does not support prompt batching.");
120        }
121        let Some(tokenizer) = tokenizer else {
122            return Box::new(std::iter::once(Err(anyhow::Error::msg(
123                "MiniCpmOImageProcessor requires a specified tokenizer.",
124            ))));
125        };
126
127        let config = other_config.expect("Need a PreProcessorConfig config.");
128        let config: &PreProcessorConfig = config.downcast_ref().expect("Downcast failed.");
129
130        let has_images = input_seqs.iter().all(|seq| seq.has_images());
131
132        let (pixel_values_all, image_bound, tgt_sizes) = if has_images {
133            const IMAGE_TAG: &str = "(<image>./</image>)";
134            const IMAGE_PATTERN: &str = r"\(<image>./</image>\)";
135            const AUDIO_PATTERN: &str = r"\(<audio>./</audio>\)";
136
137            let image_pattern = Regex::new(IMAGE_PATTERN).unwrap();
138            let _audio_pattern = Regex::new(AUDIO_PATTERN).unwrap();
139            let split_pattern = Regex::new(&format!(r"({IMAGE_PATTERN}|{AUDIO_PATTERN})")).unwrap();
140
141            let mut pixel_values_accum = Vec::new();
142            let mut tgt_sizes_accum = Vec::new();
143            let mut image_bounds_accum = Vec::new();
144
145            for seq in input_seqs.iter_mut() {
146                let PreprocessedImages {
147                    pixel_values: _,
148                    pixel_attention_mask: _,
149                    image_sizes: _,
150                    num_img_tokens: _,
151                    aspect_ratio_ids: _,
152                    aspect_ratio_mask: _,
153                    num_tiles: _,
154                    image_grid_thw: _,
155                    video_grid_thw: _,
156                    rows: _,
157                    cols: _,
158                    pixel_values_list,
159                    tgt_sizes,
160                    image_sizes_all,
161                    num_crops: _,
162                } = self
163                    .preprocess(
164                        seq.take_images()
165                            .expect("Need to have images by this point."),
166                        vec![],
167                        config,
168                        device,
169                        (usize::MAX, usize::MAX), // Don't use it here...
170                    )
171                    .expect("Preprocessing failed");
172                let pixel_values_list = pixel_values_list.unwrap();
173                let tgt_sizes = tgt_sizes.unwrap();
174                let image_sizes_all = image_sizes_all.unwrap();
175
176                let text = tokenizer
177                    .decode(seq.get_toks(), false)
178                    .expect("Detokenization failed!");
179
180                let mut text_chunks = {
181                    let mut results = Vec::new();
182                    let mut last_end = 0;
183
184                    for m in split_pattern.find_iter(&text) {
185                        // Anything between last_end and m.start() is unmatched
186                        if m.start() > last_end {
187                            results.push((false, &text[last_end..m.start()]));
188                        }
189                        results.push((true, m.as_str()));
190                        last_end = m.end();
191                    }
192                    // Handle the trailing unmatched part (if any)
193                    if last_end < text.len() {
194                        results.push((false, &text[last_end..]));
195                    }
196
197                    results
198                        .into_iter()
199                        .map(|(_, x)| x.to_string())
200                        .collect::<Vec<_>>()
201                };
202
203                let image_tags = image_pattern.find_iter(&text).collect::<Vec<_>>();
204
205                if !image_tags.is_empty() {
206                    assert_eq!(image_tags.len(), image_sizes_all.len());
207                }
208
209                let mut image_id = 0;
210                for chunk in &mut text_chunks {
211                    if chunk == IMAGE_TAG {
212                        *chunk =
213                            self.get_slice_image_placeholder(image_sizes_all[image_id], image_id);
214                        image_id += 1;
215                    }
216                }
217
218                let final_text = text_chunks.join("");
219                seq.set_initial_prompt(final_text.clone());
220
221                let image_bounds = {
222                    let im_start_id = tokenizer
223                        .encode_fast(
224                            self.config
225                                .im_start_token
226                                .clone()
227                                .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
228                            false,
229                        )
230                        .unwrap()
231                        .get_ids()[0];
232                    let im_end_id = tokenizer
233                        .encode_fast(
234                            self.config
235                                .im_end_token
236                                .clone()
237                                .unwrap_or(DEFAULT_IM_END_TOKEN.to_string()),
238                            false,
239                        )
240                        .unwrap()
241                        .get_ids()[0];
242                    let slice_start_id = tokenizer
243                        .encode_fast(
244                            self.config
245                                .slice_start_token
246                                .clone()
247                                .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
248                            false,
249                        )
250                        .unwrap()
251                        .get_ids()[0];
252                    let slice_end_id = tokenizer
253                        .encode_fast(
254                            self.config
255                                .slice_end_token
256                                .clone()
257                                .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string()),
258                            false,
259                        )
260                        .unwrap()
261                        .get_ids()[0];
262
263                    let input_ids = tokenizer
264                        .encode_fast(final_text, false)
265                        .unwrap()
266                        .get_ids()
267                        .to_vec();
268
269                    seq.set_toks_and_reallocate(input_ids.clone(), paged_attn_metadata.as_mut());
270
271                    let image_start_idx = input_ids
272                        .iter()
273                        .enumerate()
274                        .filter_map(|(i, &id)| {
275                            if id == im_start_id || id == slice_start_id {
276                                Some(i as u32 + 1)
277                            } else {
278                                None
279                            }
280                        })
281                        .collect::<Vec<_>>();
282
283                    let image_end_idx = input_ids
284                        .iter()
285                        .enumerate()
286                        .filter_map(|(i, &id)| {
287                            if id == im_end_id || id == slice_end_id {
288                                Some(i as u32)
289                            } else {
290                                None
291                            }
292                        })
293                        .collect::<Vec<_>>();
294
295                    let valid_image_nums = image_start_idx.len().max(image_end_idx.len());
296
297                    let image_start_idx = Tensor::from_slice(
298                        &image_start_idx[..valid_image_nums],
299                        (valid_image_nums, 1),
300                        device,
301                    )
302                    .unwrap();
303                    let image_end_idx = Tensor::from_slice(
304                        &image_end_idx[..valid_image_nums],
305                        (valid_image_nums, 1),
306                        device,
307                    )
308                    .unwrap();
309
310                    Tensor::cat(&[image_start_idx, image_end_idx], 1).unwrap()
311                };
312
313                pixel_values_accum.push(pixel_values_list);
314                tgt_sizes_accum.push(tgt_sizes);
315                image_bounds_accum.push(image_bounds);
316            }
317
318            (
319                Some(pixel_values_accum),
320                Some(image_bounds_accum),
321                Some(tgt_sizes_accum),
322            )
323        } else {
324            (None, None, None)
325        };
326
327        let text_models_inputs_processor::InnerInputProcessorOutput {
328            inputs:
329                text_models_inputs_processor::InputMetadata {
330                    input,
331                    positions,
332                    context_lens,
333                    position_ids,
334                    paged_attn_meta,
335                    flash_meta,
336                },
337            seq_indices,
338        } = if is_prompt {
339            get_prompt_input(
340                input_seqs
341                    .iter()
342                    .map(|seq| seq.get_toks().to_vec())
343                    .collect::<Vec<_>>(),
344                input_seqs,
345                device,
346                last_n_context_len,
347                return_raw_logits,
348                paged_attn_metadata.as_mut(),
349                None, // TODO: evaluate if it is possible to batch this
350                mapper,
351            )
352            .nth(0)
353            .unwrap()
354            .unwrap()
355        } else {
356            get_completion_input(
357                input_seqs
358                    .iter()
359                    .map(|seq| seq.get_toks().to_vec())
360                    .collect::<Vec<_>>(),
361                input_seqs,
362                device,
363                no_kv_cache,
364                last_n_context_len,
365                return_raw_logits,
366                paged_attn_metadata.as_mut(),
367                None, // TODO: evaluate if it is possible to batch this
368                mapper,
369            )
370            .nth(0)
371            .unwrap()
372            .unwrap()
373        };
374
375        let args = MiniCpmOSpecificArgs {
376            pixel_values_all,
377            tgt_sizes,
378            image_bound,
379        };
380
381        // Dummy pixel values - real ones are in model specific args
382        let inputs: Box<dyn Any> = Box::new(ModelInputs {
383            input_ids: input,
384            seqlen_offsets: positions,
385            context_lens,
386            position_ids,
387            pixel_values: None,
388            model_specific_args: Box::new(args),
389            paged_attn_meta,
390            flash_meta,
391        });
392        Box::new(std::iter::once(Ok(InputProcessorOutput {
393            inputs,
394            seq_indices,
395        })))
396    }
397}
398
399impl MiniCpmOImageProcessor {
400    fn get_sliced_grid(
401        &self,
402        (w, h): (usize, usize),
403        max_slice_nums: usize,
404        scale_resolution: usize,
405        never_split: bool,
406    ) -> Option<(usize, usize)> {
407        let log_ratio = ((w / h) as f32).ln();
408        let ratio = (w * h) as f32 / (scale_resolution * scale_resolution) as f32;
409        let multiple = ratio.ceil().min(max_slice_nums as f32);
410        if multiple <= 1. || never_split {
411            return None;
412        }
413
414        let mut candidate_split_grid_nums = Vec::new();
415        for i in [multiple - 1., multiple, multiple + 1.] {
416            if i == 1. || i > max_slice_nums as f32 {
417                continue;
418            }
419            candidate_split_grid_nums.push(i);
420        }
421
422        let mut candidate_grids = Vec::new();
423        for split_grid_nums in candidate_split_grid_nums {
424            let mut m = 1.;
425            while m <= split_grid_nums {
426                if split_grid_nums % m == 0. {
427                    candidate_grids.push((m as usize, split_grid_nums as usize / m as usize));
428                }
429                m += 1.;
430            }
431        }
432
433        let mut best_grid = (1, 1);
434        let mut min_error = f32::INFINITY;
435        for grid in candidate_grids {
436            let error = (log_ratio - (grid.0 as f32 / grid.1 as f32).ln()).abs();
437            if error < min_error {
438                best_grid = grid;
439                min_error = error;
440            }
441        }
442
443        Some(best_grid)
444    }
445
446    fn ensure_divide(&self, length: usize, patch_size: usize) -> usize {
447        ((length as f32 / patch_size as f32).round() * patch_size as f32).max(patch_size as f32)
448            as usize
449    }
450
451    fn find_best_resize(
452        &self,
453        (mut w, mut h): (usize, usize),
454        scale_resolution: usize,
455        patch_size: usize,
456        allow_upscale: bool,
457    ) -> (usize, usize) {
458        if w * h > scale_resolution * scale_resolution || allow_upscale {
459            let r = w as f32 / h as f32;
460            h = (scale_resolution as f32 / r.sqrt()) as usize;
461            w = (scale_resolution as f32 * r) as usize;
462        }
463        let best_w = self.ensure_divide(w, patch_size);
464        let best_h = self.ensure_divide(h, patch_size);
465        (best_w, best_h)
466    }
467
468    fn get_refine_size(
469        &self,
470        (w, h): (usize, usize),
471        (grid_x, grid_y): (usize, usize),
472        scale_resolution: usize,
473        patch_size: usize,
474        allow_upscale: bool,
475    ) -> (usize, usize) {
476        let refine_w = self.ensure_divide(w, grid_x);
477        let refine_h = self.ensure_divide(h, grid_y);
478
479        let grid_w = refine_h / grid_x;
480        let grid_h = refine_w / grid_y;
481
482        let best_grid_size = self.find_best_resize(
483            (grid_w, grid_h),
484            scale_resolution,
485            patch_size,
486            allow_upscale,
487        );
488
489        (best_grid_size.0 * grid_x, best_grid_size.1 * grid_y)
490    }
491
492    fn split_to_patches(
493        &self,
494        image: &DynamicImage,
495        grid: (usize, usize),
496    ) -> Vec<Vec<DynamicImage>> {
497        let mut patches = Vec::new();
498        let (w, h) = image.dimensions();
499        let (w, h) = (w as usize, h as usize);
500        let grid_x = w / grid.0;
501        let grid_y = h / grid.1;
502        for i in (0..h).step_by(grid_y) {
503            let mut images = Vec::new();
504            for j in (0..w).step_by(grid_x) {
505                images.push(image.crop_imm(j as u32, i as u32, grid_x as u32, grid_y as u32));
506            }
507            patches.push(images);
508        }
509        patches
510    }
511
512    fn get_sliced_images(
513        &self,
514        image: &DynamicImage,
515        max_slice_nums: usize,
516        scale_resolution: usize,
517        patch_size: usize,
518    ) -> Vec<DynamicImage> {
519        if !self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE) {
520            return vec![image.clone()];
521        }
522
523        let dims = image.dimensions();
524        let (w, h) = (dims.0 as usize, dims.1 as usize);
525
526        let best_grid = self.get_sliced_grid((w, h), max_slice_nums, scale_resolution, false);
527
528        let (source_images, patches) = if let Some(best_grid) = best_grid {
529            // Source image, down-sampling and ensure divided by patch_size
530            let best_resize = self.find_best_resize((w, h), scale_resolution, patch_size, false);
531            let source_image = image.resize_exact(
532                best_resize.0 as u32,
533                best_resize.1 as u32,
534                FilterType::CatmullRom,
535            );
536            let refine_size =
537                self.get_refine_size((w, h), best_grid, scale_resolution, patch_size, true);
538            let refine_image = image.resize_exact(
539                refine_size.0 as u32,
540                refine_size.1 as u32,
541                FilterType::CatmullRom,
542            );
543            let patches = self
544                .split_to_patches(&refine_image, best_grid)
545                .into_iter()
546                .flatten()
547                .collect::<Vec<_>>();
548
549            (source_image, patches)
550        } else {
551            // Don't need to slice, upsample
552            let best_size = self.find_best_resize((w, h), scale_resolution, patch_size, true);
553            let source_images = image.resize_exact(
554                best_size.0 as u32,
555                best_size.1 as u32,
556                FilterType::CatmullRom,
557            );
558
559            (source_images, vec![])
560        };
561
562        [vec![source_images], patches].concat()
563    }
564
565    /// image: (3, h, w)
566    /// output: (3, patch_size, h*w/patch_size)
567    fn reshape_by_patch(&self, image: &Tensor, patch_size: usize) -> Result<Tensor> {
568        // Equivalent of torch.nn.functional.unfold with kernel_size and stride both 2-tuples
569        let (_c, h, w) = image.dims3()?;
570        // Kernel size
571        let (kh, kw) = (patch_size, patch_size);
572        // Stride
573        let (sh, sw) = (patch_size, patch_size);
574
575        let out_h = (h - kh) / sh + 1;
576        let out_w = (w - kw) / sw + 1;
577
578        let mut patches = Vec::new();
579        for i in 0..out_h {
580            for j in 0..out_w {
581                // [c, kh, kw]
582                let patch = image.i((.., i * sh..i * sh + kh, j * sw..j * sw + kw))?;
583                // [c*kh*kw]
584                patches.push(patch.flatten_all()?);
585            }
586        }
587        // [C*kH*kW, out_h * out_w]
588        let mut patches = Tensor::stack(&patches, 1)?;
589
590        patches = patches.reshape((image.dim(0)?, patch_size, patch_size, ()))?;
591        patches
592            .permute((0, 1, 3, 2))?
593            .reshape((image.dim(0)?, patch_size, ()))
594    }
595
596    fn get_image_id_placeholder(&self, image_idx: usize) -> String {
597        format!(
598            "{}{image_idx}{}",
599            self.config
600                .im_id_start
601                .clone()
602                .unwrap_or(DEFAULT_IM_ID_START.to_string()),
603            self.config
604                .im_id_end
605                .clone()
606                .unwrap_or(DEFAULT_IM_ID_END.to_string())
607        )
608    }
609
610    fn get_grid_placeholder(&self, grid: Option<(usize, usize)>) -> String {
611        if let Some(grid) = grid {
612            let slice_image_placeholder = format!(
613                "{}{}{}",
614                self.config
615                    .slice_start_token
616                    .clone()
617                    .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
618                self.config
619                    .unk_token
620                    .clone()
621                    .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
622                    .repeat(
623                        self.config
624                            .image_feature_size
625                            .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
626                    ),
627                self.config
628                    .slice_end_token
629                    .clone()
630                    .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string())
631            );
632
633            let (cols, rows) = grid;
634            let mut slices = Vec::new();
635            for _ in 0..rows {
636                let mut lines = Vec::new();
637                for _ in 0..cols {
638                    lines.push(slice_image_placeholder.clone());
639                }
640                slices.push(lines.join(""));
641            }
642
643            slices.join("\n")
644        } else {
645            "".to_string()
646        }
647    }
648
649    fn get_slice_image_placeholder(&self, image_size: (u32, u32), image_idx: usize) -> String {
650        let max_slice_nums = self.config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
651        let use_image_id = self.config.use_image_id.unwrap_or(DEFAULT_USE_IMAGE_ID);
652        let slice_mode = self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE);
653
654        let grid = self.get_sliced_grid(
655            (image_size.0 as usize, image_size.1 as usize),
656            max_slice_nums,
657            DEFAULT_SCALE_RESOLUTION,
658            false,
659        );
660
661        let image_placeholder = format!(
662            "{}{}{}",
663            self.config
664                .im_start_token
665                .clone()
666                .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
667            self.config
668                .unk_token
669                .clone()
670                .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
671                .repeat(
672                    self.config
673                        .image_feature_size
674                        .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
675                ),
676            self.config
677                .im_end_token
678                .clone()
679                .unwrap_or(DEFAULT_IM_END_TOKEN.to_string())
680        );
681
682        let final_placeholder = if use_image_id {
683            format!(
684                "{}{image_placeholder}",
685                self.get_image_id_placeholder(image_idx)
686            )
687        } else {
688            image_placeholder
689        };
690
691        if slice_mode {
692            format!("{final_placeholder}{}", self.get_grid_placeholder(grid))
693        } else {
694            final_placeholder
695        }
696    }
697}
698
699impl ImagePreProcessor for MiniCpmOImageProcessor {
700    #[allow(clippy::excessive_precision)]
701    const DEFAULT_MEAN: [f64; 3] = [0.5, 0.5, 0.5];
702    #[allow(clippy::excessive_precision)]
703    const DEFAULT_STD: [f64; 3] = [0.5, 0.5, 0.5];
704
705    fn preprocess(
706        &self,
707        images: Vec<DynamicImage>,
708        _videos: Vec<Vec<DynamicImage>>,
709        config: &PreProcessorConfig,
710        device: &Device,
711        (_bs, _max_num_images): (usize, usize),
712    ) -> Result<PreprocessedImages> {
713        let mut pixel_values = Vec::new();
714        let mut tgt_sizes = Vec::new();
715        let image_sizes = images
716            .iter()
717            .map(|img| img.dimensions())
718            .collect::<Vec<_>>();
719        for image in images {
720            let max_slice_nums = config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
721            let scale_resolution = config.scale_resolution.unwrap_or(DEFAULT_SCALE_RESOLUTION);
722            let patch_size = config.patch_size.unwrap_or(DEFAULT_PATCH_SIZE);
723
724            let image_patches =
725                self.get_sliced_images(&image, max_slice_nums, scale_resolution, patch_size);
726
727            for slice_image in image_patches {
728                let (w, h) = slice_image.dimensions();
729                let to_tensor_rescale = Transforms {
730                    input: &ToTensor,
731                    inner_transforms: &[&Normalize {
732                        mean: config.image_mean.unwrap_or(Self::DEFAULT_MEAN).to_vec(),
733                        std: config.image_std.unwrap_or(Self::DEFAULT_STD).to_vec(),
734                    }],
735                };
736                let mut image = slice_image.apply(to_tensor_rescale, device)?;
737                image = self.reshape_by_patch(&image, patch_size)?;
738                pixel_values.push(image);
739                tgt_sizes.push(Tensor::from_vec(
740                    vec![h / patch_size as u32, w / patch_size as u32],
741                    (1, 2),
742                    &Device::Cpu,
743                )?);
744            }
745        }
746
747        let tgt_sizes = Tensor::cat(&tgt_sizes, 0)?.to_device(device)?;
748        // Dummy pixel values
749        Ok(PreprocessedImages {
750            pixel_values: Tensor::new(0u32, &Device::Cpu)?,
751            pixel_attention_mask: None,
752            image_sizes: None,
753            num_img_tokens: None,
754            aspect_ratio_ids: None,
755            aspect_ratio_mask: None,
756            num_tiles: None,
757            image_grid_thw: None,
758            video_grid_thw: None,
759            rows: None,
760            cols: None,
761            pixel_values_list: Some(pixel_values),
762            tgt_sizes: Some(tgt_sizes),
763            image_sizes_all: Some(image_sizes),
764            num_crops: None,
765        })
766    }
767}