mistralrs_core/vision_models/minicpmo/
inputs_processor.rs

1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2
3use std::{any::Any, num::NonZeroUsize, sync::Arc};
4
5use candle_core::{Device, IndexOp, Result, Tensor};
6use image::{imageops::FilterType, DynamicImage, GenericImageView};
7use mistralrs_vision::{ApplyTransforms, Normalize, ToTensor, Transforms};
8use regex::Regex;
9use tokenizers::Tokenizer;
10use tracing::warn;
11
12use crate::{
13    device_map::DeviceMapper,
14    pipeline::{
15        text_models_inputs_processor::{
16            self, get_completion_input, get_prompt_input, PagedAttentionMeta,
17        },
18        InputProcessorOutput, InputsProcessor, InputsProcessorType, MessagesAction, Processor,
19    },
20    sequence::Sequence,
21    vision_models::ModelInputs,
22};
23
24use crate::vision_models::{
25    image_processor::{ImagePreProcessor, PreprocessedImages},
26    preprocessor_config::PreProcessorConfig,
27    processor_config::ProcessorConfig,
28};
29
30use super::MiniCpmOSpecificArgs;
31
32const DEFAULT_MAX_SLICE_NUMS: usize = 9;
33const DEFAULT_SCALE_RESOLUTION: usize = 448;
34const DEFAULT_PATCH_SIZE: usize = 14;
35const DEFAULT_IMAGE_FEATURE_SIZE: usize = 64;
36const DEFAULT_IM_START_TOKEN: &str = "<image>";
37const DEFAULT_IM_END_TOKEN: &str = "</image>";
38const DEFAULT_IM_ID_START: &str = "<image_id>";
39const DEFAULT_IM_ID_END: &str = "</image_id>";
40const DEFAULT_SLICE_START_TOKEN: &str = "<slice>";
41const DEFAULT_SLICE_END_TOKEN: &str = "</slice>";
42const DEFAULT_UNK_TOKEN: &str = "<unk>";
43const DEFAULT_USE_IMAGE_ID: bool = false;
44const DEFAULT_SLICE_MODE: bool = true;
45
46pub struct MiniCpmOImageProcessor {
47    config: PreProcessorConfig,
48}
49
50pub struct MiniCpmOProcessor {
51    preprocessor_config: PreProcessorConfig,
52}
53
54impl MiniCpmOProcessor {
55    pub fn new(
56        _config: ProcessorConfig,
57        preprocessor_config: PreProcessorConfig,
58        _max_edge: Option<u32>,
59    ) -> Self {
60        Self {
61            preprocessor_config,
62        }
63    }
64}
65
66impl Processor for MiniCpmOProcessor {
67    fn inputs_processor(&self) -> Arc<dyn InputsProcessor> {
68        Arc::new(MiniCpmOImageProcessor {
69            config: self.preprocessor_config.clone(),
70        })
71    }
72
73    fn get_special_tokens(&self) -> &[&'static str] {
74        &[
75            DEFAULT_IM_START_TOKEN,
76            DEFAULT_IM_END_TOKEN,
77            DEFAULT_SLICE_START_TOKEN,
78            DEFAULT_SLICE_END_TOKEN,
79            DEFAULT_UNK_TOKEN,
80        ]
81    }
82
83    fn template_action(&self) -> MessagesAction {
84        MessagesAction::FlattenOnlyText
85    }
86}
87
88impl InputsProcessor for MiniCpmOImageProcessor {
89    fn get_type(&self) -> InputsProcessorType {
90        InputsProcessorType::Vision
91    }
92    fn process_inputs(
93        &self,
94        tokenizer: Option<Arc<Tokenizer>>,
95        input_seqs: &mut [&mut Sequence],
96        is_prompt: bool,
97        is_xlora: bool,
98        device: &Device,
99        no_kv_cache: bool,
100        last_n_context_len: Option<(usize, usize)>,
101        return_raw_logits: bool,
102        other_config: Option<Arc<dyn Any>>,
103        mut paged_attn_metadata: Option<PagedAttentionMeta>,
104        prompt_chunksize: Option<NonZeroUsize>,
105        mapper: Option<&dyn DeviceMapper>,
106    ) -> Box<dyn Iterator<Item = anyhow::Result<InputProcessorOutput>>> {
107        if is_xlora {
108            return Box::new(std::iter::once(Err(anyhow::Error::msg(
109                "Cannot make inputs for X-LoRA vision model.",
110            ))));
111        }
112        if no_kv_cache {
113            return Box::new(std::iter::once(Err(anyhow::Error::msg(
114                "Vision model must have kv cache.",
115            ))));
116        }
117        // TODO(EricLBuehler): support this? Would require some handling of image tokens.
118        if prompt_chunksize.is_some() {
119            warn!("`prompt_chunksize` is set. MiniCpm-O does not support prompt batching.");
120        }
121        let Some(tokenizer) = tokenizer else {
122            return Box::new(std::iter::once(Err(anyhow::Error::msg(
123                "MiniCpmOImageProcessor requires a specified tokenizer.",
124            ))));
125        };
126
127        let config = other_config.expect("Need a PreProcessorConfig config.");
128        let config: &PreProcessorConfig = config.downcast_ref().expect("Downcast failed.");
129
130        let has_images = input_seqs.iter().all(|seq| seq.has_images());
131
132        let (pixel_values_all, image_bound, tgt_sizes) = if has_images {
133            const IMAGE_TAG: &str = "(<image>./</image>)";
134            const IMAGE_PATTERN: &str = r"\(<image>./</image>\)";
135            const AUDIO_PATTERN: &str = r"\(<audio>./</audio>\)";
136
137            let image_pattern = Regex::new(IMAGE_PATTERN).unwrap();
138            let _audio_pattern = Regex::new(AUDIO_PATTERN).unwrap();
139            let split_pattern = Regex::new(&format!(r"({IMAGE_PATTERN}|{AUDIO_PATTERN})")).unwrap();
140
141            let mut pixel_values_accum = Vec::new();
142            let mut tgt_sizes_accum = Vec::new();
143            let mut image_bounds_accum = Vec::new();
144
145            for seq in input_seqs.iter_mut() {
146                let PreprocessedImages {
147                    pixel_values: _,
148                    pixel_attention_mask: _,
149                    image_sizes: _,
150                    num_img_tokens: _,
151                    aspect_ratio_ids: _,
152                    aspect_ratio_mask: _,
153                    num_tiles: _,
154                    image_grid_thw: _,
155                    video_grid_thw: _,
156                    rows: _,
157                    cols: _,
158                    pixel_values_list,
159                    tgt_sizes,
160                    image_sizes_all,
161                    num_crops: _,
162                } = self
163                    .preprocess(
164                        seq.take_images()
165                            .expect("Need to have images by this point."),
166                        vec![],
167                        config,
168                        device,
169                        (usize::MAX, usize::MAX), // Don't use it here...
170                    )
171                    .expect("Preprocessing failed");
172                let pixel_values_list = pixel_values_list.unwrap();
173                let tgt_sizes = tgt_sizes.unwrap();
174                let image_sizes_all = image_sizes_all.unwrap();
175
176                let text = tokenizer
177                    .decode(seq.get_toks(), false)
178                    .expect("Detokenization failed!");
179
180                let mut text_chunks = {
181                    let mut results = Vec::new();
182                    let mut last_end = 0;
183
184                    for m in split_pattern.find_iter(&text) {
185                        // Anything between last_end and m.start() is unmatched
186                        if m.start() > last_end {
187                            results.push((false, &text[last_end..m.start()]));
188                        }
189                        results.push((true, m.as_str()));
190                        last_end = m.end();
191                    }
192                    // Handle the trailing unmatched part (if any)
193                    if last_end < text.len() {
194                        results.push((false, &text[last_end..]));
195                    }
196
197                    results
198                        .into_iter()
199                        .map(|(_, x)| x.to_string())
200                        .collect::<Vec<_>>()
201                };
202
203                let image_tags = image_pattern.find_iter(&text).collect::<Vec<_>>();
204
205                if !image_tags.is_empty() {
206                    assert_eq!(image_tags.len(), image_sizes_all.len());
207                }
208
209                let mut image_id = 0;
210                for chunk in &mut text_chunks {
211                    if chunk == IMAGE_TAG {
212                        *chunk =
213                            self.get_slice_image_placeholder(image_sizes_all[image_id], image_id);
214                        image_id += 1;
215                    }
216                }
217
218                let final_text = text_chunks.join("");
219
220                let input_ids = tokenizer
221                    .encode_fast(final_text.clone(), false)
222                    .unwrap()
223                    .get_ids()
224                    .to_vec();
225
226                if !seq.multimodal.has_changed_prompt {
227                    seq.set_initial_prompt(final_text.clone());
228
229                    seq.set_toks_and_reallocate(input_ids.clone(), paged_attn_metadata.as_mut());
230                    seq.multimodal.has_changed_prompt = true;
231                }
232
233                let image_bounds = {
234                    let im_start_id = tokenizer
235                        .encode_fast(
236                            self.config
237                                .im_start_token
238                                .clone()
239                                .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
240                            false,
241                        )
242                        .unwrap()
243                        .get_ids()[0];
244                    let im_end_id = tokenizer
245                        .encode_fast(
246                            self.config
247                                .im_end_token
248                                .clone()
249                                .unwrap_or(DEFAULT_IM_END_TOKEN.to_string()),
250                            false,
251                        )
252                        .unwrap()
253                        .get_ids()[0];
254                    let slice_start_id = tokenizer
255                        .encode_fast(
256                            self.config
257                                .slice_start_token
258                                .clone()
259                                .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
260                            false,
261                        )
262                        .unwrap()
263                        .get_ids()[0];
264                    let slice_end_id = tokenizer
265                        .encode_fast(
266                            self.config
267                                .slice_end_token
268                                .clone()
269                                .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string()),
270                            false,
271                        )
272                        .unwrap()
273                        .get_ids()[0];
274
275                    let image_start_idx = input_ids
276                        .iter()
277                        .enumerate()
278                        .filter_map(|(i, &id)| {
279                            if id == im_start_id || id == slice_start_id {
280                                Some(i as u32 + 1)
281                            } else {
282                                None
283                            }
284                        })
285                        .collect::<Vec<_>>();
286
287                    let image_end_idx = input_ids
288                        .iter()
289                        .enumerate()
290                        .filter_map(|(i, &id)| {
291                            if id == im_end_id || id == slice_end_id {
292                                Some(i as u32)
293                            } else {
294                                None
295                            }
296                        })
297                        .collect::<Vec<_>>();
298
299                    let valid_image_nums = image_start_idx.len().max(image_end_idx.len());
300
301                    let image_start_idx = Tensor::from_slice(
302                        &image_start_idx[..valid_image_nums],
303                        (valid_image_nums, 1),
304                        device,
305                    )
306                    .unwrap();
307                    let image_end_idx = Tensor::from_slice(
308                        &image_end_idx[..valid_image_nums],
309                        (valid_image_nums, 1),
310                        device,
311                    )
312                    .unwrap();
313
314                    Tensor::cat(&[image_start_idx, image_end_idx], 1).unwrap()
315                };
316
317                pixel_values_accum.push(pixel_values_list);
318                tgt_sizes_accum.push(tgt_sizes);
319                image_bounds_accum.push(image_bounds);
320            }
321
322            (
323                Some(pixel_values_accum),
324                Some(image_bounds_accum),
325                Some(tgt_sizes_accum),
326            )
327        } else {
328            (None, None, None)
329        };
330
331        let text_models_inputs_processor::InnerInputProcessorOutput {
332            inputs:
333                text_models_inputs_processor::InputMetadata {
334                    input,
335                    positions,
336                    context_lens,
337                    position_ids,
338                    paged_attn_meta,
339                    flash_meta,
340                },
341            seq_indices,
342        } = if is_prompt {
343            get_prompt_input(
344                input_seqs
345                    .iter()
346                    .map(|seq| seq.get_toks())
347                    .collect::<Vec<_>>(),
348                input_seqs,
349                device,
350                last_n_context_len,
351                return_raw_logits,
352                paged_attn_metadata.as_mut(),
353                None, // TODO: evaluate if it is possible to batch this
354                mapper,
355            )
356            .nth(0)
357            .unwrap()
358            .unwrap()
359        } else {
360            get_completion_input(
361                input_seqs
362                    .iter()
363                    .map(|seq| seq.get_toks())
364                    .collect::<Vec<_>>(),
365                input_seqs,
366                device,
367                no_kv_cache,
368                last_n_context_len,
369                return_raw_logits,
370                paged_attn_metadata.as_mut(),
371                None, // TODO: evaluate if it is possible to batch this
372                mapper,
373            )
374            .nth(0)
375            .unwrap()
376            .unwrap()
377        };
378
379        let args = MiniCpmOSpecificArgs {
380            pixel_values_all,
381            tgt_sizes,
382            image_bound,
383        };
384
385        // Dummy pixel values - real ones are in model specific args
386        let inputs: Box<dyn Any> = Box::new(ModelInputs {
387            input_ids: input,
388            seqlen_offsets: positions,
389            context_lens,
390            position_ids,
391            pixel_values: None,
392            model_specific_args: Box::new(args),
393            paged_attn_meta,
394            flash_meta,
395        });
396        Box::new(std::iter::once(Ok(InputProcessorOutput {
397            inputs,
398            seq_indices,
399        })))
400    }
401}
402
403impl MiniCpmOImageProcessor {
404    fn get_sliced_grid(
405        &self,
406        (w, h): (usize, usize),
407        max_slice_nums: usize,
408        scale_resolution: usize,
409        never_split: bool,
410    ) -> Option<(usize, usize)> {
411        let log_ratio = ((w / h) as f32).ln();
412        let ratio = (w * h) as f32 / (scale_resolution * scale_resolution) as f32;
413        let multiple = ratio.ceil().min(max_slice_nums as f32);
414        if multiple <= 1. || never_split {
415            return None;
416        }
417
418        let mut candidate_split_grid_nums = Vec::new();
419        for i in [multiple - 1., multiple, multiple + 1.] {
420            if i == 1. || i > max_slice_nums as f32 {
421                continue;
422            }
423            candidate_split_grid_nums.push(i);
424        }
425
426        let mut candidate_grids = Vec::new();
427        for split_grid_nums in candidate_split_grid_nums {
428            let mut m = 1.;
429            while m <= split_grid_nums {
430                if split_grid_nums % m == 0. {
431                    candidate_grids.push((m as usize, split_grid_nums as usize / m as usize));
432                }
433                m += 1.;
434            }
435        }
436
437        let mut best_grid = (1, 1);
438        let mut min_error = f32::INFINITY;
439        for grid in candidate_grids {
440            let error = (log_ratio - (grid.0 as f32 / grid.1 as f32).ln()).abs();
441            if error < min_error {
442                best_grid = grid;
443                min_error = error;
444            }
445        }
446
447        Some(best_grid)
448    }
449
450    fn ensure_divide(&self, length: usize, patch_size: usize) -> usize {
451        ((length as f32 / patch_size as f32).round() * patch_size as f32).max(patch_size as f32)
452            as usize
453    }
454
455    fn find_best_resize(
456        &self,
457        (mut w, mut h): (usize, usize),
458        scale_resolution: usize,
459        patch_size: usize,
460        allow_upscale: bool,
461    ) -> (usize, usize) {
462        if w * h > scale_resolution * scale_resolution || allow_upscale {
463            let r = w as f32 / h as f32;
464            h = (scale_resolution as f32 / r.sqrt()) as usize;
465            w = (scale_resolution as f32 * r) as usize;
466        }
467        let best_w = self.ensure_divide(w, patch_size);
468        let best_h = self.ensure_divide(h, patch_size);
469        (best_w, best_h)
470    }
471
472    fn get_refine_size(
473        &self,
474        (w, h): (usize, usize),
475        (grid_x, grid_y): (usize, usize),
476        scale_resolution: usize,
477        patch_size: usize,
478        allow_upscale: bool,
479    ) -> (usize, usize) {
480        let refine_w = self.ensure_divide(w, grid_x);
481        let refine_h = self.ensure_divide(h, grid_y);
482
483        let grid_w = refine_h / grid_x;
484        let grid_h = refine_w / grid_y;
485
486        let best_grid_size = self.find_best_resize(
487            (grid_w, grid_h),
488            scale_resolution,
489            patch_size,
490            allow_upscale,
491        );
492
493        (best_grid_size.0 * grid_x, best_grid_size.1 * grid_y)
494    }
495
496    fn split_to_patches(
497        &self,
498        image: &DynamicImage,
499        grid: (usize, usize),
500    ) -> Vec<Vec<DynamicImage>> {
501        let mut patches = Vec::new();
502        let (w, h) = image.dimensions();
503        let (w, h) = (w as usize, h as usize);
504        let grid_x = w / grid.0;
505        let grid_y = h / grid.1;
506        for i in (0..h).step_by(grid_y) {
507            let mut images = Vec::new();
508            for j in (0..w).step_by(grid_x) {
509                images.push(image.crop_imm(j as u32, i as u32, grid_x as u32, grid_y as u32));
510            }
511            patches.push(images);
512        }
513        patches
514    }
515
516    fn get_sliced_images(
517        &self,
518        image: &DynamicImage,
519        max_slice_nums: usize,
520        scale_resolution: usize,
521        patch_size: usize,
522    ) -> Vec<DynamicImage> {
523        if !self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE) {
524            return vec![image.clone()];
525        }
526
527        let dims = image.dimensions();
528        let (w, h) = (dims.0 as usize, dims.1 as usize);
529
530        let best_grid = self.get_sliced_grid((w, h), max_slice_nums, scale_resolution, false);
531
532        let (source_images, patches) = if let Some(best_grid) = best_grid {
533            // Source image, down-sampling and ensure divided by patch_size
534            let best_resize = self.find_best_resize((w, h), scale_resolution, patch_size, false);
535            let source_image = image.resize_exact(
536                best_resize.0 as u32,
537                best_resize.1 as u32,
538                FilterType::CatmullRom,
539            );
540            let refine_size =
541                self.get_refine_size((w, h), best_grid, scale_resolution, patch_size, true);
542            let refine_image = image.resize_exact(
543                refine_size.0 as u32,
544                refine_size.1 as u32,
545                FilterType::CatmullRom,
546            );
547            let patches = self
548                .split_to_patches(&refine_image, best_grid)
549                .into_iter()
550                .flatten()
551                .collect::<Vec<_>>();
552
553            (source_image, patches)
554        } else {
555            // Don't need to slice, upsample
556            let best_size = self.find_best_resize((w, h), scale_resolution, patch_size, true);
557            let source_images = image.resize_exact(
558                best_size.0 as u32,
559                best_size.1 as u32,
560                FilterType::CatmullRom,
561            );
562
563            (source_images, vec![])
564        };
565
566        [vec![source_images], patches].concat()
567    }
568
569    /// image: (3, h, w)
570    /// output: (3, patch_size, h*w/patch_size)
571    fn reshape_by_patch(&self, image: &Tensor, patch_size: usize) -> Result<Tensor> {
572        // Equivalent of torch.nn.functional.unfold with kernel_size and stride both 2-tuples
573        let (_c, h, w) = image.dims3()?;
574        // Kernel size
575        let (kh, kw) = (patch_size, patch_size);
576        // Stride
577        let (sh, sw) = (patch_size, patch_size);
578
579        let out_h = (h - kh) / sh + 1;
580        let out_w = (w - kw) / sw + 1;
581
582        let mut patches = Vec::new();
583        for i in 0..out_h {
584            for j in 0..out_w {
585                // [c, kh, kw]
586                let patch = image.i((.., i * sh..i * sh + kh, j * sw..j * sw + kw))?;
587                // [c*kh*kw]
588                patches.push(patch.flatten_all()?);
589            }
590        }
591        // [C*kH*kW, out_h * out_w]
592        let mut patches = Tensor::stack(&patches, 1)?;
593
594        patches = patches.reshape((image.dim(0)?, patch_size, patch_size, ()))?;
595        patches
596            .permute((0, 1, 3, 2))?
597            .reshape((image.dim(0)?, patch_size, ()))
598    }
599
600    fn get_image_id_placeholder(&self, image_idx: usize) -> String {
601        format!(
602            "{}{image_idx}{}",
603            self.config
604                .im_id_start
605                .clone()
606                .unwrap_or(DEFAULT_IM_ID_START.to_string()),
607            self.config
608                .im_id_end
609                .clone()
610                .unwrap_or(DEFAULT_IM_ID_END.to_string())
611        )
612    }
613
614    fn get_grid_placeholder(&self, grid: Option<(usize, usize)>) -> String {
615        if let Some(grid) = grid {
616            let slice_image_placeholder = format!(
617                "{}{}{}",
618                self.config
619                    .slice_start_token
620                    .clone()
621                    .unwrap_or(DEFAULT_SLICE_START_TOKEN.to_string()),
622                self.config
623                    .unk_token
624                    .clone()
625                    .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
626                    .repeat(
627                        self.config
628                            .image_feature_size
629                            .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
630                    ),
631                self.config
632                    .slice_end_token
633                    .clone()
634                    .unwrap_or(DEFAULT_SLICE_END_TOKEN.to_string())
635            );
636
637            let (cols, rows) = grid;
638            let mut slices = Vec::new();
639            for _ in 0..rows {
640                let mut lines = Vec::new();
641                for _ in 0..cols {
642                    lines.push(slice_image_placeholder.clone());
643                }
644                slices.push(lines.join(""));
645            }
646
647            slices.join("\n")
648        } else {
649            "".to_string()
650        }
651    }
652
653    fn get_slice_image_placeholder(&self, image_size: (u32, u32), image_idx: usize) -> String {
654        let max_slice_nums = self.config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
655        let use_image_id = self.config.use_image_id.unwrap_or(DEFAULT_USE_IMAGE_ID);
656        let slice_mode = self.config.slice_mode.unwrap_or(DEFAULT_SLICE_MODE);
657
658        let grid = self.get_sliced_grid(
659            (image_size.0 as usize, image_size.1 as usize),
660            max_slice_nums,
661            DEFAULT_SCALE_RESOLUTION,
662            false,
663        );
664
665        let image_placeholder = format!(
666            "{}{}{}",
667            self.config
668                .im_start_token
669                .clone()
670                .unwrap_or(DEFAULT_IM_START_TOKEN.to_string()),
671            self.config
672                .unk_token
673                .clone()
674                .unwrap_or(DEFAULT_UNK_TOKEN.to_string())
675                .repeat(
676                    self.config
677                        .image_feature_size
678                        .unwrap_or(DEFAULT_IMAGE_FEATURE_SIZE)
679                ),
680            self.config
681                .im_end_token
682                .clone()
683                .unwrap_or(DEFAULT_IM_END_TOKEN.to_string())
684        );
685
686        let final_placeholder = if use_image_id {
687            format!(
688                "{}{image_placeholder}",
689                self.get_image_id_placeholder(image_idx)
690            )
691        } else {
692            image_placeholder
693        };
694
695        if slice_mode {
696            format!("{final_placeholder}{}", self.get_grid_placeholder(grid))
697        } else {
698            final_placeholder
699        }
700    }
701}
702
703impl ImagePreProcessor for MiniCpmOImageProcessor {
704    #[allow(clippy::excessive_precision)]
705    const DEFAULT_MEAN: [f64; 3] = [0.5, 0.5, 0.5];
706    #[allow(clippy::excessive_precision)]
707    const DEFAULT_STD: [f64; 3] = [0.5, 0.5, 0.5];
708
709    fn preprocess(
710        &self,
711        images: Vec<DynamicImage>,
712        _videos: Vec<Vec<DynamicImage>>,
713        config: &PreProcessorConfig,
714        device: &Device,
715        (_bs, _max_num_images): (usize, usize),
716    ) -> Result<PreprocessedImages> {
717        let mut pixel_values = Vec::new();
718        let mut tgt_sizes = Vec::new();
719        let image_sizes = images
720            .iter()
721            .map(|img| img.dimensions())
722            .collect::<Vec<_>>();
723        for image in images {
724            let max_slice_nums = config.max_slice_nums.unwrap_or(DEFAULT_MAX_SLICE_NUMS);
725            let scale_resolution = config.scale_resolution.unwrap_or(DEFAULT_SCALE_RESOLUTION);
726            let patch_size = config.patch_size.unwrap_or(DEFAULT_PATCH_SIZE);
727
728            let image_patches =
729                self.get_sliced_images(&image, max_slice_nums, scale_resolution, patch_size);
730
731            for slice_image in image_patches {
732                let (w, h) = slice_image.dimensions();
733                let to_tensor_rescale = Transforms {
734                    input: &ToTensor,
735                    inner_transforms: &[&Normalize {
736                        mean: config.image_mean.unwrap_or(Self::DEFAULT_MEAN).to_vec(),
737                        std: config.image_std.unwrap_or(Self::DEFAULT_STD).to_vec(),
738                    }],
739                };
740                let mut image = slice_image.apply(to_tensor_rescale, device)?;
741                image = self.reshape_by_patch(&image, patch_size)?;
742                pixel_values.push(image);
743                tgt_sizes.push(Tensor::from_vec(
744                    vec![h / patch_size as u32, w / patch_size as u32],
745                    (1, 2),
746                    &Device::Cpu,
747                )?);
748            }
749        }
750
751        let tgt_sizes = Tensor::cat(&tgt_sizes, 0)?.to_device(device)?;
752        // Dummy pixel values
753        Ok(PreprocessedImages {
754            pixel_values: Tensor::new(0u32, &Device::Cpu)?,
755            pixel_attention_mask: None,
756            image_sizes: None,
757            num_img_tokens: None,
758            aspect_ratio_ids: None,
759            aspect_ratio_mask: None,
760            num_tiles: None,
761            image_grid_thw: None,
762            video_grid_thw: None,
763            rows: None,
764            cols: None,
765            pixel_values_list: Some(pixel_values),
766            tgt_sizes: Some(tgt_sizes),
767            image_sizes_all: Some(image_sizes),
768            num_crops: None,
769        })
770    }
771}
mistralrs_core/vision_models/minicpmo/inputs_processor.rs

mistralrs_core/vision_models/minicpmo/
inputs_processor.rs