mistralrs_core/vision_models/
image_processor.rs

1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2
3use candle_core::{Device, Result, Tensor};
4use image::DynamicImage;
5
6use crate::pipeline::InputsProcessor;
7
8use super::preprocessor_config::PreProcessorConfig;
9
10#[allow(dead_code)]
11pub(crate) struct PreprocessedImages {
12    /// Without batch size, safe to unsqueeze & concat in dim0
13    /// For QwenVL2: may be vision pixel values, depending on if image_thw or video_thw are specified
14    pub(crate) pixel_values: Tensor,
15    /// Without batch size, safe to unsqueeze & concat in dim0
16    pub(crate) pixel_attention_mask: Option<Tensor>,
17    /// (w, h)
18    pub(crate) image_sizes: Option<(usize, usize)>,
19    pub(crate) num_img_tokens: Option<Vec<usize>>,
20    /// Without batch size, safe to unsqueeze & concat in dim0
21    pub(crate) aspect_ratio_ids: Option<Tensor>,
22    /// Without batch size, safe to unsqueeze & concat in dim0
23    pub(crate) aspect_ratio_mask: Option<Tensor>,
24    /// Without batch size
25    pub(crate) num_tiles: Option<Vec<usize>>,
26    /// Without batch size, safe to unsqueeze & concat in dim0
27    pub(crate) image_grid_thw: Option<Tensor>,
28    /// Without batch size, safe to unsqueeze & concat in dim0
29    pub(crate) video_grid_thw: Option<Tensor>,
30    /// Without batch size
31    pub(crate) rows: Option<Vec<usize>>,
32    /// Without batch size
33    pub(crate) cols: Option<Vec<usize>>,
34    /// Without batch size. Only images.
35    pub(crate) pixel_values_list: Option<Vec<Tensor>>,
36    /// Without batch size, safe to unsqueeze & concat in dim0
37    pub(crate) tgt_sizes: Option<Tensor>,
38    /// Without batch size. Per image. (h, w)
39    pub(crate) image_sizes_all: Option<Vec<(u32, u32)>>,
40    /// Without batch size
41    pub(crate) num_crops: Option<Vec<usize>>,
42}
43
44/// ImagePreProcessor: process images for the model (similar to `InputsProcessor`, typically called by it)
45pub trait ImagePreProcessor: InputsProcessor {
46    const DEFAULT_MEAN: [f64; 3];
47    const DEFAULT_STD: [f64; 3];
48
49    /// Preprocess the images for a specific batch.
50    /// `(bs, max_num_images)`, max_num_images is the max images per batches.
51    /// Pixel values are in [0, 255]
52    #[allow(clippy::too_many_arguments)]
53    fn preprocess(
54        &self,
55        images: Vec<DynamicImage>,
56        videos: Vec<Vec<DynamicImage>>,
57        config: &PreProcessorConfig,
58        device: &Device,
59        batch_info: (usize, usize),
60    ) -> Result<PreprocessedImages>;
61}