mistralrs_core/vision_models/
image_processor.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]

use candle_core::{Device, Result, Tensor};
use image::DynamicImage;

use crate::pipeline::InputsProcessor;

use super::preprocessor_config::PreProcessorConfig;

#[allow(dead_code)]
pub(crate) struct PreprocessedImages {
    /// Without batch size, safe to unsqueeze & concat in dim0
    /// For QwenVL2: may be vision pixel values, depending on if image_thw or video_thw are specified
    pub(crate) pixel_values: Tensor,
    /// Without batch size, safe to unsqueeze & concat in dim0
    pub(crate) pixel_attention_mask: Option<Tensor>,
    pub(crate) image_sizes: Option<(usize, usize)>,
    pub(crate) num_img_tokens: Option<Vec<usize>>,
    /// Without batch size, safe to unsqueeze & concat in dim0
    pub(crate) aspect_ratio_ids: Option<Tensor>,
    /// Without batch size, safe to unsqueeze & concat in dim0
    pub(crate) aspect_ratio_mask: Option<Tensor>,
    /// Without batch size
    pub(crate) num_tiles: Option<Vec<usize>>,
    /// Without batch size, safe to unsqueeze & concat in dim0
    pub(crate) image_grid_thw: Option<Tensor>,
    /// Without batch size, safe to unsqueeze & concat in dim0
    pub(crate) video_grid_thw: Option<Tensor>,
    /// Without batch size
    pub(crate) rows: Option<Vec<usize>>,
    /// Without batch size
    pub(crate) cols: Option<Vec<usize>>,
}

/// ImagePreProcessor: process images for the model (similar to `InputsProcessor`, typically called by it)
pub trait ImagePreProcessor: InputsProcessor {
    const DEFAULT_MEAN: [f64; 3];
    const DEFAULT_STD: [f64; 3];

    /// Preprocess the images for a specific batch.
    /// `(bs, max_num_images)`, max_num_images is the max images per batches.
    /// Pixel values are in [0, 255]
    #[allow(clippy::too_many_arguments)]
    fn preprocess(
        &self,
        images: Vec<DynamicImage>,
        videos: Vec<Vec<DynamicImage>>,
        config: &PreProcessorConfig,
        device: &Device,
        batch_info: (usize, usize),
    ) -> Result<PreprocessedImages>;
}