mistralrs_core/vision_models/image_processor.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
use candle_core::{Device, Result, Tensor};
use image::DynamicImage;
use crate::pipeline::InputsProcessor;
use super::preprocessor_config::PreProcessorConfig;
#[allow(dead_code)]
pub(crate) struct PreprocessedImages {
/// Without batch size, safe to unsqueeze & concat in dim0
/// For QwenVL2: may be vision pixel values, depending on if image_thw or video_thw are specified
pub(crate) pixel_values: Tensor,
/// Without batch size, safe to unsqueeze & concat in dim0
pub(crate) pixel_attention_mask: Option<Tensor>,
pub(crate) image_sizes: Option<(usize, usize)>,
pub(crate) num_img_tokens: Option<Vec<usize>>,
/// Without batch size, safe to unsqueeze & concat in dim0
pub(crate) aspect_ratio_ids: Option<Tensor>,
/// Without batch size, safe to unsqueeze & concat in dim0
pub(crate) aspect_ratio_mask: Option<Tensor>,
/// Without batch size
pub(crate) num_tiles: Option<Vec<usize>>,
/// Without batch size, safe to unsqueeze & concat in dim0
pub(crate) image_grid_thw: Option<Tensor>,
/// Without batch size, safe to unsqueeze & concat in dim0
pub(crate) video_grid_thw: Option<Tensor>,
/// Without batch size
pub(crate) rows: Option<Vec<usize>>,
/// Without batch size
pub(crate) cols: Option<Vec<usize>>,
}
/// ImagePreProcessor: process images for the model (similar to `InputsProcessor`, typically called by it)
pub trait ImagePreProcessor: InputsProcessor {
const DEFAULT_MEAN: [f64; 3];
const DEFAULT_STD: [f64; 3];
/// Preprocess the images for a specific batch.
/// `(bs, max_num_images)`, max_num_images is the max images per batches.
/// Pixel values are in [0, 255]
#[allow(clippy::too_many_arguments)]
fn preprocess(
&self,
images: Vec<DynamicImage>,
videos: Vec<Vec<DynamicImage>>,
config: &PreProcessorConfig,
device: &Device,
batch_info: (usize, usize),
) -> Result<PreprocessedImages>;
}