1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2
3use std::{
4 any::Any,
5 collections::HashMap,
6 sync::{Arc, RwLock},
7};
8
9use candle_core::{Context, DType, Device, Result, Tensor};
10use image::{imageops::FilterType, DynamicImage};
11use itertools::Itertools;
12use mistralrs_vision::{
13 ApplyTensorTransforms, ApplyTransforms, Normalize, Rescale, TensorTransforms, ToTensorNoNorm,
14 Transforms,
15};
16use tokenizers::Tokenizer;
17
18use crate::{
19 device_map::DeviceMapper,
20 pipeline::{
21 text_models_inputs_processor::{
22 self, get_completion_input, get_prompt_input, PagedAttentionMeta,
23 },
24 InputProcessorOutput, InputsProcessor, InputsProcessorType, MessagesAction, Processor,
25 },
26 sequence::Sequence,
27 vision_models::{
28 image_processor::{ImagePreProcessor, PreprocessedImages},
29 preprocessor_config::{PreProcessorConfig, ToFilter},
30 ModelInputs,
31 },
32};
33
34use super::MLlamaSpecificArgs;
35
36const IMAGE_TOKEN: &str = "<|image|>";
37
38struct MLlamaImageProcessor {
40 max_image_tiles: RwLock<Option<usize>>,
42}
43pub struct MLlamaProcessor;
45
46impl MLlamaProcessor {
47 pub fn new() -> Self {
48 Self
49 }
50}
51
52impl Processor for MLlamaProcessor {
53 fn inputs_processor(&self) -> Arc<dyn InputsProcessor> {
54 Arc::new(MLlamaImageProcessor {
55 max_image_tiles: RwLock::new(None),
56 })
57 }
58
59 fn get_special_tokens(&self) -> &[&'static str] {
60 &[IMAGE_TOKEN, "<|python_tag|>"]
61 }
62
63 fn template_action(&self) -> MessagesAction {
64 MessagesAction::FlattenOnlyText
65 }
66}
67
68fn get_cross_attention_token_mask(input_ids: Vec<u32>, image_token_id: u32) -> Vec<(i64, i64)> {
71 let image_token_locations = input_ids
72 .iter()
73 .positions(|token| *token == image_token_id)
74 .collect::<Vec<_>>();
75
76 if image_token_locations.is_empty() {
77 return vec![];
78 }
79
80 if image_token_locations.len() == 1 {
82 return vec![(image_token_locations[0] as i64, -1)];
83 }
84
85 let mut vision_masks = image_token_locations[..image_token_locations.len() - 1]
86 .iter()
87 .zip(&image_token_locations[1..])
88 .map(|(a, b)| (*a as i64, *b as i64))
89 .collect::<Vec<_>>();
90
91 vision_masks.push((
93 *image_token_locations.last().unwrap() as i64,
94 input_ids.len() as i64,
95 ));
96
97 let mut last_mask_end = vision_masks.last().unwrap().1;
100 for vision_mask in vision_masks.iter_mut().rev() {
101 if vision_mask.0 == vision_mask.1 - 1 {
102 vision_mask.1 = last_mask_end;
103 }
104 last_mask_end = vision_mask.1;
105 }
106
107 vision_masks
108}
109
110fn convert_sparse_cross_attention_mask_to_dense(
122 cross_attn_token_mask: Vec<Vec<(i64, i64)>>,
123 num_tiles: Vec<Vec<usize>>,
124 max_num_tiles: usize,
125 length: usize,
126 dev: &Device,
127) -> candle_core::Result<Tensor> {
128 let bs = cross_attn_token_mask.len();
129 let max_num_images = cross_attn_token_mask.iter().map(|x| x.len()).max().unwrap();
130
131 let mut cross_attention_mask = Tensor::zeros(
132 (bs, length, max_num_images, max_num_tiles),
133 DType::I64,
134 &Device::Cpu,
135 )?;
136
137 for (sample_idx, (sample_masks, sample_num_tiles)) in
138 cross_attn_token_mask.into_iter().zip(num_tiles).enumerate()
139 {
140 for (mask_idx, ((start, end), mask_num_tiles)) in
141 sample_masks.into_iter().zip(sample_num_tiles).enumerate()
142 {
143 let mut end = end.min(length as i64);
144 if end == -1 {
145 end = length as i64;
146 }
147 cross_attention_mask = cross_attention_mask.slice_assign(
148 &[
149 sample_idx..sample_idx + 1,
150 start as usize..end as usize,
151 mask_idx..mask_idx + 1,
152 0..mask_num_tiles,
153 ],
154 &Tensor::ones(
155 (1, end as usize - start as usize, 1, mask_num_tiles),
156 DType::I64,
157 &Device::Cpu,
158 )?,
159 )?;
160 }
161 }
162
163 cross_attention_mask.to_device(dev)
164}
165
166impl InputsProcessor for MLlamaImageProcessor {
167 fn get_type(&self) -> InputsProcessorType {
168 InputsProcessorType::Vision
169 }
170 fn process_inputs(
171 &self,
172 tokenizer: Option<Arc<Tokenizer>>,
173 input_seqs: &mut [&mut Sequence],
174 is_prompt: bool,
175 is_xlora: bool,
176 device: &Device,
177 no_kv_cache: bool,
178 last_n_context_len: Option<(usize, usize)>,
179 return_raw_logits: bool,
180 other_config: Option<Arc<dyn Any>>,
181 mut paged_attn_metadata: Option<PagedAttentionMeta>,
182 mapper: Option<&dyn DeviceMapper>,
183 ) -> anyhow::Result<InputProcessorOutput> {
184 if is_xlora {
185 return Err(anyhow::Error::msg(
186 "Cannot make inputs for X-LoRA vision model.",
187 ));
188 }
189 if no_kv_cache {
190 return Err(anyhow::Error::msg("Vision model must have kv cache."));
191 }
192 let Some(tokenizer) = tokenizer else {
193 return Err(anyhow::Error::msg(
194 "MLlamaInputProcessor requires a specified tokenizer.",
195 ));
196 };
197
198 let text_models_inputs_processor::InnerInputProcessorOutput {
199 inputs:
200 text_models_inputs_processor::InputMetadata {
201 input,
202 positions: _,
203 context_lens: _,
204 position_ids: _,
205 paged_attn_meta: _,
206 flash_meta: _,
207 },
208 seq_indices: _,
209 } = if is_prompt {
210 get_prompt_input(
211 input_seqs
212 .iter()
213 .map(|seq| seq.get_toks())
214 .collect::<Vec<_>>(),
215 input_seqs,
216 device,
217 last_n_context_len,
218 return_raw_logits,
219 paged_attn_metadata.as_mut(),
220 mapper,
221 )
222 .unwrap()
223 } else {
224 get_completion_input(
225 input_seqs
226 .iter()
227 .map(|seq| seq.get_toks())
228 .collect::<Vec<_>>(),
229 input_seqs,
230 device,
231 no_kv_cache,
232 last_n_context_len,
233 return_raw_logits,
234 paged_attn_metadata.as_mut(),
235 mapper,
236 )
237 .unwrap()
238 };
239 let config = other_config.expect("Need a PreProcessorConfig config.");
240 let config: &PreProcessorConfig = config.downcast_ref().expect("Downcast failed.");
241
242 let has_images = input_seqs.iter().all(|seq| seq.has_images());
243
244 let (pixel_values, aspect_ratio_ids, aspect_ratio_mask, cross_attn_mask) = if has_images {
245 let mut pixel_values_accum = Vec::new();
246 let mut aspect_ratio_ids_accum = Vec::new();
247 let mut aspect_ratio_mask_accum = Vec::new();
248 let mut num_tiles_accum = Vec::new();
249
250 let bs = input_seqs.len();
251 let detokenized = tokenizer
252 .decode_batch(
253 &input_seqs
254 .iter()
255 .map(|seq| seq.get_toks())
256 .collect::<Vec<_>>(),
257 false,
258 )
259 .expect("Detokenization failed!");
260 let n_images_in_text = detokenized
261 .iter()
262 .map(|text| text.matches(IMAGE_TOKEN).count())
263 .collect::<Vec<_>>();
264 let n_images_in_images = input_seqs
265 .iter()
266 .map(|seq| seq.images().map(|imgs| imgs.len()).unwrap_or(0))
267 .collect::<Vec<_>>();
268
269 if n_images_in_text != n_images_in_images {
270 return Err(anyhow::Error::msg(format!(
271 "The number of images in each batch {n_images_in_text:?} should be the same as the number of images {n_images_in_images:?}. The model cannot support a different number of images per patch. Perhaps you forgot a `<|image|>` tag?"
272 )));
273 }
274
275 let max_num_images = *n_images_in_images
276 .iter()
277 .max()
278 .expect("No max images per batch!");
279
280 for seq in input_seqs.iter_mut() {
281 let PreprocessedImages {
282 pixel_values,
283 pixel_attention_mask: _,
284 image_sizes: _,
285 num_img_tokens: _,
286 aspect_ratio_ids,
287 aspect_ratio_mask,
288 num_tiles,
289 image_grid_thw: _,
290 video_grid_thw: _,
291 rows: _,
292 cols: _,
293 pixel_values_list: _,
294 tgt_sizes: _,
295 image_sizes_all: _,
296 num_crops: _,
297 } = self
298 .preprocess(
299 seq.take_images()
300 .expect("Need to have images by this point."),
301 vec![],
302 config,
303 device,
304 (bs, max_num_images), )
306 .expect("Preprocessing failed");
307 pixel_values_accum.push(pixel_values.unsqueeze(0).unwrap());
308 aspect_ratio_ids_accum.push(aspect_ratio_ids.unwrap().unsqueeze(0).unwrap());
309 aspect_ratio_mask_accum.push(aspect_ratio_mask.unwrap().unsqueeze(0).unwrap());
310 num_tiles_accum.push(num_tiles.unwrap());
311
312 seq.multimodal.has_changed_prompt = true;
313 }
314
315 let image_token_id = tokenizer
317 .encode_fast(IMAGE_TOKEN, false)
318 .unwrap()
319 .get_ids()
320 .to_vec();
321 let image_token_id = if image_token_id.len() == 1 {
322 image_token_id[0]
323 } else {
324 panic!("{IMAGE_TOKEN} encoding should be one token, got {image_token_id:?}");
325 };
326 let chunks = input.chunk(input.dim(0).unwrap(), 0).unwrap();
327 let cross_attention_token_mask = chunks
328 .iter()
329 .map(|token_ids| {
330 get_cross_attention_token_mask(
331 token_ids.squeeze(0).unwrap().to_vec1::<u32>().unwrap(),
332 image_token_id,
333 )
334 })
335 .collect::<Vec<_>>();
336
337 let cross_attn_mask = convert_sparse_cross_attention_mask_to_dense(
338 cross_attention_token_mask,
339 num_tiles_accum,
340 self.max_image_tiles
341 .read()
342 .unwrap()
343 .expect("`max_image_tiles` must be set!"),
344 chunks
345 .iter()
346 .map(|input_ids| *input_ids.dims().last().unwrap())
347 .max()
348 .unwrap(),
349 chunks[0].device(),
350 );
351
352 let cross_attn_mask = match cross_attn_mask {
353 Ok(v) => v,
354 Err(e) => return Err(anyhow::Error::msg(e.to_string())),
355 };
356
357 (
358 Some(Tensor::cat(&pixel_values_accum, 0).unwrap()),
359 Some(Tensor::cat(&aspect_ratio_ids_accum, 0).unwrap()),
360 Some(Tensor::cat(&aspect_ratio_mask_accum, 0).unwrap()),
361 Some(cross_attn_mask),
362 )
363 } else {
364 (None, None, None, None)
365 };
366
367 let text_models_inputs_processor::InnerInputProcessorOutput {
368 inputs:
369 text_models_inputs_processor::InputMetadata {
370 input,
371 positions,
372 context_lens,
373 position_ids,
374 paged_attn_meta,
375 flash_meta,
376 },
377 seq_indices,
378 } = if is_prompt {
379 get_prompt_input(
380 input_seqs
381 .iter()
382 .map(|seq| seq.get_toks())
383 .collect::<Vec<_>>(),
384 input_seqs,
385 device,
386 last_n_context_len,
387 return_raw_logits,
388 paged_attn_metadata.as_mut(),
389 mapper,
390 )
391 .unwrap()
392 } else {
393 get_completion_input(
394 input_seqs
395 .iter()
396 .map(|seq| seq.get_toks())
397 .collect::<Vec<_>>(),
398 input_seqs,
399 device,
400 no_kv_cache,
401 last_n_context_len,
402 return_raw_logits,
403 paged_attn_metadata.as_mut(),
404 mapper,
405 )
406 .unwrap()
407 };
408
409 let inputs: Box<dyn Any> = Box::new(ModelInputs {
410 input_ids: input,
411 seqlen_offsets: positions,
412 context_lens,
413 position_ids,
414 pixel_values,
415 model_specific_args: Box::new(MLlamaSpecificArgs {
416 aspect_ratio_ids,
417 aspect_ratio_mask,
418 cross_attn_mask,
419 }),
420 paged_attn_meta,
421 flash_meta,
422 });
423 Ok(InputProcessorOutput {
424 inputs,
425 seq_indices,
426 })
427 }
428}
429
430fn argmin<T, I>(iter: I) -> Option<usize>
431where
432 T: PartialOrd,
433 I: Iterator<Item = T>,
434{
435 iter.enumerate()
436 .fold(None, |min, (idx, item)| match min {
437 None => Some((idx, item)),
438 Some((min_idx, min_item)) => {
439 if item < min_item {
440 Some((idx, item))
441 } else {
442 Some((min_idx, min_item))
443 }
444 }
445 })
446 .map(|(min_idx, _)| min_idx)
447}
448
449impl MLlamaImageProcessor {
450 fn get_all_supported_aspect_ratios(max_image_tiles: usize) -> Vec<(usize, usize)> {
452 (1..max_image_tiles + 1)
453 .flat_map(|width| {
454 (1..max_image_tiles + 1).filter_map(move |height| {
455 if width * height <= max_image_tiles {
456 Some((width, height))
457 } else {
458 None
459 }
460 })
461 })
462 .collect::<Vec<_>>()
463 }
464
465 fn get_optimal_tiled_canvas(
467 image_height: u32,
468 image_width: u32,
469 max_image_tiles: usize,
470 tile_size: usize,
471 ) -> Result<(usize, usize)> {
472 let possible_tile_arrangements = Self::get_all_supported_aspect_ratios(max_image_tiles);
473 let possible_canvas_sizes: (Vec<_>, Vec<_>) = possible_tile_arrangements
474 .into_iter()
475 .map(|(h, w)| (h * tile_size, w * tile_size))
476 .unzip();
477 let (target_heights, target_widths) = possible_canvas_sizes;
479
480 let scale_h = target_heights
482 .iter()
483 .map(|h| *h as f32 / image_height as f32)
484 .collect::<Vec<_>>();
485 let scale_w = target_widths
486 .iter()
487 .map(|w| *w as f32 / image_width as f32)
488 .collect::<Vec<_>>();
489
490 let scales = scale_h
492 .into_iter()
493 .zip(scale_w)
494 .map(|(scale_h, scale_w)| if scale_w > scale_h { scale_h } else { scale_w })
495 .collect::<Vec<_>>();
496
497 let upscaling_options = scales
499 .iter()
500 .copied()
501 .filter(|scale| *scale >= 1.)
502 .collect::<Vec<_>>();
503 let selected_scale = if !upscaling_options.is_empty() {
504 upscaling_options
505 .into_iter()
506 .min_by(|x, y| x.partial_cmp(y).expect("No ordering!"))
507 .context("No min, upscale")?
508 } else {
509 let downscaling_options = scales
511 .iter()
512 .copied()
513 .filter(|scale| *scale < 1.)
514 .collect::<Vec<_>>();
515 downscaling_options
516 .into_iter()
517 .max_by(|x, y| x.partial_cmp(y).expect("No ordering!"))
518 .context("No max, downscale")?
519 };
520
521 let chosen_canvas_h = target_heights
523 .iter()
524 .copied()
525 .enumerate()
526 .filter_map(|(i, h)| {
527 if scales[i] == selected_scale {
528 Some(h)
529 } else {
530 None
531 }
532 })
533 .collect::<Vec<_>>();
534 let chosen_canvas_w = target_widths
535 .iter()
536 .copied()
537 .enumerate()
538 .filter_map(|(i, w)| {
539 if scales[i] == selected_scale {
540 Some(w)
541 } else {
542 None
543 }
544 })
545 .collect::<Vec<_>>();
546
547 assert_eq!(chosen_canvas_h.len(), chosen_canvas_w.len());
548 if chosen_canvas_h.len() > 1 {
549 let optimal_idx = argmin(
550 chosen_canvas_h
551 .iter()
552 .zip(&chosen_canvas_w)
553 .map(|(h, w)| *h * *w),
554 )
555 .context("No argmin")?;
556 Ok((chosen_canvas_h[optimal_idx], chosen_canvas_w[optimal_idx]))
557 } else {
558 Ok((chosen_canvas_h[0], chosen_canvas_w[0]))
559 }
560 }
561
562 fn get_image_size_fit_to_canvas(
564 image_height: u32,
565 image_width: u32,
566 canvas_height: usize,
567 canvas_width: usize,
568 tile_size: usize,
569 ) -> (usize, usize) {
570 let target_width = (image_width as usize).clamp(tile_size, canvas_width);
571 let target_height = (image_height as usize).clamp(tile_size, canvas_height);
572
573 let scale_h = (target_height as f32) / (image_height as f32);
574 let scale_w = (target_width as f32) / (image_width as f32);
575
576 if scale_w < scale_h {
577 (
578 target_height.min((image_height as f32 * scale_w).floor() as usize),
579 target_width,
580 )
581 } else {
582 (
583 target_height,
584 target_width.min((image_width as f32 * scale_h).floor() as usize),
585 )
586 }
587 }
588
589 fn resize(
593 &self,
594 image: DynamicImage,
595 size: &HashMap<String, u32>,
596 max_image_tiles: usize,
597 filter: FilterType,
598 ) -> Result<(DynamicImage, (usize, usize))> {
599 let image_height = image.height();
600 let image_width = image.width();
601 let tile_size = size["height"] as usize;
602
603 let (canvas_height, canvas_width) =
604 Self::get_optimal_tiled_canvas(image_height, image_width, max_image_tiles, tile_size)?;
605 let num_tiles_height = canvas_height / tile_size;
606 let num_tiles_width = canvas_width / tile_size;
607
608 let (new_height, new_width) = Self::get_image_size_fit_to_canvas(
609 image_height,
610 image_width,
611 canvas_height,
612 canvas_width,
613 tile_size,
614 );
615
616 Ok((
617 image.resize_exact(new_width as u32, new_height as u32, filter),
618 (num_tiles_height, num_tiles_width),
619 ))
620 }
621
622 fn pad(
626 &self,
627 image: &Tensor,
628 size: &HashMap<String, u32>,
629 aspect_ratio: (usize, usize),
630 ) -> Result<Tensor> {
631 let (num_tiles_h, num_tiles_w) = aspect_ratio;
632 let padded_height = num_tiles_h * size["height"] as usize;
633 let padded_width = num_tiles_w * size["width"] as usize;
634
635 mistralrs_vision::pad(image, padded_height, padded_width)
637 }
638
639 fn split_to_tiles(
642 &self,
643 image: &Tensor,
644 num_tiles_height: usize,
645 num_tiles_width: usize,
646 ) -> Result<Tensor> {
647 let (ch, h, w) = image.dims3()?;
648 let tile_height = h / num_tiles_height;
649 let tile_width = w / num_tiles_width;
650
651 let mut image = image.reshape((
652 ch,
653 num_tiles_height,
654 tile_height,
655 num_tiles_width,
656 tile_width,
657 ))?;
658
659 image = image.permute((1, 3, 0, 2, 4))?;
661
662 image
664 .reshape((
665 num_tiles_width * num_tiles_height,
666 ch,
667 tile_height,
668 tile_width,
669 ))?
670 .contiguous()
671 }
672
673 fn pack_images(
679 &self,
680 images: Vec<Tensor>,
681 max_image_tiles: usize,
682 (_bs, max_num_images): (usize, usize),
683 ) -> Result<(Tensor, Vec<usize>)> {
684 let (_, ch, tile_h, tile_w) = images[0].dims4()?;
685
686 let mut stacked_images = Tensor::zeros(
687 (max_num_images, max_image_tiles, ch, tile_h, tile_w),
688 images[0].dtype(),
689 images[0].device(),
690 )?;
691 let mut num_sample_tiles = Vec::new();
692 for (i, image) in images.into_iter().enumerate() {
693 let num_tiles = image.dim(0)?;
694 stacked_images = stacked_images.slice_assign(
695 &[i..i + 1, 0..num_tiles, 0..ch, 0..tile_h, 0..tile_w],
696 &image.unsqueeze(0)?,
697 )?;
698 num_sample_tiles.push(num_tiles)
699 }
700 Ok((stacked_images, num_sample_tiles))
701 }
702
703 fn convert_aspect_ratios_to_ids(
707 &self,
708 aspect_ratios: Vec<(usize, usize)>,
709 max_image_tiles: usize,
710 (_bs, max_num_images): (usize, usize),
711 device: &Device,
712 ) -> Result<Tensor> {
713 let supported_aspect_ratios = Self::get_all_supported_aspect_ratios(max_image_tiles);
714
715 let mut aspect_ratios_ids = vec![0i64; max_num_images];
716 for (i, (num_tiles_h, num_tiles_w)) in aspect_ratios.iter().enumerate() {
717 aspect_ratios_ids[i] = (supported_aspect_ratios
718 .iter()
719 .position(|(h, w)| *h == *num_tiles_h && *w == *num_tiles_w)
720 .context("Could not find aspect ratio")?
721 + 1) as i64;
722 }
723
724 Tensor::new(aspect_ratios_ids, device)
725 }
726
727 fn build_aspect_ratio_mask(
728 &self,
729 aspect_ratios: Vec<(usize, usize)>,
730 max_image_tiles: usize,
731 (_bs, max_num_images): (usize, usize),
732 device: &Device,
733 ) -> Result<Tensor> {
734 let mut aspect_ratio_mask =
735 Tensor::zeros((max_num_images, max_image_tiles), DType::I64, device)?;
736
737 aspect_ratio_mask = aspect_ratio_mask.slice_assign(
741 &[0..max_num_images, 0..1],
742 &Tensor::ones((max_num_images, 1), DType::I64, device)?,
743 )?;
744
745 for (i, (num_tiles_h, num_tiles_w)) in aspect_ratios.iter().enumerate() {
746 aspect_ratio_mask = aspect_ratio_mask.slice_assign(
747 &[i..i + 1, 0..*num_tiles_h * *num_tiles_w],
748 &Tensor::ones((1, *num_tiles_h * *num_tiles_w), DType::I64, device)?,
749 )?;
750 }
751
752 Ok(aspect_ratio_mask)
753 }
754}
755
756impl ImagePreProcessor for MLlamaImageProcessor {
757 const DEFAULT_MEAN: [f64; 3] = [0.5, 0.5, 0.5];
758 const DEFAULT_STD: [f64; 3] = [0.5, 0.5, 0.5];
759
760 fn preprocess(
761 &self,
762 images: Vec<DynamicImage>,
763 videos: Vec<Vec<DynamicImage>>,
764 config: &PreProcessorConfig,
765 device: &Device,
766 (bs, max_num_images): (usize, usize),
767 ) -> Result<PreprocessedImages> {
768 assert!(videos.is_empty());
769
770 let mut sample_images = Vec::new();
771 let mut sample_aspect_ratios = Vec::new();
772 let max_image_tiles = config
773 .max_image_tiles
774 .context("`do_resize=false` is not supported, need `max_image_tiles`!")?;
775 *self.max_image_tiles.write().unwrap() = Some(max_image_tiles);
776
777 for mut image in images {
778 if config.do_convert_rgb.unwrap_or(true) {
780 image = DynamicImage::ImageRgb8(image.to_rgb8());
781 }
782
783 let size = config
784 .size
785 .as_ref()
786 .context("`do_resize=false` is not supported, need `size`!")?;
787
788 let (image, aspect_ratio) =
789 self.resize(image, size, max_image_tiles, config.resampling.to_filter()?)?;
790
791 let to_tensor_rescale = Transforms {
795 input: &ToTensorNoNorm,
796 inner_transforms: &[],
797 };
798 let mut image = image.apply(to_tensor_rescale, device)?;
799
800 image = self.pad(&image, size, aspect_ratio)?;
801
802 let transforms = TensorTransforms {
803 inner_transforms: &[
804 &config
805 .do_rescale
806 .is_some_and(|x| x)
807 .then_some(())
808 .map(|_| Rescale {
809 factor: config.rescale_factor,
810 }),
811 &config
812 .do_normalize
813 .is_some_and(|x| x)
814 .then_some(())
815 .map(|_| Normalize {
816 mean: config.image_mean.unwrap_or(Self::DEFAULT_MEAN).to_vec(),
817 std: config.image_std.unwrap_or(Self::DEFAULT_STD).to_vec(),
818 }),
819 ],
820 };
821 image = <Tensor as ApplyTensorTransforms>::apply(&image, transforms, device)?;
822
823 let (num_tiles_height, num_tiles_width) = aspect_ratio;
824 image = self.split_to_tiles(&image, num_tiles_height, num_tiles_width)?;
825
826 sample_images.push(image);
827 sample_aspect_ratios.push((num_tiles_height, num_tiles_width));
828 }
829
830 let (images, num_tiles) =
831 self.pack_images(sample_images, max_image_tiles, (bs, max_num_images))?;
832
833 let aspect_ratio_ids = self.convert_aspect_ratios_to_ids(
834 sample_aspect_ratios.clone(),
835 max_image_tiles,
836 (bs, max_num_images),
837 device,
838 )?;
839 let aspect_ratio_mask = self.build_aspect_ratio_mask(
840 sample_aspect_ratios,
841 max_image_tiles,
842 (bs, max_num_images),
843 device,
844 )?;
845
846 Ok(PreprocessedImages {
847 pixel_values: images,
848 pixel_attention_mask: None,
849 image_sizes: None,
850 num_img_tokens: None,
851 aspect_ratio_ids: Some(aspect_ratio_ids),
852 aspect_ratio_mask: Some(aspect_ratio_mask),
853 num_tiles: Some(num_tiles),
854 image_grid_thw: None,
855 video_grid_thw: None,
856 rows: None,
857 cols: None,
858 pixel_values_list: None,
859 tgt_sizes: None,
860 image_sizes_all: None,
861 num_crops: None,
862 })
863 }
864}