mistralrs_core/vision_models/qwen2_5_vl/
config.rs

1// https://github.com/huggingface/transformers/blob/f2c388e3f946862f657acc1e21b272ec946fc66c/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
2
3use mistralrs_quant::QuantizedConfig;
4
5use crate::layers::Activation;
6
7use crate::serde_default_fn;
8
9serde_default_fn!(Activation, default_vision_hidden_act, Activation::QuickGelu);
10serde_default_fn!(usize, default_in_channels, 3);
11serde_default_fn!(usize, default_depth, 32);
12serde_default_fn!(usize, default_hidden_size, 3584);
13serde_default_fn!(usize, default_out_hidden_size, 3584);
14serde_default_fn!(usize, default_intermediate_size, 3420);
15serde_default_fn!(usize, default_num_heads, 16);
16serde_default_fn!(usize, default_patch_size, 14);
17serde_default_fn!(usize, default_spatial_merge_size, 2);
18serde_default_fn!(usize, default_temporal_patch_size, 2);
19serde_default_fn!(usize, default_window_size, 112);
20serde_default_fn!(
21    Vec<usize>,
22    default_fullatt_block_indexes,
23    vec![7, 15, 23, 31]
24);
25
26#[derive(Debug, Clone, serde::Deserialize)]
27pub struct VisionConfig {
28    #[serde(default = "default_depth")]
29    pub depth: usize,
30    #[serde(default = "default_hidden_size")]
31    pub hidden_size: usize,
32    #[serde(default = "default_out_hidden_size")]
33    pub out_hidden_size: usize,
34    #[serde(default = "default_vision_hidden_act")]
35    pub hidden_act: Activation,
36    #[serde(default = "default_intermediate_size")]
37    pub intermediate_size: usize,
38    #[serde(default = "default_num_heads")]
39    pub num_heads: usize,
40    #[serde(default = "default_in_channels")]
41    pub in_chans: usize,
42    #[serde(default = "default_patch_size")]
43    pub patch_size: usize,
44    #[serde(default = "default_spatial_merge_size")]
45    pub spatial_merge_size: usize,
46    #[serde(default = "default_temporal_patch_size")]
47    pub temporal_patch_size: usize,
48    #[serde(default = "default_window_size")]
49    pub window_size: usize,
50    #[serde(default = "default_fullatt_block_indexes")]
51    pub fullatt_block_indexes: Vec<usize>,
52}
53
54#[derive(Debug, Clone, serde::Deserialize)]
55pub struct MRopeScaling {
56    pub mrope_section: Vec<usize>,
57}
58
59#[derive(Debug, Clone, serde::Deserialize)]
60pub struct Config {
61    pub vocab_size: usize,
62    pub hidden_size: usize,
63    pub intermediate_size: usize,
64    pub num_hidden_layers: usize,
65    pub num_attention_heads: usize,
66    pub num_key_value_heads: usize,
67    pub hidden_act: Activation,
68    pub max_position_embeddings: usize,
69    pub rms_norm_eps: f64,
70    pub tie_word_embeddings: bool,
71    pub rope_theta: f64,
72    pub use_sliding_window: bool,
73    pub sliding_window: Option<usize>,
74    pub vision_config: VisionConfig,
75    pub rope_scaling: MRopeScaling,
76    pub quantization_config: Option<QuantizedConfig>,
77    pub image_token_id: u32,
78    pub video_token_id: u32,
79    // pub vision_start_token_id: usize,
80    // pub max_window_layers: usize,
81}