mistralrs/
gguf_xlora_model.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
use mistralrs_core::*;

use crate::{best_device, GgufModelBuilder, Model};

/// Wrapper of [`GgufModelBuilder`] for X-LoRA models.
pub struct GgufXLoraModelBuilder {
    gguf_model: GgufModelBuilder,
    xlora_model_id: String,
    ordering: Ordering,
    tgt_non_granular_index: Option<usize>,
}

impl GgufXLoraModelBuilder {
    pub fn from_gguf_model_builder(
        gguf_model: GgufModelBuilder,
        xlora_model_id: impl ToString,
        ordering: Ordering,
    ) -> Self {
        Self {
            gguf_model,
            xlora_model_id: xlora_model_id.to_string(),
            ordering,
            tgt_non_granular_index: None,
        }
    }

    pub fn tgt_non_granular_index(mut self, tgt_non_granular_idx: usize) -> Self {
        self.tgt_non_granular_index = Some(tgt_non_granular_idx);
        self
    }

    pub async fn build(self) -> anyhow::Result<Model> {
        let config = GGUFSpecificConfig {
            prompt_batchsize: self.gguf_model.prompt_batchsize,
            topology: self.gguf_model.topology,
        };

        if self.gguf_model.with_logging {
            initialize_logging();
        }

        let loader = GGUFLoaderBuilder::new(
            self.gguf_model.chat_template,
            self.gguf_model.tok_model_id,
            self.gguf_model.model_id,
            self.gguf_model.files,
            config,
        )
        .with_xlora(
            self.xlora_model_id,
            self.ordering,
            self.gguf_model.no_kv_cache,
            self.tgt_non_granular_index,
        )
        .build();

        // Load, into a Pipeline
        let pipeline = loader.load_model_from_hf(
            self.gguf_model.hf_revision,
            self.gguf_model.token_source,
            &ModelDType::Auto,
            &best_device(self.gguf_model.force_cpu)?,
            !self.gguf_model.with_logging,
            DeviceMapMetadata::dummy(),
            None,
            self.gguf_model.paged_attn_cfg,
        )?;

        let scheduler_method = match self.gguf_model.paged_attn_cfg {
            Some(_) => {
                let config = pipeline
                    .lock()
                    .await
                    .get_metadata()
                    .cache_config
                    .as_ref()
                    .unwrap()
                    .clone();

                SchedulerConfig::PagedAttentionMeta {
                    max_num_seqs: self.gguf_model.max_num_seqs,
                    config,
                }
            }
            None => SchedulerConfig::DefaultScheduler {
                method: DefaultSchedulerMethod::Fixed(self.gguf_model.max_num_seqs.try_into()?),
            },
        };

        let mut runner = MistralRsBuilder::new(pipeline, scheduler_method)
            .with_no_kv_cache(self.gguf_model.no_kv_cache)
            .with_gemm_full_precision_f16(true)
            .with_no_prefix_cache(self.gguf_model.prefix_cache_n.is_none());

        if let Some(n) = self.gguf_model.prefix_cache_n {
            runner = runner.with_prefix_cache_n(n)
        }

        Ok(Model::new(runner.build()))
    }
}