mistralrs/
gguf_lora_model.rsuse mistralrs_core::*;
use crate::{best_device, GgufModelBuilder, Model};
pub struct GgufLoraModelBuilder {
gguf_model: GgufModelBuilder,
lora_model_id: String,
ordering: Ordering,
}
impl GgufLoraModelBuilder {
pub fn from_gguf_model_builder(
gguf_model: GgufModelBuilder,
lora_model_id: impl ToString,
ordering: Ordering,
) -> Self {
Self {
gguf_model,
lora_model_id: lora_model_id.to_string(),
ordering,
}
}
pub async fn build(self) -> anyhow::Result<Model> {
let config = GGUFSpecificConfig {
prompt_batchsize: self.gguf_model.prompt_batchsize,
topology: self.gguf_model.topology,
};
if self.gguf_model.with_logging {
initialize_logging();
}
let loader = GGUFLoaderBuilder::new(
self.gguf_model.chat_template,
self.gguf_model.tok_model_id,
self.gguf_model.model_id,
self.gguf_model.files,
config,
)
.with_lora(self.lora_model_id, self.ordering)
.build();
let pipeline = loader.load_model_from_hf(
self.gguf_model.hf_revision,
self.gguf_model.token_source,
&ModelDType::Auto,
&best_device(self.gguf_model.force_cpu)?,
!self.gguf_model.with_logging,
DeviceMapMetadata::dummy(),
None,
self.gguf_model.paged_attn_cfg,
)?;
let scheduler_method = match self.gguf_model.paged_attn_cfg {
Some(_) => {
let config = pipeline
.lock()
.await
.get_metadata()
.cache_config
.as_ref()
.unwrap()
.clone();
SchedulerConfig::PagedAttentionMeta {
max_num_seqs: self.gguf_model.max_num_seqs,
config,
}
}
None => SchedulerConfig::DefaultScheduler {
method: DefaultSchedulerMethod::Fixed(self.gguf_model.max_num_seqs.try_into()?),
},
};
let mut runner = MistralRsBuilder::new(pipeline, scheduler_method)
.with_no_kv_cache(self.gguf_model.no_kv_cache)
.with_gemm_full_precision_f16(true)
.with_no_prefix_cache(self.gguf_model.prefix_cache_n.is_none());
if let Some(n) = self.gguf_model.prefix_cache_n {
runner = runner.with_prefix_cache_n(n)
}
Ok(Model::new(runner.build()))
}
}