Online calibration: serve an ISQ model, collect activation statistics from real traffic,

Online calibration: serve an ISQ model, collect activation statistics from real traffic, then requantize from the source weights and hot-swap the layers, all without a restart.

Run with: cargo run --release --example online_calibration -p mistralrs

//! Online calibration: serve an ISQ model, collect activation statistics from real traffic,
//! then requantize from the source weights and hot-swap the layers, all without a restart.
//!
//! Run with: `cargo run --release --example online_calibration -p mistralrs`

use anyhow::Result;
use mistralrs::{IsqBits, ModelBuilder, TextMessageRole, TextMessages};

#[tokio::main]
async fn main() -> Result<()> {
    let model = ModelBuilder::new("google/gemma-4-E4B-it")
        .with_auto_isq(IsqBits::Four)
        .with_logging()
        .build()
        .await?;

    let messages = TextMessages::new().add_message(
        TextMessageRole::User,
        "Explain how a hash map works, briefly.",
    );

    // Collect activation statistics while serving normally (~15% decode overhead while on).
    model.begin_calibration().await?;
    for _ in 0..8 {
        model.send_chat_request(messages.clone()).await?;
    }

    let status = model.calibration_status().await?;
    println!(
        "Collecting on {}/{} layers, {} token rows seen",
        status.layers_tracking, status.layers, status.total_rows
    );

    // Requantize from the source weights with the traffic-derived importance matrix and
    // hot-swap each layer. The optional path also saves the imatrix for reuse via --imatrix.
    model
        .apply_calibration(Some("traffic.cimatrix".into()))
        .await?;

    let response = model.send_chat_request(messages).await?;
    println!("{}", response.choices[0].message.content.as_ref().unwrap());

    Ok(())
}

Source: mistralrs/examples/quantization/online_calibration/main.rs