Skip to content

online_calibration

Runnable Python SDK example online_calibration.

from mistralrs import Runner, Which, ChatCompletionRequest
runner = Runner(
which=Which.Plain(
model_id="google/gemma-4-E4B-it",
),
in_situ_quant="Q4K",
)
request = ChatCompletionRequest(
model="default",
messages=[{"role": "user", "content": "Explain how a hash map works, briefly."}],
max_tokens=64,
)
# Collect activation statistics while serving normally (~15% decode overhead while on).
runner.begin_calibration()
for _ in range(8):
runner.send_chat_completion_request(request)
status = runner.calibration_status()
print(
f"Collecting on {status.layers_tracking}/{status.layers} layers, "
f"{status.total_rows} token rows seen"
)
# Requantize from the source weights with the traffic-derived importance matrix and
# hot-swap each layer. The optional path also saves the imatrix for reuse.
runner.apply_calibration(save_cimatrix="traffic.cimatrix")
res = runner.send_chat_completion_request(request)
print(res.choices[0].message.content)

Source: examples/python/online_calibration.py