mistralrs

View Source

1from .mistralrs import *
2
3__doc__ = mistralrs.__doc__
4if hasattr(mistralrs, "__all__"):
5    __all__ = mistralrs.__all__

class Runner:

An object wrapping the underlying Rust system to handle requests and process conversations.

def send_chat_completion_request(self, /, request, model_id=None):

Send an OpenAI API compatible request, returning the result.

def send_completion_request(self, /, request, model_id=None):

Send an OpenAI API compatible request, returning the result.

def generate_image( self, /, prompt, response_format, height=720, width=1280, model_id=None):

Generate an image.

def generate_audio(self, /, prompt, model_id=None):

Generate audio.

def send_re_isq(self, /, dtype, model_id=None):

Send a request to re-ISQ the model. If the model was loaded as GGUF or GGML then nothing will happen.

def tokenize_text(self, /, text, add_special_tokens, enable_thinking, model_id=None):

Tokenize some text, returning raw tokens.

def detokenize_text(self, /, tokens, skip_special_tokens, model_id=None):

Detokenize some tokens, returning text.

def list_models(self, /):

List all available model IDs in multi-model mode.

def get_default_model_id(self, /):

Get the default model ID in multi-model mode.

def set_default_model_id(self, /, model_id):

Set the default model ID in multi-model mode.

def remove_model(self, /, model_id):

Remove a model by ID in multi-model mode.

def send_chat_completion_request_to_model(self, /, request, model_id):

Send an OpenAI API compatible request to a specific model, returning the result.

def send_completion_request_to_model(self, /, request, model_id):

Send an OpenAI API compatible completion request to a specific model, returning the result.

class MultiModelRunner:

A multi-model runner that provides a cleaner interface for managing multiple models. This wraps the existing Runner and provides model-specific methods.

def send_chat_completion_request_to_model(self, /, request, model_id):

Send a chat completion request to a specific model.

def send_completion_request_to_model(self, /, request, model_id):

Send a completion request to a specific model.

def list_models(self, /):

List all available model IDs.

def get_default_model_id(self, /):

Get the default model ID.

def set_default_model_id(self, /, model_id):

Set the default model ID.

def remove_model(self, /, model_id):

Remove a model by ID.

def send_chat_completion_request(self, /, request, model_id=None):

Send a chat completion request to the default model.

def send_completion_request(self, /, request, model_id=None):

Send a completion request to the default model.

def generate_image( self, /, prompt, response_format, height=720, width=1280, model_id=None):

Generate an image using the default model.

def generate_audio(self, /, prompt, model_id=None):

Generate audio using the default model.

def send_re_isq(self, /, dtype, model_id=None):

Send a request to re-ISQ the default model.

def tokenize_text(self, /, text, add_special_tokens, enable_thinking, model_id=None):

Tokenize some text using the default model.

def detokenize_text(self, /, tokens, skip_special_tokens, model_id=None):

Detokenize some tokens using the default model.

def inner(self, /):

Get a copy of the underlying Runner instance.

class Which:

Plain = <class 'builtins.Which_Plain'>

XLora = <class 'builtins.Which_XLora'>

Lora = <class 'builtins.Which_Lora'>

GGUF = <class 'builtins.Which_GGUF'>

XLoraGGUF = <class 'builtins.Which_XLoraGGUF'>

LoraGGUF = <class 'builtins.Which_LoraGGUF'>

GGML = <class 'builtins.Which_GGML'>

XLoraGGML = <class 'builtins.Which_XLoraGGML'>

LoraGGML = <class 'builtins.Which_LoraGGML'>

VisionPlain = <class 'builtins.Which_VisionPlain'>

DiffusionPlain = <class 'builtins.Which_DiffusionPlain'>

Speech = <class 'builtins.Which_Speech'>

class ChatCompletionRequest:

An OpenAI API compatible chat completion request.

class CompletionRequest:

An OpenAI API compatible completion request.

class Architecture:

Mistral = Architecture.Mistral

Gemma = Architecture.Gemma

Mixtral = Architecture.Mixtral

Llama = Architecture.Llama

Phi2 = Architecture.Phi2

Phi3 = Architecture.Phi3

Qwen2 = Architecture.Qwen2

Gemma2 = Architecture.Gemma2

Starcoder2 = Architecture.Starcoder2

Phi3_5MoE = Architecture.Phi3_5MoE

DeepseekV2 = Architecture.DeepseekV2

DeepseekV3 = Architecture.DeepseekV3

Qwen3 = Architecture.Qwen3

GLM4 = Architecture.GLM4

Qwen3Moe = Architecture.Qwen3Moe

SmolLm3 = Architecture.SmolLm3

class VisionArchitecture:

Phi3V = VisionArchitecture.Phi3V

Idefics2 = VisionArchitecture.Idefics2

LLaVANext = VisionArchitecture.LLaVANext

LLaVA = VisionArchitecture.LLaVA

VLlama = VisionArchitecture.VLlama

Qwen2VL = VisionArchitecture.Qwen2VL

Idefics3 = VisionArchitecture.Idefics3

MiniCpmO = VisionArchitecture.MiniCpmO

Phi4MM = VisionArchitecture.Phi4MM

Qwen2_5VL = VisionArchitecture.Qwen2_5VL

Gemma3 = VisionArchitecture.Gemma3

Mistral3 = VisionArchitecture.Mistral3

Llama4 = VisionArchitecture.Llama4

Gemma3n = VisionArchitecture.Gemma3n

class DiffusionArchitecture:

Flux = DiffusionArchitecture.Flux

FluxOffloaded = DiffusionArchitecture.FluxOffloaded

class AnyMoeConfig:

class AnyMoeExpertType:

FineTuned = <class 'builtins.AnyMoeExpertType_FineTuned'>

LoraAdapter = <class 'builtins.AnyMoeExpertType_LoraAdapter'>

class ToolChoice:

NoTools = ToolChoice.NoTools

Auto = ToolChoice.Auto

class SpeechGenerationResponse:

pcm

rate

channels

class SpeechLoaderType:

Dia = SpeechLoaderType.Dia

class ResponseMessage:

Chat completion response message.

tool_calls

role

content

class Delta:

Delta in content for streaming response.

content

tool_calls

role

class ResponseLogprob:

A logprob with the top logprobs for this token.

bytes

top_logprobs

logprob

token

class Logprobs:

Logprobs per token.

content

class Choice:

Chat completion choice.

finish_reason

index

logprobs

message

class ChunkChoice:

Chat completion streaming chunk choice.

finish_reason

logprobs

index

delta

class Usage:

OpenAI compatible (superset) usage during a request.

completion_tokens

prompt_tokens

avg_compl_tok_per_sec

total_tokens

total_prompt_time_sec

total_completion_time_sec

total_time_sec

avg_tok_per_sec

avg_prompt_tok_per_sec

class ChatCompletionResponse:

An OpenAI compatible chat completion response.

choices

created

model

system_fingerprint

object

usage

class ChatCompletionChunkResponse:

Chat completion streaming request chunk.

system_fingerprint

model

choices

created

object

usage

class CompletionChoice:

Completion request choice.

index

logprobs

text

finish_reason

class CompletionResponse:

An OpenAI compatible completion response.

choices

object

created

model

system_fingerprint

usage

class TopLogprob:

Top-n logprobs element

token

logprob

bytes

class ModelDType:

DType for the model.

If the model is quantized, this is ignored so it is reasonable to use the [Default] impl.

Note: When using Auto, fallback pattern is: BF16 -> F16 -> 32

Auto = ModelDType.Auto

BF16 = ModelDType.BF16

F16 = ModelDType.F16

F32 = ModelDType.F32

class ImageGenerationResponseFormat:

Image generation response format

Url = ImageGenerationResponseFormat.Url

B64Json = ImageGenerationResponseFormat.B64Json

class McpServerSourcePy:

MCP server source configuration for different transport types

Http = <class 'builtins.McpServerSourcePy_Http'>

Process = <class 'builtins.McpServerSourcePy_Process'>

WebSocket = <class 'builtins.McpServerSourcePy_WebSocket'>

class McpServerConfigPy:

Configuration for an individual MCP server

name

enabled

tool_prefix

bearer_token

resources

source

class McpClientConfigPy:

Configuration for MCP client integration

servers

max_concurrent_calls

auto_register_tools

tool_timeout_secs