mistralrs

1from .mistralrs import *
2
3__doc__ = mistralrs.__doc__
4if hasattr(mistralrs, "__all__"):
5    __all__ = mistralrs.__all__
class Runner:

An object wrapping the underlying Rust system to handle requests and process conversations.

def send_chat_completion_request(self, /, request, model_id=None):

Send an OpenAI API compatible request, returning the result.

def send_embedding_request(self, /, request, model_id=None):

Send an embeddings request, returning embedding vectors in the same order they were provided. This returns the embeddings as [batch size, embedding dim]

def send_completion_request(self, /, request, model_id=None):

Send an OpenAI API compatible request, returning the result.

def generate_image( self, /, prompt, response_format, height=720, width=1280, model_id=None):

Generate an image.

def generate_audio(self, /, prompt, model_id=None):

Generate audio.

def send_re_isq(self, /, dtype, model_id=None):

Send a request to re-ISQ the model. If the model was loaded as GGUF or GGML then nothing will happen.

def tokenize_text(self, /, text, add_special_tokens, enable_thinking, model_id=None):

Tokenize some text, returning raw tokens.

def detokenize_text(self, /, tokens, skip_special_tokens, model_id=None):

Detokenize some tokens, returning text.

def list_models(self, /):

List all available model IDs in multi-model mode.

def max_sequence_length(self, /, model_id=None):

Return the maximum supported sequence length for the requested model, if available.

def get_default_model_id(self, /):

Get the default model ID in multi-model mode.

def set_default_model_id(self, /, model_id):

Set the default model ID in multi-model mode.

def remove_model(self, /, model_id):

Remove a model by ID in multi-model mode.

def send_chat_completion_request_to_model(self, /, request, model_id):

Send an OpenAI API compatible request to a specific model, returning the result.

def send_completion_request_to_model(self, /, request, model_id):

Send an OpenAI API compatible completion request to a specific model, returning the result.

def unload_model(self, /, model_id):

Unload a model from memory while preserving its configuration for later reload. The model can be reloaded automatically when a request is sent to it, or manually using reload_model().

def reload_model(self, /, model_id):

Manually reload a previously unloaded model.

def list_unloaded_models(self, /):

List all unloaded model IDs.

def is_model_loaded(self, /, model_id):

Check if a model is currently loaded (as opposed to unloaded).

def get_model_status(self, /, model_id):

Get the status of a model: "loaded", "unloaded", "reloading", or None if not found.

def list_models_with_status(self, /):

List all models with their status (loaded, unloaded, reloading).

class Which:
Plain = <class 'builtins.Which_Plain'>
Embedding = <class 'builtins.Which_Embedding'>
XLora = <class 'builtins.Which_XLora'>
Lora = <class 'builtins.Which_Lora'>
GGUF = <class 'builtins.Which_GGUF'>
XLoraGGUF = <class 'builtins.Which_XLoraGGUF'>
LoraGGUF = <class 'builtins.Which_LoraGGUF'>
GGML = <class 'builtins.Which_GGML'>
XLoraGGML = <class 'builtins.Which_XLoraGGML'>
LoraGGML = <class 'builtins.Which_LoraGGML'>
VisionPlain = <class 'builtins.Which_VisionPlain'>
DiffusionPlain = <class 'builtins.Which_DiffusionPlain'>
Speech = <class 'builtins.Which_Speech'>
class ChatCompletionRequest:

An OpenAI API compatible chat completion request.

class CompletionRequest:

An OpenAI API compatible completion request.

class EmbeddingRequest:
class Architecture:
GLM4MoeLite = Architecture.GLM4MoeLite
GraniteMoeHybrid = Architecture.GraniteMoeHybrid
class EmbeddingArchitecture:
class VisionArchitecture:
class DiffusionArchitecture:
class AnyMoeConfig:
class AnyMoeExpertType:
FineTuned = <class 'builtins.AnyMoeExpertType_FineTuned'>
LoraAdapter = <class 'builtins.AnyMoeExpertType_LoraAdapter'>
class ToolChoice:
NoTools = ToolChoice.NoTools
class SpeechGenerationResponse:
pcm
rate
channels
class SpeechLoaderType:
class ResponseMessage:

Chat completion response message.

content
role
tool_calls
reasoning_content

Reasoning/analysis content from Harmony format (separate from final content). This contains chain-of-thought reasoning that is not intended for end users.

class Delta:

Delta in content for streaming response.

tool_calls
content
role
reasoning_content

Reasoning/analysis content delta from Harmony format. This contains incremental chain-of-thought reasoning.

class ResponseLogprob:

A logprob with the top logprobs for this token.

token
logprob
top_logprobs
bytes
class Logprobs:

Logprobs per token.

content
class Choice:

Chat completion choice.

finish_reason
logprobs
message
index
class ChunkChoice:

Chat completion streaming chunk choice.

index
delta
finish_reason
logprobs
class Usage:

OpenAI compatible (superset) usage during a request.

total_completion_time_sec
completion_tokens
avg_tok_per_sec
avg_compl_tok_per_sec
prompt_tokens
total_time_sec
total_prompt_time_sec
total_tokens
avg_prompt_tok_per_sec
class ChatCompletionResponse:

An OpenAI compatible chat completion response.

id
choices
created
model
system_fingerprint
object
usage
class ChatCompletionChunkResponse:

Chat completion streaming request chunk.

created
system_fingerprint
choices
object
usage
model
id
class CompletionChoice:

Completion request choice.

finish_reason
index
logprobs
text
class CompletionResponse:

An OpenAI compatible completion response.

usage
id
model
created
system_fingerprint
choices
object
class TopLogprob:

Top-n logprobs element

bytes
token
logprob
class ModelDType:

DType for the model.

If the model is quantized, this is ignored so it is reasonable to use the [Default] impl.

Note: When using Auto, fallback pattern is: BF16 -> F16 -> 32

class ImageGenerationResponseFormat:

Image generation response format

class McpServerSourcePy:

MCP server source configuration for different transport types

Http = <class 'builtins.McpServerSourcePy_Http'>
Process = <class 'builtins.McpServerSourcePy_Process'>
WebSocket = <class 'builtins.McpServerSourcePy_WebSocket'>
class McpServerConfigPy:

Configuration for an individual MCP server

source
id
name
enabled
tool_prefix
resources
bearer_token
class McpClientConfigPy:

Configuration for MCP client integration

max_concurrent_calls
auto_register_tools
servers
tool_timeout_secs