Example demonstrating multi-model usage with mistral.rs server HTTP API

Example demonstrating multi-model usage with mistral.rs server HTTP API.
This example shows how to:
List available models from the server
Send requests to specific models
Compare responses from different models
Handle model selection in the API
Prerequisites:
Start the server in multi-model mode: mistralrs serve -p 1234 —multi-model-config example-multi-model-config.json
"""
Example demonstrating multi-model usage with mistral.rs server HTTP API.

This example shows how to:
1. List available models from the server
2. Send requests to specific models
3. Compare responses from different models
4. Handle model selection in the API

Prerequisites:
- Start the server in multi-model mode:
  mistralrs serve -p 1234 --multi-model-config example-multi-model-config.json
"""

import requests
import json
from typing import List, Dict, Any, Optional

# Server configuration
SERVER_URL = "http://localhost:1234"


def list_models() -> List[Dict[str, Any]]:
    """List all available models from the server."""
    response = requests.get(f"{SERVER_URL}/v1/models")
    response.raise_for_status()
    models_data = response.json()
    return models_data.get("data", [])


def chat_with_model(
    messages: List[Dict[str, str]],
    model_id: Optional[str] = None,
    temperature: float = 0.7,
    max_tokens: int = 1000,
) -> Dict[str, Any]:
    """Send a chat completion request to a specific model."""
    payload = {
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    # Add model_id if specified
    if model_id:
        payload["model"] = model_id

    response = requests.post(
        f"{SERVER_URL}/v1/chat/completions",
        json=payload,
        headers={"Content-Type": "application/json"},
    )
    response.raise_for_status()
    return response.json()


def stream_chat_with_model(
    messages: List[Dict[str, str]],
    model_id: Optional[str] = None,
    temperature: float = 0.7,
):
    """Stream chat completion from a specific model."""
    payload = {
        "messages": messages,
        "temperature": temperature,
        "stream": True,
    }

    if model_id:
        payload["model"] = model_id

    response = requests.post(
        f"{SERVER_URL}/v1/chat/completions",
        json=payload,
        headers={"Content-Type": "application/json"},
        stream=True,
    )
    response.raise_for_status()

    for line in response.iter_lines():
        if line:
            line_str = line.decode("utf-8")
            if line_str.startswith("data: "):
                data_str = line_str[6:]  # Remove "data: " prefix
                if data_str == "[DONE]":
                    break
                try:
                    chunk = json.loads(data_str)
                    if chunk["choices"][0]["delta"].get("content"):
                        yield chunk["choices"][0]["delta"]["content"]
                except json.JSONDecodeError:
                    continue


def compare_models_example():
    """Compare responses from different models to the same prompt."""
    print("=== Comparing Multiple Models ===\n")

    # List available models
    models = list_models()
    print(f"Available models: {len(models)}")
    for model in models:
        print(f"  - {model['id']}")
    print()

    if len(models) < 2:
        print(
            "Need at least 2 models loaded for comparison. Please start the server with multiple models."
        )
        return

    # Test prompt
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "What are the main benefits of using renewable energy?",
        },
    ]

    # Get responses from different models
    print("Getting responses from different models...\n")

    for model in models[:2]:  # Compare first two models
        model_id = model["id"]
        print(f"Model: {model_id}")
        print("-" * 50)

        response = chat_with_model(messages, model_id=model_id, temperature=0.7)
        content = response["choices"][0]["message"]["content"]
        print(content)
        print("\n")


def model_specific_tasks_example():
    """Demonstrate using different models for different types of tasks."""
    print("=== Model-Specific Tasks ===\n")

    models = list_models()
    if not models:
        print("No models available. Please start the server with models loaded.")
        return

    # Example 1: Creative writing with one model
    creative_messages = [
        {"role": "system", "content": "You are a creative writer."},
        {"role": "user", "content": "Write a haiku about artificial intelligence."},
    ]

    model_id = models[0]["id"]
    print(f"Creative task with {model_id}:")
    response = chat_with_model(creative_messages, model_id=model_id, temperature=0.9)
    print(response["choices"][0]["message"]["content"])
    print("\n")

    # Example 2: Technical explanation with another model (or same if only one)
    technical_messages = [
        {"role": "system", "content": "You are a technical expert."},
        {
            "role": "user",
            "content": "Explain how a transformer neural network works in simple terms.",
        },
    ]

    # Use second model if available, otherwise use first
    model_id = models[1]["id"] if len(models) > 1 else models[0]["id"]
    print(f"Technical task with {model_id}:")
    response = chat_with_model(technical_messages, model_id=model_id, temperature=0.3)
    print(response["choices"][0]["message"]["content"])
    print("\n")


def streaming_example():
    """Demonstrate streaming with specific models."""
    print("=== Streaming Example ===\n")

    models = list_models()
    if not models:
        print("No models available.")
        return

    messages = [
        {
            "role": "user",
            "content": "Tell me a short story about a robot learning to paint.",
        }
    ]

    model_id = models[0]["id"]
    print(f"Streaming from {model_id}:\n")

    for chunk in stream_chat_with_model(messages, model_id=model_id):
        print(chunk, end="", flush=True)
    print("\n")


def model_fallback_example():
    """Demonstrate fallback behavior when model is not specified."""
    print("=== Default Model Fallback ===\n")

    models = list_models()
    if not models:
        print("No models available.")
        return

    messages = [{"role": "user", "content": "What is 2 + 2?"}]

    # Request without specifying model (uses default)
    print("Request without model_id (uses default):")
    response = chat_with_model(messages)
    print(f"Response: {response['choices'][0]['message']['content']}")
    print(f"Model used: {response.get('model', 'Not specified')}\n")

    # Request with specific model
    if models:
        model_id = models[0]["id"]
        print(f"Request with model_id='{model_id}':")
        response = chat_with_model(messages, model_id=model_id)
        print(f"Response: {response['choices'][0]['message']['content']}")
        print(f"Model used: {response.get('model', 'Not specified')}")


def interactive_model_selection():
    """Interactive example allowing user to choose models."""
    print("=== Interactive Model Selection ===\n")

    models = list_models()
    if not models:
        print("No models available.")
        return

    print("Available models:")
    for i, model in enumerate(models):
        print(f"{i + 1}. {model['id']}")

    while True:
        print("\nEnter a message (or 'quit' to exit):")
        user_input = input("> ")

        if user_input.lower() == "quit":
            break

        print("\nSelect a model (enter number):")
        try:
            model_idx = int(input("> ")) - 1
            if 0 <= model_idx < len(models):
                model_id = models[model_idx]["id"]

                messages = [{"role": "user", "content": user_input}]
                response = chat_with_model(messages, model_id=model_id)

                print(f"\n{model_id}: {response['choices'][0]['message']['content']}")
            else:
                print("Invalid model selection.")
        except (ValueError, KeyError):
            print("Invalid input.")


if __name__ == "__main__":
    try:
        # Test server connection
        response = requests.get(f"{SERVER_URL}/v1/models", timeout=2)
        response.raise_for_status()
    except requests.exceptions.RequestException:
        print("Error: Could not connect to server.")
        print("Please start the server with:")
        print(
            "  mistralrs serve -p 1234 --multi-model-config example-multi-model-config.json"
        )
        exit(1)

    # Run examples
    compare_models_example()
    print("\n" + "=" * 60 + "\n")

    model_specific_tasks_example()
    print("\n" + "=" * 60 + "\n")

    streaming_example()
    print("\n" + "=" * 60 + "\n")

    model_fallback_example()
    print("\n" + "=" * 60 + "\n")

    # Uncomment to run interactive mode
    # interactive_model_selection()
Source: examples/server/multi_model_chat.py