Skip to content

Multimodal streaming with combined image and audio inputs

Multimodal streaming with combined image and audio inputs.

Run with: cargo run --release --example multimodal -p mistralrs

//! Multimodal streaming with combined image and audio inputs.
//!
//! Run with: `cargo run --release --example multimodal -p mistralrs`
use std::io::Write;
use anyhow::Result;
use mistralrs::{
AudioInput, ChatCompletionChunkResponse, ChunkChoice, Delta, MultimodalMessages,
MultimodalModelBuilder, Response, TextMessageRole,
};
#[tokio::main]
async fn main() -> Result<()> {
let model = MultimodalModelBuilder::new("microsoft/Phi-4-multimodal-instruct")
.with_logging()
.build()
.await?;
let audio_bytes =
reqwest::get("https://upload.wikimedia.org/wikipedia/commons/4/42/Bird_singing.ogg")
.await?
.bytes()
.await?
.to_vec();
let audio = AudioInput::from_bytes(&audio_bytes)?;
let image_bytes =
reqwest::get("https://www.allaboutbirds.org/guide/assets/og/528129121-1200px.jpg")
.await?
.bytes()
.await?
.to_vec();
let image = image::load_from_memory(&image_bytes)?;
let messages = MultimodalMessages::new().add_multimodal_message(
TextMessageRole::User,
"Describe in detail what is happening.",
vec![image],
vec![audio],
vec![],
);
let mut stream = model.stream_chat_request(messages).await?;
while let Some(chunk) = stream.next().await {
if let Response::Chunk(ChatCompletionChunkResponse { choices, .. }) = chunk {
if let Some(ChunkChoice {
delta:
Delta {
content: Some(content),
..
},
..
}) = choices.first()
{
print!("{content}");
std::io::stdout().flush()?;
};
} else {
// Handle errors
}
}
Ok(())
}

Source: mistralrs/examples/models/multimodal/main.rs