Skip to content

Text-to-speech synthesis using a speech model

Text-to-speech synthesis using a speech model.

Run with: cargo run --release --example speech -p mistralrs

//! Text-to-speech synthesis using a speech model.
//!
//! Run with: `cargo run --release --example speech -p mistralrs`
use std::time::Instant;
use anyhow::Result;
use mistralrs::{speech_utils, SpeechLoaderType, SpeechModelBuilder};
#[tokio::main]
async fn main() -> Result<()> {
let model = SpeechModelBuilder::new("nari-labs/Dia-1.6B", SpeechLoaderType::Dia)
.with_logging()
.build()
.await?;
let start = Instant::now();
// let text_to_speak = "[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face.";
let text_to_speak = "[S1] mistral r s is a local LLM inference engine. [S2] You can run text and vision models, and also image generation and speech generation. [S1] There is agentic web search, tool calling, and a convenient Python API. [S2] Check it out on github.";
let (pcm, rate, channels) = model.generate_speech(text_to_speak).await?;
let finished = Instant::now();
let mut output = std::fs::File::create("out.wav").unwrap();
speech_utils::write_pcm_as_wav(&mut output, &pcm, rate as u32, channels as u16).unwrap();
println!(
"Done! Took {} s. Audio saved at `out.wav`.",
finished.duration_since(start).as_secs_f32(),
);
Ok(())
}

Source: mistralrs/examples/models/speech/main.rs