Azure

Azure Speech to Text of MP3 files

Curia Damiano

Nov 30, 2021 • 2 min read

For a customer, I needed to create a POC where we wanted to run the Azure Speech service Speech-to-Text against the Mozilla Common Voice project, that is a collection of MP3 files reproducing people speaking in different languages.

Currently Speech-to-Text can't work with MP3 files; the suggested solution is to install GStreamer and invoke it as described in the official documentation.
I wanted instead a solution that doesn't require to install any software. So I've chosen to adopt NAudio, that is a popular NuGet package available for .NET.

Here is the first version of my code.
The MP3 file is converted to WAV saving to a temporary file, that is deleted at the end of the processing.

using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using NAudio.Wave;

class Program
{
	async static Task Main(string[] args)
	{
		Console.WriteLine("Application started.");
		var speechConfig = SpeechConfig.FromSubscription("d2d07c7892de468c941064137e65ea97", "westeurope");

		foreach (var mp3File in Directory.GetFiles(".", "*.mp3"))
		{
			string tempFileName = Path.GetTempFileName();
			try {
				Console.WriteLine($"Converting {mp3File} to {tempFileName} WAV.");

				using Mp3FileReader mp3FileReader = new Mp3FileReader(mp3File);
				using WaveStream pcm = WaveFormatConversionStream.CreatePcmStream(mp3FileReader);
				WaveFileWriter.CreateWaveFile(tempFileName, pcm);

				Console.WriteLine($"Recognizing the WAV file.");
				using var audioConfig = AudioConfig.FromWavFileInput(tempFileName);
				using var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
				var result = await recognizer.RecognizeOnceAsync();
				Console.WriteLine($"RECOGNIZED: Text={result.Text}");
			}
			finally
			{
				File.Delete(tempFileName);
			}
		}

		Console.WriteLine("Application finished.");
	}
}

This is another version, where the conversion is done through in-memory streams.

using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using NAudio.Wave;

// From: https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-use-audio-input-streams
class Mp3ToWavAudioStream : PullAudioInputStreamCallback, IDisposable
{
	private Mp3FileReader mp3FileReader;
	private WaveStream pcm;

	public Mp3ToWavAudioStream(string mp3File)
	{
		this.mp3FileReader = new Mp3FileReader(mp3File);
		this.pcm = WaveFormatConversionStream.CreatePcmStream(mp3FileReader);
	}

	public override int Read(byte[] buffer, uint size)
	{
		return pcm.Read(buffer, 0, (int)size);
	}

	protected override void Dispose(bool disposing)
	{
		if (disposing)
		{
			try { this.mp3FileReader?.Dispose(); mp3FileReader = null!; } catch { }
			try { this.pcm?.Dispose(); pcm = null!; } catch { }
		}
		base.Dispose(disposing);
	}
};

class Program
{
	async static Task Main(string[] args)
	{
		Console.WriteLine("Application started.");
		var speechConfig = SpeechConfig.FromSubscription("d2d07c7892de468c941064137e65ea97", "westeurope");

		foreach (var mp3File in Directory.GetFiles(".", "*.mp3"))
		{
			Console.WriteLine($"Converting {mp3File} to WAV.");

			using Mp3ToWavAudioStream audioStream = new Mp3ToWavAudioStream(mp3File);

			byte channels = 1;
			byte bitsPerSample = 16;
			uint samplesPerSecond = 49152; // 768kbps = 768*1024 bps = 786432 bps; 786432 bps / 16 bits/sample = 49152 samples/second
			var audioFormat = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels);

			Console.WriteLine($"Recognizing the WAV file.");
			using var audioConfig = AudioConfig.FromStreamInput(audioStream, audioFormat);
			using var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
			var result = await recognizer.RecognizeOnceAsync();
			Console.WriteLine($"RECOGNIZED: Text={result.Text}");
		}

		Console.WriteLine("Application finished.");
	}
}

Note that the second version with in-memory streams is a bit slower that the version with the temporary file.
Also the second version hardcodes the bitrate of the generated WAV file, while the first version lets the recognizer to detect it.

Sign up for more like this.