I'm trying to trim down the data I'm getting from the Azure speech-to-text model I'm using. Line 21 is where the output format is specified and I've changed it to "simple" but I still get a detailed output. The code I'm using is:
using System;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
namespace NEST
{
class Program
{
static void Main(string[] args)
{
var key = "";
var region = "";
var audioFilePath = #"C:/Users/MichaelSchwartz/source/repos/AI-102-Process-Speech-master/transcribe_speech_to_text/media/narration.wav";
var speechConfig = SpeechConfig.FromSubscription(key, region);
// Generates timestamps
speechConfig.RequestWordLevelTimestamps();
speechConfig.OutputFormat = OutputFormat.Simple;
var stopRecognition = new TaskCompletionSource<int>();
// Calls the audio file
var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
//Display Recognized
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED :{e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult)}");
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\n Session stopped event.");
stopRecognition.TrySetResult(0);
};
recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
// Waits for completion. Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
do
{
Console.WriteLine("Press Enter to stop");
} while (Console.ReadKey().Key != ConsoleKey.Enter);
// Stops recognition.
recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
What units of time are being returned? Offset: 173800000? The model runs for a few seconds, not hours. What is meant by "offset" and "duration"?
Is there a way to timestamp at the message level and not word level? Or at least a way to focus on a subset of word level data that indicates when each utterance begins? I'm transcribing longer utterances and it's hours of audio.
Output is:
RECOGNIZED :{"DisplayText":"The speech Translation API transcribes audio streams into text. Your application can display this text to the user or act upon it as command input. You can use this API either with an SDK client library, or a rest a rest API.","Duration":163400000,"Id":"02d2042cadec4ae9bf324c91949620e0","NBest":[{"Confidence":0.85213876,"Display":"The speech Translation API transcribes audio streams into text. Your application can display this text to the user or act upon it as command input. You can use this API either with an SDK client library, or a rest a rest API.","ITN":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","Lexical":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.85089046,"Display":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"is"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8548482,"Display":"the speech translation API transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":3900000,"Offset":51500000,"Word":"streams"},{"Duration":1200000,"Offset":55500000,"Word":"and"},{"Duration":700000,"Offset":56800000,"Word":"a"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8535998,"Display":"the speech translation API transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":3900000,"Offset":51500000,"Word":"streams"},{"Duration":1200000,"Offset":55500000,"Word":"and"},{"Duration":700000,"Offset":56800000,"Word":"a"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"is"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8474758,"Display":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either within SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":2700000,"Offset":134400000,"Word":"within"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]}],"Offset":21800000,"RecognitionStatus":"Success"}
CANCELED: Reason=EndOfStream
Press Enter to stop
Session stopped event.
Also, why does the output repeat the results 5 times? I'm looking to trim this down as much as possible for data analysis. Is there a way to change the unit of measurement to something more user friendly, such as seconds?
For Q1:
Why does the output repeat the results 5 times?
Actually, you can find the answer from STT FAQ by the question:
I get several results for each phrase with the detailed output format.
Which one should I use?
It is by design that you can get several results in NBest of JSON response with different Confidence scores, by default, the system will choose the first as display result. You can choose the result as you need, for instance, the result with the highest Confidence score.
For Q2:
Is there a way to change the unit of measurement to something more user-friendly, such as seconds?
In fact, Azure not provides any further ways to use the result. But I write a simple demo based on your code:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Linq;
namespace STTwithTime
{
class Program
{
static void Main(string[] args)
{
var key = "";
var region = "";
var audioFilePath = #"";
var speechConfig = SpeechConfig.FromSubscription(key, region);
// Generates timestamps
speechConfig.RequestWordLevelTimestamps();
speechConfig.OutputFormat = OutputFormat.Detailed;
var stopRecognition = new TaskCompletionSource<int>();
var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
//Display Recognized
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
var result = JsonConvert.DeserializeObject<Result>(e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult));
var maxConfidenceValue = result.NBest.Max(item => item.Confidence);
var maxConfidence = result.NBest.Find(item => item.Confidence == maxConfidenceValue);
Console.WriteLine("================================");
Console.WriteLine("Confidence:"+maxConfidence.Confidence);
Console.WriteLine("RECOGNIZED :" + maxConfidence.Display);
Console.WriteLine("Duration: :" + Convert.ToDouble(result.Duration) / 10000000);
Console.WriteLine("Words:");
foreach (var word in maxConfidence.Words) {
Console.WriteLine(word.word + "=> offset:" + Convert.ToDouble(word.Offset) / 10000000 + " duraction:" + Convert.ToDouble(word.Duration) / 10000000);
}
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\n Session stopped event.");
stopRecognition.TrySetResult(0);
};
recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
// Waits for completion. Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
}
}
public class Word
{
public int Duration { get; set; }
public int Offset { get; set; }
public string word { get; set; }
}
public class NBest
{
public double Confidence { get; set; }
public string Display { get; set; }
public string ITN { get; set; }
public string Lexical { get; set; }
public string MaskedITN { get; set; }
public List<Word> Words { get; set; }
}
public class Result
{
public string DisplayText { get; set; }
public int Duration { get; set; }
public string Id { get; set; }
public List<NBest> NBest { get; set; }
public int Offset { get; set; }
public string RecognitionStatus { get; set; }
}
}
Result:
If you specify a long time .wav file, the result will be split into multiple parties.