SpeakProgressEventArgs SpeechSynthesizer неточны?
При использовании класса System.Speech.Synthesis.SpeechSynthesizer в.Net 3.5 свойство AudioPosition объекта SpeakProgressEventArgs представляется неточным.
Следующий код производит следующий вывод:
Код:
using System;
using System.Speech.Synthesis;
using System.Threading;
namespace SpeechTest
{
class Program
{
static ManualResetEvent speechDoneEvent = new ManualResetEvent(false);
static void Main(string[] args)
{
SpeechSynthesizer synthesizer = new SpeechSynthesizer();
synthesizer.SpeakProgress += new EventHandler<SpeakProgressEventArgs>(synthesizer_SpeakProgress);
synthesizer.SpeakCompleted += new EventHandler<SpeakCompletedEventArgs>(synthesizer_SpeakCompleted);
synthesizer.SetOutputToWaveFile("Test.wav");
synthesizer.SpeakAsync("This holiday season, support the music you love by shopping at Made in Washington, online and at one of five local stores. Made in Washington chocolates, bountiful gift baskets and ornaments are the perfect holiday gifts for family, friends and co-workers.");
speechDoneEvent.WaitOne();
}
static void synthesizer_SpeakCompleted(object sender, SpeakCompletedEventArgs e)
{
speechDoneEvent.Set();
}
static void synthesizer_SpeakProgress(object sender, SpeakProgressEventArgs e)
{
Console.WriteLine("SpeakProgress: AudioPosition=" + e.AudioPosition + ",\tCharacterPosition=" + e.CharacterPosition + ",\tCharacterCount=" + e.CharacterCount + ",\tText=" + e.Text);
}
}
}
Выход:
SpeakProgress: AudioPosition=00:00:00.0043750, CharacterPosition=0, CharacterCount=4, Text=This
SpeakProgress: AudioPosition=00:00:00.2925625, CharacterPosition=5, CharacterCount=7, Text=holiday
SpeakProgress: AudioPosition=00:00:00.9086250, CharacterPosition=13, CharacterCount=6, Text=season
SpeakProgress: AudioPosition=00:00:01.9421250, CharacterPosition=21, CharacterCount=7, Text=support
SpeakProgress: AudioPosition=00:00:02.5621250, CharacterPosition=29, CharacterCount=3, Text=the
SpeakProgress: AudioPosition=00:00:02.6760625, CharacterPosition=33, CharacterCount=5, Text=music
SpeakProgress: AudioPosition=00:00:03.2648125, CharacterPosition=39, CharacterCount=3, Text=you
SpeakProgress: AudioPosition=00:00:03.5199375, CharacterPosition=43, CharacterCount=4, Text=love
SpeakProgress: AudioPosition=00:00:03.8435625, CharacterPosition=48, CharacterCount=2, Text=by
SpeakProgress: AudioPosition=00:00:04.0701875, CharacterPosition=51, CharacterCount=8, Text=shopping
SpeakProgress: AudioPosition=00:00:04.6840625, CharacterPosition=60, CharacterCount=2, Text=at
SpeakProgress: AudioPosition=00:00:04.8036250, CharacterPosition=63, CharacterCount=4, Text=Made
SpeakProgress: AudioPosition=00:00:05.0698125, CharacterPosition=68, CharacterCount=2, Text=in
SpeakProgress: AudioPosition=00:00:05.2521250, CharacterPosition=71, CharacterCount=10, Text=Washington
SpeakProgress: AudioPosition=00:00:06.2961875, CharacterPosition=83, CharacterCount=6, Text=online
SpeakProgress: AudioPosition=00:00:07.0540625, CharacterPosition=90, CharacterCount=3, Text=and
SpeakProgress: AudioPosition=00:00:07.3331250, CharacterPosition=94, CharacterCount=2, Text=at
SpeakProgress: AudioPosition=00:00:07.6818750, CharacterPosition=97, CharacterCount=3, Text=one
SpeakProgress: AudioPosition=00:00:08.0598750, CharacterPosition=101, CharacterCount=2, Text=of
SpeakProgress: AudioPosition=00:00:08.2163750, CharacterPosition=104, CharacterCount=4, Text=five
SpeakProgress: AudioPosition=00:00:08.5971875, CharacterPosition=109, CharacterCount=5, Text=local
SpeakProgress: AudioPosition=00:00:09.0243750, CharacterPosition=115, CharacterCount=6, Text=stores
SpeakProgress: AudioPosition=00:00:10.5325625, CharacterPosition=123, CharacterCount=4, Text=Made
SpeakProgress: AudioPosition=00:00:10.7700625, CharacterPosition=128, CharacterCount=2, Text=in
SpeakProgress: AudioPosition=00:00:10.9377500, CharacterPosition=131, CharacterCount=10, Text=Washington
SpeakProgress: AudioPosition=00:00:11.6708125, CharacterPosition=142, CharacterCount=10, Text=chocolates
SpeakProgress: AudioPosition=00:00:12.9798750, CharacterPosition=154, CharacterCount=9, Text=bountiful
SpeakProgress: AudioPosition=00:00:13.6303125, CharacterPosition=164, CharacterCount=4, Text=gift
SpeakProgress: AudioPosition=00:00:14.0959375, CharacterPosition=169, CharacterCount=7, Text=baskets
SpeakProgress: AudioPosition=00:00:14.7848125, CharacterPosition=177, CharacterCount=3, Text=and
SpeakProgress: AudioPosition=00:00:15.0507500, CharacterPosition=181, CharacterCount=9, Text=ornaments
SpeakProgress: AudioPosition=00:00:15.7195000, CharacterPosition=191, CharacterCount=3, Text=are
SpeakProgress: AudioPosition=00:00:15.9872500, CharacterPosition=195, CharacterCount=3, Text=the
SpeakProgress: AudioPosition=00:00:16.1488750, CharacterPosition=199, CharacterCount=7, Text=perfect
SpeakProgress: AudioPosition=00:00:16.7275000, CharacterPosition=207, CharacterCount=7, Text=holiday
SpeakProgress: AudioPosition=00:00:17.3336875, CharacterPosition=215, CharacterCount=5, Text=gifts
SpeakProgress: AudioPosition=00:00:17.9813125, CharacterPosition=221, CharacterCount=3, Text=for
SpeakProgress: AudioPosition=00:00:18.2216875, CharacterPosition=225, CharacterCount=6, Text=family
SpeakProgress: AudioPosition=00:00:19.0973750, CharacterPosition=233, CharacterCount=7, Text=friends
SpeakProgress: AudioPosition=00:00:19.7726250, CharacterPosition=241, CharacterCount=3, Text=and
SpeakProgress: AudioPosition=00:00:19.9655625, CharacterPosition=245, CharacterCount=10, Text=co-workers
SpeakProgress: AudioPosition=00:00:20.2518750, CharacterPosition=245, CharacterCount=10, Text=co-workers
Однако продолжительность создаваемого WAV-файла составляет 15,69 секунды. То же самое происходит, если вы выводите в поток или в ноль.
В документации по этому свойству сказано, что это свойство "Объект TimeSpan, представляющий временную позицию события в потоке аудиовывода".
Должно ли это быть точное время, указывающее время начала или окончания произнесения слова в выходном файле, или я неправильно его интерпретирую?
1 ответ
audioPosition
зависит от выбранного голоса синтезатора речи. Для некоторых голосов Microsoft, таких как Анна, Зира, Дэвид, Хейзел, как я слышал, поддерживаемый аудиоформат - 16000 Гц PCM. Таким образом, следующее решение может исправить положение auido:
var format =
new System.Speech.AudioFormat.SpeechAudioFormatInfo(EncodingFormat.Pcm,
16000, 16, 1, 32000, 2, null);
synthesizer.SetOutputToWaveFile("Test.wav", format);
если вы заметите, частота дискретизации по умолчанию SetOutputToWaveFile
равно 22050, и отношение правильного времени (15.69) ко времени, указанному как AudipPosition
(20,25) составляет около 0,77. Если вы умножите это соотношение на 22050, вы получите около 16000, что является правильной частотой дискретизации.