Uwp SpeechSynthesizer/MediaPlayer утечка памяти
У нас есть приложение Uwp, которое использует голоса Microsoft, чтобы говорить и читать текст, как он говорит. Я заметил, что использование памяти приложением увеличивается с каждым битом произносимого текста, и в конечном итоге ему не хватает памяти. Неважно, какой голос используется или какой текст произносится.
Чтобы выделить текст, я подписываюсь на события в TimedMedatataTracks MediaPlaybackItem. Когда текст заканчивается, я отписываюсь от каждого события и распоряжаюсь MediaPlaybackItem.Source. Профилировщик памяти Visual Studio не показывает утечек в управляемой памяти, поэтому я подозреваю, что что-то не очищается в неуправляемом пространстве.
Изменить: я прокомментировал это в коде, но я буду называть это здесь - если я не подпишусь на события TimedMetadataTrack, утечка исчезнет. Я также могу воспроизвести это с помощью примера приложения Windows (синтезировать текст с границами)
Я пропускаю что-то, что должно быть удалено, или это ошибка в SpeechSynthesizer/MediaPlayer?
using System;
using System.Diagnostics;
using Windows.Media.Core;
using Windows.Media.Playback;
using Windows.Media.SpeechSynthesis;
namespace WindowsTts
{
public class UwpNativeVoice : IDisposable
{
private readonly object _activeSpeechLock;
private SpeechSynthesizer _synthesizer;
private MediaPlayer _mediaPlayer;
private SpeechCallback _activeSpeech;
public UwpNativeVoice(VoiceInformation platformInfo)
{
_activeSpeechLock = new object();
_synthesizer = new SpeechSynthesizer();
_synthesizer.Options.IncludeWordBoundaryMetadata = true;
_synthesizer.Voice = platformInfo;
_mediaPlayer = new MediaPlayer
{
RealTimePlayback = true,
AutoPlay = false,
Volume = 1.0f
};
_mediaPlayer.MediaOpened += OnMediaPlayerMediaOpened;
_mediaPlayer.MediaEnded += OnMediaPlayerMediaEnded;
}
public void Dispose()
{
_mediaPlayer.MediaOpened -= OnMediaPlayerMediaOpened;
_mediaPlayer.MediaEnded -= OnMediaPlayerMediaEnded;
(_mediaPlayer.Source as MediaPlaybackItem)?.Source?.Dispose();
_mediaPlayer.Source = null;
_mediaPlayer.Dispose();
_mediaPlayer = null;
_synthesizer?.Dispose();
_synthesizer = null;
}
public async void Speak(string text, SpeechDelegate speechDelegate)
{
if ( string.IsNullOrEmpty(text) )
{
// no-op; just fire events and bail
speechDelegate?.Invoke(text, ReadTextEvent.Start);
speechDelegate?.Invoke(text, ReadTextEvent.End);
return;
}
if (_activeSpeech != null)
{
// something currently speaking; halt it, fire events and then start anew
Halt();
}
// get synth stream, and add markers for bookmarks & word boundaries
var synthStream = await _synthesizer.SynthesizeTextToStreamAsync(text);
lock (_activeSpeechLock)
{
_activeSpeech = new SpeechCallback(text, speechDelegate);
try
{
var source = MediaSource.CreateFromStream(synthStream, synthStream.ContentType);
var playbackItem = new MediaPlaybackItem(source);
ConfigPlaybackEvents(playbackItem); //Comment this out and the leak goes away
_mediaPlayer.Source = playbackItem;
_mediaPlayer.Play();
}
catch (Exception e)
{
Debug.WriteLine(e);
_activeSpeech?.Invoke(ReadTextEvent.End);
_activeSpeech = null;
}
}
}
public bool Halt()
{
lock (_activeSpeechLock)
{
if (_activeSpeech == null)
return true;
}
_mediaPlayer.Pause();
DestroyMediaPlaybackItem(_mediaPlayer.Source as MediaPlaybackItem);
_mediaPlayer.Source = null;
SpeechCallback callback;
lock (_activeSpeechLock)
{
callback = _activeSpeech;
_activeSpeech = null;
}
callback?.Invoke(ReadTextEvent.End);
return true;
}
private void OnMediaPlayerMediaOpened(MediaPlayer sender, object args)
{
FireReadTextEvent(ReadTextEvent.Start);
}
private void OnTimedMetadataTrackEntered(TimedMetadataTrack track, MediaCueEventArgs args)
{
if ( track.TimedMetadataKind == TimedMetadataKind.Speech && args.Cue is SpeechCue speechCue )
{
var startIdx = speechCue.StartPositionInInput ?? 0;
var endIdx = speechCue.EndPositionInInput ?? -1;
FireReadTextEvent(ReadTextEvent.WordEvent(startIdx, (endIdx - startIdx) + 1));
}
}
private void OnMediaPlayerMediaEnded(MediaPlayer sender, object args)
{
SpeechCallback callback;
lock ( _activeSpeechLock )
{
callback = _activeSpeech;
_activeSpeech = null;
}
callback?.Invoke(ReadTextEvent.End);
DestroyMediaPlaybackItem(sender.Source as MediaPlaybackItem);
sender.Source = null;
}
private void FireReadTextEvent(ReadTextEvent evt)
{
SpeechCallback callback;
lock ( _activeSpeechLock )
callback = _activeSpeech;
callback?.Invoke(evt);
}
private void ConfigPlaybackEvents(MediaPlaybackItem playbackItem)
{
// see: https://docs.microsoft.com/en-us/uwp/api/windows.media.core.timedmetadatatrack
// iterate through existing tracks, registering callbacks for them
for ( int i = 0; i < playbackItem.TimedMetadataTracks.Count; i++ )
RegisterAction(playbackItem, i);
}
private void RegisterAction(MediaPlaybackItem item, int idx)
{
const string speechWordIdentifier = "SpeechWord";
TimedMetadataTrack track = item.TimedMetadataTracks[idx];
if (track.Id.Equals(speechWordIdentifier, StringComparison.Ordinal) || track.Label.Equals(speechWordIdentifier, StringComparison.Ordinal))
{
track.CueEntered += OnTimedMetadataTrackEntered;
item.TimedMetadataTracks.SetPresentationMode((uint)idx, TimedMetadataTrackPresentationMode.ApplicationPresented);
}
}
private void DestroyMediaPlaybackItem(MediaPlaybackItem item)
{
if ( item == null )
return;
foreach ( var track in item.TimedMetadataTracks )
{
track.CueEntered -= OnTimedMetadataTrackEntered;
}
item.Source?.Dispose();
}
}
}
namespace WindowsTts
{
/// <summary>Defines a trigger that caused the broadcasting of a ReadTextEvent.</summary>
public enum ReadTextTrigger
{
Start,
Bookmark,
Word,
End,
}
/// <summary>A ReadTextEvent encompasses the relevant information from the tts world and is passed to the api user as part of a ReadTextInfo's EventAction data. </summary>
public class ReadTextEvent
{
public static ReadTextEvent Start { get; } = new ReadTextEvent()
{
Trigger = ReadTextTrigger.Start,
BookmarkName = null,
TextOffset = -1,
TextLength = -1,
};
public static ReadTextEvent End { get; } = new ReadTextEvent()
{
Trigger = ReadTextTrigger.End,
BookmarkName = null,
TextOffset = -1,
TextLength = -1,
};
public ReadTextTrigger Trigger { get; set; }
public string BookmarkName { get; set; }
public int TextOffset { get; set; }
public int TextLength { get; set; }
/// <summary>Utility methods to pre-initialize some fields of this object.</summary>
public static ReadTextEvent Factory(ReadTextEvent src)
{
return new ReadTextEvent()
{
Trigger = src.Trigger,
BookmarkName = src.BookmarkName,
TextOffset = src.TextOffset,
TextLength = src.TextLength,
};
}
public static ReadTextEvent BookmarkEvent(string bookmark)
{
return new ReadTextEvent()
{
Trigger = ReadTextTrigger.Bookmark,
BookmarkName = bookmark,
TextOffset = -1,
TextLength = -1,
};
}
public static ReadTextEvent WordEvent(int textOffset, int textLength)
{
return new ReadTextEvent()
{
Trigger = ReadTextTrigger.Word,
BookmarkName = null,
TextOffset = textOffset,
TextLength = textLength,
};
}
private ReadTextEvent()
{
}
}
/// <summary>
/// A SpeechDelegate is passed to the ITtsVoice.Speak() method, so that the caller may receive progress info as the text is being spoken.
/// </summary>
/// <param name="speechText"></param>
/// <param name="readTextEvent"></param>
public delegate void SpeechDelegate(string speechText, ReadTextEvent readTextEvent);
/// <summary>
/// This class encapsulates everything necessary to invoke a SpeechDelegate.
/// A SpeechCallback instance may be created each time a new string is enqueued for speaking,
/// and then invoked multiple times throughout the process, with an updated ReadTextEvent.
/// </summary>
public class SpeechCallback
{
private readonly SpeechDelegate _speechDelegate;
public SpeechCallback(string text, SpeechDelegate speechDelegate)
{
Text = text;
_speechDelegate = speechDelegate;
}
public string Text { get; }
public void Invoke(ReadTextEvent readTextEvent) => _speechDelegate?.Invoke(Text, readTextEvent);
}
}
namespace WindowsTts
{
/// <summary>Defines a trigger that caused the broadcasting of a ReadTextEvent.</summary>
public enum ReadTextTrigger
{
Start,
Bookmark,
Word,
End,
}
/// <summary>A ReadTextEvent encompasses the relevant information from the tts world and is passed to the api user as part of a ReadTextInfo's EventAction data. </summary>
public class ReadTextEvent
{
public static ReadTextEvent Start { get; } = new ReadTextEvent()
{
Trigger = ReadTextTrigger.Start,
BookmarkName = null,
TextOffset = -1,
TextLength = -1,
};
public static ReadTextEvent End { get; } = new ReadTextEvent()
{
Trigger = ReadTextTrigger.End,
BookmarkName = null,
TextOffset = -1,
TextLength = -1,
};
public ReadTextTrigger Trigger { get; set; }
public string BookmarkName { get; set; }
public int TextOffset { get; set; }
public int TextLength { get; set; }
/// <summary>Utility methods to pre-initialize some fields of this object.</summary>
public static ReadTextEvent Factory(ReadTextEvent src)
{
return new ReadTextEvent()
{
Trigger = src.Trigger,
BookmarkName = src.BookmarkName,
TextOffset = src.TextOffset,
TextLength = src.TextLength,
};
}
public static ReadTextEvent BookmarkEvent(string bookmark)
{
return new ReadTextEvent()
{
Trigger = ReadTextTrigger.Bookmark,
BookmarkName = bookmark,
TextOffset = -1,
TextLength = -1,
};
}
public static ReadTextEvent WordEvent(int textOffset, int textLength)
{
return new ReadTextEvent()
{
Trigger = ReadTextTrigger.Word,
BookmarkName = null,
TextOffset = textOffset,
TextLength = textLength,
};
}
private ReadTextEvent()
{
}
}
/// <summary>
/// A SpeechDelegate is passed to the ITtsVoice.Speak() method, so that the caller may receive progress info as the text is being spoken.
/// </summary>
/// <param name="speechText"></param>
/// <param name="readTextEvent"></param>
public delegate void SpeechDelegate(string speechText, ReadTextEvent readTextEvent);
/// <summary>
/// This class encapsulates everything necessary to invoke a SpeechDelegate.
/// A SpeechCallback instance may be created each time a new string is enqueued for speaking,
/// and then invoked multiple times throughout the process, with an updated ReadTextEvent.
/// </summary>
public class SpeechCallback
{
private readonly SpeechDelegate _speechDelegate;
public SpeechCallback(string text, SpeechDelegate speechDelegate)
{
Text = text;
_speechDelegate = speechDelegate;
}
public string Text { get; }
public void Invoke(ReadTextEvent readTextEvent) => _speechDelegate?.Invoke(Text, readTextEvent);
}
}