Skip to content

Commit

Permalink
com.rest.elevenlabs 3.2.1 (#54)
Browse files Browse the repository at this point in the history
- make TextToSpeechRequest public
- fix text encoding bug
  • Loading branch information
StephenHodgson authored Dec 15, 2023
1 parent 09eca3f commit b5b5282
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 62 deletions.
31 changes: 30 additions & 1 deletion Runtime/Models/Model.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using System;
using Newtonsoft.Json;
using System.Collections.Generic;
using UnityEngine.Scripting;
Expand Down Expand Up @@ -79,12 +80,40 @@ public Model(

[Preserve]
[JsonIgnore]
public static Model MonoLingualV1 { get; } = new Model("eleven_monolingual_v1");
[Obsolete("Use EnglishV1")]
public static Model MonoLingualV1 => EnglishV1;

/// <summary>
/// Use our standard English language model to generate speech in a variety of voices, styles and moods.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model EnglishV1 { get; } = new Model("eleven_monolingual_v1");

/// <summary>
/// Speech to speech model suitable for scenarios where you need maximum control over the content and prosody of your generations.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model EnglishV2 { get; } = new Model("eleven_english_sts_v2");

/// <summary>
/// Cutting-edge turbo model is ideally suited for tasks demanding extremely low latency.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model EnglishTurboV2 { get; } = new Model("eleven_turbo_v2");

/// <summary>
/// Generate lifelike speech in multiple languages and create content that resonates with a broader audience.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model MultiLingualV1 { get; } = new Model("eleven_multilingual_v1");

/// <summary>
/// State of the art multilingual speech synthesis model, able to generate life-like speech in 29 languages.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model MultiLingualV2 { get; } = new Model("eleven_multilingual_v2");
Expand Down
94 changes: 46 additions & 48 deletions Runtime/TextToSpeech/TextToSpeechEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using UnityEngine;
Expand Down Expand Up @@ -63,38 +64,43 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { }
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="VoiceClip"/>.</returns>
public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default)
{
ValidateInputs(text, voice);
=> await TextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), outputFormat, optimizeStreamingLatency, model), cancellationToken);

var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken);
var request = new TextToSpeechRequest(text, model, defaultVoiceSettings);
/// <summary>
/// Converts text into speech using a voice of your choice and returns audio.
/// </summary>
/// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="VoiceClip"/>.</returns>
public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, CancellationToken cancellationToken = default)
{
var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions);
var parameters = new Dictionary<string, string>
{
{ OutputFormatParameter, outputFormat.ToString().ToLower() }
{ OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
};

if (optimizeStreamingLatency.HasValue)
if (request.OptimizeStreamingLatency.HasValue)
{
parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.Value.ToString());
parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
}

var response = await Rest.PostAsync(GetUrl($"/{voice.Id}", parameters), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
var response = await Rest.PostAsync(GetUrl($"/{request.Voice.Id}", parameters), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
response.Validate(EnableDebug);

if (!response.Headers.TryGetValue(HistoryItemId, out var clipId))
{
throw new ArgumentException("Failed to parse clip id!");
}

var audioType = outputFormat.GetAudioType();
var audioType = request.OutputFormat.GetAudioType();
var extension = audioType switch
{
AudioType.MPEG => "mp3",
AudioType.OGGVORBIS => "ogg",
_ => throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}")
};
var downloadDirectory = await GetCacheDirectoryAsync(voice);
var downloadDirectory = await GetCacheDirectoryAsync(request.Voice);
var cachedPath = $"{downloadDirectory}/{clipId}.{extension}";

if (!File.Exists(cachedPath))
Expand All @@ -106,13 +112,13 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
break;
case AudioType.OGGVORBIS:
var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit);
var frequency = outputFormat switch
var frequency = request.OutputFormat switch
{
OutputFormat.PCM_16000 => 16000,
OutputFormat.PCM_22050 => 22050,
OutputFormat.PCM_24000 => 24000,
OutputFormat.PCM_44100 => 44100,
_ => throw new ArgumentOutOfRangeException(nameof(outputFormat), outputFormat, null)
_ => throw new ArgumentOutOfRangeException(nameof(request.OutputFormat), request.OutputFormat, null)
};
var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false);
await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken).ConfigureAwait(false);
Expand All @@ -124,7 +130,7 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe

await Awaiters.UnityMainThread;
var audioClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, debug: EnableDebug, cancellationToken: cancellationToken);
return new VoiceClip(clipId, text, voice, audioClip, cachedPath);
return new VoiceClip(clipId, request.Text, request.Voice, audioClip, cachedPath);
}

/// <summary>
Expand Down Expand Up @@ -167,36 +173,47 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
/// </param>
/// <returns>Downloaded clip path, and the loaded audio clip.</returns>
public async Task<VoiceClip> StreamTextToSpeechAsync(string text, Voice voice, Action<AudioClip> partialClipCallback, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.PCM_24000, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default)
{
ValidateInputs(text, voice);
=> await StreamTextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), outputFormat, optimizeStreamingLatency, model), partialClipCallback, cancellationToken);

var frequency = outputFormat switch
/// <summary>
/// Converts text into speech using a voice of your choice and returns audio as an audio stream.
/// </summary>
/// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
/// <param name="partialClipCallback">
/// Optional, Callback to enable streaming audio as it comes in.<br/>
/// Returns partial <see cref="VoiceClip"/>.
/// </param>
/// <param name="cancellationToken">
/// Optional, <see cref="CancellationToken"/>.
/// </param>
/// <returns>Downloaded clip path, and the loaded audio clip.</returns>
public async Task<VoiceClip> StreamTextToSpeechAsync(TextToSpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
{
var frequency = request.OutputFormat switch
{
OutputFormat.MP3_44100_64 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_96 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_128 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_192 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_64 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_96 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_128 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
OutputFormat.MP3_44100_192 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
OutputFormat.PCM_16000 => 16000,
OutputFormat.PCM_22050 => 22050,
OutputFormat.PCM_24000 => 24000,
OutputFormat.PCM_44100 => 44100,
_ => throw new ArgumentOutOfRangeException(nameof(outputFormat), outputFormat, null)
_ => throw new ArgumentOutOfRangeException(nameof(request.OutputFormat), request.OutputFormat, null)
};
var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken);
var request = new TextToSpeechRequest(text, model, defaultVoiceSettings);
var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions);
var parameters = new Dictionary<string, string>
{
{ OutputFormatParameter, outputFormat.ToString().ToLower() }
{ OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
};

if (optimizeStreamingLatency.HasValue)
if (request.OptimizeStreamingLatency.HasValue)
{
parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.Value.ToString());
parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
}

var part = 0;
var response = await Rest.PostAsync(GetUrl($"/{voice.Id}/stream", parameters), payload, StreamCallback, eventChunkSize: 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken).ConfigureAwait(true);
var response = await Rest.PostAsync(GetUrl($"/{request.Voice.Id}/stream", parameters), payload, StreamCallback, eventChunkSize: 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken).ConfigureAwait(true);
response.Validate(EnableDebug);

if (!response.Headers.TryGetValue(HistoryItemId, out var clipId))
Expand All @@ -207,12 +224,12 @@ public async Task<VoiceClip> StreamTextToSpeechAsync(string text, Voice voice, A
var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit);
var fullClip = AudioClip.Create(clipId, pcmData.Length, 1, frequency, false);
var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false);
var downloadDirectory = await GetCacheDirectoryAsync(voice);
var downloadDirectory = await GetCacheDirectoryAsync(request.Voice);
var cachedPath = $"{downloadDirectory}/{clipId}.ogg";
await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken).ConfigureAwait(false);
await Awaiters.UnityMainThread;
await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, debug: EnableDebug, cancellationToken: cancellationToken);
return new VoiceClip(clipId, text, voice, fullClip, cachedPath);
return new VoiceClip(clipId, request.Text, request.Voice, fullClip, cachedPath);

void StreamCallback(Response partialResponse)
{
Expand Down Expand Up @@ -242,25 +259,6 @@ void StreamCallback(Response partialResponse)
}
}

private static void ValidateInputs(string text, Voice voice)
{
if (string.IsNullOrWhiteSpace(text))
{
throw new ArgumentNullException(nameof(text));
}

if (text.Length > 5000)
{
throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
}

if (voice == null ||
string.IsNullOrWhiteSpace(voice.Id))
{
throw new ArgumentNullException(nameof(voice));
}
}

private static async Task<string> GetCacheDirectoryAsync(Voice voice)
{
await Rest.ValidateCacheDirectoryAsync();
Expand Down
43 changes: 33 additions & 10 deletions Runtime/TextToSpeech/TextToSpeechRequest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,38 @@
namespace ElevenLabs.TextToSpeech
{
[Preserve]
internal sealed class TextToSpeechRequest
public sealed class TextToSpeechRequest
{
[JsonConstructor]
public TextToSpeechRequest(
[JsonProperty("text")] string text,
[JsonProperty("model_id")] Model model,
[JsonProperty("voice_settings")] VoiceSettings voiceSettings)
[Preserve]
public TextToSpeechRequest(Voice voice, string text, Encoding encoding = null, VoiceSettings voiceSettings = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Model model = null)
{
if (string.IsNullOrWhiteSpace(text))
{
throw new ArgumentNullException(nameof(text));
}

if (!Encoding.GetEncoding(text).Equals(Encoding.UTF8))
if (text.Length > 5000)
{
throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
}

if (voice == null ||
string.IsNullOrWhiteSpace(voice.Id))
{
throw new ArgumentNullException(nameof(voice));
}

if (encoding?.Equals(Encoding.UTF8) == false)
{
text = Encoding.UTF8.GetString(Encoding.Default.GetBytes(text));
text = Encoding.UTF8.GetString(encoding.GetBytes(text));
}

Text = text;
Model = model ?? Models.Model.MonoLingualV1;
VoiceSettings = voiceSettings ?? throw new ArgumentNullException(nameof(voiceSettings));
Model = model ?? Models.Model.MultiLingualV2;
Voice = voice;
VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings));
OutputFormat = outputFormat;
OptimizeStreamingLatency = optimizeStreamingLatency;
}

[Preserve]
Expand All @@ -41,8 +52,20 @@ public TextToSpeechRequest(
[JsonProperty("model_id")]
public string Model { get; }

[Preserve]
[JsonIgnore]
public Voice Voice { get; }

[Preserve]
[JsonProperty("voice_settings")]
public VoiceSettings VoiceSettings { get; internal set; }

[Preserve]
[JsonIgnore]
public OutputFormat OutputFormat { get; }

[Preserve]
[JsonIgnore]
public int? OptimizeStreamingLatency { get; }
}
}
4 changes: 2 additions & 2 deletions Samples~/TextToSpeech/TextToSpeechDemo.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using ElevenLabs.Models;
using ElevenLabs.Voices;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -54,8 +55,7 @@ private async void Start()
var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(message, voice, partialClip =>
{
streamClipQueue.Enqueue(partialClip);
}, cancellationToken: lifetimeCancellationTokenSource.Token);

}, model: Model.EnglishTurboV2, cancellationToken: lifetimeCancellationTokenSource.Token);
audioSource.clip = voiceClip.AudioClip;
Debug.Log($"Full clip: {voiceClip.Id}");
}
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"displayName": "ElevenLabs",
"description": "A non-official Eleven Labs voice synthesis RESTful client.",
"keywords": [],
"version": "3.2.0",
"version": "3.2.1",
"unity": "2021.3",
"documentationUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs#documentation",
"changelogUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs/releases",
Expand Down

0 comments on commit b5b5282

Please sign in to comment.