summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjacqueline <me@jacqueline.id.au>2024-09-11 12:57:04 +1000
committerjacqueline <me@jacqueline.id.au>2024-09-11 12:57:04 +1000
commit542ebc65317ac4744a4b96c3131dace5bda10314 (patch)
tree05593126ec6ac9f340fbd76afecb560a4da27ddd /src
parentd0b739c66ef11a6c16f99cad6957a1782236fd8c (diff)
downloadtangara-fw-542ebc65317ac4744a4b96c3131dace5bda10314.tar.gz
Play TTS files in response to TTS prompts, but it's legible now
- input files are upsamples and padded to stereo before playback - any in-progress playback is cancelled before playing a new file
Diffstat (limited to 'src')
-rw-r--r--src/tangara/audio/processor.cpp33
-rw-r--r--src/tangara/audio/processor.hpp56
-rw-r--r--src/tangara/tts/player.cpp125
-rw-r--r--src/tangara/tts/player.hpp7
-rw-r--r--src/tangara/tts/provider.cpp2
5 files changed, 162 insertions, 61 deletions
diff --git a/src/tangara/audio/processor.cpp b/src/tangara/audio/processor.cpp
index aa2604b5..2fa7f78e 100644
--- a/src/tangara/audio/processor.cpp
+++ b/src/tangara/audio/processor.cpp
@@ -347,34 +347,39 @@ auto SampleProcessor::discardCommand(Args& command) -> void {
// End of stream commands can just be dropped without further action.
}
-SampleProcessor::Buffer::Buffer()
- : buffer_(reinterpret_cast<sample::Sample*>(
- heap_caps_calloc(kSampleBufferLength,
- sizeof(sample::Sample),
- MALLOC_CAP_DMA)),
- kSampleBufferLength),
+Buffer::Buffer(std::span<sample::Sample> storage)
+ : storage_(nullptr), buffer_(storage), samples_in_buffer_() {}
+
+Buffer::Buffer()
+ : storage_(reinterpret_cast<sample::Sample*>(
+ heap_caps_calloc(kSampleBufferLength,
+ sizeof(sample::Sample),
+ MALLOC_CAP_DMA))),
+ buffer_(storage_, kSampleBufferLength),
samples_in_buffer_() {}
-SampleProcessor::Buffer::~Buffer() {
- heap_caps_free(buffer_.data());
+Buffer::~Buffer() {
+ if (storage_) {
+ heap_caps_free(storage_);
+ }
}
-auto SampleProcessor::Buffer::writeAcquire() -> std::span<sample::Sample> {
+auto Buffer::writeAcquire() -> std::span<sample::Sample> {
return buffer_.subspan(samples_in_buffer_.size());
}
-auto SampleProcessor::Buffer::writeCommit(size_t samples) -> void {
+auto Buffer::writeCommit(size_t samples) -> void {
if (samples == 0) {
return;
}
samples_in_buffer_ = buffer_.first(samples + samples_in_buffer_.size());
}
-auto SampleProcessor::Buffer::readAcquire() -> std::span<sample::Sample> {
+auto Buffer::readAcquire() -> std::span<sample::Sample> {
return samples_in_buffer_;
}
-auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
+auto Buffer::readCommit(size_t samples) -> void {
if (samples == 0) {
return;
}
@@ -389,11 +394,11 @@ auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
}
}
-auto SampleProcessor::Buffer::isEmpty() -> bool {
+auto Buffer::isEmpty() -> bool {
return samples_in_buffer_.empty();
}
-auto SampleProcessor::Buffer::clear() -> void {
+auto Buffer::clear() -> void {
samples_in_buffer_ = {};
}
diff --git a/src/tangara/audio/processor.hpp b/src/tangara/audio/processor.hpp
index 45e05291..52bace95 100644
--- a/src/tangara/audio/processor.hpp
+++ b/src/tangara/audio/processor.hpp
@@ -22,6 +22,35 @@
namespace audio {
+/* Utility for managing buffering samples between digital filters. */
+class Buffer {
+ public:
+ Buffer(std::span<sample::Sample> storage);
+ Buffer();
+ ~Buffer();
+
+ /* Returns a span of the unused space within the buffer. */
+ auto writeAcquire() -> std::span<sample::Sample>;
+ /* Signals how many samples were just added to the writeAcquire span. */
+ auto writeCommit(size_t) -> void;
+
+ /* Returns a span of the samples stored within the buffer. */
+ auto readAcquire() -> std::span<sample::Sample>;
+ /* Signals how many samples from the readAcquire span were consumed. */
+ auto readCommit(size_t) -> void;
+
+ auto isEmpty() -> bool;
+ auto clear() -> void;
+
+ Buffer(const Buffer&) = delete;
+ Buffer& operator=(const Buffer&) = delete;
+
+ private:
+ sample::Sample* storage_;
+ std::span<sample::Sample> buffer_;
+ std::span<sample::Sample> samples_in_buffer_;
+};
+
/*
* Handle to a persistent task that converts samples between formats (sample
* rate, channels, bits per sample), in order to put samples in the preferred
@@ -87,33 +116,6 @@ class SampleProcessor {
StreamBufferHandle_t source_;
drivers::PcmBuffer& sink_;
- /* Internal utility for managing buffering samples between our filters. */
- class Buffer {
- public:
- Buffer();
- ~Buffer();
-
- /* Returns a span of the unused space within the buffer. */
- auto writeAcquire() -> std::span<sample::Sample>;
- /* Signals how many samples were just added to the writeAcquire span. */
- auto writeCommit(size_t) -> void;
-
- /* Returns a span of the samples stored within the buffer. */
- auto readAcquire() -> std::span<sample::Sample>;
- /* Signals how many samples from the readAcquire span were consumed. */
- auto readCommit(size_t) -> void;
-
- auto isEmpty() -> bool;
- auto clear() -> void;
-
- Buffer(const Buffer&) = delete;
- Buffer& operator=(const Buffer&) = delete;
-
- private:
- std::span<sample::Sample> buffer_;
- std::span<sample::Sample> samples_in_buffer_;
- };
-
Buffer input_buffer_;
Buffer resampled_buffer_;
Buffer output_buffer_;
diff --git a/src/tangara/tts/player.cpp b/src/tangara/tts/player.cpp
index 3fcd88bc..b5b99b5d 100644
--- a/src/tangara/tts/player.cpp
+++ b/src/tangara/tts/player.cpp
@@ -6,8 +6,12 @@
#include "tts/player.hpp"
+#include "audio/processor.hpp"
+#include "audio/resample.hpp"
#include "codec.hpp"
#include "esp_log.h"
+#include "freertos/projdefs.h"
+#include "portmacro.h"
#include "sample.hpp"
#include "types.hpp"
@@ -18,57 +22,140 @@ namespace tts {
Player::Player(tasks::WorkerPool& worker,
drivers::PcmBuffer& output,
audio::FatfsStreamFactory& factory)
- : bg_(worker), stream_factory_(factory), output_(output) {}
+ : bg_(worker), stream_factory_(factory), output_(output), play_count_(0) {}
auto Player::playFile(const std::string& path) -> void {
ESP_LOGI(kTag, "playing '%s'", path.c_str());
- bg_.Dispatch<void>([=]() {
+ int this_play = ++play_count_;
+
+ bg_.Dispatch<void>([=, this]() {
auto stream = stream_factory_.create(path);
if (!stream) {
ESP_LOGE(kTag, "creating stream failed");
return;
}
+
+ // FIXME: Rather than hardcoding WAV support only, we should work out a
+ // proper subset of 'low memory' decoders that can all be used for TTS
+ // playback.
if (stream->type() != codecs::StreamType::kWav) {
ESP_LOGE(kTag, "stream was unsupported type");
return;
}
+
auto decoder = codecs::CreateCodecForType(stream->type());
if (!decoder) {
ESP_LOGE(kTag, "creating decoder failed");
return;
}
+
std::unique_ptr<codecs::ICodec> codec{*decoder};
auto open_res = codec->OpenStream(stream, 0);
if (open_res.has_error()) {
ESP_LOGE(kTag, "opening stream failed");
return;
}
- // if (open_res->sample_rate_hz != 48000 || open_res->num_channels != 2) {
- // ESP_LOGE(kTag, "stream format is wrong (was %u channels @ %lu hz)",
- // open_res->num_channels, open_res->sample_rate_hz);
- // return;
- // }
- sample::Sample decode_buf[4096];
- for (;;) {
- auto decode_res = codec->DecodeTo(decode_buf);
+
+ decodeToSink(*open_res, std::move(codec), this_play);
+ });
+}
+
+auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
+ std::unique_ptr<codecs::ICodec> codec,
+ int play_count) -> void {
+ // Set up buffers to hold samples between the intermediary parts of
+ // processing. We can just use the stack for these, since this method is
+ // called only from background workers, which have enormous stacks.
+ sample::Sample decode_storage[4096];
+ audio::Buffer decode_buf(decode_storage);
+
+ sample::Sample resample_storage[4096];
+ audio::Buffer resample_buf(resample_storage);
+
+ sample::Sample stereo_storage[4096];
+ audio::Buffer stereo_buf(stereo_storage);
+
+ // Work out what processing the codec's output needs.
+ std::unique_ptr<audio::Resampler> resampler;
+ if (format.sample_rate_hz != 48000) {
+ resampler = std::make_unique<audio::Resampler>(format.sample_rate_hz, 48000,
+ format.num_channels);
+ }
+ bool double_samples = format.num_channels == 1;
+
+ // FIXME: This decode-and-process loop is substantially the same as the audio
+ // processor's filter loop. Ideally we should refactor both of these loops to
+ // reuse code, however I'm holding off on doing this until we've implemented
+ // more advanced audio processing features in the audio processor (EQ, tempo
+ // shifting, etc.) as it's not clear to me yet how much the two codepaths will
+ // be diverging later anyway.
+ while (codec || !decode_buf.isEmpty() || !resample_buf.isEmpty() ||
+ !stereo_buf.isEmpty()) {
+ if (play_count != play_count_) {
+ // FIXME: This is a little unsafe and could maybe take out the first few
+ // samples of the next file.
+ output_.clear();
+ break;
+ }
+ if (codec) {
+ auto decode_res = codec->DecodeTo(decode_buf.writeAcquire());
if (decode_res.has_error()) {
ESP_LOGE(kTag, "decoding error");
- return;
+ break;
}
+ decode_buf.writeCommit(decode_res->samples_written);
if (decode_res->is_stream_finished) {
- break;
+ codec.reset();
}
+ }
+
+ if (!decode_buf.isEmpty()) {
+ auto resample_input = decode_buf.readAcquire();
+ auto resample_output = resample_buf.writeAcquire();
- std::span<sample::Sample> decode_span{decode_buf,
- decode_res->samples_written};
- while (!decode_span.empty()) {
- size_t sent = output_.send(decode_span);
- decode_span = decode_span.subspan(sent);
+ size_t read, wrote;
+ if (resampler) {
+ std::tie(read, wrote) =
+ resampler->Process(resample_input, resample_output, false);
+ } else {
+ read = wrote = std::min(resample_input.size(), resample_output.size());
+ std::copy_n(resample_input.begin(), read, resample_output.begin());
}
+
+ decode_buf.readCommit(read);
+ resample_buf.writeCommit(wrote);
}
- ESP_LOGI(kTag, "finished playing okay");
- });
+ if (!resample_buf.isEmpty()) {
+ auto channels_input = resample_buf.readAcquire();
+ auto channels_output = stereo_buf.writeAcquire();
+ size_t read, wrote;
+ if (double_samples) {
+ wrote = channels_output.size();
+ read = wrote / 2;
+ if (read > channels_input.size()) {
+ read = channels_input.size();
+ wrote = read * 2;
+ }
+ for (size_t i = 0; i < read; i++) {
+ channels_output[i * 2] = channels_input[i];
+ channels_output[(i * 2) + 1] = channels_input[i];
+ }
+ } else {
+ read = wrote = std::min(channels_input.size(), channels_output.size());
+ std::copy_n(channels_input.begin(), read, channels_output.begin());
+ }
+ resample_buf.readCommit(read);
+ stereo_buf.writeCommit(wrote);
+ }
+
+ // The mixin PcmBuffer should almost always be draining, so we can force
+ // samples into it more aggressively than with the main music PcmBuffer.
+ while (!stereo_buf.isEmpty()) {
+ size_t sent = output_.send(stereo_buf.readAcquire());
+ stereo_buf.readCommit(sent);
+ }
+ }
}
} // namespace tts
diff --git a/src/tangara/tts/player.hpp b/src/tangara/tts/player.hpp
index a132b9cd..0a3ba723 100644
--- a/src/tangara/tts/player.hpp
+++ b/src/tangara/tts/player.hpp
@@ -9,6 +9,7 @@
#include <string>
#include "audio/fatfs_stream_factory.hpp"
+#include "codec.hpp"
#include "drivers/pcm_buffer.hpp"
#include "tasks.hpp"
@@ -33,6 +34,12 @@ class Player {
tasks::WorkerPool& bg_;
audio::FatfsStreamFactory& stream_factory_;
drivers::PcmBuffer& output_;
+
+ std::atomic<int> play_count_;
+
+ auto decodeToSink(const codecs::ICodec::OutputFormat&,
+ std::unique_ptr<codecs::ICodec>,
+ int play_count) -> void;
};
} // namespace tts
diff --git a/src/tangara/tts/provider.cpp b/src/tangara/tts/provider.cpp
index b7c1e55d..2b1dd4e6 100644
--- a/src/tangara/tts/provider.cpp
+++ b/src/tangara/tts/provider.cpp
@@ -28,7 +28,7 @@ static const char* kTtsPath = "/.tangara-tts/";
static auto textToFile(const std::string& text) -> std::optional<std::string> {
uint64_t hash = komihash(text.data(), text.size(), 0);
std::stringstream stream;
- stream << kTtsPath << std::hex << hash << ".wav";
+ stream << kTtsPath << std::hex << hash;
return stream.str();
}