Play TTS files in response to TTS prompts, but it's legible now

- input files are upsamples and padded to stereo before playback - any in-progress playback is cancelled before playing a new file
author: jacqueline <me@jacqueline.id.au> 2024-09-11 12:57:04 +1000
committer: jacqueline <me@jacqueline.id.au> 2024-09-11 12:57:04 +1000
commit: 542ebc65317ac4744a4b96c3131dace5bda10314 (patch)
tree: 05593126ec6ac9f340fbd76afecb560a4da27ddd /src
parent: d0b739c66ef11a6c16f99cad6957a1782236fd8c (diff)
download: tangara-fw-542ebc65317ac4744a4b96c3131dace5bda10314.tar.gz
5 files changed, 162 insertions, 61 deletions
diff --git a/src/tangara/audio/processor.cpp b/src/tangara/audio/processor.cpp
index aa2604b5..2fa7f78e 100644
--- a/src/tangara/audio/processor.cpp
+++ b/src/tangara/audio/processor.cpp
@@ -347,34 +347,39 @@ auto SampleProcessor::discardCommand(Args& command) -> void {
   // End of stream commands can just be dropped without further action.
 }
 
-SampleProcessor::Buffer::Buffer()
-    : buffer_(reinterpret_cast<sample::Sample*>(
-                  heap_caps_calloc(kSampleBufferLength,
-                                   sizeof(sample::Sample),
-                                   MALLOC_CAP_DMA)),
-              kSampleBufferLength),
+Buffer::Buffer(std::span<sample::Sample> storage)
+    : storage_(nullptr), buffer_(storage), samples_in_buffer_() {}
+
+Buffer::Buffer()
+    : storage_(reinterpret_cast<sample::Sample*>(
+          heap_caps_calloc(kSampleBufferLength,
+                           sizeof(sample::Sample),
+                           MALLOC_CAP_DMA))),
+      buffer_(storage_, kSampleBufferLength),
       samples_in_buffer_() {}
 
-SampleProcessor::Buffer::~Buffer() {
-  heap_caps_free(buffer_.data());
+Buffer::~Buffer() {
+  if (storage_) {
+    heap_caps_free(storage_);
+  }
 }
 
-auto SampleProcessor::Buffer::writeAcquire() -> std::span<sample::Sample> {
+auto Buffer::writeAcquire() -> std::span<sample::Sample> {
   return buffer_.subspan(samples_in_buffer_.size());
 }
 
-auto SampleProcessor::Buffer::writeCommit(size_t samples) -> void {
+auto Buffer::writeCommit(size_t samples) -> void {
   if (samples == 0) {
     return;
   }
   samples_in_buffer_ = buffer_.first(samples + samples_in_buffer_.size());
 }
 
-auto SampleProcessor::Buffer::readAcquire() -> std::span<sample::Sample> {
+auto Buffer::readAcquire() -> std::span<sample::Sample> {
   return samples_in_buffer_;
 }
 
-auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
+auto Buffer::readCommit(size_t samples) -> void {
   if (samples == 0) {
     return;
   }
@@ -389,11 +394,11 @@ auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
   }
 }
 
-auto SampleProcessor::Buffer::isEmpty() -> bool {
+auto Buffer::isEmpty() -> bool {
   return samples_in_buffer_.empty();
 }
 
-auto SampleProcessor::Buffer::clear() -> void {
+auto Buffer::clear() -> void {
   samples_in_buffer_ = {};
 }
 
diff --git a/src/tangara/audio/processor.hpp b/src/tangara/audio/processor.hpp
index 45e05291..52bace95 100644
--- a/src/tangara/audio/processor.hpp
+++ b/src/tangara/audio/processor.hpp
@@ -22,6 +22,35 @@
 
 namespace audio {
 
+/* Utility for managing buffering samples between digital filters. */
+class Buffer {
+ public:
+  Buffer(std::span<sample::Sample> storage);
+  Buffer();
+  ~Buffer();
+
+  /* Returns a span of the unused space within the buffer. */
+  auto writeAcquire() -> std::span<sample::Sample>;
+  /* Signals how many samples were just added to the writeAcquire span. */
+  auto writeCommit(size_t) -> void;
+
+  /* Returns a span of the samples stored within the buffer. */
+  auto readAcquire() -> std::span<sample::Sample>;
+  /* Signals how many samples from the readAcquire span were consumed. */
+  auto readCommit(size_t) -> void;
+
+  auto isEmpty() -> bool;
+  auto clear() -> void;
+
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+ private:
+  sample::Sample* storage_;
+  std::span<sample::Sample> buffer_;
+  std::span<sample::Sample> samples_in_buffer_;
+};
+
 /*
  * Handle to a persistent task that converts samples between formats (sample
  * rate, channels, bits per sample), in order to put samples in the preferred
@@ -87,33 +116,6 @@ class SampleProcessor {
   StreamBufferHandle_t source_;
   drivers::PcmBuffer& sink_;
 
-  /* Internal utility for managing buffering samples between our filters. */
-  class Buffer {
-   public:
-    Buffer();
-    ~Buffer();
-
-    /* Returns a span of the unused space within the buffer. */
-    auto writeAcquire() -> std::span<sample::Sample>;
-    /* Signals how many samples were just added to the writeAcquire span. */
-    auto writeCommit(size_t) -> void;
-
-    /* Returns a span of the samples stored within the buffer. */
-    auto readAcquire() -> std::span<sample::Sample>;
-    /* Signals how many samples from the readAcquire span were consumed. */
-    auto readCommit(size_t) -> void;
-
-    auto isEmpty() -> bool;
-    auto clear() -> void;
-
-    Buffer(const Buffer&) = delete;
-    Buffer& operator=(const Buffer&) = delete;
-
-   private:
-    std::span<sample::Sample> buffer_;
-    std::span<sample::Sample> samples_in_buffer_;
-  };
-
   Buffer input_buffer_;
   Buffer resampled_buffer_;
   Buffer output_buffer_;
diff --git a/src/tangara/tts/player.cpp b/src/tangara/tts/player.cpp
index 3fcd88bc..b5b99b5d 100644
--- a/src/tangara/tts/player.cpp
+++ b/src/tangara/tts/player.cpp
@@ -6,8 +6,12 @@
 
 #include "tts/player.hpp"
 
+#include "audio/processor.hpp"
+#include "audio/resample.hpp"
 #include "codec.hpp"
 #include "esp_log.h"
+#include "freertos/projdefs.h"
+#include "portmacro.h"
 #include "sample.hpp"
 #include "types.hpp"
 
@@ -18,57 +22,140 @@ namespace tts {
 Player::Player(tasks::WorkerPool& worker,
                drivers::PcmBuffer& output,
                audio::FatfsStreamFactory& factory)
-    : bg_(worker), stream_factory_(factory), output_(output) {}
+    : bg_(worker), stream_factory_(factory), output_(output), play_count_(0) {}
 
 auto Player::playFile(const std::string& path) -> void {
   ESP_LOGI(kTag, "playing '%s'", path.c_str());
-  bg_.Dispatch<void>([=]() {
+  int this_play = ++play_count_;
+
+  bg_.Dispatch<void>([=, this]() {
     auto stream = stream_factory_.create(path);
     if (!stream) {
       ESP_LOGE(kTag, "creating stream failed");
       return;
     }
+
+    // FIXME: Rather than hardcoding WAV support only, we should work out a
+    // proper subset of 'low memory' decoders that can all be used for TTS
+    // playback.
     if (stream->type() != codecs::StreamType::kWav) {
       ESP_LOGE(kTag, "stream was unsupported type");
       return;
     }
+
     auto decoder = codecs::CreateCodecForType(stream->type());
     if (!decoder) {
       ESP_LOGE(kTag, "creating decoder failed");
       return;
     }
+
     std::unique_ptr<codecs::ICodec> codec{*decoder};
     auto open_res = codec->OpenStream(stream, 0);
     if (open_res.has_error()) {
       ESP_LOGE(kTag, "opening stream failed");
       return;
     }
-    // if (open_res->sample_rate_hz != 48000 || open_res->num_channels != 2) {
-    // ESP_LOGE(kTag, "stream format is wrong (was %u channels @ %lu hz)",
-    // open_res->num_channels, open_res->sample_rate_hz);
-    // return;
-    // }
-    sample::Sample decode_buf[4096];
-    for (;;) {
-      auto decode_res = codec->DecodeTo(decode_buf);
+
+    decodeToSink(*open_res, std::move(codec), this_play);
+  });
+}
+
+auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
+                          std::unique_ptr<codecs::ICodec> codec,
+                          int play_count) -> void {
+  // Set up buffers to hold samples between the intermediary parts of
+  // processing. We can just use the stack for these, since this method is
+  // called only from background workers, which have enormous stacks.
+  sample::Sample decode_storage[4096];
+  audio::Buffer decode_buf(decode_storage);
+
+  sample::Sample resample_storage[4096];
+  audio::Buffer resample_buf(resample_storage);
+
+  sample::Sample stereo_storage[4096];
+  audio::Buffer stereo_buf(stereo_storage);
+
+  // Work out what processing the codec's output needs.
+  std::unique_ptr<audio::Resampler> resampler;
+  if (format.sample_rate_hz != 48000) {
+    resampler = std::make_unique<audio::Resampler>(format.sample_rate_hz, 48000,
+                                                   format.num_channels);
+  }
+  bool double_samples = format.num_channels == 1;
+
+  // FIXME: This decode-and-process loop is substantially the same as the audio
+  // processor's filter loop. Ideally we should refactor both of these loops to
+  // reuse code, however I'm holding off on doing this until we've implemented
+  // more advanced audio processing features in the audio processor (EQ, tempo
+  // shifting, etc.) as it's not clear to me yet how much the two codepaths will
+  // be diverging later anyway.
+  while (codec || !decode_buf.isEmpty() || !resample_buf.isEmpty() ||
+         !stereo_buf.isEmpty()) {
+    if (play_count != play_count_) {
+      // FIXME: This is a little unsafe and could maybe take out the first few
+      // samples of the next file.
+      output_.clear();
+      break;
+    }
+    if (codec) {
+      auto decode_res = codec->DecodeTo(decode_buf.writeAcquire());
       if (decode_res.has_error()) {
         ESP_LOGE(kTag, "decoding error");
-        return;
+        break;
       }
+      decode_buf.writeCommit(decode_res->samples_written);
       if (decode_res->is_stream_finished) {
-        break;
+        codec.reset();
       }
+    }
+
+    if (!decode_buf.isEmpty()) {
+      auto resample_input = decode_buf.readAcquire();
+      auto resample_output = resample_buf.writeAcquire();
 
-      std::span<sample::Sample> decode_span{decode_buf,
-                                            decode_res->samples_written};
-      while (!decode_span.empty()) {
-        size_t sent = output_.send(decode_span);
-        decode_span = decode_span.subspan(sent);
+      size_t read, wrote;
+      if (resampler) {
+        std::tie(read, wrote) =
+            resampler->Process(resample_input, resample_output, false);
+      } else {
+        read = wrote = std::min(resample_input.size(), resample_output.size());
+        std::copy_n(resample_input.begin(), read, resample_output.begin());
       }
+
+      decode_buf.readCommit(read);
+      resample_buf.writeCommit(wrote);
     }
 
-    ESP_LOGI(kTag, "finished playing okay");
-  });
+    if (!resample_buf.isEmpty()) {
+      auto channels_input = resample_buf.readAcquire();
+      auto channels_output = stereo_buf.writeAcquire();
+      size_t read, wrote;
+      if (double_samples) {
+        wrote = channels_output.size();
+        read = wrote / 2;
+        if (read > channels_input.size()) {
+          read = channels_input.size();
+          wrote = read * 2;
+        }
+        for (size_t i = 0; i < read; i++) {
+          channels_output[i * 2] = channels_input[i];
+          channels_output[(i * 2) + 1] = channels_input[i];
+        }
+      } else {
+        read = wrote = std::min(channels_input.size(), channels_output.size());
+        std::copy_n(channels_input.begin(), read, channels_output.begin());
+      }
+      resample_buf.readCommit(read);
+      stereo_buf.writeCommit(wrote);
+    }
+
+    // The mixin PcmBuffer should almost always be draining, so we can force
+    // samples into it more aggressively than with the main music PcmBuffer.
+    while (!stereo_buf.isEmpty()) {
+      size_t sent = output_.send(stereo_buf.readAcquire());
+      stereo_buf.readCommit(sent);
+    }
+  }
 }
 
 }  // namespace tts
diff --git a/src/tangara/tts/player.hpp b/src/tangara/tts/player.hpp
index a132b9cd..0a3ba723 100644
--- a/src/tangara/tts/player.hpp
+++ b/src/tangara/tts/player.hpp
@@ -9,6 +9,7 @@
 #include <string>
 
 #include "audio/fatfs_stream_factory.hpp"
+#include "codec.hpp"
 #include "drivers/pcm_buffer.hpp"
 #include "tasks.hpp"
 
@@ -33,6 +34,12 @@ class Player {
   tasks::WorkerPool& bg_;
   audio::FatfsStreamFactory& stream_factory_;
   drivers::PcmBuffer& output_;
+
+  std::atomic<int> play_count_;
+
+  auto decodeToSink(const codecs::ICodec::OutputFormat&,
+                    std::unique_ptr<codecs::ICodec>,
+                    int play_count) -> void;
 };
 
 }  // namespace tts
diff --git a/src/tangara/tts/provider.cpp b/src/tangara/tts/provider.cpp
index b7c1e55d..2b1dd4e6 100644
--- a/src/tangara/tts/provider.cpp
+++ b/src/tangara/tts/provider.cpp
@@ -28,7 +28,7 @@ static const char* kTtsPath = "/.tangara-tts/";
 static auto textToFile(const std::string& text) -> std::optional<std::string> {
   uint64_t hash = komihash(text.data(), text.size(), 0);
   std::stringstream stream;
-  stream << kTtsPath << std::hex << hash << ".wav";
+  stream << kTtsPath << std::hex << hash;
   return stream.str();
 }
author	jacqueline <me@jacqueline.id.au>	2024-09-11 12:57:04 +1000
committer	jacqueline <me@jacqueline.id.au>	2024-09-11 12:57:04 +1000
commit	542ebc65317ac4744a4b96c3131dace5bda10314 (patch)
tree	05593126ec6ac9f340fbd76afecb560a4da27ddd /src
parent	d0b739c66ef11a6c16f99cad6957a1782236fd8c (diff)
download	tangara-fw-542ebc65317ac4744a4b96c3131dace5bda10314.tar.gz