21 files changed, 488 insertions, 105 deletions
diff --git a/src/codecs/wav.cpp b/src/codecs/wav.cpp
index f5b9d789..746f44ca 100644
--- a/src/codecs/wav.cpp
+++ b/src/codecs/wav.cpp
@@ -137,8 +137,6 @@ auto WavDecoder::OpenStream(std::shared_ptr<IStream> input, uint32_t offset)
   // uint32_t file_size = bytes_to_u32(buffer_span.subspan(4, 4)) + 8;
 
   std::string fmt_header = bytes_to_str(buffer_span.subspan(12, 4));
-  ESP_LOGI(kTag, "fmt header found? %s",
-           (fmt_header.starts_with("fmt")) ? "yes" : "no");
   if (!fmt_header.starts_with("fmt")) {
     ESP_LOGW(kTag, "Could not find format chunk");
     return cpp::fail(Error::kMalformedData);
diff --git a/src/drivers/bluetooth.cpp b/src/drivers/bluetooth.cpp
index 8ec30395..3da5dd0c 100644
--- a/src/drivers/bluetooth.cpp
+++ b/src/drivers/bluetooth.cpp
@@ -38,7 +38,7 @@ namespace drivers {
 
 [[maybe_unused]] static constexpr char kTag[] = "bluetooth";
 
-DRAM_ATTR static PcmBuffer* sStream = nullptr;
+DRAM_ATTR static OutputBuffers* sStreams = nullptr;
 DRAM_ATTR static std::atomic<float> sVolumeFactor = 1.f;
 
 static tasks::WorkerPool* sBgWorker;
@@ -97,13 +97,16 @@ IRAM_ATTR auto a2dp_data_cb(uint8_t* buf, int32_t buf_size) -> int32_t {
   if (buf == nullptr || buf_size <= 0) {
     return 0;
   }
-  PcmBuffer* stream = sStream;
-  if (stream == nullptr) {
+  OutputBuffers* streams = sStreams;
+  if (streams == nullptr) {
     return 0;
   }
 
   int16_t* samples = reinterpret_cast<int16_t*>(buf);
-  stream->receive({samples, static_cast<size_t>(buf_size / 2)}, false);
+  streams->first.receive({samples, static_cast<size_t>(buf_size / 2)}, false,
+                         false);
+  streams->second.receive({samples, static_cast<size_t>(buf_size / 2)}, true,
+                          false);
 
   // Apply software volume scaling.
   float factor = sVolumeFactor.load();
@@ -141,14 +144,14 @@ auto Bluetooth::enabled() -> bool {
   return !bluetooth::BluetoothState::is_in_state<bluetooth::Disabled>();
 }
 
-auto Bluetooth::source(PcmBuffer* src) -> void {
-  if (src == sStream) {
+auto Bluetooth::sources(OutputBuffers* src) -> void {
+  auto lock = bluetooth::BluetoothState::lock();
+  if (src == sStreams) {
     return;
   }
-  auto lock = bluetooth::BluetoothState::lock();
-  sStream = src;
+  sStreams = src;
   tinyfsm::FsmList<bluetooth::BluetoothState>::dispatch(
-      bluetooth::events::SourceChanged{});
+      bluetooth::events::SourcesChanged{});
 }
 
 auto Bluetooth::softVolume(float f) -> void {
@@ -771,8 +774,8 @@ void Connected::react(const events::PairedDeviceChanged& ev) {
   }
 }
 
-void Connected::react(const events::SourceChanged& ev) {
-  if (sStream != nullptr) {
+void Connected::react(const events::SourcesChanged& ev) {
+  if (sStreams != nullptr) {
     ESP_LOGI(kTag, "checking source is ready");
     esp_a2d_media_ctrl(ESP_A2D_MEDIA_CTRL_CHECK_SRC_RDY);
   } else {
diff --git a/src/drivers/i2s_dac.cpp b/src/drivers/i2s_dac.cpp
index 9c9bb793..46bf8e80 100644
--- a/src/drivers/i2s_dac.cpp
+++ b/src/drivers/i2s_dac.cpp
@@ -52,10 +52,12 @@ extern "C" IRAM_ATTR auto callback(i2s_chan_handle_t handle,
   assert(event->size % 4 == 0);
 
   uint8_t* buf = reinterpret_cast<uint8_t*>(event->dma_buf);
-  auto* src = reinterpret_cast<PcmBuffer*>(user_ctx);
+  auto* src = reinterpret_cast<OutputBuffers*>(user_ctx);
 
-  BaseType_t ret =
-      src->receive({reinterpret_cast<int16_t*>(buf), event->size / 2}, true);
+  BaseType_t ret1 = src->first.receive(
+      {reinterpret_cast<int16_t*>(buf), event->size / 2}, false, true);
+  BaseType_t ret2 = src->second.receive(
+      {reinterpret_cast<int16_t*>(buf), event->size / 2}, true, true);
 
   // The ESP32's I2S peripheral has a different endianness to its processors.
   // ESP-IDF handles this difference for stereo channels, but not for mono
@@ -70,10 +72,10 @@ extern "C" IRAM_ATTR auto callback(i2s_chan_handle_t handle,
     }
   }
 
-  return ret;
+  return ret1 || ret2;
 }
 
-auto I2SDac::create(IGpios& expander, PcmBuffer& buf)
+auto I2SDac::create(IGpios& expander, OutputBuffers& bufs)
     -> std::optional<I2SDac*> {
   i2s_chan_handle_t i2s_handle;
   i2s_chan_config_t channel_config{
@@ -90,7 +92,7 @@ auto I2SDac::create(IGpios& expander, PcmBuffer& buf)
   // First, instantiate the instance so it can do all of its power on
   // configuration.
   std::unique_ptr<I2SDac> dac =
-      std::make_unique<I2SDac>(expander, buf, i2s_handle);
+      std::make_unique<I2SDac>(expander, bufs, i2s_handle);
 
   // Whilst we wait for the initial boot, we can work on installing the I2S
   // driver.
@@ -122,14 +124,14 @@ auto I2SDac::create(IGpios& expander, PcmBuffer& buf)
       .on_sent = callback,
       .on_send_q_ovf = NULL,
   };
-  i2s_channel_register_event_callback(i2s_handle, &callbacks, &buf);
+  i2s_channel_register_event_callback(i2s_handle, &callbacks, &bufs);
 
   return dac.release();
 }
 
-I2SDac::I2SDac(IGpios& gpio, PcmBuffer& buf, i2s_chan_handle_t i2s_handle)
+I2SDac::I2SDac(IGpios& gpio, OutputBuffers& bufs, i2s_chan_handle_t i2s_handle)
     : gpio_(gpio),
-      buffer_(buf),
+      buffers_(bufs),
       i2s_handle_(i2s_handle),
       i2s_active_(false),
       clock_config_(I2S_STD_CLK_DEFAULT_CONFIG(48000)),
diff --git a/src/drivers/include/drivers/bluetooth.hpp b/src/drivers/include/drivers/bluetooth.hpp
index 449812d6..99c71e52 100644
--- a/src/drivers/include/drivers/bluetooth.hpp
+++ b/src/drivers/include/drivers/bluetooth.hpp
@@ -45,7 +45,7 @@ class Bluetooth {
   auto enable(bool en) -> void;
   auto enabled() -> bool;
 
-  auto source(PcmBuffer*) -> void;
+  auto sources(OutputBuffers*) -> void;
   auto softVolume(float) -> void;
 
   enum class ConnectionState {
@@ -98,7 +98,7 @@ struct Disable : public tinyfsm::Event {};
 
 struct ConnectTimedOut : public tinyfsm::Event {};
 struct PairedDeviceChanged : public tinyfsm::Event {};
-struct SourceChanged : public tinyfsm::Event {};
+struct SourcesChanged : public tinyfsm::Event {};
 struct DeviceDiscovered : public tinyfsm::Event {
   const Device& device;
 };
@@ -172,7 +172,7 @@ class BluetoothState : public tinyfsm::Fsm<BluetoothState> {
   virtual void react(const events::Disable& ev) = 0;
   virtual void react(const events::ConnectTimedOut& ev){};
   virtual void react(const events::PairedDeviceChanged& ev){};
-  virtual void react(const events::SourceChanged& ev){};
+  virtual void react(const events::SourcesChanged& ev){};
 
   virtual void react(const events::DeviceDiscovered&);
 
@@ -243,7 +243,7 @@ class Connected : public BluetoothState {
   void exit() override;
 
   void react(const events::PairedDeviceChanged& ev) override;
-  void react(const events::SourceChanged& ev) override;
+  void react(const events::SourcesChanged& ev) override;
 
   void react(const events::Disable& ev) override;
   void react(events::internal::Gap ev) override;
diff --git a/src/drivers/include/drivers/i2s_dac.hpp b/src/drivers/include/drivers/i2s_dac.hpp
index cf9258c0..891acb56 100644
--- a/src/drivers/include/drivers/i2s_dac.hpp
+++ b/src/drivers/include/drivers/i2s_dac.hpp
@@ -40,9 +40,10 @@ constexpr size_t kI2SBufferLengthFrames = 1024;
  */
 class I2SDac {
  public:
-  static auto create(IGpios& expander, PcmBuffer&) -> std::optional<I2SDac*>;
+  static auto create(IGpios& expander, OutputBuffers&)
+      -> std::optional<I2SDac*>;
 
-  I2SDac(IGpios& gpio, PcmBuffer&, i2s_chan_handle_t i2s_handle);
+  I2SDac(IGpios& gpio, OutputBuffers&, i2s_chan_handle_t i2s_handle);
   ~I2SDac();
 
   auto SetPaused(bool) -> void;
@@ -77,7 +78,7 @@ class I2SDac {
   auto set_channel(bool) -> void;
 
   IGpios& gpio_;
-  PcmBuffer& buffer_;
+  OutputBuffers& buffers_;
   i2s_chan_handle_t i2s_handle_;
 
   bool i2s_active_;
diff --git a/src/drivers/include/drivers/pcm_buffer.hpp b/src/drivers/include/drivers/pcm_buffer.hpp
index 8f53317e..6b38be94 100644
--- a/src/drivers/include/drivers/pcm_buffer.hpp
+++ b/src/drivers/include/drivers/pcm_buffer.hpp
@@ -39,11 +39,17 @@ class PcmBuffer {
    * Fills the given span with samples. If enough samples are available in
    * the buffer, then the span will be filled with samples from the buffer. Any
    * shortfall is made up by padding the given span with zeroes.
+   *
+   * If `mix` is set to true then, instead of overwriting the destination span,
+   * the retrieved samples will be mixed into any existing samples contained
+   * within the destination. This mixing uses a naive sum approach, and so may
+   * introduce clipping.
    */
-  auto receive(std::span<int16_t>, bool isr) -> BaseType_t;
+  auto receive(std::span<int16_t>, bool mix, bool isr) -> BaseType_t;
 
   auto clear() -> void;
   auto isEmpty() -> bool;
+  auto suspend(bool) -> void;
 
   /*
    * How many samples have been added to this buffer since it was created. This
@@ -62,7 +68,7 @@ class PcmBuffer {
   PcmBuffer& operator=(const PcmBuffer&) = delete;
 
  private:
-  auto readSingle(std::span<int16_t>, bool isr)
+  auto readSingle(std::span<int16_t>, bool mix, bool isr)
       -> std::pair<size_t, BaseType_t>;
 
   StaticRingbuffer_t meta_;
@@ -70,7 +76,21 @@ class PcmBuffer {
 
   std::atomic<uint32_t> sent_;
   std::atomic<uint32_t> received_;
+  std::atomic<bool> suspended_;
+
   RingbufHandle_t ringbuf_;
 };
 
+/*
+ * Convenience type for a pair of PcmBuffers. Each audio output handles mixing
+ * streams together to ensure that low-latency sounds in one channel (e.g. a
+ * system notification bleep) aren't delayed by a large audio buffer in the
+ * other channel (e.g. a long-running track).
+ *
+ * By convention, the first buffer of this pair is used for tracks, whilst the
+ * second is reserved for 'system sounds'; usually TTS, but potentially maybe
+ * other informative noises.
+ */
+using OutputBuffers = std::pair<PcmBuffer, PcmBuffer>;
+
 }  // namespace drivers
diff --git a/src/drivers/pcm_buffer.cpp b/src/drivers/pcm_buffer.cpp
index 071f5cea..bc58d4b9 100644
--- a/src/drivers/pcm_buffer.cpp
+++ b/src/drivers/pcm_buffer.cpp
@@ -25,7 +25,8 @@ namespace drivers {
 
 [[maybe_unused]] static const char kTag[] = "pcmbuf";
 
-PcmBuffer::PcmBuffer(size_t size_in_samples) : sent_(0), received_(0) {
+PcmBuffer::PcmBuffer(size_t size_in_samples)
+    : sent_(0), received_(0), suspended_(false) {
   size_t size_in_bytes = size_in_samples * sizeof(int16_t);
   ESP_LOGI(kTag, "allocating pcm buffer of size %u (%uKiB)", size_in_samples,
            size_in_bytes / 1024);
@@ -49,18 +50,26 @@ auto PcmBuffer::send(std::span<const int16_t> data) -> size_t {
   return data.size();
 }
 
-IRAM_ATTR auto PcmBuffer::receive(std::span<int16_t> dest, bool isr)
+IRAM_ATTR auto PcmBuffer::receive(std::span<int16_t> dest, bool mix, bool isr)
     -> BaseType_t {
+  if (suspended_) {
+    if (!mix) {
+      std::fill_n(dest.begin(), dest.size(), 0);
+    }
+    return false;
+  }
+
   size_t first_read = 0, second_read = 0;
   BaseType_t ret1 = false, ret2 = false;
-  std::tie(first_read, ret1) = readSingle(dest, isr);
+  std::tie(first_read, ret1) = readSingle(dest, mix, isr);
 
   if (first_read < dest.size()) {
-    std::tie(second_read, ret2) = readSingle(dest.subspan(first_read), isr);
+    std::tie(second_read, ret2) =
+        readSingle(dest.subspan(first_read), mix, isr);
   }
 
   size_t total_read = first_read + second_read;
-  if (total_read < dest.size()) {
+  if (total_read < dest.size() && !mix) {
     std::fill_n(dest.begin() + total_read, dest.size() - total_read, 0);
   }
 
@@ -85,6 +94,10 @@ auto PcmBuffer::isEmpty() -> bool {
          xRingbufferGetCurFreeSize(ringbuf_);
 }
 
+auto PcmBuffer::suspend(bool s) -> void {
+  suspended_ = s;
+}
+
 auto PcmBuffer::totalSent() -> uint32_t {
   return sent_;
 }
@@ -93,7 +106,9 @@ auto PcmBuffer::totalReceived() -> uint32_t {
   return received_;
 }
 
-IRAM_ATTR auto PcmBuffer::readSingle(std::span<int16_t> dest, bool isr)
+IRAM_ATTR auto PcmBuffer::readSingle(std::span<int16_t> dest,
+                                     bool mix,
+                                     bool isr)
     -> std::pair<size_t, BaseType_t> {
   BaseType_t ret;
   size_t read_bytes = 0;
@@ -111,7 +126,18 @@ IRAM_ATTR auto PcmBuffer::readSingle(std::span<int16_t> dest, bool isr)
     return {read_samples, ret};
   }
 
-  std::memcpy(dest.data(), data, read_bytes);
+  if (mix) {
+    for (size_t i = 0; i < read_samples; i++) {
+      // Sum the two samples in a 32 bit field so that the addition is always
+      // safe.
+      int32_t sum = static_cast<int32_t>(dest[i]) +
+                    static_cast<int32_t>(reinterpret_cast<int16_t*>(data)[i]);
+      // Clip back into the range of a single sample.
+      dest[i] = std::clamp<int32_t>(sum, INT16_MIN, INT16_MAX);
+    }
+  } else {
+    std::memcpy(dest.data(), data, read_bytes);
+  }
 
   if (isr) {
     vRingbufferReturnItem(ringbuf_, data);
diff --git a/src/tangara/audio/audio_events.hpp b/src/tangara/audio/audio_events.hpp
index 91bcf48b..56d150b2 100644
--- a/src/tangara/audio/audio_events.hpp
+++ b/src/tangara/audio/audio_events.hpp
@@ -144,8 +144,11 @@ struct OutputModeChanged : tinyfsm::Event {
   std::optional<drivers::NvsStorage::Output> set_to;
 };
 
-namespace internal {
+struct TtsPlaybackChanged : tinyfsm::Event {
+  bool is_playing;
+};
 
+namespace internal {
 struct DecodingStarted : tinyfsm::Event {
   std::shared_ptr<TrackInfo> track;
 };
diff --git a/src/tangara/audio/audio_fsm.cpp b/src/tangara/audio/audio_fsm.cpp
index 5a91c6f9..1daf568e 100644
--- a/src/tangara/audio/audio_fsm.cpp
+++ b/src/tangara/audio/audio_fsm.cpp
@@ -44,6 +44,7 @@
 #include "sample.hpp"
 #include "system_fsm/service_locator.hpp"
 #include "system_fsm/system_events.hpp"
+#include "tts/player.hpp"
 
 namespace audio {
 
@@ -60,15 +61,22 @@ std::shared_ptr<IAudioOutput> AudioState::sOutput;
 std::shared_ptr<I2SAudioOutput> AudioState::sI2SOutput;
 std::shared_ptr<BluetoothAudioOutput> AudioState::sBtOutput;
 
-// Two seconds of samples for two channels, at a representative sample rate.
-constexpr size_t kDrainLatencySamples = 48000 * 2 * 2;
+// For tracks, keep about two seconds' worth of samples at 2ch 48kHz. This
+// is more headroom than we need for small playback, but it doesn't hurt to
+// keep some PSRAM in our pockets for a rainy day.
+constexpr size_t kTrackDrainLatencySamples = 48000 * 2 * 2;
 
-std::unique_ptr<drivers::PcmBuffer> AudioState::sDrainBuffer;
+// For system sounds, we intentionally choose codecs that are very fast to
+// decode. This lets us get away with a much smaller drain buffer.
+constexpr size_t kSystemDrainLatencySamples = 48000;
+
+std::unique_ptr<drivers::OutputBuffers> AudioState::sDrainBuffers;
 std::optional<IAudioOutput::Format> AudioState::sDrainFormat;
 
 StreamCues AudioState::sStreamCues;
 
 bool AudioState::sIsPaused = true;
+bool AudioState::sIsTtsPlaying = false;
 
 auto AudioState::emitPlaybackUpdate(bool paused) -> void {
   std::optional<uint32_t> position;
@@ -184,6 +192,11 @@ void AudioState::react(const TogglePlayPause& ev) {
   }
 }
 
+void AudioState::react(const TtsPlaybackChanged& ev) {
+  sIsTtsPlaying = ev.is_playing;
+  updateOutputMode();
+}
+
 void AudioState::react(const internal::DecodingFinished& ev) {
   // If we just finished playing whatever's at the front of the queue, then we
   // need to advanve and start playing the next one ASAP in order to continue
@@ -219,7 +232,7 @@ void AudioState::react(const internal::StreamStarted& ev) {
   }
 
   sStreamCues.addCue(ev.track, ev.cue_at_sample);
-  sStreamCues.update(sDrainBuffer->totalReceived());
+  sStreamCues.update(sDrainBuffers->first.totalReceived());
 
   if (!sIsPaused && !is_in_state<states::Playback>()) {
     transit<states::Playback>();
@@ -362,8 +375,8 @@ void AudioState::react(const OutputModeChanged& ev) {
       sOutput = sI2SOutput;
       break;
   }
-  sOutput->mode(IAudioOutput::Modes::kOnPaused);
   sSampleProcessor->SetOutput(sOutput);
+  updateOutputMode();
 
   // Bluetooth volume isn't 'changed' until we've connected to a device.
   if (new_mode == drivers::NvsStorage::Output::kHeadphones) {
@@ -374,6 +387,14 @@ void AudioState::react(const OutputModeChanged& ev) {
   }
 }
 
+auto AudioState::updateOutputMode() -> void {
+  if (is_in_state<states::Playback>() || sIsTtsPlaying) {
+    sOutput->mode(IAudioOutput::Modes::kOnPlaying);
+  } else {
+    sOutput->mode(IAudioOutput::Modes::kOnPaused);
+  }
+}
+
 auto AudioState::commitVolume() -> void {
   auto mode = sServices->nvs().OutputMode();
   auto vol = sOutput->GetVolume();
@@ -393,13 +414,20 @@ namespace states {
 void Uninitialised::react(const system_fsm::BootComplete& ev) {
   sServices = ev.services;
 
-  sDrainBuffer = std::make_unique<drivers::PcmBuffer>(kDrainLatencySamples);
+  sDrainBuffers = std::make_unique<drivers::OutputBuffers>(
+      kTrackDrainLatencySamples, kSystemDrainLatencySamples);
+  sDrainBuffers->first.suspend(true);
 
   sStreamFactory.reset(
       new FatfsStreamFactory(sServices->database(), sServices->tag_parser()));
-  sI2SOutput.reset(new I2SAudioOutput(sServices->gpios(), *sDrainBuffer));
+  sI2SOutput.reset(new I2SAudioOutput(sServices->gpios(), *sDrainBuffers));
   sBtOutput.reset(new BluetoothAudioOutput(
-      sServices->bluetooth(), *sDrainBuffer, sServices->bg_worker()));
+      sServices->bluetooth(), *sDrainBuffers, sServices->bg_worker()));
+
+  auto& tts_provider = sServices->tts();
+  auto tts_player = std::make_unique<tts::Player>(
+      sServices->bg_worker(), sDrainBuffers->second, *sStreamFactory);
+  tts_provider.player(std::move(tts_player));
 
   auto& nvs = sServices->nvs();
   sI2SOutput->SetMaxVolume(nvs.AmpMaxVolume());
@@ -430,7 +458,7 @@ void Uninitialised::react(const system_fsm::BootComplete& ev) {
       .left_bias = nvs.AmpLeftBias(),
   });
 
-  sSampleProcessor.reset(new SampleProcessor(*sDrainBuffer));
+  sSampleProcessor.reset(new SampleProcessor(sDrainBuffers->first));
   sSampleProcessor->SetOutput(sOutput);
 
   sDecoder.reset(Decoder::Start(sSampleProcessor));
@@ -441,6 +469,10 @@ void Uninitialised::react(const system_fsm::BootComplete& ev) {
 static const char kQueueKey[] = "audio:queue";
 static const char kCurrentFileKey[] = "audio:current";
 
+auto Standby::entry() -> void {
+  updateOutputMode();
+}
+
 void Standby::react(const system_fsm::KeyLockChanged& ev) {
   if (!ev.locking) {
     return;
@@ -526,7 +558,8 @@ static void heartbeat(TimerHandle_t) {
 
 void Playback::entry() {
   ESP_LOGI(kTag, "audio output resumed");
-  sOutput->mode(IAudioOutput::Modes::kOnPlaying);
+  sDrainBuffers->first.suspend(false);
+  updateOutputMode();
   emitPlaybackUpdate(false);
 
   if (!sHeartbeatTimer) {
@@ -539,7 +572,7 @@ void Playback::entry() {
 void Playback::exit() {
   ESP_LOGI(kTag, "audio output paused");
   xTimerStop(sHeartbeatTimer, portMAX_DELAY);
-  sOutput->mode(IAudioOutput::Modes::kOnPaused);
+  sDrainBuffers->first.suspend(true);
   emitPlaybackUpdate(true);
 }
 
@@ -550,7 +583,7 @@ void Playback::react(const system_fsm::SdStateChanged& ev) {
 }
 
 void Playback::react(const internal::StreamHeartbeat& ev) {
-  sStreamCues.update(sDrainBuffer->totalReceived());
+  sStreamCues.update(sDrainBuffers->first.totalReceived());
 
   if (sStreamCues.hasStream()) {
     emitPlaybackUpdate(false);
diff --git a/src/tangara/audio/audio_fsm.hpp b/src/tangara/audio/audio_fsm.hpp
index 0644375f..bc3feb55 100644
--- a/src/tangara/audio/audio_fsm.hpp
+++ b/src/tangara/audio/audio_fsm.hpp
@@ -48,6 +48,7 @@ class AudioState : public tinyfsm::Fsm<AudioState> {
   void react(const PlaySineWave&);
   void react(const SetTrack&);
   void react(const TogglePlayPause&);
+  void react(const TtsPlaybackChanged&);
 
   void react(const internal::DecodingFinished&);
   void react(const internal::StreamStarted&);
@@ -70,6 +71,7 @@ class AudioState : public tinyfsm::Fsm<AudioState> {
   virtual void react(const system_fsm::HasPhonesChanged&);
 
  protected:
+  auto updateOutputMode() -> void;
   auto emitPlaybackUpdate(bool paused) -> void;
   auto commitVolume() -> void;
 
@@ -82,12 +84,13 @@ class AudioState : public tinyfsm::Fsm<AudioState> {
   static std::shared_ptr<BluetoothAudioOutput> sBtOutput;
   static std::shared_ptr<IAudioOutput> sOutput;
 
-  static std::unique_ptr<drivers::PcmBuffer> sDrainBuffer;
+  static std::unique_ptr<drivers::OutputBuffers> sDrainBuffers;
 
   static StreamCues sStreamCues;
   static std::optional<IAudioOutput::Format> sDrainFormat;
 
   static bool sIsPaused;
+  static bool sIsTtsPlaying;
 };
 
 namespace states {
@@ -102,6 +105,7 @@ class Uninitialised : public AudioState {
 
 class Standby : public AudioState {
  public:
+  void entry() override;
   void react(const system_fsm::KeyLockChanged&) override;
   void react(const system_fsm::SdStateChanged&) override;
 
diff --git a/src/tangara/audio/bt_audio_output.cpp b/src/tangara/audio/bt_audio_output.cpp
index 336fc758..c6c64fd1 100644
--- a/src/tangara/audio/bt_audio_output.cpp
+++ b/src/tangara/audio/bt_audio_output.cpp
@@ -36,11 +36,11 @@ static constexpr uint16_t kVolumeRange = 60;
 using ConnectionState = drivers::Bluetooth::ConnectionState;
 
 BluetoothAudioOutput::BluetoothAudioOutput(drivers::Bluetooth& bt,
-                                           drivers::PcmBuffer& buffer,
+                                           drivers::OutputBuffers& bufs,
                                            tasks::WorkerPool& p)
     : IAudioOutput(),
       bluetooth_(bt),
-      buffer_(buffer),
+      buffers_(bufs),
       bg_worker_(p),
       volume_() {}
 
@@ -48,9 +48,9 @@ BluetoothAudioOutput::~BluetoothAudioOutput() {}
 
 auto BluetoothAudioOutput::changeMode(Modes mode) -> void {
   if (mode == Modes::kOnPlaying) {
-    bluetooth_.source(&buffer_);
+    bluetooth_.sources(&buffers_);
   } else {
-    bluetooth_.source(nullptr);
+    bluetooth_.sources(nullptr);
   }
 }
 
diff --git a/src/tangara/audio/bt_audio_output.hpp b/src/tangara/audio/bt_audio_output.hpp
index f22f330a..53d2c1a4 100644
--- a/src/tangara/audio/bt_audio_output.hpp
+++ b/src/tangara/audio/bt_audio_output.hpp
@@ -25,7 +25,7 @@ namespace audio {
 class BluetoothAudioOutput : public IAudioOutput {
  public:
   BluetoothAudioOutput(drivers::Bluetooth& bt,
-                       drivers::PcmBuffer& buf,
+                       drivers::OutputBuffers& bufs,
                        tasks::WorkerPool&);
   ~BluetoothAudioOutput();
 
@@ -54,7 +54,7 @@ class BluetoothAudioOutput : public IAudioOutput {
 
  private:
   drivers::Bluetooth& bluetooth_;
-  drivers::PcmBuffer& buffer_;
+  drivers::OutputBuffers& buffers_;
   tasks::WorkerPool& bg_worker_;
 
   uint16_t volume_;
diff --git a/src/tangara/audio/fatfs_stream_factory.cpp b/src/tangara/audio/fatfs_stream_factory.cpp
index 735ec134..94f22ae9 100644
--- a/src/tangara/audio/fatfs_stream_factory.cpp
+++ b/src/tangara/audio/fatfs_stream_factory.cpp
@@ -50,7 +50,6 @@ auto FatfsStreamFactory::create(std::string path, uint32_t offset)
     -> std::shared_ptr<TaggedStream> {
   auto tags = tag_parser_.ReadAndParseTags(path);
   if (!tags) {
-    ESP_LOGE(kTag, "failed to read tags");
     return {};
   }
 
diff --git a/src/tangara/audio/i2s_audio_output.cpp b/src/tangara/audio/i2s_audio_output.cpp
index 8222b8c9..55c8bdb8 100644
--- a/src/tangara/audio/i2s_audio_output.cpp
+++ b/src/tangara/audio/i2s_audio_output.cpp
@@ -42,10 +42,10 @@ static constexpr uint16_t kLineLevelVolume = 0x13d;
 static constexpr uint16_t kDefaultVolume = 0x100;
 
 I2SAudioOutput::I2SAudioOutput(drivers::IGpios& expander,
-                               drivers::PcmBuffer& buffer)
+                               drivers::OutputBuffers& buffers)
     : IAudioOutput(),
       expander_(expander),
-      buffer_(buffer),
+      buffers_(buffers),
       dac_(),
       current_mode_(Modes::kOff),
       current_config_(),
@@ -72,7 +72,7 @@ auto I2SAudioOutput::changeMode(Modes mode) -> void {
   if (was_off) {
     // Ensure an I2SDac instance actually exists.
     if (!dac_) {
-      auto instance = drivers::I2SDac::create(expander_, buffer_);
+      auto instance = drivers::I2SDac::create(expander_, buffers_);
       if (!instance) {
         return;
       }
diff --git a/src/tangara/audio/i2s_audio_output.hpp b/src/tangara/audio/i2s_audio_output.hpp
index 35d888b9..2b768ddd 100644
--- a/src/tangara/audio/i2s_audio_output.hpp
+++ b/src/tangara/audio/i2s_audio_output.hpp
@@ -21,7 +21,7 @@ namespace audio {
 
 class I2SAudioOutput : public IAudioOutput {
  public:
-  I2SAudioOutput(drivers::IGpios&, drivers::PcmBuffer&);
+  I2SAudioOutput(drivers::IGpios&, drivers::OutputBuffers&);
 
   auto SetMaxVolume(uint16_t) -> void;
   auto SetVolumeDb(uint16_t) -> void;
@@ -51,7 +51,7 @@ class I2SAudioOutput : public IAudioOutput {
 
  private:
   drivers::IGpios& expander_;
-  drivers::PcmBuffer& buffer_;
+  drivers::OutputBuffers& buffers_;
 
   std::unique_ptr<drivers::I2SDac> dac_;
 
diff --git a/src/tangara/audio/processor.cpp b/src/tangara/audio/processor.cpp
index aa2604b5..2fa7f78e 100644
--- a/src/tangara/audio/processor.cpp
+++ b/src/tangara/audio/processor.cpp
@@ -347,34 +347,39 @@ auto SampleProcessor::discardCommand(Args& command) -> void {
   // End of stream commands can just be dropped without further action.
 }
 
-SampleProcessor::Buffer::Buffer()
-    : buffer_(reinterpret_cast<sample::Sample*>(
-                  heap_caps_calloc(kSampleBufferLength,
-                                   sizeof(sample::Sample),
-                                   MALLOC_CAP_DMA)),
-              kSampleBufferLength),
+Buffer::Buffer(std::span<sample::Sample> storage)
+    : storage_(nullptr), buffer_(storage), samples_in_buffer_() {}
+
+Buffer::Buffer()
+    : storage_(reinterpret_cast<sample::Sample*>(
+          heap_caps_calloc(kSampleBufferLength,
+                           sizeof(sample::Sample),
+                           MALLOC_CAP_DMA))),
+      buffer_(storage_, kSampleBufferLength),
       samples_in_buffer_() {}
 
-SampleProcessor::Buffer::~Buffer() {
-  heap_caps_free(buffer_.data());
+Buffer::~Buffer() {
+  if (storage_) {
+    heap_caps_free(storage_);
+  }
 }
 
-auto SampleProcessor::Buffer::writeAcquire() -> std::span<sample::Sample> {
+auto Buffer::writeAcquire() -> std::span<sample::Sample> {
   return buffer_.subspan(samples_in_buffer_.size());
 }
 
-auto SampleProcessor::Buffer::writeCommit(size_t samples) -> void {
+auto Buffer::writeCommit(size_t samples) -> void {
   if (samples == 0) {
     return;
   }
   samples_in_buffer_ = buffer_.first(samples + samples_in_buffer_.size());
 }
 
-auto SampleProcessor::Buffer::readAcquire() -> std::span<sample::Sample> {
+auto Buffer::readAcquire() -> std::span<sample::Sample> {
   return samples_in_buffer_;
 }
 
-auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
+auto Buffer::readCommit(size_t samples) -> void {
   if (samples == 0) {
     return;
   }
@@ -389,11 +394,11 @@ auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
   }
 }
 
-auto SampleProcessor::Buffer::isEmpty() -> bool {
+auto Buffer::isEmpty() -> bool {
   return samples_in_buffer_.empty();
 }
 
-auto SampleProcessor::Buffer::clear() -> void {
+auto Buffer::clear() -> void {
   samples_in_buffer_ = {};
 }
 
diff --git a/src/tangara/audio/processor.hpp b/src/tangara/audio/processor.hpp
index 45e05291..52bace95 100644
--- a/src/tangara/audio/processor.hpp
+++ b/src/tangara/audio/processor.hpp
@@ -22,6 +22,35 @@
 
 namespace audio {
 
+/* Utility for managing buffering samples between digital filters. */
+class Buffer {
+ public:
+  Buffer(std::span<sample::Sample> storage);
+  Buffer();
+  ~Buffer();
+
+  /* Returns a span of the unused space within the buffer. */
+  auto writeAcquire() -> std::span<sample::Sample>;
+  /* Signals how many samples were just added to the writeAcquire span. */
+  auto writeCommit(size_t) -> void;
+
+  /* Returns a span of the samples stored within the buffer. */
+  auto readAcquire() -> std::span<sample::Sample>;
+  /* Signals how many samples from the readAcquire span were consumed. */
+  auto readCommit(size_t) -> void;
+
+  auto isEmpty() -> bool;
+  auto clear() -> void;
+
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+ private:
+  sample::Sample* storage_;
+  std::span<sample::Sample> buffer_;
+  std::span<sample::Sample> samples_in_buffer_;
+};
+
 /*
  * Handle to a persistent task that converts samples between formats (sample
  * rate, channels, bits per sample), in order to put samples in the preferred
@@ -87,33 +116,6 @@ class SampleProcessor {
   StreamBufferHandle_t source_;
   drivers::PcmBuffer& sink_;
 
-  /* Internal utility for managing buffering samples between our filters. */
-  class Buffer {
-   public:
-    Buffer();
-    ~Buffer();
-
-    /* Returns a span of the unused space within the buffer. */
-    auto writeAcquire() -> std::span<sample::Sample>;
-    /* Signals how many samples were just added to the writeAcquire span. */
-    auto writeCommit(size_t) -> void;
-
-    /* Returns a span of the samples stored within the buffer. */
-    auto readAcquire() -> std::span<sample::Sample>;
-    /* Signals how many samples from the readAcquire span were consumed. */
-    auto readCommit(size_t) -> void;
-
-    auto isEmpty() -> bool;
-    auto clear() -> void;
-
-    Buffer(const Buffer&) = delete;
-    Buffer& operator=(const Buffer&) = delete;
-
-   private:
-    std::span<sample::Sample> buffer_;
-    std::span<sample::Sample> samples_in_buffer_;
-  };
-
   Buffer input_buffer_;
   Buffer resampled_buffer_;
   Buffer output_buffer_;
diff --git a/src/tangara/tts/player.cpp b/src/tangara/tts/player.cpp
new file mode 100644
index 00000000..46e8c48a
--- /dev/null
+++ b/src/tangara/tts/player.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2024 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "tts/player.hpp"
+#include <mutex>
+
+#include "audio/audio_events.hpp"
+#include "audio/processor.hpp"
+#include "audio/resample.hpp"
+#include "codec.hpp"
+#include "esp_log.h"
+#include "events/event_queue.hpp"
+#include "freertos/projdefs.h"
+#include "portmacro.h"
+#include "sample.hpp"
+#include "types.hpp"
+
+namespace tts {
+
+[[maybe_unused]] static constexpr char kTag[] = "ttsplay";
+
+Player::Player(tasks::WorkerPool& worker,
+               drivers::PcmBuffer& output,
+               audio::FatfsStreamFactory& factory)
+    : bg_(worker),
+      stream_factory_(factory),
+      output_(output),
+      stream_playing_(false),
+      stream_cancelled_(false) {}
+
+auto Player::playFile(const std::string& text, const std::string& file)
+    -> void {
+  bg_.Dispatch<void>([=, this]() {
+    {
+      std::scoped_lock<std::mutex> lock{new_stream_mutex_};
+      if (stream_playing_) {
+        stream_cancelled_ = true;
+        stream_playing_.wait(true);
+      }
+      stream_cancelled_ = false;
+      stream_playing_ = true;
+    }
+
+    openAndDecode(text, file);
+
+    if (!stream_cancelled_) {
+      events::Audio().Dispatch(audio::TtsPlaybackChanged{.is_playing = false});
+    }
+    stream_playing_ = false;
+    stream_playing_.notify_all();
+  });
+}
+
+auto Player::openAndDecode(const std::string& text, const std::string& path)
+    -> void {
+  auto stream = stream_factory_.create(path);
+  if (!stream) {
+    ESP_LOGW(kTag, "missing '%s' for '%s'", path.c_str(), text.c_str());
+    return;
+  }
+
+  // FIXME: Rather than hardcoding WAV support only, we should work out a
+  // proper subset of 'low memory' decoders that can all be used for TTS
+  // playback.
+  if (stream->type() != codecs::StreamType::kWav) {
+    ESP_LOGE(kTag, "'%s' has unsupported encoding", path.c_str());
+    return;
+  }
+
+  auto decoder = codecs::CreateCodecForType(stream->type());
+  if (!decoder) {
+    ESP_LOGE(kTag, "creating decoder failed");
+    return;
+  }
+
+  std::unique_ptr<codecs::ICodec> codec{*decoder};
+  auto open_res = codec->OpenStream(stream, 0);
+  if (open_res.has_error()) {
+    ESP_LOGE(kTag, "opening stream failed");
+    return;
+  }
+
+  decodeToSink(*open_res, std::move(codec));
+}
+
+auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
+                          std::unique_ptr<codecs::ICodec> codec) -> void {
+  // Set up buffers to hold samples between the intermediary parts of
+  // processing. We can just use the stack for these, since this method is
+  // called only from background workers, which have enormous stacks.
+  sample::Sample decode_storage[4096];
+  audio::Buffer decode_buf(decode_storage);
+
+  sample::Sample resample_storage[4096];
+  audio::Buffer resample_buf(resample_storage);
+
+  sample::Sample stereo_storage[4096];
+  audio::Buffer stereo_buf(stereo_storage);
+
+  // Work out what processing the codec's output needs.
+  std::unique_ptr<audio::Resampler> resampler;
+  if (format.sample_rate_hz != 48000) {
+    resampler = std::make_unique<audio::Resampler>(format.sample_rate_hz, 48000,
+                                                   format.num_channels);
+  }
+  bool double_samples = format.num_channels == 1;
+
+  // Start our playback (wait for previous to end?)
+  events::Audio().Dispatch(audio::TtsPlaybackChanged{.is_playing = true});
+
+  // FIXME: This decode-and-process loop is substantially the same as the audio
+  // processor's filter loop. Ideally we should refactor both of these loops to
+  // reuse code, however I'm holding off on doing this until we've implemented
+  // more advanced audio processing features in the audio processor (EQ, tempo
+  // shifting, etc.) as it's not clear to me yet how much the two codepaths will
+  // be diverging later anyway.
+  while ((codec || !decode_buf.isEmpty() || !resample_buf.isEmpty() ||
+          !stereo_buf.isEmpty()) &&
+         !stream_cancelled_) {
+    if (codec) {
+      auto decode_res = codec->DecodeTo(decode_buf.writeAcquire());
+      if (decode_res.has_error()) {
+        ESP_LOGE(kTag, "decoding error");
+        break;
+      }
+      decode_buf.writeCommit(decode_res->samples_written);
+      if (decode_res->is_stream_finished) {
+        codec.reset();
+      }
+    }
+
+    if (!decode_buf.isEmpty()) {
+      auto resample_input = decode_buf.readAcquire();
+      auto resample_output = resample_buf.writeAcquire();
+
+      size_t read, wrote;
+      if (resampler) {
+        std::tie(read, wrote) =
+            resampler->Process(resample_input, resample_output, false);
+      } else {
+        read = wrote = std::min(resample_input.size(), resample_output.size());
+        std::copy_n(resample_input.begin(), read, resample_output.begin());
+      }
+
+      decode_buf.readCommit(read);
+      resample_buf.writeCommit(wrote);
+    }
+
+    if (!resample_buf.isEmpty()) {
+      auto channels_input = resample_buf.readAcquire();
+      auto channels_output = stereo_buf.writeAcquire();
+      size_t read, wrote;
+      if (double_samples) {
+        wrote = channels_output.size();
+        read = wrote / 2;
+        if (read > channels_input.size()) {
+          read = channels_input.size();
+          wrote = read * 2;
+        }
+        for (size_t i = 0; i < read; i++) {
+          channels_output[i * 2] = channels_input[i];
+          channels_output[(i * 2) + 1] = channels_input[i];
+        }
+      } else {
+        read = wrote = std::min(channels_input.size(), channels_output.size());
+        std::copy_n(channels_input.begin(), read, channels_output.begin());
+      }
+      resample_buf.readCommit(read);
+      stereo_buf.writeCommit(wrote);
+    }
+
+    // The mixin PcmBuffer should almost always be draining, so we can force
+    // samples into it more aggressively than with the main music PcmBuffer.
+    while (!stereo_buf.isEmpty()) {
+      size_t sent = output_.send(stereo_buf.readAcquire());
+      stereo_buf.readCommit(sent);
+    }
+  }
+
+  while (!output_.isEmpty()) {
+    if (stream_cancelled_) {
+      output_.clear();
+    } else {
+      vTaskDelay(pdMS_TO_TICKS(100));
+    }
+  }
+}
+
+}  // namespace tts
diff --git a/src/tangara/tts/player.hpp b/src/tangara/tts/player.hpp
new file mode 100644
index 00000000..d28da474
--- /dev/null
+++ b/src/tangara/tts/player.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2024 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include <string>
+
+#include "audio/fatfs_stream_factory.hpp"
+#include "codec.hpp"
+#include "drivers/pcm_buffer.hpp"
+#include "tasks.hpp"
+
+namespace tts {
+
+/*
+ * A TTS Player is the output stage of the TTS pipeline. It receives a stream
+ * of filenames that should be played, and handles decoding these files and
+ * sending them to the output buffer.
+ */
+class Player {
+ public:
+  Player(tasks::WorkerPool&, drivers::PcmBuffer&, audio::FatfsStreamFactory&);
+
+  auto playFile(const std::string& text, const std::string& path) -> void;
+
+  // Not copyable or movable.
+  Player(const Player&) = delete;
+  Player& operator=(const Player&) = delete;
+
+ private:
+  tasks::WorkerPool& bg_;
+  audio::FatfsStreamFactory& stream_factory_;
+  drivers::PcmBuffer& output_;
+
+  std::mutex new_stream_mutex_;
+  std::atomic<bool> stream_playing_;
+  std::atomic<bool> stream_cancelled_;
+
+  auto openAndDecode(const std::string& text, const std::string& path) -> void;
+  auto decodeToSink(const codecs::ICodec::OutputFormat&,
+                    std::unique_ptr<codecs::ICodec>) -> void;
+};
+
+}  // namespace tts
diff --git a/src/tangara/tts/provider.cpp b/src/tangara/tts/provider.cpp
index 7d33bae6..d19500e0 100644
--- a/src/tangara/tts/provider.cpp
+++ b/src/tangara/tts/provider.cpp
@@ -5,21 +5,39 @@
  */
 
 #include "tts/provider.hpp"
+#include <stdint.h>
 
+#include <ios>
 #include <optional>
+#include <sstream>
 #include <string>
 #include <variant>
 
+#include "drivers/storage.hpp"
 #include "esp_log.h"
 
+#include "komihash.h"
 #include "tts/events.hpp"
 
 namespace tts {
 
 [[maybe_unused]] static constexpr char kTag[] = "tts";
 
+static const char* kTtsPath = "/.tangara-tts/";
+
+static auto textToFile(const std::string& text) -> std::optional<std::string> {
+  uint64_t hash = komihash(text.data(), text.size(), 0);
+  std::stringstream stream;
+  stream << kTtsPath << std::hex << hash;
+  return stream.str();
+}
+
 Provider::Provider() {}
 
+auto Provider::player(std::unique_ptr<Player> p) -> void {
+  player_ = std::move(p);
+}
+
 auto Provider::feed(const Event& e) -> void {
   if (std::holds_alternative<SimpleEvent>(e)) {
     // ESP_LOGI(kTag, "context changed");
@@ -31,6 +49,19 @@ auto Provider::feed(const Event& e) -> void {
       // ESP_LOGI(kTag, "new selection: '%s', interactive? %i",
       // ev.new_selection->description.value_or("").c_str(),
       // ev.new_selection->is_interactive);
+      auto text = ev.new_selection->description;
+      if (!text) {
+        ESP_LOGW(kTag, "missing description for element");
+        return;
+      }
+      auto file = textToFile(*text);
+      if (!file) {
+        return;
+      }
+
+      if (player_) {
+        player_->playFile(*text, *file);
+      }
     }
   }
 }
diff --git a/src/tangara/tts/provider.hpp b/src/tangara/tts/provider.hpp
index 59f61a6c..8fe143cc 100644
--- a/src/tangara/tts/provider.hpp
+++ b/src/tangara/tts/provider.hpp
@@ -6,18 +6,35 @@
 
 #pragma once
 
+#include <memory>
 #include <optional>
 #include <string>
 #include <variant>
 
 #include "tts/events.hpp"
+#include "tts/player.hpp"
 
 namespace tts {
 
+/*
+ * A TTS Provider is responsible for receiving system events that may be
+ * relevant to TTS, and digesting them into discrete 'utterances' that can be
+ * used to generate audio feedback.
+ */
 class Provider {
  public:
   Provider();
+
+  auto player(std::unique_ptr<Player>) -> void;
+
   auto feed(const Event&) -> void;
+
+  // Not copyable or movable.
+  Provider(const Provider&) = delete;
+  Provider& operator=(const Provider&) = delete;
+
+ private:
+  std::unique_ptr<Player> player_;
 };
 
 }  // namespace tts