1 files changed, 260 insertions, 0 deletions
diff --git a/src/tangara/database/records.cpp b/src/tangara/database/records.cpp
new file mode 100644
index 00000000..b086be3b
--- /dev/null
+++ b/src/tangara/database/records.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "records.hpp"
+
+#include <stdint.h>
+#include <sys/_stdint.h>
+
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <memory_resource>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "cppbor.h"
+#include "cppbor_parse.h"
+#include "esp_log.h"
+
+#include "index.hpp"
+#include "komihash.h"
+#include "memory_resource.hpp"
+#include "track.hpp"
+
+// As LevelDB is a key-value store, each record in the database consists of a
+// key and an optional value.
+//
+// Values, when present, are always cbor-encoded. This is fast, compact, and
+// very easy to evolve over time due to its inclusion of type information.
+//
+// Keys have a more complicated scheme, as for performance we rely heavily on
+// LevelDB's sorted storage format. We must therefore worry about clustering of
+// similar records, and the sortability of our encoding format.
+//    Each kind of key consists of a a single-byte prefix, then one or more
+// fields separated by null (0) bytes. Each field may be cbor-encoded, or may
+// use some bespoke encoding; it depends on whether we want to be able to sort
+// by that field.
+//    For debugging and discussion purposes, we represent field separators
+// textually as '/', and write each field as its hex encoding. e.g. a data key
+// for the track with id 17 would be written as 'D / 0x11'.
+
+namespace database {
+
+[[maybe_unused]] static const char* kTag = "RECORDS";
+
+static const char kPathPrefix = 'P';
+static const char kDataPrefix = 'D';
+static const char kHashPrefix = 'H';
+static const char kTagHashPrefix = 'T';
+static const char kIndexPrefix = 'I';
+static const char kFieldSeparator = '\0';
+
+static constexpr auto makePrefix(char p) -> std::string {
+  std::string str;
+  str += p;
+  str += kFieldSeparator;
+  return str;
+}
+
+auto EncodePathKey(std::string_view path) -> std::string {
+  std::stringstream out{};
+  out << makePrefix(kPathPrefix);
+  out << path;
+  return out.str();
+}
+
+/* 'D/' */
+auto EncodeDataPrefix() -> std::string {
+  return makePrefix(kDataPrefix);
+}
+
+/* 'D/ 0xACAB' */
+auto EncodeDataKey(const TrackId& id) -> std::string {
+  return EncodeDataPrefix() + TrackIdToBytes(id);
+}
+
+auto EncodeDataValue(const TrackData& track) -> std::string {
+  auto* tag_hashes = new cppbor::Map{};  // Free'd by Array's dtor.
+  for (const auto& entry : track.individual_tag_hashes) {
+    tag_hashes->add(cppbor::Uint{static_cast<uint32_t>(entry.first)},
+                    cppbor::Uint{entry.second});
+  }
+  cppbor::Array val{
+      cppbor::Uint{track.id},
+      cppbor::Tstr{track.filepath},
+      cppbor::Uint{track.tags_hash},
+      cppbor::Bool{track.is_tombstoned},
+      cppbor::Uint{track.modified_at.first},
+      cppbor::Uint{track.modified_at.second},
+      tag_hashes,
+  };
+  return val.toString();
+}
+
+auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr<TrackData> {
+  auto [item, unused, err] = cppbor::parseWithViews(
+      reinterpret_cast<const uint8_t*>(slice.data()), slice.size());
+  if (!item || item->type() != cppbor::ARRAY) {
+    return nullptr;
+  }
+  auto vals = item->asArray();
+  if (vals->size() != 7 || vals->get(0)->type() != cppbor::UINT ||
+      vals->get(1)->type() != cppbor::TSTR ||
+      vals->get(2)->type() != cppbor::UINT ||
+      vals->get(3)->type() != cppbor::SIMPLE ||
+      vals->get(4)->type() != cppbor::UINT ||
+      vals->get(5)->type() != cppbor::UINT ||
+      vals->get(6)->type() != cppbor::MAP) {
+    return {};
+  }
+  auto res = std::make_shared<TrackData>();
+  res->id = vals->get(0)->asUint()->unsignedValue();
+  res->filepath = vals->get(1)->asViewTstr()->view();
+  res->tags_hash = vals->get(2)->asUint()->unsignedValue();
+  res->is_tombstoned = vals->get(3)->asBool()->value();
+  res->modified_at = std::make_pair<uint16_t, uint16_t>(
+      vals->get(4)->asUint()->unsignedValue(),
+      vals->get(5)->asUint()->unsignedValue());
+
+  auto tag_hashes = vals->get(6)->asMap();
+  for (const auto& entry : *tag_hashes) {
+    auto tag = static_cast<Tag>(entry.first->asUint()->unsignedValue());
+    res->individual_tag_hashes[tag] = entry.second->asUint()->unsignedValue();
+  }
+  return res;
+}
+
+/* 'H/ 0xBEEF' */
+auto EncodeHashKey(const uint64_t& hash) -> std::string {
+  return makePrefix(kHashPrefix) + cppbor::Uint{hash}.toString();
+}
+
+auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> {
+  return BytesToTrackId({slice.data(), slice.size()});
+}
+
+auto EncodeHashValue(TrackId id) -> std::string {
+  return TrackIdToBytes(id);
+}
+
+/* 'T/ 0xBEEF' */
+auto EncodeTagHashKey(const uint64_t& hash) -> std::string {
+  return makePrefix(kTagHashPrefix) + cppbor::Uint{hash}.toString();
+}
+
+/* 'I/' */
+auto EncodeAllIndexesPrefix() -> std::string {
+  return makePrefix(kIndexPrefix);
+}
+
+auto EncodeIndexPrefix(const IndexKey::Header& header) -> std::string {
+  std::ostringstream out;
+  out << makePrefix(kIndexPrefix);
+  cppbor::Array val{
+      cppbor::Uint{header.id},
+      cppbor::Uint{header.depth},
+      cppbor::Uint{header.components_hash},
+  };
+  out << val.toString() << kFieldSeparator;
+  return out.str();
+}
+
+/*
+ * 'I/0xa2/0x686921/0xb9'
+ *                   ^ --- trailer
+ *          ^ --- component ("hi!")
+ *     ^ -------- header
+ *
+ *  The components *must* be encoded in a way that is easy to sort
+ *  lexicographically. The header and footer do not have this restriction, so
+ *  cbor is fine.
+ *
+ *  We store grouping information within the header; which index, filtered
+ *  components. We store disambiguation information in the trailer; just a track
+ *  id for now, but could reasonably be something like 'release year' as well.
+ */
+auto EncodeIndexKey(const IndexKey& key) -> std::string {
+  std::ostringstream out{};
+
+  out << EncodeIndexPrefix(key.header);
+
+  // The component should already be UTF-8 encoded, so just write it.
+  if (key.item) {
+    out << *key.item << kFieldSeparator;
+  }
+
+  if (key.track) {
+    out << TrackIdToBytes(*key.track);
+  }
+
+  return out.str();
+}
+
+auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> {
+  IndexKey result{};
+
+  auto prefix = EncodeAllIndexesPrefix();
+  if (!slice.starts_with(prefix)) {
+    return {};
+  }
+
+  std::string key_data = slice.ToString().substr(prefix.size());
+  auto [key, end_of_key, err] = cppbor::parseWithViews(
+      reinterpret_cast<const uint8_t*>(key_data.data()), key_data.size());
+  if (!key || key->type() != cppbor::ARRAY) {
+    return {};
+  }
+  auto as_array = key->asArray();
+  if (as_array->size() != 3 || as_array->get(0)->type() != cppbor::UINT ||
+      as_array->get(1)->type() != cppbor::UINT ||
+      as_array->get(2)->type() != cppbor::UINT) {
+    return {};
+  }
+  result.header.id = as_array->get(0)->asUint()->unsignedValue();
+  result.header.depth = as_array->get(1)->asUint()->unsignedValue();
+  result.header.components_hash = as_array->get(2)->asUint()->unsignedValue();
+
+  size_t header_length =
+      reinterpret_cast<const char*>(end_of_key) - key_data.data();
+
+  if (header_length == 0 || header_length >= key_data.size()) {
+    return {};
+  }
+
+  std::istringstream in(key_data.substr(header_length + 1));
+  std::stringbuf buffer{};
+
+  in.get(buffer, kFieldSeparator);
+  if (buffer.str().size() > 0) {
+    result.item = buffer.str();
+  }
+
+  buffer = {};
+  in.get(buffer);
+  std::string id_str = buffer.str();
+  if (id_str.size() > 1) {
+    result.track = BytesToTrackId(id_str.substr(1));
+  }
+
+  return result;
+}
+
+auto TrackIdToBytes(TrackId id) -> std::string {
+  return cppbor::Uint{id}.toString();
+}
+
+auto BytesToTrackId(std::span<const char> bytes) -> std::optional<TrackId> {
+  auto [res, unused, err] = cppbor::parse(
+      reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size());
+  if (!res || res->type() != cppbor::UINT) {
+    return {};
+  }
+  return res->asUint()->unsignedValue();
+}
+
+}  // namespace database