add indexing to the database

idk man i wrote most of this in a fugue state whilst high on the couch with my cat
author: jacqueline <me@jacqueline.id.au> 2023-06-23 15:32:11 +1000
committer: jacqueline <me@jacqueline.id.au> 2023-06-23 15:32:11 +1000
commit: 245d9ff4b9cde1f487beed76085a52f3f2d6d26c (patch)
tree: 0730e6cda4c03a92c0d5e6b2e31fe27bfa021f69 /src/database/records.cpp
parent: aee0474191aa6b4e4505e3f5a74b4ac8cc48063b (diff)
download: tangara-fw-245d9ff4b9cde1f487beed76085a52f3f2d6d26c.tar.gz
1 files changed, 206 insertions, 5 deletions
diff --git a/src/database/records.cpp b/src/database/records.cpp
index 49e5db0b..72608eb0 100644
--- a/src/database/records.cpp
+++ b/src/database/records.cpp
@@ -8,20 +8,43 @@
 
 #include <stdint.h>
 
+#include <iomanip>
 #include <sstream>
+#include <string>
 #include <vector>
 
 #include "cbor.h"
 #include "esp_log.h"
 
+#include "index.hpp"
+#include "komihash.h"
+#include "shared_string.h"
 #include "track.hpp"
 
+// As LevelDB is a key-value store, each record in the database consists of a
+// key and an optional value.
+//
+// Values, when present, are always cbor-encoded. This is fast, compact, and
+// very easy to evolve over time due to its inclusion of type information.
+//
+// Keys have a more complicated scheme, as for performance we rely heavily on
+// LevelDB's sorted storage format. We must therefore worry about clustering of
+// similar records, and the sortability of our encoding format.
+//    Each kind of key consists of a a single-byte prefix, then one or more
+// fields separated by null (0) bytes. Each field may be cbor-encoded, or may
+// use some bespoke encoding; it depends on whether we want to be able to sort
+// by that field.
+//    For debugging and discussion purposes, we represent field separators
+// textually as '/', and write each field as its hex encoding. e.g. a data key
+// for the track with id 17 would be written as 'D / 0x11'.
+
 namespace database {
 
 static const char* kTag = "RECORDS";
 
 static const char kDataPrefix = 'D';
 static const char kHashPrefix = 'H';
+static const char kIndexPrefix = 'I';
 static const char kFieldSeparator = '\0';
 
 /*
@@ -39,6 +62,8 @@ static const char kFieldSeparator = '\0';
 template <typename T>
 auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t {
   // First pass: work out how many bytes we will encode into.
+  // FIXME: With benchmarking to help, we could consider preallocting a small
+  // buffer here to do the whole encoding in one pass.
   CborEncoder size_encoder;
   cbor_encoder_init(&size_encoder, NULL, 0, 0);
   std::invoke(fn, &size_encoder);
@@ -55,19 +80,21 @@ auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t {
 
 OwningSlice::OwningSlice(std::string d) : data(d), slice(data) {}
 
-auto CreateDataPrefix() -> OwningSlice {
+/* 'D/' */
+auto EncodeDataPrefix() -> OwningSlice {
   char data[2] = {kDataPrefix, kFieldSeparator};
   return OwningSlice({data, 2});
 }
 
-auto CreateDataKey(const TrackId& id) -> OwningSlice {
+/* 'D/ 0xACAB' */
+auto EncodeDataKey(const TrackId& id) -> OwningSlice {
   std::ostringstream output;
   output.put(kDataPrefix).put(kFieldSeparator);
   output << TrackIdToBytes(id).data;
   return OwningSlice(output.str());
 }
 
-auto CreateDataValue(const TrackData& track) -> OwningSlice {
+auto EncodeDataValue(const TrackData& track) -> OwningSlice {
   uint8_t* buf;
   std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) {
     CborEncoder array_encoder;
@@ -179,7 +206,8 @@ auto ParseDataValue(const leveldb::Slice& slice) -> std::optional<TrackData> {
   return TrackData(id, path, hash, play_count, is_tombstoned);
 }
 
-auto CreateHashKey(const uint64_t& hash) -> OwningSlice {
+/* 'H/ 0xBEEF' */
+auto EncodeHashKey(const uint64_t& hash) -> OwningSlice {
   std::ostringstream output;
   output.put(kHashPrefix).put(kFieldSeparator);
 
@@ -197,10 +225,183 @@ auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> {
   return BytesToTrackId(slice.ToString());
 }
 
-auto CreateHashValue(TrackId id) -> OwningSlice {
+auto EncodeHashValue(TrackId id) -> OwningSlice {
   return TrackIdToBytes(id);
 }
 
+/* 'I/' */
+auto EncodeAllIndexesPrefix() -> OwningSlice {
+  char data[2] = {kIndexPrefix, kFieldSeparator};
+  return OwningSlice({data, 2});
+}
+
+auto AppendIndexHeader(const IndexKey::Header& header, std::ostringstream* out)
+    -> void {
+  *out << kIndexPrefix << kFieldSeparator;
+
+  // Construct the header.
+  uint8_t* buf;
+  std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) {
+    CborEncoder array_encoder;
+    CborError err;
+    err = cbor_encoder_create_array(enc, &array_encoder, 3);
+    if (err != CborNoError && err != CborErrorOutOfMemory) {
+      ESP_LOGE(kTag, "encoding err %u", err);
+      return;
+    }
+    err = cbor_encode_uint(&array_encoder, header.id);
+    if (err != CborNoError && err != CborErrorOutOfMemory) {
+      ESP_LOGE(kTag, "encoding err %u", err);
+      return;
+    }
+    err = cbor_encode_uint(&array_encoder, header.depth);
+    if (err != CborNoError && err != CborErrorOutOfMemory) {
+      ESP_LOGE(kTag, "encoding err %u", err);
+      return;
+    }
+    err = cbor_encode_uint(&array_encoder, header.components_hash);
+    if (err != CborNoError && err != CborErrorOutOfMemory) {
+      ESP_LOGE(kTag, "encoding err %u", err);
+      return;
+    }
+    err = cbor_encoder_close_container(enc, &array_encoder);
+    if (err != CborNoError && err != CborErrorOutOfMemory) {
+      ESP_LOGE(kTag, "encoding err %u", err);
+      return;
+    }
+  });
+  std::string encoded{reinterpret_cast<char*>(buf), buf_len};
+  delete buf;
+  *out << encoded << kFieldSeparator;
+}
+
+auto EncodeIndexPrefix(const IndexKey::Header& header) -> OwningSlice {
+  std::ostringstream out;
+  AppendIndexHeader(header, &out);
+  return OwningSlice(out.str());
+}
+
+/*
+ * 'I/0xa2/0x686921/0xb9'
+ *                   ^ --- trailer
+ *          ^ --- component ("hi!")
+ *     ^ -------- header
+ *
+ *  The components *must* be encoded in a way that is easy to sort
+ *  lexicographically. The header and footer do not have this restriction, so
+ *  cbor is fine.
+ *
+ *  We store grouping information within the header; which index, filtered
+ *  components. We store disambiguation information in the trailer; just a track
+ *  id for now, but could reasonably be something like 'release year' as well.
+ */
+auto EncodeIndexKey(const IndexKey& key) -> OwningSlice {
+  std::ostringstream out;
+
+  // Construct the header.
+  AppendIndexHeader(key.header, &out);
+
+  // The component should already be UTF-8 encoded, so just write it.
+  if (key.item) {
+    out << *key.item;
+  }
+
+  // Construct the footer.
+  out << kFieldSeparator;
+  if (key.track) {
+    out << TrackIdToBytes(*key.track).data;
+  }
+  return OwningSlice(out.str());
+}
+
+auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> {
+  IndexKey result{};
+
+  auto prefix = EncodeAllIndexesPrefix();
+  if (!slice.starts_with(prefix.data)) {
+    return {};
+  }
+
+  std::string key_data = slice.ToString().substr(prefix.data.size());
+  std::size_t header_length = 0;
+  {
+    CborParser parser;
+    CborValue container;
+    CborError err;
+    err = cbor_parser_init(reinterpret_cast<const uint8_t*>(key_data.data()),
+                           key_data.size(), 0, &parser, &container);
+    if (err != CborNoError || !cbor_value_is_container(&container)) {
+      return {};
+    }
+
+    CborValue val;
+    err = cbor_value_enter_container(&container, &val);
+    if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) {
+      return {};
+    }
+
+    uint64_t raw_int;
+    err = cbor_value_get_uint64(&val, &raw_int);
+    if (err != CborNoError) {
+      return {};
+    }
+    result.header.id = raw_int;
+    err = cbor_value_advance(&val);
+    if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) {
+      return {};
+    }
+
+    err = cbor_value_get_uint64(&val, &raw_int);
+    if (err != CborNoError) {
+      return {};
+    }
+    result.header.depth = raw_int;
+    err = cbor_value_advance(&val);
+    if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) {
+      return {};
+    }
+
+    err = cbor_value_get_uint64(&val, &raw_int);
+    if (err != CborNoError) {
+      return {};
+    }
+    result.header.components_hash = raw_int;
+    err = cbor_value_advance(&val);
+    if (err != CborNoError || !cbor_value_at_end(&val)) {
+      return {};
+    }
+
+    const uint8_t* next_byte = cbor_value_get_next_byte(&val);
+    header_length =
+        next_byte - reinterpret_cast<const uint8_t*>(key_data.data());
+  }
+
+  if (header_length == 0) {
+    return {};
+  }
+
+  if (header_length >= key_data.size()) {
+    return {};
+  }
+
+  std::istringstream in(key_data.substr(header_length + 1));
+  std::stringbuf buffer{};
+
+  in.get(buffer, kFieldSeparator);
+  if (buffer.str().size() > 0) {
+    result.item = buffer.str();
+  }
+
+  buffer = {};
+  in.get(buffer);
+  if (buffer.str().size() > 1) {
+    std::string raw_id = buffer.str().substr(1);
+    result.track = BytesToTrackId(raw_id);
+  }
+
+  return result;
+}
+
 auto TrackIdToBytes(TrackId id) -> OwningSlice {
   uint8_t buf[8];
   CborEncoder enc;
author	jacqueline <me@jacqueline.id.au>	2023-06-23 15:32:11 +1000
committer	jacqueline <me@jacqueline.id.au>	2023-06-23 15:32:11 +1000
commit	245d9ff4b9cde1f487beed76085a52f3f2d6d26c (patch)
tree	0730e6cda4c03a92c0d5e6b2e31fe27bfa021f69 /src/database/records.cpp
parent	aee0474191aa6b4e4505e3f5a74b4ac8cc48063b (diff)
download	tangara-fw-245d9ff4b9cde1f487beed76085a52f3f2d6d26c.tar.gz