summaryrefslogtreecommitdiff
path: root/src/database/records.cpp
diff options
context:
space:
mode:
authorjacqueline <me@jacqueline.id.au>2023-06-23 15:32:11 +1000
committerjacqueline <me@jacqueline.id.au>2023-06-23 15:32:11 +1000
commit245d9ff4b9cde1f487beed76085a52f3f2d6d26c (patch)
tree0730e6cda4c03a92c0d5e6b2e31fe27bfa021f69 /src/database/records.cpp
parentaee0474191aa6b4e4505e3f5a74b4ac8cc48063b (diff)
downloadtangara-fw-245d9ff4b9cde1f487beed76085a52f3f2d6d26c.tar.gz
add indexing to the database
idk man i wrote most of this in a fugue state whilst high on the couch with my cat
Diffstat (limited to 'src/database/records.cpp')
-rw-r--r--src/database/records.cpp211
1 files changed, 206 insertions, 5 deletions
diff --git a/src/database/records.cpp b/src/database/records.cpp
index 49e5db0b..72608eb0 100644
--- a/src/database/records.cpp
+++ b/src/database/records.cpp
@@ -8,20 +8,43 @@
#include <stdint.h>
+#include <iomanip>
#include <sstream>
+#include <string>
#include <vector>
#include "cbor.h"
#include "esp_log.h"
+#include "index.hpp"
+#include "komihash.h"
+#include "shared_string.h"
#include "track.hpp"
+// As LevelDB is a key-value store, each record in the database consists of a
+// key and an optional value.
+//
+// Values, when present, are always cbor-encoded. This is fast, compact, and
+// very easy to evolve over time due to its inclusion of type information.
+//
+// Keys have a more complicated scheme, as for performance we rely heavily on
+// LevelDB's sorted storage format. We must therefore worry about clustering of
+// similar records, and the sortability of our encoding format.
+// Each kind of key consists of a a single-byte prefix, then one or more
+// fields separated by null (0) bytes. Each field may be cbor-encoded, or may
+// use some bespoke encoding; it depends on whether we want to be able to sort
+// by that field.
+// For debugging and discussion purposes, we represent field separators
+// textually as '/', and write each field as its hex encoding. e.g. a data key
+// for the track with id 17 would be written as 'D / 0x11'.
+
namespace database {
static const char* kTag = "RECORDS";
static const char kDataPrefix = 'D';
static const char kHashPrefix = 'H';
+static const char kIndexPrefix = 'I';
static const char kFieldSeparator = '\0';
/*
@@ -39,6 +62,8 @@ static const char kFieldSeparator = '\0';
template <typename T>
auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t {
// First pass: work out how many bytes we will encode into.
+ // FIXME: With benchmarking to help, we could consider preallocting a small
+ // buffer here to do the whole encoding in one pass.
CborEncoder size_encoder;
cbor_encoder_init(&size_encoder, NULL, 0, 0);
std::invoke(fn, &size_encoder);
@@ -55,19 +80,21 @@ auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t {
OwningSlice::OwningSlice(std::string d) : data(d), slice(data) {}
-auto CreateDataPrefix() -> OwningSlice {
+/* 'D/' */
+auto EncodeDataPrefix() -> OwningSlice {
char data[2] = {kDataPrefix, kFieldSeparator};
return OwningSlice({data, 2});
}
-auto CreateDataKey(const TrackId& id) -> OwningSlice {
+/* 'D/ 0xACAB' */
+auto EncodeDataKey(const TrackId& id) -> OwningSlice {
std::ostringstream output;
output.put(kDataPrefix).put(kFieldSeparator);
output << TrackIdToBytes(id).data;
return OwningSlice(output.str());
}
-auto CreateDataValue(const TrackData& track) -> OwningSlice {
+auto EncodeDataValue(const TrackData& track) -> OwningSlice {
uint8_t* buf;
std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) {
CborEncoder array_encoder;
@@ -179,7 +206,8 @@ auto ParseDataValue(const leveldb::Slice& slice) -> std::optional<TrackData> {
return TrackData(id, path, hash, play_count, is_tombstoned);
}
-auto CreateHashKey(const uint64_t& hash) -> OwningSlice {
+/* 'H/ 0xBEEF' */
+auto EncodeHashKey(const uint64_t& hash) -> OwningSlice {
std::ostringstream output;
output.put(kHashPrefix).put(kFieldSeparator);
@@ -197,10 +225,183 @@ auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> {
return BytesToTrackId(slice.ToString());
}
-auto CreateHashValue(TrackId id) -> OwningSlice {
+auto EncodeHashValue(TrackId id) -> OwningSlice {
return TrackIdToBytes(id);
}
+/* 'I/' */
+auto EncodeAllIndexesPrefix() -> OwningSlice {
+ char data[2] = {kIndexPrefix, kFieldSeparator};
+ return OwningSlice({data, 2});
+}
+
+auto AppendIndexHeader(const IndexKey::Header& header, std::ostringstream* out)
+ -> void {
+ *out << kIndexPrefix << kFieldSeparator;
+
+ // Construct the header.
+ uint8_t* buf;
+ std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) {
+ CborEncoder array_encoder;
+ CborError err;
+ err = cbor_encoder_create_array(enc, &array_encoder, 3);
+ if (err != CborNoError && err != CborErrorOutOfMemory) {
+ ESP_LOGE(kTag, "encoding err %u", err);
+ return;
+ }
+ err = cbor_encode_uint(&array_encoder, header.id);
+ if (err != CborNoError && err != CborErrorOutOfMemory) {
+ ESP_LOGE(kTag, "encoding err %u", err);
+ return;
+ }
+ err = cbor_encode_uint(&array_encoder, header.depth);
+ if (err != CborNoError && err != CborErrorOutOfMemory) {
+ ESP_LOGE(kTag, "encoding err %u", err);
+ return;
+ }
+ err = cbor_encode_uint(&array_encoder, header.components_hash);
+ if (err != CborNoError && err != CborErrorOutOfMemory) {
+ ESP_LOGE(kTag, "encoding err %u", err);
+ return;
+ }
+ err = cbor_encoder_close_container(enc, &array_encoder);
+ if (err != CborNoError && err != CborErrorOutOfMemory) {
+ ESP_LOGE(kTag, "encoding err %u", err);
+ return;
+ }
+ });
+ std::string encoded{reinterpret_cast<char*>(buf), buf_len};
+ delete buf;
+ *out << encoded << kFieldSeparator;
+}
+
+auto EncodeIndexPrefix(const IndexKey::Header& header) -> OwningSlice {
+ std::ostringstream out;
+ AppendIndexHeader(header, &out);
+ return OwningSlice(out.str());
+}
+
+/*
+ * 'I/0xa2/0x686921/0xb9'
+ * ^ --- trailer
+ * ^ --- component ("hi!")
+ * ^ -------- header
+ *
+ * The components *must* be encoded in a way that is easy to sort
+ * lexicographically. The header and footer do not have this restriction, so
+ * cbor is fine.
+ *
+ * We store grouping information within the header; which index, filtered
+ * components. We store disambiguation information in the trailer; just a track
+ * id for now, but could reasonably be something like 'release year' as well.
+ */
+auto EncodeIndexKey(const IndexKey& key) -> OwningSlice {
+ std::ostringstream out;
+
+ // Construct the header.
+ AppendIndexHeader(key.header, &out);
+
+ // The component should already be UTF-8 encoded, so just write it.
+ if (key.item) {
+ out << *key.item;
+ }
+
+ // Construct the footer.
+ out << kFieldSeparator;
+ if (key.track) {
+ out << TrackIdToBytes(*key.track).data;
+ }
+ return OwningSlice(out.str());
+}
+
+auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> {
+ IndexKey result{};
+
+ auto prefix = EncodeAllIndexesPrefix();
+ if (!slice.starts_with(prefix.data)) {
+ return {};
+ }
+
+ std::string key_data = slice.ToString().substr(prefix.data.size());
+ std::size_t header_length = 0;
+ {
+ CborParser parser;
+ CborValue container;
+ CborError err;
+ err = cbor_parser_init(reinterpret_cast<const uint8_t*>(key_data.data()),
+ key_data.size(), 0, &parser, &container);
+ if (err != CborNoError || !cbor_value_is_container(&container)) {
+ return {};
+ }
+
+ CborValue val;
+ err = cbor_value_enter_container(&container, &val);
+ if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) {
+ return {};
+ }
+
+ uint64_t raw_int;
+ err = cbor_value_get_uint64(&val, &raw_int);
+ if (err != CborNoError) {
+ return {};
+ }
+ result.header.id = raw_int;
+ err = cbor_value_advance(&val);
+ if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) {
+ return {};
+ }
+
+ err = cbor_value_get_uint64(&val, &raw_int);
+ if (err != CborNoError) {
+ return {};
+ }
+ result.header.depth = raw_int;
+ err = cbor_value_advance(&val);
+ if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) {
+ return {};
+ }
+
+ err = cbor_value_get_uint64(&val, &raw_int);
+ if (err != CborNoError) {
+ return {};
+ }
+ result.header.components_hash = raw_int;
+ err = cbor_value_advance(&val);
+ if (err != CborNoError || !cbor_value_at_end(&val)) {
+ return {};
+ }
+
+ const uint8_t* next_byte = cbor_value_get_next_byte(&val);
+ header_length =
+ next_byte - reinterpret_cast<const uint8_t*>(key_data.data());
+ }
+
+ if (header_length == 0) {
+ return {};
+ }
+
+ if (header_length >= key_data.size()) {
+ return {};
+ }
+
+ std::istringstream in(key_data.substr(header_length + 1));
+ std::stringbuf buffer{};
+
+ in.get(buffer, kFieldSeparator);
+ if (buffer.str().size() > 0) {
+ result.item = buffer.str();
+ }
+
+ buffer = {};
+ in.get(buffer);
+ if (buffer.str().size() > 1) {
+ std::string raw_id = buffer.str().substr(1);
+ result.track = BytesToTrackId(raw_id);
+ }
+
+ return result;
+}
+
auto TrackIdToBytes(TrackId id) -> OwningSlice {
uint8_t buf[8];
CborEncoder enc;