diff options
| author | jacqueline <me@jacqueline.id.au> | 2023-06-23 15:32:11 +1000 |
|---|---|---|
| committer | jacqueline <me@jacqueline.id.au> | 2023-06-23 15:32:11 +1000 |
| commit | 245d9ff4b9cde1f487beed76085a52f3f2d6d26c (patch) | |
| tree | 0730e6cda4c03a92c0d5e6b2e31fe27bfa021f69 /src/database/records.cpp | |
| parent | aee0474191aa6b4e4505e3f5a74b4ac8cc48063b (diff) | |
| download | tangara-fw-245d9ff4b9cde1f487beed76085a52f3f2d6d26c.tar.gz | |
add indexing to the database
idk man i wrote most of this in a fugue state whilst high on the couch
with my cat
Diffstat (limited to 'src/database/records.cpp')
| -rw-r--r-- | src/database/records.cpp | 211 |
1 files changed, 206 insertions, 5 deletions
diff --git a/src/database/records.cpp b/src/database/records.cpp index 49e5db0b..72608eb0 100644 --- a/src/database/records.cpp +++ b/src/database/records.cpp @@ -8,20 +8,43 @@ #include <stdint.h> +#include <iomanip> #include <sstream> +#include <string> #include <vector> #include "cbor.h" #include "esp_log.h" +#include "index.hpp" +#include "komihash.h" +#include "shared_string.h" #include "track.hpp" +// As LevelDB is a key-value store, each record in the database consists of a +// key and an optional value. +// +// Values, when present, are always cbor-encoded. This is fast, compact, and +// very easy to evolve over time due to its inclusion of type information. +// +// Keys have a more complicated scheme, as for performance we rely heavily on +// LevelDB's sorted storage format. We must therefore worry about clustering of +// similar records, and the sortability of our encoding format. +// Each kind of key consists of a a single-byte prefix, then one or more +// fields separated by null (0) bytes. Each field may be cbor-encoded, or may +// use some bespoke encoding; it depends on whether we want to be able to sort +// by that field. +// For debugging and discussion purposes, we represent field separators +// textually as '/', and write each field as its hex encoding. e.g. a data key +// for the track with id 17 would be written as 'D / 0x11'. + namespace database { static const char* kTag = "RECORDS"; static const char kDataPrefix = 'D'; static const char kHashPrefix = 'H'; +static const char kIndexPrefix = 'I'; static const char kFieldSeparator = '\0'; /* @@ -39,6 +62,8 @@ static const char kFieldSeparator = '\0'; template <typename T> auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t { // First pass: work out how many bytes we will encode into. + // FIXME: With benchmarking to help, we could consider preallocting a small + // buffer here to do the whole encoding in one pass. CborEncoder size_encoder; cbor_encoder_init(&size_encoder, NULL, 0, 0); std::invoke(fn, &size_encoder); @@ -55,19 +80,21 @@ auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t { OwningSlice::OwningSlice(std::string d) : data(d), slice(data) {} -auto CreateDataPrefix() -> OwningSlice { +/* 'D/' */ +auto EncodeDataPrefix() -> OwningSlice { char data[2] = {kDataPrefix, kFieldSeparator}; return OwningSlice({data, 2}); } -auto CreateDataKey(const TrackId& id) -> OwningSlice { +/* 'D/ 0xACAB' */ +auto EncodeDataKey(const TrackId& id) -> OwningSlice { std::ostringstream output; output.put(kDataPrefix).put(kFieldSeparator); output << TrackIdToBytes(id).data; return OwningSlice(output.str()); } -auto CreateDataValue(const TrackData& track) -> OwningSlice { +auto EncodeDataValue(const TrackData& track) -> OwningSlice { uint8_t* buf; std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) { CborEncoder array_encoder; @@ -179,7 +206,8 @@ auto ParseDataValue(const leveldb::Slice& slice) -> std::optional<TrackData> { return TrackData(id, path, hash, play_count, is_tombstoned); } -auto CreateHashKey(const uint64_t& hash) -> OwningSlice { +/* 'H/ 0xBEEF' */ +auto EncodeHashKey(const uint64_t& hash) -> OwningSlice { std::ostringstream output; output.put(kHashPrefix).put(kFieldSeparator); @@ -197,10 +225,183 @@ auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> { return BytesToTrackId(slice.ToString()); } -auto CreateHashValue(TrackId id) -> OwningSlice { +auto EncodeHashValue(TrackId id) -> OwningSlice { return TrackIdToBytes(id); } +/* 'I/' */ +auto EncodeAllIndexesPrefix() -> OwningSlice { + char data[2] = {kIndexPrefix, kFieldSeparator}; + return OwningSlice({data, 2}); +} + +auto AppendIndexHeader(const IndexKey::Header& header, std::ostringstream* out) + -> void { + *out << kIndexPrefix << kFieldSeparator; + + // Construct the header. + uint8_t* buf; + std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) { + CborEncoder array_encoder; + CborError err; + err = cbor_encoder_create_array(enc, &array_encoder, 3); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encode_uint(&array_encoder, header.id); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encode_uint(&array_encoder, header.depth); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encode_uint(&array_encoder, header.components_hash); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encoder_close_container(enc, &array_encoder); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + }); + std::string encoded{reinterpret_cast<char*>(buf), buf_len}; + delete buf; + *out << encoded << kFieldSeparator; +} + +auto EncodeIndexPrefix(const IndexKey::Header& header) -> OwningSlice { + std::ostringstream out; + AppendIndexHeader(header, &out); + return OwningSlice(out.str()); +} + +/* + * 'I/0xa2/0x686921/0xb9' + * ^ --- trailer + * ^ --- component ("hi!") + * ^ -------- header + * + * The components *must* be encoded in a way that is easy to sort + * lexicographically. The header and footer do not have this restriction, so + * cbor is fine. + * + * We store grouping information within the header; which index, filtered + * components. We store disambiguation information in the trailer; just a track + * id for now, but could reasonably be something like 'release year' as well. + */ +auto EncodeIndexKey(const IndexKey& key) -> OwningSlice { + std::ostringstream out; + + // Construct the header. + AppendIndexHeader(key.header, &out); + + // The component should already be UTF-8 encoded, so just write it. + if (key.item) { + out << *key.item; + } + + // Construct the footer. + out << kFieldSeparator; + if (key.track) { + out << TrackIdToBytes(*key.track).data; + } + return OwningSlice(out.str()); +} + +auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> { + IndexKey result{}; + + auto prefix = EncodeAllIndexesPrefix(); + if (!slice.starts_with(prefix.data)) { + return {}; + } + + std::string key_data = slice.ToString().substr(prefix.data.size()); + std::size_t header_length = 0; + { + CborParser parser; + CborValue container; + CborError err; + err = cbor_parser_init(reinterpret_cast<const uint8_t*>(key_data.data()), + key_data.size(), 0, &parser, &container); + if (err != CborNoError || !cbor_value_is_container(&container)) { + return {}; + } + + CborValue val; + err = cbor_value_enter_container(&container, &val); + if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) { + return {}; + } + + uint64_t raw_int; + err = cbor_value_get_uint64(&val, &raw_int); + if (err != CborNoError) { + return {}; + } + result.header.id = raw_int; + err = cbor_value_advance(&val); + if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) { + return {}; + } + + err = cbor_value_get_uint64(&val, &raw_int); + if (err != CborNoError) { + return {}; + } + result.header.depth = raw_int; + err = cbor_value_advance(&val); + if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) { + return {}; + } + + err = cbor_value_get_uint64(&val, &raw_int); + if (err != CborNoError) { + return {}; + } + result.header.components_hash = raw_int; + err = cbor_value_advance(&val); + if (err != CborNoError || !cbor_value_at_end(&val)) { + return {}; + } + + const uint8_t* next_byte = cbor_value_get_next_byte(&val); + header_length = + next_byte - reinterpret_cast<const uint8_t*>(key_data.data()); + } + + if (header_length == 0) { + return {}; + } + + if (header_length >= key_data.size()) { + return {}; + } + + std::istringstream in(key_data.substr(header_length + 1)); + std::stringbuf buffer{}; + + in.get(buffer, kFieldSeparator); + if (buffer.str().size() > 0) { + result.item = buffer.str(); + } + + buffer = {}; + in.get(buffer); + if (buffer.str().size() > 1) { + std::string raw_id = buffer.str().substr(1); + result.track = BytesToTrackId(raw_id); + } + + return result; +} + auto TrackIdToBytes(TrackId id) -> OwningSlice { uint8_t buf[8]; CborEncoder enc; |
