diff options
Diffstat (limited to 'src/tangara/database/records.cpp')
| -rw-r--r-- | src/tangara/database/records.cpp | 260 |
1 files changed, 260 insertions, 0 deletions
diff --git a/src/tangara/database/records.cpp b/src/tangara/database/records.cpp new file mode 100644 index 00000000..b086be3b --- /dev/null +++ b/src/tangara/database/records.cpp @@ -0,0 +1,260 @@ +/* + * Copyright 2023 jacqueline <me@jacqueline.id.au> + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#include "records.hpp" + +#include <stdint.h> +#include <sys/_stdint.h> + +#include <functional> +#include <iomanip> +#include <iostream> +#include <memory_resource> +#include <sstream> +#include <string> +#include <vector> + +#include "cppbor.h" +#include "cppbor_parse.h" +#include "esp_log.h" + +#include "index.hpp" +#include "komihash.h" +#include "memory_resource.hpp" +#include "track.hpp" + +// As LevelDB is a key-value store, each record in the database consists of a +// key and an optional value. +// +// Values, when present, are always cbor-encoded. This is fast, compact, and +// very easy to evolve over time due to its inclusion of type information. +// +// Keys have a more complicated scheme, as for performance we rely heavily on +// LevelDB's sorted storage format. We must therefore worry about clustering of +// similar records, and the sortability of our encoding format. +// Each kind of key consists of a a single-byte prefix, then one or more +// fields separated by null (0) bytes. Each field may be cbor-encoded, or may +// use some bespoke encoding; it depends on whether we want to be able to sort +// by that field. +// For debugging and discussion purposes, we represent field separators +// textually as '/', and write each field as its hex encoding. e.g. a data key +// for the track with id 17 would be written as 'D / 0x11'. + +namespace database { + +[[maybe_unused]] static const char* kTag = "RECORDS"; + +static const char kPathPrefix = 'P'; +static const char kDataPrefix = 'D'; +static const char kHashPrefix = 'H'; +static const char kTagHashPrefix = 'T'; +static const char kIndexPrefix = 'I'; +static const char kFieldSeparator = '\0'; + +static constexpr auto makePrefix(char p) -> std::string { + std::string str; + str += p; + str += kFieldSeparator; + return str; +} + +auto EncodePathKey(std::string_view path) -> std::string { + std::stringstream out{}; + out << makePrefix(kPathPrefix); + out << path; + return out.str(); +} + +/* 'D/' */ +auto EncodeDataPrefix() -> std::string { + return makePrefix(kDataPrefix); +} + +/* 'D/ 0xACAB' */ +auto EncodeDataKey(const TrackId& id) -> std::string { + return EncodeDataPrefix() + TrackIdToBytes(id); +} + +auto EncodeDataValue(const TrackData& track) -> std::string { + auto* tag_hashes = new cppbor::Map{}; // Free'd by Array's dtor. + for (const auto& entry : track.individual_tag_hashes) { + tag_hashes->add(cppbor::Uint{static_cast<uint32_t>(entry.first)}, + cppbor::Uint{entry.second}); + } + cppbor::Array val{ + cppbor::Uint{track.id}, + cppbor::Tstr{track.filepath}, + cppbor::Uint{track.tags_hash}, + cppbor::Bool{track.is_tombstoned}, + cppbor::Uint{track.modified_at.first}, + cppbor::Uint{track.modified_at.second}, + tag_hashes, + }; + return val.toString(); +} + +auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr<TrackData> { + auto [item, unused, err] = cppbor::parseWithViews( + reinterpret_cast<const uint8_t*>(slice.data()), slice.size()); + if (!item || item->type() != cppbor::ARRAY) { + return nullptr; + } + auto vals = item->asArray(); + if (vals->size() != 7 || vals->get(0)->type() != cppbor::UINT || + vals->get(1)->type() != cppbor::TSTR || + vals->get(2)->type() != cppbor::UINT || + vals->get(3)->type() != cppbor::SIMPLE || + vals->get(4)->type() != cppbor::UINT || + vals->get(5)->type() != cppbor::UINT || + vals->get(6)->type() != cppbor::MAP) { + return {}; + } + auto res = std::make_shared<TrackData>(); + res->id = vals->get(0)->asUint()->unsignedValue(); + res->filepath = vals->get(1)->asViewTstr()->view(); + res->tags_hash = vals->get(2)->asUint()->unsignedValue(); + res->is_tombstoned = vals->get(3)->asBool()->value(); + res->modified_at = std::make_pair<uint16_t, uint16_t>( + vals->get(4)->asUint()->unsignedValue(), + vals->get(5)->asUint()->unsignedValue()); + + auto tag_hashes = vals->get(6)->asMap(); + for (const auto& entry : *tag_hashes) { + auto tag = static_cast<Tag>(entry.first->asUint()->unsignedValue()); + res->individual_tag_hashes[tag] = entry.second->asUint()->unsignedValue(); + } + return res; +} + +/* 'H/ 0xBEEF' */ +auto EncodeHashKey(const uint64_t& hash) -> std::string { + return makePrefix(kHashPrefix) + cppbor::Uint{hash}.toString(); +} + +auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> { + return BytesToTrackId({slice.data(), slice.size()}); +} + +auto EncodeHashValue(TrackId id) -> std::string { + return TrackIdToBytes(id); +} + +/* 'T/ 0xBEEF' */ +auto EncodeTagHashKey(const uint64_t& hash) -> std::string { + return makePrefix(kTagHashPrefix) + cppbor::Uint{hash}.toString(); +} + +/* 'I/' */ +auto EncodeAllIndexesPrefix() -> std::string { + return makePrefix(kIndexPrefix); +} + +auto EncodeIndexPrefix(const IndexKey::Header& header) -> std::string { + std::ostringstream out; + out << makePrefix(kIndexPrefix); + cppbor::Array val{ + cppbor::Uint{header.id}, + cppbor::Uint{header.depth}, + cppbor::Uint{header.components_hash}, + }; + out << val.toString() << kFieldSeparator; + return out.str(); +} + +/* + * 'I/0xa2/0x686921/0xb9' + * ^ --- trailer + * ^ --- component ("hi!") + * ^ -------- header + * + * The components *must* be encoded in a way that is easy to sort + * lexicographically. The header and footer do not have this restriction, so + * cbor is fine. + * + * We store grouping information within the header; which index, filtered + * components. We store disambiguation information in the trailer; just a track + * id for now, but could reasonably be something like 'release year' as well. + */ +auto EncodeIndexKey(const IndexKey& key) -> std::string { + std::ostringstream out{}; + + out << EncodeIndexPrefix(key.header); + + // The component should already be UTF-8 encoded, so just write it. + if (key.item) { + out << *key.item << kFieldSeparator; + } + + if (key.track) { + out << TrackIdToBytes(*key.track); + } + + return out.str(); +} + +auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> { + IndexKey result{}; + + auto prefix = EncodeAllIndexesPrefix(); + if (!slice.starts_with(prefix)) { + return {}; + } + + std::string key_data = slice.ToString().substr(prefix.size()); + auto [key, end_of_key, err] = cppbor::parseWithViews( + reinterpret_cast<const uint8_t*>(key_data.data()), key_data.size()); + if (!key || key->type() != cppbor::ARRAY) { + return {}; + } + auto as_array = key->asArray(); + if (as_array->size() != 3 || as_array->get(0)->type() != cppbor::UINT || + as_array->get(1)->type() != cppbor::UINT || + as_array->get(2)->type() != cppbor::UINT) { + return {}; + } + result.header.id = as_array->get(0)->asUint()->unsignedValue(); + result.header.depth = as_array->get(1)->asUint()->unsignedValue(); + result.header.components_hash = as_array->get(2)->asUint()->unsignedValue(); + + size_t header_length = + reinterpret_cast<const char*>(end_of_key) - key_data.data(); + + if (header_length == 0 || header_length >= key_data.size()) { + return {}; + } + + std::istringstream in(key_data.substr(header_length + 1)); + std::stringbuf buffer{}; + + in.get(buffer, kFieldSeparator); + if (buffer.str().size() > 0) { + result.item = buffer.str(); + } + + buffer = {}; + in.get(buffer); + std::string id_str = buffer.str(); + if (id_str.size() > 1) { + result.track = BytesToTrackId(id_str.substr(1)); + } + + return result; +} + +auto TrackIdToBytes(TrackId id) -> std::string { + return cppbor::Uint{id}.toString(); +} + +auto BytesToTrackId(std::span<const char> bytes) -> std::optional<TrackId> { + auto [res, unused, err] = cppbor::parse( + reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size()); + if (!res || res->type() != cppbor::UINT) { + return {}; + } + return res->asUint()->unsignedValue(); +} + +} // namespace database |
