summaryrefslogtreecommitdiff
path: root/src/database/records.cpp
blob: a0aac69aca0cf0edf0448362fb2a7f400f3f3588 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/*
 * Copyright 2023 jacqueline <me@jacqueline.id.au>
 *
 * SPDX-License-Identifier: GPL-3.0-only
 */

#include "records.hpp"

#include <stdint.h>
#include <sys/_stdint.h>

#include <functional>
#include <iomanip>
#include <memory_resource>
#include <sstream>
#include <string>
#include <vector>

#include "cppbor.h"
#include "cppbor_parse.h"
#include "esp_log.h"

#include "index.hpp"
#include "komihash.h"
#include "memory_resource.hpp"
#include "track.hpp"

// As LevelDB is a key-value store, each record in the database consists of a
// key and an optional value.
//
// Values, when present, are always cbor-encoded. This is fast, compact, and
// very easy to evolve over time due to its inclusion of type information.
//
// Keys have a more complicated scheme, as for performance we rely heavily on
// LevelDB's sorted storage format. We must therefore worry about clustering of
// similar records, and the sortability of our encoding format.
//    Each kind of key consists of a a single-byte prefix, then one or more
// fields separated by null (0) bytes. Each field may be cbor-encoded, or may
// use some bespoke encoding; it depends on whether we want to be able to sort
// by that field.
//    For debugging and discussion purposes, we represent field separators
// textually as '/', and write each field as its hex encoding. e.g. a data key
// for the track with id 17 would be written as 'D / 0x11'.

namespace database {

[[maybe_unused]] static const char* kTag = "RECORDS";

static const char kDataPrefix = 'D';
static const char kHashPrefix = 'H';
[[maybe_unused]] static const char kTagHashPrefix = 'T';
static const char kIndexPrefix = 'I';
static const char kFieldSeparator = '\0';

/* 'D/' */
auto EncodeDataPrefix() -> std::string {
  return {kDataPrefix, kFieldSeparator};
}

/* 'D/ 0xACAB' */
auto EncodeDataKey(const TrackId& id) -> std::string {
  return EncodeDataPrefix() + TrackIdToBytes(id);
}

auto EncodeDataValue(const TrackData& track) -> std::string {
  auto* tag_hashes = new cppbor::Map{};  // Free'd by Array's dtor.
  for (const auto& entry : track.individual_tag_hashes) {
    tag_hashes->add(cppbor::Uint{static_cast<uint32_t>(entry.first)},
                    cppbor::Uint{entry.second});
  }
  cppbor::Array val{
      cppbor::Uint{track.id},
      cppbor::Tstr{track.filepath},
      cppbor::Uint{track.tags_hash},
      cppbor::Bool{track.is_tombstoned},
      cppbor::Uint{track.modified_at.first},
      cppbor::Uint{track.modified_at.second},
      tag_hashes,
  };
  return val.toString();
}

auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr<TrackData> {
  auto [item, unused, err] = cppbor::parseWithViews(
      reinterpret_cast<const uint8_t*>(slice.data()), slice.size());
  if (!item || item->type() != cppbor::ARRAY) {
    return nullptr;
  }
  auto vals = item->asArray();
  if (vals->size() != 7 || vals->get(0)->type() != cppbor::UINT ||
      vals->get(1)->type() != cppbor::TSTR ||
      vals->get(2)->type() != cppbor::UINT ||
      vals->get(3)->type() != cppbor::SIMPLE ||
      vals->get(4)->type() != cppbor::UINT ||
      vals->get(5)->type() != cppbor::UINT ||
      vals->get(6)->type() != cppbor::MAP) {
    return {};
  }
  auto res = std::make_shared<TrackData>();
  res->id = vals->get(0)->asUint()->unsignedValue();
  res->filepath = vals->get(1)->asViewTstr()->view();
  res->tags_hash = vals->get(2)->asUint()->unsignedValue();
  res->is_tombstoned = vals->get(3)->asBool()->value();
  res->modified_at = std::make_pair<uint16_t, uint16_t>(
      vals->get(4)->asUint()->unsignedValue(),
      vals->get(5)->asUint()->unsignedValue());

  auto tag_hashes = vals->get(6)->asMap();
  for (const auto& entry : *tag_hashes) {
    auto tag = static_cast<Tag>(entry.first->asUint()->unsignedValue());
    res->individual_tag_hashes[tag] = entry.second->asUint()->unsignedValue();
  }
  return res;
}

/* 'H/ 0xBEEF' */
auto EncodeHashKey(const uint64_t& hash) -> std::string {
  return std::string{kHashPrefix, kFieldSeparator} +
         cppbor::Uint{hash}.toString();
}

auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> {
  return BytesToTrackId({slice.data(), slice.size()});
}

auto EncodeHashValue(TrackId id) -> std::string {
  return TrackIdToBytes(id);
}

/* 'T/ 0xBEEF' */
auto EncodeTagHashKey(const uint64_t& hash) -> std::string {
  return std::string{kTagHashPrefix, kFieldSeparator} +
         cppbor::Uint{hash}.toString();
}

/* 'I/' */
auto EncodeAllIndexesPrefix() -> std::string {
  return {kIndexPrefix, kFieldSeparator};
}

auto EncodeIndexPrefix(const IndexKey::Header& header) -> std::string {
  std::ostringstream out;
  out.put(kIndexPrefix).put(kFieldSeparator);
  cppbor::Array val{
      cppbor::Uint{header.id},
      cppbor::Uint{header.depth},
      cppbor::Uint{header.components_hash},
  };
  out << val.toString() << kFieldSeparator;
  return out.str();
}

/*
 * 'I/0xa2/0x686921/0xb9'
 *                   ^ --- trailer
 *          ^ --- component ("hi!")
 *     ^ -------- header
 *
 *  The components *must* be encoded in a way that is easy to sort
 *  lexicographically. The header and footer do not have this restriction, so
 *  cbor is fine.
 *
 *  We store grouping information within the header; which index, filtered
 *  components. We store disambiguation information in the trailer; just a track
 *  id for now, but could reasonably be something like 'release year' as well.
 */
auto EncodeIndexKey(const IndexKey& key) -> std::string {
  std::ostringstream out{};

  out << EncodeIndexPrefix(key.header);

  // The component should already be UTF-8 encoded, so just write it.
  if (key.item) {
    out << *key.item << kFieldSeparator;
  }

  if (key.track) {
    out << TrackIdToBytes(*key.track);
  }

  return out.str();
}

auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> {
  IndexKey result{};

  auto prefix = EncodeAllIndexesPrefix();
  if (!slice.starts_with(prefix)) {
    return {};
  }

  std::string key_data = slice.ToString().substr(prefix.size());
  auto [key, end_of_key, err] = cppbor::parseWithViews(
      reinterpret_cast<const uint8_t*>(key_data.data()), key_data.size());
  if (!key || key->type() != cppbor::ARRAY) {
    return {};
  }
  auto as_array = key->asArray();
  if (as_array->size() != 3 || as_array->get(0)->type() != cppbor::UINT ||
      as_array->get(1)->type() != cppbor::UINT ||
      as_array->get(2)->type() != cppbor::UINT) {
    return {};
  }
  result.header.id = as_array->get(0)->asUint()->unsignedValue();
  result.header.depth = as_array->get(1)->asUint()->unsignedValue();
  result.header.components_hash = as_array->get(2)->asUint()->unsignedValue();

  size_t header_length =
      reinterpret_cast<const char*>(end_of_key) - key_data.data();

  if (header_length == 0 || header_length >= key_data.size()) {
    return {};
  }

  std::istringstream in(key_data.substr(header_length + 1));
  std::stringbuf buffer{};

  in.get(buffer, kFieldSeparator);
  if (buffer.str().size() > 0) {
    result.item = buffer.str();
  }

  buffer = {};
  in.get(buffer);
  std::string id_str = buffer.str();
  if (id_str.size() > 1) {
    result.track = BytesToTrackId(id_str.substr(1));
  }

  return result;
}

auto TrackIdToBytes(TrackId id) -> std::string {
  return cppbor::Uint{id}.toString();
}

auto BytesToTrackId(cpp::span<const char> bytes) -> std::optional<TrackId> {
  auto [res, unused, err] = cppbor::parse(
      reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size());
  if (!res || res->type() != cppbor::UINT) {
    return {};
  }
  return res->asUint()->unsignedValue();
}

}  // namespace database