diff options
| author | jacqueline <me@jacqueline.id.au> | 2023-10-30 15:47:38 +1100 |
|---|---|---|
| committer | jacqueline <me@jacqueline.id.au> | 2023-10-30 15:52:26 +1100 |
| commit | b58c08150853b8055093dc116d407ffd543f8ec8 (patch) | |
| tree | 9b82d130d2c833fc234bca0f12f0fba1b7202c4d /src/locale | |
| parent | 18d90051c9145ead86d4da701a2bc54f45e4fb66 (diff) | |
| download | tangara-fw-b58c08150853b8055093dc116d407ffd543f8ec8.tar.gz | |
add locale-aware colation to db indexes
Diffstat (limited to 'src/locale')
| -rw-r--r-- | src/locale/CMakeLists.txt | 11 | ||||
| -rw-r--r-- | src/locale/collation.cpp | 92 | ||||
| -rw-r--r-- | src/locale/include/collation.hpp | 49 | ||||
| -rw-r--r-- | src/locale/include/strxfrm.h | 35 | ||||
| -rw-r--r-- | src/locale/priv_include/weight.h | 160 | ||||
| -rw-r--r-- | src/locale/strxfrm_l.c | 641 |
6 files changed, 988 insertions, 0 deletions
diff --git a/src/locale/CMakeLists.txt b/src/locale/CMakeLists.txt new file mode 100644 index 00000000..627ca314 --- /dev/null +++ b/src/locale/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright 2023 jacqueline <me@jacqueline.id.au> +# +# SPDX-License-Identifier: GPL-3.0-only + +idf_component_register( + SRCS "collation.cpp" "strxfrm_l.c" + INCLUDE_DIRS "include" + PRIV_INCLUDE_DIRS "priv_include" + REQUIRES "span" "esp_partition" "spi_flash") + +target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS}) diff --git a/src/locale/collation.cpp b/src/locale/collation.cpp new file mode 100644 index 00000000..f5e8272a --- /dev/null +++ b/src/locale/collation.cpp @@ -0,0 +1,92 @@ +/* + * Copyright 2023 jacqueline <me@jacqueline.id.au> + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#include "collation.hpp" + +#include <stdint.h> +#include <memory> + +#include "esp_flash_spi_init.h" +#include "esp_log.h" +#include "esp_partition.h" +#include "hal/spi_flash_types.h" +#include "spi_flash_mmap.h" +#include "strxfrm.h" + +namespace locale { + +static constexpr char kTag[] = "collate"; + +static constexpr esp_partition_type_t kLocalePartitionType = + static_cast<esp_partition_type_t>(0x40); +static constexpr esp_partition_subtype_t kLcCollateSubtype = + static_cast<esp_partition_subtype_t>(0x0); + +auto CreateCollator() -> std::unique_ptr<ICollator> { + std::unique_ptr<ICollator> ret{GLibCollator::create()}; + if (!ret) { + ret.reset(new NoopCollator()); + } + return ret; +} + +auto GLibCollator::create() -> GLibCollator* { + uint32_t data_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA); + ESP_LOGI(kTag, "free data pages: %lu (%lu KiB)", data_pages, data_pages * 64); + + const esp_partition_t* partition = + esp_partition_find_first(kLocalePartitionType, kLcCollateSubtype, NULL); + if (partition == NULL) { + ESP_LOGW(kTag, "no LC_COLLATE partition found"); + } + + ESP_LOGI(kTag, "found LC_COLLATE partition of size %lu", partition->size); + if (partition->size > data_pages * 64 * 1024) { + ESP_LOGE(kTag, "not enough free pages to map LC_COLLATE partition!"); + return nullptr; + } + + const void* region; + esp_partition_mmap_handle_t handle; + esp_err_t err = esp_partition_mmap(partition, 0, partition->size, + ESP_PARTITION_MMAP_DATA, ®ion, &handle); + if (err != ESP_OK) { + ESP_LOGE(kTag, "LC_COLLATE mmap failed"); + return nullptr; + } + + auto data = std::make_unique<locale_data_t>(); + if (!parse_locale_data(region, partition->size, data.get())) { + ESP_LOGE(kTag, "parsing locale data failed"); + esp_partition_munmap(handle); + return nullptr; + } + + return new GLibCollator(handle, std::move(data)); +} + +GLibCollator::GLibCollator(const esp_partition_mmap_handle_t handle, + std::unique_ptr<locale_data_t> locale) + : handle_(handle), locale_data_(std::move(locale)) {} + +GLibCollator::~GLibCollator() { + esp_partition_munmap(handle_); +} + +auto GLibCollator::Transform(const std::string& in) -> std::string { + char dest[256]; + size_t size = glib_strxfrm(dest, in.c_str(), 256, locale_data_.get()); + if (size >= 256) { + char* larger_dest = new char[size + 1]{0}; + glib_strxfrm(larger_dest, in.c_str(), size, locale_data_.get()); + std::string out{larger_dest, size}; + delete[] larger_dest; + return out; + } + return {dest, size}; +} + +} // namespace locale diff --git a/src/locale/include/collation.hpp b/src/locale/include/collation.hpp new file mode 100644 index 00000000..e8b6fa17 --- /dev/null +++ b/src/locale/include/collation.hpp @@ -0,0 +1,49 @@ +/* + * Copyright 2023 jacqueline <me@jacqueline.id.au> + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#pragma once + +#include <cstddef> +#include <memory> +#include <string> + +#include "esp_partition.h" +#include "span.hpp" + +#include "strxfrm.h" + +namespace locale { + +class ICollator { + public: + virtual ~ICollator() {} + + virtual auto Transform(const std::string&) -> std::string = 0; +}; + +class NoopCollator : public ICollator { + public: + auto Transform(const std::string& in) -> std::string override { return in; } +}; + +auto CreateCollator() -> std::unique_ptr<ICollator>; + +class GLibCollator : public ICollator { + public: + static auto create() -> GLibCollator*; + ~GLibCollator(); + + auto Transform(const std::string& in) -> std::string override; + + private: + GLibCollator(const esp_partition_mmap_handle_t, + std::unique_ptr<locale_data_t>); + + const esp_partition_mmap_handle_t handle_; + std::unique_ptr<locale_data_t> locale_data_; +}; + +} // namespace locale diff --git a/src/locale/include/strxfrm.h b/src/locale/include/strxfrm.h new file mode 100644 index 00000000..a40f7596 --- /dev/null +++ b/src/locale/include/strxfrm.h @@ -0,0 +1,35 @@ +/* + * Copyright 2023 jacqueline <me@jacqueline.id.au> + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + uint_fast32_t nrules; + unsigned char* rulesets; + unsigned char* weights; + int32_t* table; + unsigned char* extra; + int32_t* indirect; +} locale_data_t; + +bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out); + +size_t glib_strxfrm(char* dest, + const char* src, + size_t n, + locale_data_t* locale); + +#ifdef __cplusplus +} +#endif diff --git a/src/locale/priv_include/weight.h b/src/locale/priv_include/weight.h new file mode 100644 index 00000000..34465d90 --- /dev/null +++ b/src/locale/priv_include/weight.h @@ -0,0 +1,160 @@ +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Ulrich Drepper, <drepper@cygnus.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#pragma once + +#include <stdint.h> +#include <stddef.h> + +/* This alignment is used for 32-bit integers in locale files, both + those that are explicitly int32_t or uint32_t and those that are + wchar_t, regardless of the (possibly smaller) alignment required + for such integers on a particular host. */ +#define LOCFILE_ALIGN sizeof (int32_t) +#define LOCFILE_ALIGN_MASK (LOCFILE_ALIGN - 1) +#define LOCFILE_ALIGN_UP(x) (((x) + LOCFILE_ALIGN - 1) \ + & ~LOCFILE_ALIGN_MASK) +#define LOCFILE_ALIGNED_P(x) (((x) & LOCFILE_ALIGN_MASK) == 0) + +/* Find index of weight. */ +static inline int32_t __attribute__ ((always_inline)) +findidx (const int32_t *table, + const int32_t *indirect, + const unsigned char *extra, + const unsigned char **cpp, size_t len) +{ + int_fast32_t i = table[*(*cpp)++]; + const unsigned char *cp; + const unsigned char *usrc; + + if (i >= 0) + /* This is an index into the weight table. Cool. */ + return i; + + /* Oh well, more than one sequence starting with this byte. + Search for the correct one. */ + cp = &extra[-i]; + usrc = *cpp; + --len; + while (1) + { + size_t nhere; + + /* The first thing is the index. */ + i = *((const int32_t *) cp); + cp += sizeof (int32_t); + + /* Next is the length of the byte sequence. These are always + short byte sequences so there is no reason to call any + function (even if they are inlined). */ + nhere = *cp++; + + if (i >= 0) + { + /* It is a single character. If it matches we found our + index. Note that at the end of each list there is an + entry of length zero which represents the single byte + sequence. The first (and here only) byte was tested + already. */ + size_t cnt; + + /* With GCC 5.3 when compiling with -Os the compiler warns + that seq2.back_us, which becomes usrc, might be used + uninitialized. This can't be true because we pass a length + of -1 for len at the same time which means that this loop + never executes. */ + for (cnt = 0; cnt < nhere && cnt < len; ++cnt) + if (cp[cnt] != usrc[cnt]) + break; + + if (cnt == nhere) + { + /* Found it. */ + *cpp += nhere; + return i; + } + + /* Up to the next entry. */ + cp += nhere; + if (!LOCFILE_ALIGNED_P (1 + nhere)) + cp += LOCFILE_ALIGN - (1 + nhere) % LOCFILE_ALIGN; + } + else + { + /* This is a range of characters. First decide whether the + current byte sequence lies in the range. */ + size_t cnt; + size_t offset = 0; + + for (cnt = 0; cnt < nhere && cnt < len; ++cnt) + if (cp[cnt] != usrc[cnt]) + break; + + if (cnt != nhere) + { + if (cnt == len || cp[cnt] > usrc[cnt]) + { + /* Cannot be in this range. */ + cp += 2 * nhere; + if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) + cp += (LOCFILE_ALIGN + - (1 + 2 * nhere) % LOCFILE_ALIGN); + continue; + } + + /* Test against the end of the range. */ + for (cnt = 0; cnt < nhere; ++cnt) + if (cp[nhere + cnt] != usrc[cnt]) + break; + + if (cnt != nhere && cp[nhere + cnt] < usrc[cnt]) + { + /* Cannot be in this range. */ + cp += 2 * nhere; + if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) + cp += (LOCFILE_ALIGN + - (1 + 2 * nhere) % LOCFILE_ALIGN); + continue; + } + + /* This range matches the next characters. Now find + the offset in the indirect table. */ + for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt); + + do + { + offset <<= 8; + /* With GCC 7 when compiling with -Os the compiler + warns that seq1.back_us and seq2.back_us, which + become usrc, might be used uninitialized. This + is impossible for the same reason as described + above. */ + offset += usrc[cnt] - cp[cnt]; + } + while (++cnt < nhere); + } + + *cpp += nhere; + return indirect[-i + offset]; + } + } + + /* NOTREACHED */ + return 0x43219876; +} + diff --git a/src/locale/strxfrm_l.c b/src/locale/strxfrm_l.c new file mode 100644 index 00000000..198b59b3 --- /dev/null +++ b/src/locale/strxfrm_l.c @@ -0,0 +1,641 @@ +/* Copyright (C) 1995-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Ulrich Drepper <drepper@gnu.org>, 1995. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "strxfrm.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/param.h> + +#include "esp_log.h" + +#include "weight.h" + +static const char kTag[] = "strxfrm"; + +#ifndef STRING_TYPE +#define STRING_TYPE char +#define USTRING_TYPE unsigned char +#define STRLEN strlen +#define STPNCPY stpncpy +#define L(arg) arg +#endif + +#define CONCAT(a, b) CONCAT1(a, b) +#define CONCAT1(a, b) a##b + +/* Maximum string size that is calculated with cached indices. Right now this + is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be + lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */ +#define SMALL_STR_SIZE 4095 + +/* We know three kinds of collation sorting rules. */ +enum coll_sort_rule { + illegal_0__, + sort_forward, + sort_backward, + illegal_3__, + sort_position, + sort_forward_position, + sort_backward_position, + sort_mask +}; + +enum collate_element { + COLLATE_NRULES = 0, + COLLATE_RULESETS, + COLLATE_TABLEMB, + COLLATE_WEIGHTMB, + COLLATE_EXTRAMB, + COLLATE_INDIRECTMB, + COLLATE_GAP1, + COLLATE_GAP2, + COLLATE_GAP3, + COLLATE_TABLEWC, + COLLATE_WEIGHTWC, + COLLATE_EXTRAWC, + COLLATE_INDIRECTWC, + COLLATE_SYMB_HASH_SIZEMB, + COLLATE_SYMB_TABLEMB, + COLLATE_SYMB_EXTRAMB, + COLLATE_COLLSEQMB, + COLLATE_COLLSEQWC, + COLLATE_CODESET, + + // Not a real element; used to know how many elements there are. + COLLATE_LAST +}; +typedef enum collate_element collate_element_t; + +bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) { + const struct { + unsigned int magic; + unsigned int nstrings; + unsigned int strindex[0]; + }* const header = raw_data; + + if (header->magic != 0x20051017) { + ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic); + return false; + } + + if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) { + ESP_LOGE(kTag, "file was too small to contain header"); + return false; + } + + if (header->nstrings != COLLATE_LAST) { + ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)", + header->nstrings, COLLATE_LAST); + return false; + } + + // The LC_COLLATE partition appears to contain data in the correct shape. + // Pull out pointers to the various attributes it contains. + const void* offsets[COLLATE_LAST]; + for (size_t i = 0; i < header->nstrings; i++) { + size_t offset = header->strindex[i]; + if (offset > size) { + ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset); + return false; + } + offsets[i] = (raw_data + offset); + } + + // Now parse those pointers into the output struct. + out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES]; + + out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS]; + out->table = (int32_t*)offsets[COLLATE_TABLEMB]; + out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB]; + out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB]; + out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB]; + + assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0); + assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0); + assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0); + assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0); + + return true; +} + +/* We need UTF-8 encoding of numbers. */ +static int utf8_encode(char* buf, int val) { + int retval; + + if (val < 0x80) { + *buf++ = (char)val; + retval = 1; + } else { + int step; + + for (step = 2; step < 6; ++step) + if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) + break; + retval = step; + + *buf = (unsigned char)(~0xff >> step); + --step; + do { + buf[step] = 0x80 | (val & 0x3f); + val >>= 6; + } while (--step > 0); + *buf |= val; + } + + return retval; +} + +/* Find next weight and rule index. Inlined since called for every char. */ +static __always_inline size_t find_idx(const USTRING_TYPE** us, + int32_t* weight_idx, + unsigned char* rule_idx, + const locale_data_t* l_data, + const int pass) { + int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1); + *rule_idx = tmp >> 24; + int32_t idx = tmp & 0xffffff; + size_t len = l_data->weights[idx++]; + + /* Skip over indices of previous levels. */ + for (int i = 0; i < pass; i++) { + idx += len; + len = l_data->weights[idx++]; + } + + *weight_idx = idx; + return len; +} + +static int find_position(const USTRING_TYPE* us, + const locale_data_t* l_data, + const int pass) { + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE* usrc = us; + + find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass); + return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position; +} + +/* Do the transformation. */ +static size_t do_xfrm(const USTRING_TYPE* usrc, + STRING_TYPE* dest, + size_t n, + const locale_data_t* l_data) { + int32_t weight_idx; + unsigned char rule_idx; + uint_fast32_t pass; + size_t needed = 0; + size_t last_needed; + + /* Now the passes over the weights. */ + for (pass = 0; pass < l_data->nrules; ++pass) { + size_t backw_len = 0; + last_needed = needed; + const USTRING_TYPE* cur = usrc; + const USTRING_TYPE* backw_start = NULL; + + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = find_position(cur, l_data, pass); + + if (position == 0) { + while (*cur != L('\0')) { + const USTRING_TYPE* pos = cur; + size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); + int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; + + if ((rule & sort_forward) != 0) { + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t i = backw_len; i > 0;) { + int32_t weight_idx; + unsigned char rule_idx; + size_t len = + find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); + if (needed + i < n) + for (size_t j = len; j > 0; j--) + dest[needed + i - j] = l_data->weights[weight_idx++]; + + i -= len; + } + + needed += backw_len; + backw_start = NULL; + backw_len = 0; + } + + /* Now handle the forward element. */ + if (needed + len < n) + while (len-- > 0) + dest[needed++] = l_data->weights[weight_idx++]; + else + /* No more characters fit into the buffer. */ + needed += len; + } else { + /* Remember start of the backward sequence & track length. */ + if (backw_start == NULL) + backw_start = pos; + backw_len += len; + } + } + + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t i = backw_len; i > 0;) { + size_t len = + find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); + if (needed + i < n) + for (size_t j = len; j > 0; j--) + dest[needed + i - j] = l_data->weights[weight_idx++]; + + i -= len; + } + + needed += backw_len; + } + } else { + int val = 1; + char buf[7]; + size_t buflen; + size_t i; + + while (*cur != L('\0')) { + const USTRING_TYPE* pos = cur; + size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); + int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; + + if ((rule & sort_forward) != 0) { + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t p = backw_len; p > 0; p--) { + size_t len; + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE* backw_cur = backw_start; + + /* To prevent a warning init the used vars. */ + len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + for (i = 1; i < p; i++) + len = + find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = l_data->weights[weight_idx + i]; + } + needed += buflen + len; + val = 1; + } else + ++val; + } + + backw_start = NULL; + backw_len = 0; + } + + /* Now handle the forward element. */ + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = l_data->weights[weight_idx + i]; + } + needed += buflen + len; + val = 1; + } else + ++val; + } else { + /* Remember start of the backward sequence & track length. */ + if (backw_start == NULL) + backw_start = pos; + backw_len++; + } + } + + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t p = backw_len; p > 0; p--) { + size_t len; + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE* backw_cur = backw_start; + + /* To prevent a warning init the used vars. */ + len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + for (i = 1; i < p; i++) + len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = l_data->weights[weight_idx + i]; + } + needed += buflen + len; + val = 1; + } else + ++val; + } + } + } + + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0'); + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && needed == last_needed + 1) { + /* Remove the \1 byte. */ + if (--needed <= n) + dest[needed - 1] = L('\0'); + } + + /* Return the number of bytes/words we need, but don't count the NUL + byte/word at the end. */ + return needed - 1; +} + +/* Do the transformation using weight-index and rule cache. */ +static size_t do_xfrm_cached(STRING_TYPE* dest, + size_t n, + const locale_data_t* l_data, + size_t idxmax, + int32_t* idxarr, + const unsigned char* rulearr) { + uint_fast32_t nrules = l_data->nrules; + unsigned char* rulesets = l_data->rulesets; + USTRING_TYPE* weights = l_data->weights; + uint_fast32_t pass; + size_t needed = 0; + size_t last_needed; + size_t idxcnt; + + /* Now the passes over the weights. */ + for (pass = 0; pass < nrules; ++pass) { + size_t backw_stop = ~0ul; + int rule = rulesets[rulearr[0] * nrules + pass]; + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = rule & sort_position; + + last_needed = needed; + if (position == 0) { + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { + if ((rule & sort_forward) != 0) { + size_t len; + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt; backw > backw_stop;) { + --backw; + len = weights[idxarr[backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[idxcnt]++]; + else { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[idxcnt] += len; + } + } else { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; + } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; + } + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + backw = idxcnt; + while (backw > backw_stop) { + size_t len = weights[idxarr[--backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + } + } else { + int val = 1; + char buf[7]; + size_t buflen; + size_t i; + + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { + if ((rule & sort_forward) != 0) { + size_t len; + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt; backw > backw_stop;) { + --backw; + len = weights[idxarr[backw]++]; + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = weights[idxarr[backw] + i]; + } + needed += buflen + len; + idxarr[backw] += len; + val = 1; + } else + ++val; + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = weights[idxarr[idxcnt] + i]; + } + needed += buflen + len; + idxarr[idxcnt] += len; + val = 1; + } else + /* Note that we don't have to increment `idxarr[idxcnt]' + since the length is zero. */ + ++val; + } else { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; + } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; + } + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + backw = idxmax - 1; + while (backw > backw_stop) { + size_t len = weights[idxarr[--backw]++]; + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = weights[idxarr[backw] + i]; + } + needed += buflen + len; + idxarr[backw] += len; + val = 1; + } else + ++val; + } + } + } + + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && needed == last_needed + 1) { + /* Remove the \1 byte. */ + if (--needed <= n) + dest[needed - 1] = L('\0'); + } + + /* Return the number of bytes/words we need, but don't count the NUL + byte/word at the end. */ + return needed - 1; +} + +size_t glib_strxfrm(char* dest, + const char* src, + size_t n, + locale_data_t* locale) { + /* Handle byte comparison case. */ + if (locale->nrules == 0) { + size_t srclen = strlen(src); + + if (n != 0) { + strncpy(dest, src, MIN(srclen + 1, n)); + } + + return srclen; + } + + /* Handle an empty string, code hereafter relies on strlen (src) > 0. */ + if (*src == L('\0')) { + if (n != 0) + *dest = L('\0'); + return 0; + } + + /* We need the elements of the string as unsigned values since they + are used as indeces. */ + const USTRING_TYPE* usrc = (const USTRING_TYPE*)src; + + /* Allocate cache for small strings on the stack and fill it with weight and + rule indices. If the cache size is not sufficient, continue with the + uncached xfrm version. */ + size_t idxmax = 0; + const USTRING_TYPE* cur = usrc; + int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t)); + unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1); + + do { + int32_t tmp = + findidx(locale->table, locale->indirect, locale->extra, &cur, -1); + rulearr[idxmax] = tmp >> 24; + idxarr[idxmax] = tmp & 0xffffff; + + ++idxmax; + } while (*cur != L('\0') && idxmax < SMALL_STR_SIZE); + + /* This element is only read, the value never used but to determine + another value which then is ignored. */ + rulearr[idxmax] = '\0'; + + /* Do the transformation. */ + if (*cur == L('\0')) + return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr); + else + return do_xfrm(usrc, dest, n, locale); +} |
