add locale-aware colation to db indexes

author: jacqueline <me@jacqueline.id.au> 2023-10-30 15:47:38 +1100
committer: jacqueline <me@jacqueline.id.au> 2023-10-30 15:52:26 +1100
commit: b58c08150853b8055093dc116d407ffd543f8ec8 (patch)
tree: 9b82d130d2c833fc234bca0f12f0fba1b7202c4d /src/locale
parent: 18d90051c9145ead86d4da701a2bc54f45e4fb66 (diff)
download: tangara-fw-b58c08150853b8055093dc116d407ffd543f8ec8.tar.gz
6 files changed, 988 insertions, 0 deletions
diff --git a/src/locale/CMakeLists.txt b/src/locale/CMakeLists.txt
new file mode 100644
index 00000000..627ca314
--- /dev/null
+++ b/src/locale/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright 2023 jacqueline <me@jacqueline.id.au>
+#
+# SPDX-License-Identifier: GPL-3.0-only
+
+idf_component_register(
+  SRCS "collation.cpp" "strxfrm_l.c"
+  INCLUDE_DIRS "include"
+  PRIV_INCLUDE_DIRS "priv_include"
+  REQUIRES "span" "esp_partition" "spi_flash")
+
+target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS})
diff --git a/src/locale/collation.cpp b/src/locale/collation.cpp
new file mode 100644
index 00000000..f5e8272a
--- /dev/null
+++ b/src/locale/collation.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "collation.hpp"
+
+#include <stdint.h>
+#include <memory>
+
+#include "esp_flash_spi_init.h"
+#include "esp_log.h"
+#include "esp_partition.h"
+#include "hal/spi_flash_types.h"
+#include "spi_flash_mmap.h"
+#include "strxfrm.h"
+
+namespace locale {
+
+static constexpr char kTag[] = "collate";
+
+static constexpr esp_partition_type_t kLocalePartitionType =
+    static_cast<esp_partition_type_t>(0x40);
+static constexpr esp_partition_subtype_t kLcCollateSubtype =
+    static_cast<esp_partition_subtype_t>(0x0);
+
+auto CreateCollator() -> std::unique_ptr<ICollator> {
+  std::unique_ptr<ICollator> ret{GLibCollator::create()};
+  if (!ret) {
+    ret.reset(new NoopCollator());
+  }
+  return ret;
+}
+
+auto GLibCollator::create() -> GLibCollator* {
+  uint32_t data_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA);
+  ESP_LOGI(kTag, "free data pages: %lu (%lu KiB)", data_pages, data_pages * 64);
+
+  const esp_partition_t* partition =
+      esp_partition_find_first(kLocalePartitionType, kLcCollateSubtype, NULL);
+  if (partition == NULL) {
+    ESP_LOGW(kTag, "no LC_COLLATE partition found");
+  }
+
+  ESP_LOGI(kTag, "found LC_COLLATE partition of size %lu", partition->size);
+  if (partition->size > data_pages * 64 * 1024) {
+    ESP_LOGE(kTag, "not enough free pages to map LC_COLLATE partition!");
+    return nullptr;
+  }
+
+  const void* region;
+  esp_partition_mmap_handle_t handle;
+  esp_err_t err = esp_partition_mmap(partition, 0, partition->size,
+                                     ESP_PARTITION_MMAP_DATA, &region, &handle);
+  if (err != ESP_OK) {
+    ESP_LOGE(kTag, "LC_COLLATE mmap failed");
+    return nullptr;
+  }
+
+  auto data = std::make_unique<locale_data_t>();
+  if (!parse_locale_data(region, partition->size, data.get())) {
+    ESP_LOGE(kTag, "parsing locale data failed");
+    esp_partition_munmap(handle);
+    return nullptr;
+  }
+
+  return new GLibCollator(handle, std::move(data));
+}
+
+GLibCollator::GLibCollator(const esp_partition_mmap_handle_t handle,
+                           std::unique_ptr<locale_data_t> locale)
+    : handle_(handle), locale_data_(std::move(locale)) {}
+
+GLibCollator::~GLibCollator() {
+  esp_partition_munmap(handle_);
+}
+
+auto GLibCollator::Transform(const std::string& in) -> std::string {
+  char dest[256];
+  size_t size = glib_strxfrm(dest, in.c_str(), 256, locale_data_.get());
+  if (size >= 256) {
+    char* larger_dest = new char[size + 1]{0};
+    glib_strxfrm(larger_dest, in.c_str(), size, locale_data_.get());
+    std::string out{larger_dest, size};
+    delete[] larger_dest;
+    return out;
+  }
+  return {dest, size};
+}
+
+}  // namespace locale
diff --git a/src/locale/include/collation.hpp b/src/locale/include/collation.hpp
new file mode 100644
index 00000000..e8b6fa17
--- /dev/null
+++ b/src/locale/include/collation.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include "esp_partition.h"
+#include "span.hpp"
+
+#include "strxfrm.h"
+
+namespace locale {
+
+class ICollator {
+ public:
+  virtual ~ICollator() {}
+
+  virtual auto Transform(const std::string&) -> std::string = 0;
+};
+
+class NoopCollator : public ICollator {
+ public:
+  auto Transform(const std::string& in) -> std::string override { return in; }
+};
+
+auto CreateCollator() -> std::unique_ptr<ICollator>;
+
+class GLibCollator : public ICollator {
+ public:
+  static auto create() -> GLibCollator*;
+  ~GLibCollator();
+
+  auto Transform(const std::string& in) -> std::string override;
+
+ private:
+  GLibCollator(const esp_partition_mmap_handle_t,
+               std::unique_ptr<locale_data_t>);
+
+  const esp_partition_mmap_handle_t handle_;
+  std::unique_ptr<locale_data_t> locale_data_;
+};
+
+}  // namespace locale
diff --git a/src/locale/include/strxfrm.h b/src/locale/include/strxfrm.h
new file mode 100644
index 00000000..a40f7596
--- /dev/null
+++ b/src/locale/include/strxfrm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  uint_fast32_t nrules;
+  unsigned char* rulesets;
+  unsigned char* weights;
+  int32_t* table;
+  unsigned char* extra;
+  int32_t* indirect;
+} locale_data_t;
+
+bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out);
+
+size_t glib_strxfrm(char* dest,
+                    const char* src,
+                    size_t n,
+                    locale_data_t* locale);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/locale/priv_include/weight.h b/src/locale/priv_include/weight.h
new file mode 100644
index 00000000..34465d90
--- /dev/null
+++ b/src/locale/priv_include/weight.h
@@ -0,0 +1,160 @@
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Ulrich Drepper, <drepper@cygnus.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+
+/* This alignment is used for 32-bit integers in locale files, both
+   those that are explicitly int32_t or uint32_t and those that are
+   wchar_t, regardless of the (possibly smaller) alignment required
+   for such integers on a particular host.  */
+#define LOCFILE_ALIGN		sizeof (int32_t)
+#define LOCFILE_ALIGN_MASK	(LOCFILE_ALIGN - 1)
+#define LOCFILE_ALIGN_UP(x)	(((x) + LOCFILE_ALIGN - 1)	\
+				 & ~LOCFILE_ALIGN_MASK)
+#define LOCFILE_ALIGNED_P(x)	(((x) & LOCFILE_ALIGN_MASK) == 0)
+
+/* Find index of weight.  */
+static inline int32_t __attribute__ ((always_inline))
+findidx (const int32_t *table,
+	 const int32_t *indirect,
+	 const unsigned char *extra,
+	 const unsigned char **cpp, size_t len)
+{
+  int_fast32_t i = table[*(*cpp)++];
+  const unsigned char *cp;
+  const unsigned char *usrc;
+
+  if (i >= 0)
+    /* This is an index into the weight table.  Cool.  */
+    return i;
+
+  /* Oh well, more than one sequence starting with this byte.
+     Search for the correct one.  */
+  cp = &extra[-i];
+  usrc = *cpp;
+  --len;
+  while (1)
+    {
+      size_t nhere;
+
+      /* The first thing is the index.  */
+      i = *((const int32_t *) cp);
+      cp += sizeof (int32_t);
+
+      /* Next is the length of the byte sequence.  These are always
+	 short byte sequences so there is no reason to call any
+	 function (even if they are inlined).  */
+      nhere = *cp++;
+
+      if (i >= 0)
+	{
+	  /* It is a single character.  If it matches we found our
+	     index.  Note that at the end of each list there is an
+	     entry of length zero which represents the single byte
+	     sequence.  The first (and here only) byte was tested
+	     already.  */
+	  size_t cnt;
+
+	  /* With GCC 5.3 when compiling with -Os the compiler warns
+	     that seq2.back_us, which becomes usrc, might be used
+	     uninitialized.  This can't be true because we pass a length
+	     of -1 for len at the same time which means that this loop
+	     never executes.  */
+	  for (cnt = 0; cnt < nhere && cnt < len; ++cnt)
+	    if (cp[cnt] != usrc[cnt])
+	      break;
+
+	  if (cnt == nhere)
+	    {
+	      /* Found it.  */
+	      *cpp += nhere;
+	      return i;
+	    }
+
+	  /* Up to the next entry.  */
+	  cp += nhere;
+	  if (!LOCFILE_ALIGNED_P (1 + nhere))
+	    cp += LOCFILE_ALIGN - (1 + nhere) % LOCFILE_ALIGN;
+	}
+      else
+	{
+	  /* This is a range of characters.  First decide whether the
+	     current byte sequence lies in the range.  */
+	  size_t cnt;
+	  size_t offset = 0;
+
+	  for (cnt = 0; cnt < nhere && cnt < len; ++cnt)
+	    if (cp[cnt] != usrc[cnt])
+	      break;
+
+	  if (cnt != nhere)
+	    {
+	      if (cnt == len || cp[cnt] > usrc[cnt])
+		{
+		  /* Cannot be in this range.  */
+		  cp += 2 * nhere;
+		  if (!LOCFILE_ALIGNED_P (1 + 2 * nhere))
+		    cp += (LOCFILE_ALIGN
+			   - (1 + 2 * nhere) % LOCFILE_ALIGN);
+		  continue;
+		}
+
+	      /* Test against the end of the range.  */
+	      for (cnt = 0; cnt < nhere; ++cnt)
+		if (cp[nhere + cnt] != usrc[cnt])
+		  break;
+
+	      if (cnt != nhere && cp[nhere + cnt] < usrc[cnt])
+		{
+		  /* Cannot be in this range.  */
+		  cp += 2 * nhere;
+		  if (!LOCFILE_ALIGNED_P (1 + 2 * nhere))
+		    cp += (LOCFILE_ALIGN
+			   - (1 + 2 * nhere) % LOCFILE_ALIGN);
+		  continue;
+		}
+
+	      /* This range matches the next characters.  Now find
+		 the offset in the indirect table.  */
+	      for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt);
+
+	      do
+		{
+		  offset <<= 8;
+		  /* With GCC 7 when compiling with -Os the compiler
+		     warns that seq1.back_us and seq2.back_us, which
+		     become usrc, might be used uninitialized.  This
+		     is impossible for the same reason as described
+		     above.  */
+		  offset += usrc[cnt] - cp[cnt];
+		}
+	      while (++cnt < nhere);
+	    }
+
+	  *cpp += nhere;
+	  return indirect[-i + offset];
+	}
+    }
+
+  /* NOTREACHED */
+  return 0x43219876;
+}
+
diff --git a/src/locale/strxfrm_l.c b/src/locale/strxfrm_l.c
new file mode 100644
index 00000000..198b59b3
--- /dev/null
+++ b/src/locale/strxfrm_l.c
@@ -0,0 +1,641 @@
+/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Ulrich Drepper <drepper@gnu.org>, 1995.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "strxfrm.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
+
+#include "esp_log.h"
+
+#include "weight.h"
+
+static const char kTag[] = "strxfrm";
+
+#ifndef STRING_TYPE
+#define STRING_TYPE char
+#define USTRING_TYPE unsigned char
+#define STRLEN strlen
+#define STPNCPY stpncpy
+#define L(arg) arg
+#endif
+
+#define CONCAT(a, b) CONCAT1(a, b)
+#define CONCAT1(a, b) a##b
+
+/* Maximum string size that is calculated with cached indices.  Right now this
+   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
+   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
+#define SMALL_STR_SIZE 4095
+
+/* We know three kinds of collation sorting rules.  */
+enum coll_sort_rule {
+  illegal_0__,
+  sort_forward,
+  sort_backward,
+  illegal_3__,
+  sort_position,
+  sort_forward_position,
+  sort_backward_position,
+  sort_mask
+};
+
+enum collate_element {
+  COLLATE_NRULES = 0,
+  COLLATE_RULESETS,
+  COLLATE_TABLEMB,
+  COLLATE_WEIGHTMB,
+  COLLATE_EXTRAMB,
+  COLLATE_INDIRECTMB,
+  COLLATE_GAP1,
+  COLLATE_GAP2,
+  COLLATE_GAP3,
+  COLLATE_TABLEWC,
+  COLLATE_WEIGHTWC,
+  COLLATE_EXTRAWC,
+  COLLATE_INDIRECTWC,
+  COLLATE_SYMB_HASH_SIZEMB,
+  COLLATE_SYMB_TABLEMB,
+  COLLATE_SYMB_EXTRAMB,
+  COLLATE_COLLSEQMB,
+  COLLATE_COLLSEQWC,
+  COLLATE_CODESET,
+
+  // Not a real element; used to know how many elements there are.
+  COLLATE_LAST
+};
+typedef enum collate_element collate_element_t;
+
+bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) {
+  const struct {
+    unsigned int magic;
+    unsigned int nstrings;
+    unsigned int strindex[0];
+  }* const header = raw_data;
+
+  if (header->magic != 0x20051017) {
+    ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic);
+    return false;
+  }
+
+  if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) {
+    ESP_LOGE(kTag, "file was too small to contain header");
+    return false;
+  }
+
+  if (header->nstrings != COLLATE_LAST) {
+    ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)",
+             header->nstrings, COLLATE_LAST);
+    return false;
+  }
+
+  // The LC_COLLATE partition appears to contain data in the correct shape.
+  // Pull out pointers to the various attributes it contains.
+  const void* offsets[COLLATE_LAST];
+  for (size_t i = 0; i < header->nstrings; i++) {
+    size_t offset = header->strindex[i];
+    if (offset > size) {
+      ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset);
+      return false;
+    }
+    offsets[i] = (raw_data + offset);
+  }
+
+  // Now parse those pointers into the output struct.
+  out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES];
+
+  out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS];
+  out->table = (int32_t*)offsets[COLLATE_TABLEMB];
+  out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB];
+  out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB];
+  out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB];
+
+  assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0);
+  assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0);
+  assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0);
+  assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0);
+
+  return true;
+}
+
+/* We need UTF-8 encoding of numbers.  */
+static int utf8_encode(char* buf, int val) {
+  int retval;
+
+  if (val < 0x80) {
+    *buf++ = (char)val;
+    retval = 1;
+  } else {
+    int step;
+
+    for (step = 2; step < 6; ++step)
+      if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
+        break;
+    retval = step;
+
+    *buf = (unsigned char)(~0xff >> step);
+    --step;
+    do {
+      buf[step] = 0x80 | (val & 0x3f);
+      val >>= 6;
+    } while (--step > 0);
+    *buf |= val;
+  }
+
+  return retval;
+}
+
+/* Find next weight and rule index.  Inlined since called for every char.  */
+static __always_inline size_t find_idx(const USTRING_TYPE** us,
+                                       int32_t* weight_idx,
+                                       unsigned char* rule_idx,
+                                       const locale_data_t* l_data,
+                                       const int pass) {
+  int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1);
+  *rule_idx = tmp >> 24;
+  int32_t idx = tmp & 0xffffff;
+  size_t len = l_data->weights[idx++];
+
+  /* Skip over indices of previous levels.  */
+  for (int i = 0; i < pass; i++) {
+    idx += len;
+    len = l_data->weights[idx++];
+  }
+
+  *weight_idx = idx;
+  return len;
+}
+
+static int find_position(const USTRING_TYPE* us,
+                         const locale_data_t* l_data,
+                         const int pass) {
+  int32_t weight_idx;
+  unsigned char rule_idx;
+  const USTRING_TYPE* usrc = us;
+
+  find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass);
+  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
+}
+
+/* Do the transformation.  */
+static size_t do_xfrm(const USTRING_TYPE* usrc,
+                      STRING_TYPE* dest,
+                      size_t n,
+                      const locale_data_t* l_data) {
+  int32_t weight_idx;
+  unsigned char rule_idx;
+  uint_fast32_t pass;
+  size_t needed = 0;
+  size_t last_needed;
+
+  /* Now the passes over the weights.  */
+  for (pass = 0; pass < l_data->nrules; ++pass) {
+    size_t backw_len = 0;
+    last_needed = needed;
+    const USTRING_TYPE* cur = usrc;
+    const USTRING_TYPE* backw_start = NULL;
+
+    /* We assume that if a rule has defined `position' in one section
+      this is true for all of them.  */
+    int position = find_position(cur, l_data, pass);
+
+    if (position == 0) {
+      while (*cur != L('\0')) {
+        const USTRING_TYPE* pos = cur;
+        size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass);
+        int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
+
+        if ((rule & sort_forward) != 0) {
+          /* Handle the pushed backward sequence.  */
+          if (backw_start != NULL) {
+            for (size_t i = backw_len; i > 0;) {
+              int32_t weight_idx;
+              unsigned char rule_idx;
+              size_t len =
+                  find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass);
+              if (needed + i < n)
+                for (size_t j = len; j > 0; j--)
+                  dest[needed + i - j] = l_data->weights[weight_idx++];
+
+              i -= len;
+            }
+
+            needed += backw_len;
+            backw_start = NULL;
+            backw_len = 0;
+          }
+
+          /* Now handle the forward element.  */
+          if (needed + len < n)
+            while (len-- > 0)
+              dest[needed++] = l_data->weights[weight_idx++];
+          else
+            /* No more characters fit into the buffer.  */
+            needed += len;
+        } else {
+          /* Remember start of the backward sequence & track length.  */
+          if (backw_start == NULL)
+            backw_start = pos;
+          backw_len += len;
+        }
+      }
+
+      /* Handle the pushed backward sequence.  */
+      if (backw_start != NULL) {
+        for (size_t i = backw_len; i > 0;) {
+          size_t len =
+              find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass);
+          if (needed + i < n)
+            for (size_t j = len; j > 0; j--)
+              dest[needed + i - j] = l_data->weights[weight_idx++];
+
+          i -= len;
+        }
+
+        needed += backw_len;
+      }
+    } else {
+      int val = 1;
+      char buf[7];
+      size_t buflen;
+      size_t i;
+
+      while (*cur != L('\0')) {
+        const USTRING_TYPE* pos = cur;
+        size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass);
+        int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
+
+        if ((rule & sort_forward) != 0) {
+          /* Handle the pushed backward sequence.  */
+          if (backw_start != NULL) {
+            for (size_t p = backw_len; p > 0; p--) {
+              size_t len;
+              int32_t weight_idx;
+              unsigned char rule_idx;
+              const USTRING_TYPE* backw_cur = backw_start;
+
+              /* To prevent a warning init the used vars.  */
+              len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
+
+              for (i = 1; i < p; i++)
+                len =
+                    find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
+
+              if (len != 0) {
+                buflen = utf8_encode(buf, val);
+                if (needed + buflen + len < n) {
+                  for (i = 0; i < buflen; ++i)
+                    dest[needed + i] = buf[i];
+                  for (i = 0; i < len; ++i)
+                    dest[needed + buflen + i] = l_data->weights[weight_idx + i];
+                }
+                needed += buflen + len;
+                val = 1;
+              } else
+                ++val;
+            }
+
+            backw_start = NULL;
+            backw_len = 0;
+          }
+
+          /* Now handle the forward element.  */
+          if (len != 0) {
+            buflen = utf8_encode(buf, val);
+            if (needed + buflen + len < n) {
+              for (i = 0; i < buflen; ++i)
+                dest[needed + i] = buf[i];
+              for (i = 0; i < len; ++i)
+                dest[needed + buflen + i] = l_data->weights[weight_idx + i];
+            }
+            needed += buflen + len;
+            val = 1;
+          } else
+            ++val;
+        } else {
+          /* Remember start of the backward sequence & track length.  */
+          if (backw_start == NULL)
+            backw_start = pos;
+          backw_len++;
+        }
+      }
+
+      /* Handle the pushed backward sequence.  */
+      if (backw_start != NULL) {
+        for (size_t p = backw_len; p > 0; p--) {
+          size_t len;
+          int32_t weight_idx;
+          unsigned char rule_idx;
+          const USTRING_TYPE* backw_cur = backw_start;
+
+          /* To prevent a warning init the used vars.  */
+          len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
+
+          for (i = 1; i < p; i++)
+            len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
+
+          if (len != 0) {
+            buflen = utf8_encode(buf, val);
+            if (needed + buflen + len < n) {
+              for (i = 0; i < buflen; ++i)
+                dest[needed + i] = buf[i];
+              for (i = 0; i < len; ++i)
+                dest[needed + buflen + i] = l_data->weights[weight_idx + i];
+            }
+            needed += buflen + len;
+            val = 1;
+          } else
+            ++val;
+        }
+      }
+    }
+
+    /* Finally store the byte to separate the passes or terminate
+       the string.  */
+    if (needed < n)
+      dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
+    ++needed;
+  }
+
+  /* This is a little optimization: many collation specifications have
+     a `position' rule at the end and if no non-ignored character
+     is found the last \1 byte is immediately followed by a \0 byte
+     signalling this.  We can avoid the \1 byte(s).  */
+  if (needed > 2 && needed == last_needed + 1) {
+    /* Remove the \1 byte.  */
+    if (--needed <= n)
+      dest[needed - 1] = L('\0');
+  }
+
+  /* Return the number of bytes/words we need, but don't count the NUL
+     byte/word at the end.  */
+  return needed - 1;
+}
+
+/* Do the transformation using weight-index and rule cache.  */
+static size_t do_xfrm_cached(STRING_TYPE* dest,
+                             size_t n,
+                             const locale_data_t* l_data,
+                             size_t idxmax,
+                             int32_t* idxarr,
+                             const unsigned char* rulearr) {
+  uint_fast32_t nrules = l_data->nrules;
+  unsigned char* rulesets = l_data->rulesets;
+  USTRING_TYPE* weights = l_data->weights;
+  uint_fast32_t pass;
+  size_t needed = 0;
+  size_t last_needed;
+  size_t idxcnt;
+
+  /* Now the passes over the weights.  */
+  for (pass = 0; pass < nrules; ++pass) {
+    size_t backw_stop = ~0ul;
+    int rule = rulesets[rulearr[0] * nrules + pass];
+    /* We assume that if a rule has defined `position' in one section
+       this is true for all of them.  */
+    int position = rule & sort_position;
+
+    last_needed = needed;
+    if (position == 0) {
+      for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) {
+        if ((rule & sort_forward) != 0) {
+          size_t len;
+
+          if (backw_stop != ~0ul) {
+            /* Handle the pushed elements now.  */
+            size_t backw;
+
+            for (backw = idxcnt; backw > backw_stop;) {
+              --backw;
+              len = weights[idxarr[backw]++];
+
+              if (needed + len < n)
+                while (len-- > 0)
+                  dest[needed++] = weights[idxarr[backw]++];
+              else {
+                /* No more characters fit into the buffer.  */
+                needed += len;
+                idxarr[backw] += len;
+              }
+            }
+
+            backw_stop = ~0ul;
+          }
+
+          /* Now handle the forward element.  */
+          len = weights[idxarr[idxcnt]++];
+          if (needed + len < n)
+            while (len-- > 0)
+              dest[needed++] = weights[idxarr[idxcnt]++];
+          else {
+            /* No more characters fit into the buffer.  */
+            needed += len;
+            idxarr[idxcnt] += len;
+          }
+        } else {
+          /* Remember where the backwards series started.  */
+          if (backw_stop == ~0ul)
+            backw_stop = idxcnt;
+        }
+
+        rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+      }
+
+      if (backw_stop != ~0ul) {
+        /* Handle the pushed elements now.  */
+        size_t backw;
+
+        backw = idxcnt;
+        while (backw > backw_stop) {
+          size_t len = weights[idxarr[--backw]++];
+
+          if (needed + len < n)
+            while (len-- > 0)
+              dest[needed++] = weights[idxarr[backw]++];
+          else {
+            /* No more characters fit into the buffer.  */
+            needed += len;
+            idxarr[backw] += len;
+          }
+        }
+      }
+    } else {
+      int val = 1;
+      char buf[7];
+      size_t buflen;
+      size_t i;
+
+      for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) {
+        if ((rule & sort_forward) != 0) {
+          size_t len;
+
+          if (backw_stop != ~0ul) {
+            /* Handle the pushed elements now.  */
+            size_t backw;
+
+            for (backw = idxcnt; backw > backw_stop;) {
+              --backw;
+              len = weights[idxarr[backw]++];
+              if (len != 0) {
+                buflen = utf8_encode(buf, val);
+                if (needed + buflen + len < n) {
+                  for (i = 0; i < buflen; ++i)
+                    dest[needed + i] = buf[i];
+                  for (i = 0; i < len; ++i)
+                    dest[needed + buflen + i] = weights[idxarr[backw] + i];
+                }
+                needed += buflen + len;
+                idxarr[backw] += len;
+                val = 1;
+              } else
+                ++val;
+            }
+
+            backw_stop = ~0ul;
+          }
+
+          /* Now handle the forward element.  */
+          len = weights[idxarr[idxcnt]++];
+          if (len != 0) {
+            buflen = utf8_encode(buf, val);
+            if (needed + buflen + len < n) {
+              for (i = 0; i < buflen; ++i)
+                dest[needed + i] = buf[i];
+              for (i = 0; i < len; ++i)
+                dest[needed + buflen + i] = weights[idxarr[idxcnt] + i];
+            }
+            needed += buflen + len;
+            idxarr[idxcnt] += len;
+            val = 1;
+          } else
+            /* Note that we don't have to increment `idxarr[idxcnt]'
+               since the length is zero.  */
+            ++val;
+        } else {
+          /* Remember where the backwards series started.  */
+          if (backw_stop == ~0ul)
+            backw_stop = idxcnt;
+        }
+
+        rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+      }
+
+      if (backw_stop != ~0ul) {
+        /* Handle the pushed elements now.  */
+        size_t backw;
+
+        backw = idxmax - 1;
+        while (backw > backw_stop) {
+          size_t len = weights[idxarr[--backw]++];
+          if (len != 0) {
+            buflen = utf8_encode(buf, val);
+            if (needed + buflen + len < n) {
+              for (i = 0; i < buflen; ++i)
+                dest[needed + i] = buf[i];
+              for (i = 0; i < len; ++i)
+                dest[needed + buflen + i] = weights[idxarr[backw] + i];
+            }
+            needed += buflen + len;
+            idxarr[backw] += len;
+            val = 1;
+          } else
+            ++val;
+        }
+      }
+    }
+
+    /* Finally store the byte to separate the passes or terminate
+       the string.  */
+    if (needed < n)
+      dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
+    ++needed;
+  }
+
+  /* This is a little optimization: many collation specifications have
+     a `position' rule at the end and if no non-ignored character
+     is found the last \1 byte is immediately followed by a \0 byte
+     signalling this.  We can avoid the \1 byte(s).  */
+  if (needed > 2 && needed == last_needed + 1) {
+    /* Remove the \1 byte.  */
+    if (--needed <= n)
+      dest[needed - 1] = L('\0');
+  }
+
+  /* Return the number of bytes/words we need, but don't count the NUL
+     byte/word at the end.  */
+  return needed - 1;
+}
+
+size_t glib_strxfrm(char* dest,
+                    const char* src,
+                    size_t n,
+                    locale_data_t* locale) {
+  /* Handle byte comparison case.  */
+  if (locale->nrules == 0) {
+    size_t srclen = strlen(src);
+
+    if (n != 0) {
+      strncpy(dest, src, MIN(srclen + 1, n));
+    }
+
+    return srclen;
+  }
+
+  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
+  if (*src == L('\0')) {
+    if (n != 0)
+      *dest = L('\0');
+    return 0;
+  }
+
+  /* We need the elements of the string as unsigned values since they
+     are used as indeces.  */
+  const USTRING_TYPE* usrc = (const USTRING_TYPE*)src;
+
+  /* Allocate cache for small strings on the stack and fill it with weight and
+     rule indices.  If the cache size is not sufficient, continue with the
+     uncached xfrm version.  */
+  size_t idxmax = 0;
+  const USTRING_TYPE* cur = usrc;
+  int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t));
+  unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1);
+
+  do {
+    int32_t tmp =
+        findidx(locale->table, locale->indirect, locale->extra, &cur, -1);
+    rulearr[idxmax] = tmp >> 24;
+    idxarr[idxmax] = tmp & 0xffffff;
+
+    ++idxmax;
+  } while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
+
+  /* This element is only read, the value never used but to determine
+     another value which then is ignored.  */
+  rulearr[idxmax] = '\0';
+
+  /* Do the transformation.  */
+  if (*cur == L('\0'))
+    return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr);
+  else
+    return do_xfrm(usrc, dest, n, locale);
+}
author	jacqueline <me@jacqueline.id.au>	2023-10-30 15:47:38 +1100
committer	jacqueline <me@jacqueline.id.au>	2023-10-30 15:52:26 +1100
commit	b58c08150853b8055093dc116d407ffd543f8ec8 (patch)
tree	9b82d130d2c833fc234bca0f12f0fba1b7202c4d /src/locale
parent	18d90051c9145ead86d4da701a2bc54f45e4fb66 (diff)
download	tangara-fw-b58c08150853b8055093dc116d407ffd543f8ec8.tar.gz