|
|
a43681 |
From ee7ba570f7c555f93f41badefb63397737ef7810 Mon Sep 17 00:00:00 2001
|
|
|
a43681 |
From: Peter Jones <pjones@redhat.com>
|
|
|
a43681 |
Date: Tue, 18 Jun 2019 13:12:39 -0400
|
|
|
a43681 |
Subject: [PATCH 35/63] ucs2: document things a little better
|
|
|
a43681 |
|
|
|
a43681 |
Signed-off-by: Peter Jones <pjones@redhat.com>
|
|
|
a43681 |
---
|
|
|
a43681 |
src/ucs2.h | 135 +++++++++++++++++++++++++++++++++++++++--------------
|
|
|
a43681 |
1 file changed, 100 insertions(+), 35 deletions(-)
|
|
|
a43681 |
|
|
|
a43681 |
diff --git a/src/ucs2.h b/src/ucs2.h
|
|
|
a43681 |
index 478de23b23f..3f8a41d8ccc 100644
|
|
|
a43681 |
--- a/src/ucs2.h
|
|
|
a43681 |
+++ b/src/ucs2.h
|
|
|
a43681 |
@@ -22,11 +22,20 @@
|
|
|
a43681 |
#define ev_bits(val, mask, shift) \
|
|
|
a43681 |
(((val) & ((mask) << (shift))) >> (shift))
|
|
|
a43681 |
|
|
|
a43681 |
+/*
|
|
|
a43681 |
+ * ucs2len(): Count the number of characters in a UCS-2 string.
|
|
|
a43681 |
+ * s: a UCS-2 string
|
|
|
a43681 |
+ * limit: the maximum number of uint16_t bytepairs to examine
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * returns the number of characters before NUL is found (i.e., excluding
|
|
|
a43681 |
+ * the NUL character). If limit is non-negative, no character index above
|
|
|
a43681 |
+ * limit will be accessed, and the maximum return value is limit.
|
|
|
a43681 |
+ */
|
|
|
a43681 |
static inline size_t UNUSED
|
|
|
a43681 |
-ucs2len(const void *vs, ssize_t limit)
|
|
|
a43681 |
+ucs2len(const void *s, ssize_t limit)
|
|
|
a43681 |
{
|
|
|
a43681 |
ssize_t i;
|
|
|
a43681 |
- const uint8_t *s8 = vs;
|
|
|
a43681 |
+ const uint8_t *s8 = s;
|
|
|
a43681 |
|
|
|
a43681 |
for (i = 0;
|
|
|
a43681 |
i < (limit >= 0 ? limit : i+1) && !(s8[0] == 0 && s8[1] == 0);
|
|
|
a43681 |
@@ -35,6 +44,15 @@ ucs2len(const void *vs, ssize_t limit)
|
|
|
a43681 |
return i;
|
|
|
a43681 |
}
|
|
|
a43681 |
|
|
|
a43681 |
+/*
|
|
|
a43681 |
+ * ucs2size(): count the number of bytes in use by a UCS-2 string.
|
|
|
a43681 |
+ * s: a UCS-2 string
|
|
|
a43681 |
+ * limit: the maximum number of uint16_t bytepairs to examine
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * returns the number of bytes, including NUL, in the UCS-2 string s. If
|
|
|
a43681 |
+ * limit is non-negative, no character index above limit will be accessed,
|
|
|
a43681 |
+ * and the maximum return value is limit.
|
|
|
a43681 |
+ */
|
|
|
a43681 |
static inline size_t UNUSED
|
|
|
a43681 |
ucs2size(const void *s, ssize_t limit)
|
|
|
a43681 |
{
|
|
|
a43681 |
@@ -46,6 +64,18 @@ ucs2size(const void *s, ssize_t limit)
|
|
|
a43681 |
return rc;
|
|
|
a43681 |
}
|
|
|
a43681 |
|
|
|
a43681 |
+/*
|
|
|
a43681 |
+ * utf8len(): Count the number of characters in a UTF-8 string.
|
|
|
a43681 |
+ * s: a UTF-8 string
|
|
|
a43681 |
+ * limit: the maximum number of bytes to examine
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * returns the number of UTF-8 charters before NUL is found (i.e.,
|
|
|
a43681 |
+ * excluding the NUL character). If limit is non-negative, no character
|
|
|
a43681 |
+ * index above limit will be accessed, and the maximum return value is
|
|
|
a43681 |
+ * limit.
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * Caveat: only good up to 3-byte sequences.
|
|
|
a43681 |
+ */
|
|
|
a43681 |
static inline size_t UNUSED NONNULL(1)
|
|
|
a43681 |
utf8len(const unsigned char *s, ssize_t limit)
|
|
|
a43681 |
{
|
|
|
a43681 |
@@ -63,6 +93,15 @@ utf8len(const unsigned char *s, ssize_t limit)
|
|
|
a43681 |
return j;
|
|
|
a43681 |
}
|
|
|
a43681 |
|
|
|
a43681 |
+/*
|
|
|
a43681 |
+ * utf8size(): count the number of bytes in use by a UTF-8 string.
|
|
|
a43681 |
+ * s: a UTF-8 string
|
|
|
a43681 |
+ * limit: the maximum number of bytes to examine
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * returns the number of bytes, including NUL, in the UTF-8 string s.
|
|
|
a43681 |
+ * If limit is non-negative, no character index above limit will be
|
|
|
a43681 |
+ * accessed, and the maximum return value is limit.
|
|
|
a43681 |
+ */
|
|
|
a43681 |
static inline size_t UNUSED NONNULL(1)
|
|
|
a43681 |
utf8size(const unsigned char *s, ssize_t limit)
|
|
|
a43681 |
{
|
|
|
a43681 |
@@ -72,68 +111,94 @@ utf8size(const unsigned char *s, ssize_t limit)
|
|
|
a43681 |
return ret;
|
|
|
a43681 |
}
|
|
|
a43681 |
|
|
|
a43681 |
+/*
|
|
|
a43681 |
+ * ucs2_to_utf8(): convert UCS-2 to UTF-8
|
|
|
a43681 |
+ * s: the UCS-2 string
|
|
|
a43681 |
+ * limit: the maximum number of characters to copy from s, including the
|
|
|
a43681 |
+ * NUL terminator, or -1 for no limit.
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * returns an allocated string, into which at most limit - 1 characters of
|
|
|
a43681 |
+ * UTF-8 are translated from UCS-2. The return value is *always*
|
|
|
a43681 |
+ * NUL-terminated.
|
|
|
a43681 |
+ */
|
|
|
a43681 |
static inline unsigned char * UNUSED
|
|
|
a43681 |
-ucs2_to_utf8(const void * const voidchars, ssize_t limit)
|
|
|
a43681 |
+ucs2_to_utf8(const void * const s, ssize_t limit)
|
|
|
a43681 |
{
|
|
|
a43681 |
ssize_t i, j;
|
|
|
a43681 |
- unsigned char *ret;
|
|
|
a43681 |
- const uint16_t * const chars = voidchars;
|
|
|
a43681 |
+ unsigned char *out, *ret;
|
|
|
a43681 |
+ const uint16_t * const chars = s;
|
|
|
a43681 |
|
|
|
a43681 |
if (limit < 0)
|
|
|
a43681 |
limit = ucs2len(chars, -1);
|
|
|
a43681 |
- ret = malloc(limit * 6 + 1);
|
|
|
a43681 |
- if (!ret)
|
|
|
a43681 |
+ out = malloc(limit * 6 + 1);
|
|
|
a43681 |
+ if (!out)
|
|
|
a43681 |
return NULL;
|
|
|
a43681 |
- memset(ret, 0, limit * 6 +1);
|
|
|
a43681 |
+ memset(out, 0, limit * 6 +1);
|
|
|
a43681 |
|
|
|
a43681 |
for (i=0, j=0; chars[i] && i < (limit >= 0 ? limit : i+1); i++,j++) {
|
|
|
a43681 |
if (chars[i] <= 0x7f) {
|
|
|
a43681 |
- ret[j] = chars[i];
|
|
|
a43681 |
+ out[j] = chars[i];
|
|
|
a43681 |
} else if (chars[i] > 0x7f && chars[i] <= 0x7ff) {
|
|
|
a43681 |
- ret[j++] = 0xc0 | ev_bits(chars[i], 0x1f, 6);
|
|
|
a43681 |
- ret[j] = 0x80 | ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
+ out[j++] = 0xc0 | ev_bits(chars[i], 0x1f, 6);
|
|
|
a43681 |
+ out[j] = 0x80 | ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
#if 1
|
|
|
a43681 |
} else if (chars[i] > 0x7ff) {
|
|
|
a43681 |
- ret[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
- ret[j] = 0x80| ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
+ out[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
+ out[j] = 0x80| ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
}
|
|
|
a43681 |
#else
|
|
|
a43681 |
} else if (chars[i] > 0x7ff && chars[i] < 0x10000) {
|
|
|
a43681 |
- ret[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
- ret[j] = 0x80| ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
+ out[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
+ out[j] = 0x80| ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
} else if (chars[i] > 0xffff && chars[i] < 0x200000) {
|
|
|
a43681 |
- ret[j++] = 0xf0 | ev_bits(chars[i], 0x7, 18);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
- ret[j] = 0x80| ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
+ out[j++] = 0xf0 | ev_bits(chars[i], 0x7, 18);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
+ out[j] = 0x80| ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
} else if (chars[i] > 0x1fffff && chars[i] < 0x4000000) {
|
|
|
a43681 |
- ret[j++] = 0xf8 | ev_bits(chars[i], 0x3, 24);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
- ret[j] = 0x80 | ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
+ out[j++] = 0xf8 | ev_bits(chars[i], 0x3, 24);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
+ out[j] = 0x80 | ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
} else if (chars[i] > 0x3ffffff) {
|
|
|
a43681 |
- ret[j++] = 0xfc | ev_bits(chars[i], 0x1, 30);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 24);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12);
|
|
|
a43681 |
- ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
- ret[j] = 0x80 | ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
+ out[j++] = 0xfc | ev_bits(chars[i], 0x1, 30);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 24);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12);
|
|
|
a43681 |
+ out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6);
|
|
|
a43681 |
+ out[j] = 0x80 | ev_bits(chars[i], 0x3f, 0);
|
|
|
a43681 |
}
|
|
|
a43681 |
#endif
|
|
|
a43681 |
}
|
|
|
a43681 |
- ret[j] = '\0';
|
|
|
a43681 |
+ out[j++] = '\0';
|
|
|
a43681 |
+ ret = realloc(out, j);
|
|
|
a43681 |
+ if (!ret) {
|
|
|
a43681 |
+ free(out);
|
|
|
a43681 |
+ return NULL;
|
|
|
a43681 |
+ }
|
|
|
a43681 |
return ret;
|
|
|
a43681 |
}
|
|
|
a43681 |
|
|
|
a43681 |
+/*
|
|
|
a43681 |
+ * utf8_to_ucs2(): convert UTF-8 to UCS-2
|
|
|
a43681 |
+ * s: the destination buffer to write to.
|
|
|
a43681 |
+ * size: the size of the allocation to write to
|
|
|
a43681 |
+ * terminate: whether or not to add a terminator to the string
|
|
|
a43681 |
+ * utf8: the utf8 source
|
|
|
a43681 |
+ *
|
|
|
a43681 |
+ * returns the number of characters written to s, including the NUL
|
|
|
a43681 |
+ * terminator if "terminate" is true, or -1 on error. In the case of an
|
|
|
a43681 |
+ * error, the buffer will not be modified.
|
|
|
a43681 |
+ */
|
|
|
a43681 |
static inline ssize_t UNUSED NONNULL(4)
|
|
|
a43681 |
-utf8_to_ucs2(void *ucs2void, ssize_t size, int terminate, const unsigned char *utf8)
|
|
|
a43681 |
+utf8_to_ucs2(void *s, ssize_t size, bool terminate, const unsigned char *utf8)
|
|
|
a43681 |
{
|
|
|
a43681 |
ssize_t req;
|
|
|
a43681 |
ssize_t i, j;
|
|
|
a43681 |
- uint16_t *ucs2 = ucs2void;
|
|
|
a43681 |
+ uint16_t *ucs2 = s;
|
|
|
a43681 |
uint16_t val16;
|
|
|
a43681 |
|
|
|
a43681 |
if (!ucs2 && size > 0) {
|
|
|
a43681 |
--
|
|
|
a43681 |
2.26.2
|
|
|
a43681 |
|