daandemeyer / rpms / systemd

Forked from rpms/systemd 2 years ago
Clone
a4b143
From 1bd14ab0b60de49c8546f22d90996b95423b9eeb Mon Sep 17 00:00:00 2001
a4b143
From: Dave Reisner <dreisner@archlinux.org>
a4b143
Date: Tue, 17 Sep 2013 15:39:09 -0400
a4b143
Subject: [PATCH] move utf8 functions from libudev-private.h to utf8.h
a4b143
a4b143
There's now some more obvious overlap amongst the two utf8 validation
a4b143
functions, but no more than there already was previously.
a4b143
a4b143
This also adds some menial tests for anyone who wants to do more
a4b143
merging of these two in the future.
a4b143
---
a4b143
 .gitignore                 |   2 +
a4b143
 Makefile.am                |  10 +++
a4b143
 src/libudev/libudev-util.c | 171 +--------------------------------------------
a4b143
 src/shared/utf8.c          | 151 +++++++++++++++++++++++++++++++++++++++
a4b143
 src/shared/utf8.h          |   4 ++
a4b143
 src/test/test-utf8.c       |  59 ++++++++++++++++
a4b143
 6 files changed, 229 insertions(+), 168 deletions(-)
a4b143
 create mode 100644 src/test/test-utf8.c
a4b143
a4b143
diff --git a/.gitignore b/.gitignore
a4b143
index 61bc2a3..deeee53 100644
a4b143
--- a/.gitignore
a4b143
+++ b/.gitignore
a4b143
@@ -124,6 +124,7 @@
a4b143
 /test-list
a4b143
 /test-log
a4b143
 /test-login
a4b143
+/test-login-shared
a4b143
 /test-loopback
a4b143
 /test-mmap-cache
a4b143
 /test-ns
a4b143
@@ -143,6 +144,7 @@
a4b143
 /test-udev
a4b143
 /test-unit-file
a4b143
 /test-unit-name
a4b143
+/test-utf8
a4b143
 /test-util
a4b143
 /test-watchdog
a4b143
 /timedatectl
a4b143
diff --git a/Makefile.am b/Makefile.am
a4b143
index 25bfd91..6dd33ad 100644
a4b143
--- a/Makefile.am
a4b143
+++ b/Makefile.am
a4b143
@@ -1121,6 +1121,7 @@ tests += \
a4b143
 	test-strxcpyx \
a4b143
 	test-unit-name \
a4b143
 	test-unit-file \
a4b143
+	test-utf8 \
a4b143
 	test-util \
a4b143
 	test-date \
a4b143
 	test-sleep \
a4b143
@@ -1223,6 +1224,15 @@ test_unit_file_CFLAGS = \
a4b143
 test_unit_file_LDADD = \
a4b143
 	libsystemd-core.la
a4b143
 
a4b143
+test_utf8_SOURCES = \
a4b143
+	src/test/test-utf8.c
a4b143
+
a4b143
+test_utf8_CFLAGS = \
a4b143
+	$(AM_CFLAGS)
a4b143
+
a4b143
+test_utf8_LDADD = \
a4b143
+	libsystemd-shared.la
a4b143
+
a4b143
 test_util_SOURCES = \
a4b143
 	src/test/test-util.c
a4b143
 
a4b143
diff --git a/src/libudev/libudev-util.c b/src/libudev/libudev-util.c
a4b143
index 714dc50..d54430c 100644
a4b143
--- a/src/libudev/libudev-util.c
a4b143
+++ b/src/libudev/libudev-util.c
a4b143
@@ -34,6 +34,7 @@
a4b143
 
a4b143
 #include "libudev.h"
a4b143
 #include "libudev-private.h"
a4b143
+#include "utf8.h"
a4b143
 
a4b143
 /**
a4b143
  * SECTION:libudev-util
a4b143
@@ -306,129 +307,6 @@ void util_remove_trailing_chars(char *path, char c)
a4b143
                 path[--len] = '\0';
a4b143
 }
a4b143
 
a4b143
-/* count of characters used to encode one unicode char */
a4b143
-static int utf8_encoded_expected_len(const char *str)
a4b143
-{
a4b143
-        unsigned char c = (unsigned char)str[0];
a4b143
-
a4b143
-        if (c < 0x80)
a4b143
-                return 1;
a4b143
-        if ((c & 0xe0) == 0xc0)
a4b143
-                return 2;
a4b143
-        if ((c & 0xf0) == 0xe0)
a4b143
-                return 3;
a4b143
-        if ((c & 0xf8) == 0xf0)
a4b143
-                return 4;
a4b143
-        if ((c & 0xfc) == 0xf8)
a4b143
-                return 5;
a4b143
-        if ((c & 0xfe) == 0xfc)
a4b143
-                return 6;
a4b143
-        return 0;
a4b143
-}
a4b143
-
a4b143
-/* decode one unicode char */
a4b143
-static int utf8_encoded_to_unichar(const char *str)
a4b143
-{
a4b143
-        int unichar;
a4b143
-        int len;
a4b143
-        int i;
a4b143
-
a4b143
-        len = utf8_encoded_expected_len(str);
a4b143
-        switch (len) {
a4b143
-        case 1:
a4b143
-                return (int)str[0];
a4b143
-        case 2:
a4b143
-                unichar = str[0] & 0x1f;
a4b143
-                break;
a4b143
-        case 3:
a4b143
-                unichar = (int)str[0] & 0x0f;
a4b143
-                break;
a4b143
-        case 4:
a4b143
-                unichar = (int)str[0] & 0x07;
a4b143
-                break;
a4b143
-        case 5:
a4b143
-                unichar = (int)str[0] & 0x03;
a4b143
-                break;
a4b143
-        case 6:
a4b143
-                unichar = (int)str[0] & 0x01;
a4b143
-                break;
a4b143
-        default:
a4b143
-                return -1;
a4b143
-        }
a4b143
-
a4b143
-        for (i = 1; i < len; i++) {
a4b143
-                if (((int)str[i] & 0xc0) != 0x80)
a4b143
-                        return -1;
a4b143
-                unichar <<= 6;
a4b143
-                unichar |= (int)str[i] & 0x3f;
a4b143
-        }
a4b143
-
a4b143
-        return unichar;
a4b143
-}
a4b143
-
a4b143
-/* expected size used to encode one unicode char */
a4b143
-static int utf8_unichar_to_encoded_len(int unichar)
a4b143
-{
a4b143
-        if (unichar < 0x80)
a4b143
-                return 1;
a4b143
-        if (unichar < 0x800)
a4b143
-                return 2;
a4b143
-        if (unichar < 0x10000)
a4b143
-                return 3;
a4b143
-        if (unichar < 0x200000)
a4b143
-                return 4;
a4b143
-        if (unichar < 0x4000000)
a4b143
-                return 5;
a4b143
-        return 6;
a4b143
-}
a4b143
-
a4b143
-/* check if unicode char has a valid numeric range */
a4b143
-static int utf8_unichar_valid_range(int unichar)
a4b143
-{
a4b143
-        if (unichar > 0x10ffff)
a4b143
-                return 0;
a4b143
-        if ((unichar & 0xfffff800) == 0xd800)
a4b143
-                return 0;
a4b143
-        if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
a4b143
-                return 0;
a4b143
-        if ((unichar & 0xffff) == 0xffff)
a4b143
-                return 0;
a4b143
-        return 1;
a4b143
-}
a4b143
-
a4b143
-/* validate one encoded unicode char and return its length */
a4b143
-static int utf8_encoded_valid_unichar(const char *str)
a4b143
-{
a4b143
-        int len;
a4b143
-        int unichar;
a4b143
-        int i;
a4b143
-
a4b143
-        len = utf8_encoded_expected_len(str);
a4b143
-        if (len == 0)
a4b143
-                return -1;
a4b143
-
a4b143
-        /* ascii is valid */
a4b143
-        if (len == 1)
a4b143
-                return 1;
a4b143
-
a4b143
-        /* check if expected encoded chars are available */
a4b143
-        for (i = 0; i < len; i++)
a4b143
-                if ((str[i] & 0x80) != 0x80)
a4b143
-                        return -1;
a4b143
-
a4b143
-        unichar = utf8_encoded_to_unichar(str);
a4b143
-
a4b143
-        /* check if encoded length matches encoded value */
a4b143
-        if (utf8_unichar_to_encoded_len(unichar) != len)
a4b143
-                return -1;
a4b143
-
a4b143
-        /* check if value has valid range */
a4b143
-        if (!utf8_unichar_valid_range(unichar))
a4b143
-                return -1;
a4b143
-
a4b143
-        return len;
a4b143
-}
a4b143
-
a4b143
 int util_replace_whitespace(const char *str, char *to, size_t len)
a4b143
 {
a4b143
         size_t i, j;
a4b143
@@ -457,17 +335,6 @@ int util_replace_whitespace(const char *str, char *to, size_t len)
a4b143
         return 0;
a4b143
 }
a4b143
 
a4b143
-static int is_whitelisted(char c, const char *white)
a4b143
-{
a4b143
-        if ((c >= '0' && c <= '9') ||
a4b143
-            (c >= 'A' && c <= 'Z') ||
a4b143
-            (c >= 'a' && c <= 'z') ||
a4b143
-            strchr("#+-.:=@_", c) != NULL ||
a4b143
-            (white != NULL && strchr(white, c) != NULL))
a4b143
-                return 1;
a4b143
-        return 0;
a4b143
-}
a4b143
-
a4b143
 /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
a4b143
 int util_replace_chars(char *str, const char *white)
a4b143
 {
a4b143
@@ -477,7 +344,7 @@ int util_replace_chars(char *str, const char *white)
a4b143
         while (str[i] != '\0') {
a4b143
                 int len;
a4b143
 
a4b143
-                if (is_whitelisted(str[i], white)) {
a4b143
+                if (is_utf8_encoding_whitelisted(str[i], white)) {
a4b143
                         i++;
a4b143
                         continue;
a4b143
                 }
a4b143
@@ -525,39 +392,7 @@ int util_replace_chars(char *str, const char *white)
a4b143
  **/
a4b143
 _public_ int udev_util_encode_string(const char *str, char *str_enc, size_t len)
a4b143
 {
a4b143
-        size_t i, j;
a4b143
-
a4b143
-        if (str == NULL || str_enc == NULL)
a4b143
-                return -1;
a4b143
-
a4b143
-        for (i = 0, j = 0; str[i] != '\0'; i++) {
a4b143
-                int seqlen;
a4b143
-
a4b143
-                seqlen = utf8_encoded_valid_unichar(&str[i]);
a4b143
-                if (seqlen > 1) {
a4b143
-                        if (len-j < (size_t)seqlen)
a4b143
-                                goto err;
a4b143
-                        memcpy(&str_enc[j], &str[i], seqlen);
a4b143
-                        j += seqlen;
a4b143
-                        i += (seqlen-1);
a4b143
-                } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
a4b143
-                        if (len-j < 4)
a4b143
-                                goto err;
a4b143
-                        sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
a4b143
-                        j += 4;
a4b143
-                } else {
a4b143
-                        if (len-j < 1)
a4b143
-                                goto err;
a4b143
-                        str_enc[j] = str[i];
a4b143
-                        j++;
a4b143
-                }
a4b143
-        }
a4b143
-        if (len-j < 1)
a4b143
-                goto err;
a4b143
-        str_enc[j] = '\0';
a4b143
-        return 0;
a4b143
-err:
a4b143
-        return -1;
a4b143
+        return udev_encode_string(str, str_enc, len);
a4b143
 }
a4b143
 
a4b143
 /*
a4b143
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
a4b143
index 655cc77..1a68394 100644
a4b143
--- a/src/shared/utf8.c
a4b143
+++ b/src/shared/utf8.c
a4b143
@@ -317,3 +317,154 @@ char *utf16_to_utf8(const void *s, size_t length) {
a4b143
 
a4b143
         return r;
a4b143
 }
a4b143
+
a4b143
+/* count of characters used to encode one unicode char */
a4b143
+static int utf8_encoded_expected_len(const char *str) {
a4b143
+        unsigned char c = (unsigned char)str[0];
a4b143
+
a4b143
+        if (c < 0x80)
a4b143
+                return 1;
a4b143
+        if ((c & 0xe0) == 0xc0)
a4b143
+                return 2;
a4b143
+        if ((c & 0xf0) == 0xe0)
a4b143
+                return 3;
a4b143
+        if ((c & 0xf8) == 0xf0)
a4b143
+                return 4;
a4b143
+        if ((c & 0xfc) == 0xf8)
a4b143
+                return 5;
a4b143
+        if ((c & 0xfe) == 0xfc)
a4b143
+                return 6;
a4b143
+        return 0;
a4b143
+}
a4b143
+
a4b143
+/* decode one unicode char */
a4b143
+static int utf8_encoded_to_unichar(const char *str) {
a4b143
+        int unichar;
a4b143
+        int len;
a4b143
+        int i;
a4b143
+
a4b143
+        len = utf8_encoded_expected_len(str);
a4b143
+        switch (len) {
a4b143
+        case 1:
a4b143
+                return (int)str[0];
a4b143
+        case 2:
a4b143
+                unichar = str[0] & 0x1f;
a4b143
+                break;
a4b143
+        case 3:
a4b143
+                unichar = (int)str[0] & 0x0f;
a4b143
+                break;
a4b143
+        case 4:
a4b143
+                unichar = (int)str[0] & 0x07;
a4b143
+                break;
a4b143
+        case 5:
a4b143
+                unichar = (int)str[0] & 0x03;
a4b143
+                break;
a4b143
+        case 6:
a4b143
+                unichar = (int)str[0] & 0x01;
a4b143
+                break;
a4b143
+        default:
a4b143
+                return -1;
a4b143
+        }
a4b143
+
a4b143
+        for (i = 1; i < len; i++) {
a4b143
+                if (((int)str[i] & 0xc0) != 0x80)
a4b143
+                        return -1;
a4b143
+                unichar <<= 6;
a4b143
+                unichar |= (int)str[i] & 0x3f;
a4b143
+        }
a4b143
+
a4b143
+        return unichar;
a4b143
+}
a4b143
+
a4b143
+/* expected size used to encode one unicode char */
a4b143
+static int utf8_unichar_to_encoded_len(int unichar) {
a4b143
+        if (unichar < 0x80)
a4b143
+                return 1;
a4b143
+        if (unichar < 0x800)
a4b143
+                return 2;
a4b143
+        if (unichar < 0x10000)
a4b143
+                return 3;
a4b143
+        if (unichar < 0x200000)
a4b143
+                return 4;
a4b143
+        if (unichar < 0x4000000)
a4b143
+                return 5;
a4b143
+        return 6;
a4b143
+}
a4b143
+
a4b143
+/* validate one encoded unicode char and return its length */
a4b143
+int utf8_encoded_valid_unichar(const char *str) {
a4b143
+        int len;
a4b143
+        int unichar;
a4b143
+        int i;
a4b143
+
a4b143
+        len = utf8_encoded_expected_len(str);
a4b143
+        if (len == 0)
a4b143
+                return -1;
a4b143
+
a4b143
+        /* ascii is valid */
a4b143
+        if (len == 1)
a4b143
+                return 1;
a4b143
+
a4b143
+        /* check if expected encoded chars are available */
a4b143
+        for (i = 0; i < len; i++)
a4b143
+                if ((str[i] & 0x80) != 0x80)
a4b143
+                        return -1;
a4b143
+
a4b143
+        unichar = utf8_encoded_to_unichar(str);
a4b143
+
a4b143
+        /* check if encoded length matches encoded value */
a4b143
+        if (utf8_unichar_to_encoded_len(unichar) != len)
a4b143
+                return -1;
a4b143
+
a4b143
+        /* check if value has valid range */
a4b143
+        if (!is_unicode_valid(unichar))
a4b143
+                return -1;
a4b143
+
a4b143
+        return len;
a4b143
+}
a4b143
+
a4b143
+int is_utf8_encoding_whitelisted(char c, const char *white) {
a4b143
+        if ((c >= '0' && c <= '9') ||
a4b143
+            (c >= 'A' && c <= 'Z') ||
a4b143
+            (c >= 'a' && c <= 'z') ||
a4b143
+            strchr("#+-.:=@_", c) != NULL ||
a4b143
+            (white != NULL && strchr(white, c) != NULL))
a4b143
+                return 1;
a4b143
+        return 0;
a4b143
+}
a4b143
+
a4b143
+int udev_encode_string(const char *str, char *str_enc, size_t len) {
a4b143
+        size_t i, j;
a4b143
+
a4b143
+        if (str == NULL || str_enc == NULL)
a4b143
+                return -1;
a4b143
+
a4b143
+        for (i = 0, j = 0; str[i] != '\0'; i++) {
a4b143
+                int seqlen;
a4b143
+
a4b143
+                seqlen = utf8_encoded_valid_unichar(&str[i]);
a4b143
+                if (seqlen > 1) {
a4b143
+                        if (len-j < (size_t)seqlen)
a4b143
+                                goto err;
a4b143
+                        memcpy(&str_enc[j], &str[i], seqlen);
a4b143
+                        j += seqlen;
a4b143
+                        i += (seqlen-1);
a4b143
+                } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
a4b143
+                        if (len-j < 4)
a4b143
+                                goto err;
a4b143
+                        sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
a4b143
+                        j += 4;
a4b143
+                } else {
a4b143
+                        if (len-j < 1)
a4b143
+                                goto err;
a4b143
+                        str_enc[j] = str[i];
a4b143
+                        j++;
a4b143
+                }
a4b143
+        }
a4b143
+        if (len-j < 1)
a4b143
+                goto err;
a4b143
+        str_enc[j] = '\0';
a4b143
+        return 0;
a4b143
+err:
a4b143
+        return -1;
a4b143
+}
a4b143
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
a4b143
index f805ea6..7a5608c 100644
a4b143
--- a/src/shared/utf8.h
a4b143
+++ b/src/shared/utf8.h
a4b143
@@ -34,3 +34,7 @@ char *utf8_filter(const char *s);
a4b143
 char *ascii_filter(const char *s);
a4b143
 
a4b143
 char *utf16_to_utf8(const void *s, size_t length);
a4b143
+
a4b143
+int utf8_encoded_valid_unichar(const char *str);
a4b143
+int is_utf8_encoding_whitelisted(char c, const char *white);
a4b143
+int udev_encode_string(const char *str, char *str_enc, size_t len);
a4b143
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
a4b143
new file mode 100644
a4b143
index 0000000..d2b9771
a4b143
--- /dev/null
a4b143
+++ b/src/test/test-utf8.c
a4b143
@@ -0,0 +1,59 @@
a4b143
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
a4b143
+
a4b143
+/***
a4b143
+  This file is part of systemd.
a4b143
+
a4b143
+  Copyright 2013 Dave Reisner
a4b143
+
a4b143
+  systemd is free software; you can redistribute it and/or modify it
a4b143
+  under the terms of the GNU Lesser General Public License as published by
a4b143
+  the Free Software Foundation; either version 2.1 of the License, or
a4b143
+  (at your option) any later version.
a4b143
+
a4b143
+  systemd is distributed in the hope that it will be useful, but
a4b143
+  WITHOUT ANY WARRANTY; without even the implied warranty of
a4b143
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
a4b143
+  Lesser General Public License for more details.
a4b143
+
a4b143
+  You should have received a copy of the GNU Lesser General Public License
a4b143
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
a4b143
+***/
a4b143
+
a4b143
+
a4b143
+#include "utf8.h"
a4b143
+#include "util.h"
a4b143
+
a4b143
+/* helpers for test_udev_encode_string */
a4b143
+static char *do_encode_string(const char *in) {
a4b143
+        size_t out_len = strlen(in) * 4;
a4b143
+        char *out = malloc(out_len);
a4b143
+
a4b143
+        assert_se(out);
a4b143
+        assert_se(udev_encode_string(in, out, out_len) >= 0);
a4b143
+        puts(out);
a4b143
+
a4b143
+        return out;
a4b143
+}
a4b143
+
a4b143
+static bool expect_encoded_as(const char *in, const char *expected) {
a4b143
+        _cleanup_free_ char *encoded = do_encode_string(in);
a4b143
+        return streq(encoded, expected);
a4b143
+}
a4b143
+
a4b143
+static void test_udev_encode_string(void) {
a4b143
+        assert_se(expect_encoded_as("systemd sucks", "systemd\\x20sucks"));
a4b143
+        assert_se(expect_encoded_as("pinkiepie", "pinkiepie"));
a4b143
+        assert_se(expect_encoded_as("valíd\\ųtf8", "valíd\\x5cųtf8"));
a4b143
+        assert_se(expect_encoded_as("s/ash/ng", "s\\x2fash\\x2fng"));
a4b143
+}
a4b143
+
a4b143
+static void test_utf8_is_valid(void) {
a4b143
+        assert_se(utf8_is_valid("ascii is valid unicode"));
a4b143
+        assert_se(utf8_is_valid("\341\204\242"));
a4b143
+        assert_se(!utf8_is_valid("\341\204"));
a4b143
+}
a4b143
+
a4b143
+int main(int argc, char *argv[]) {
a4b143
+        test_utf8_is_valid();
a4b143
+        test_udev_encode_string();
a4b143
+}