summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGabriel Krisman Bertazi <krisman@collabora.co.uk>2019-04-25 13:51:22 -0400
committerTheodore Ts'o <tytso@mit.edu>2019-04-25 13:51:22 -0400
commit9d53690f0d4e5686e80f034ea584b7a822b356d3 (patch)
treec123dcc3f2e193f584acc2560a1358d546e66810
parenta8384c68797ee022f5fd7bcef5f4cc57863d4042 (diff)
unicode: implement higher level API for string handling
This patch integrates the utf8n patches with some higher level API to perform UTF-8 string comparison, normalization and casefolding operations. Implemented is a variation of NFD, and casefold is performed by doing full casefold on top of NFD. These algorithms are based on the core implemented by Olaf Weber from SGI. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/unicode/Makefile4
-rw-r--r--fs/unicode/utf8-core.c187
-rw-r--r--fs/unicode/utf8-norm.c6
-rw-r--r--fs/unicode/utf8n.h1
-rw-r--r--include/linux/unicode.h30
5 files changed, 227 insertions, 1 deletions
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 16d43d180416..bfb0360687df 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_UNICODE) += utf8-norm.o
+obj-$(CONFIG_UNICODE) += unicode.o
+
+unicode-y := utf8-norm.o utf8-core.o
# This rule is not invoked during the kernel compilation. It is used to
# regenerate the utf8data.h header file.
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..6afab4fdce90
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+
+#include "utf8n.h"
+
+int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+
+ if (utf8nlen(data, str->name, str->len) < 0)
+ return -1;
+ return 0;
+}
+EXPORT_SYMBOL(utf8_validate);
+
+int utf8_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+EXPORT_SYMBOL(utf8_strncmp);
+
+int utf8_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp);
+
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ size_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+EXPORT_SYMBOL(utf8_casefold);
+
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur;
+ ssize_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+EXPORT_SYMBOL(utf8_normalize);
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
+{
+ substring_t args[3];
+ char version_string[12];
+ const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ strncpy(version_string, version, sizeof(version_string));
+
+ if (match_token(version_string, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+ match_int(&args[2], rev))
+ return -EINVAL;
+
+ return 0;
+}
+
+struct unicode_map *utf8_load(const char *version)
+{
+ struct unicode_map *um = NULL;
+ int unicode_version;
+
+ if (version) {
+ unsigned int maj, min, rev;
+
+ if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (!utf8version_is_supported(maj, min, rev))
+ return ERR_PTR(-EINVAL);
+
+ unicode_version = UNICODE_AGE(maj, min, rev);
+ } else {
+ unicode_version = utf8version_latest();
+ printk(KERN_WARNING"UTF-8 version not specified. "
+ "Assuming latest supported version (%d.%d.%d).",
+ (unicode_version >> 16) & 0xff,
+ (unicode_version >> 8) & 0xff,
+ (unicode_version & 0xff));
+ }
+
+ um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+ if (!um)
+ return ERR_PTR(-ENOMEM);
+
+ um->charset = "UTF-8";
+ um->version = unicode_version;
+
+ return um;
+}
+EXPORT_SYMBOL(utf8_load);
+
+void utf8_unload(struct unicode_map *um)
+{
+ kfree(um);
+}
+EXPORT_SYMBOL(utf8_unload);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 848b93e97f50..20d440c3f2db 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
}
EXPORT_SYMBOL(utf8version_is_supported);
+int utf8version_latest(void)
+{
+ return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
+
/*
* UTF-8 valid ranges.
*
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index b63a9091dc39..a120638014c1 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -32,6 +32,7 @@
/* Highest unicode version supported by the data tables. */
extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
/*
* Look for the correct const struct utf8data for a unicode version.
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
new file mode 100644
index 000000000000..aec2c6d800aa
--- /dev/null
+++ b/include/linux/unicode.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNICODE_H
+#define _LINUX_UNICODE_H
+
+#include <linux/init.h>
+#include <linux/dcache.h>
+
+struct unicode_map {
+ const char *charset;
+ int version;
+};
+
+int utf8_validate(const struct unicode_map *um, const struct qstr *str);
+
+int utf8_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2);
+
+int utf8_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2);
+
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+
+struct unicode_map *utf8_load(const char *version);
+void utf8_unload(struct unicode_map *um);
+
+#endif /* _LINUX_UNICODE_H */