Merge branch 'obsd-master'

This commit is contained in:
Thomas Adam
2015-11-12 12:01:17 +00:00
30 changed files with 451 additions and 340 deletions

78
utf8.c
View File

@ -393,6 +393,8 @@ utf8_open(struct utf8_data *utf8data, u_char ch)
int
utf8_append(struct utf8_data *utf8data, u_char ch)
{
/* XXX this should do validity checks too! */
if (utf8data->have >= utf8data->size)
fatalx("UTF-8 character overflow");
if (utf8data->size > sizeof utf8data->data)
@ -466,18 +468,46 @@ utf8_combine(const struct utf8_data *utf8data)
case 3:
value = utf8data->data[2] & 0x3f;
value |= (utf8data->data[1] & 0x3f) << 6;
value |= (utf8data->data[0] & 0x0f) << 12;
value |= (utf8data->data[0] & 0xf) << 12;
break;
case 4:
value = utf8data->data[3] & 0x3f;
value |= (utf8data->data[2] & 0x3f) << 6;
value |= (utf8data->data[1] & 0x3f) << 12;
value |= (utf8data->data[0] & 0x07) << 18;
value |= (utf8data->data[0] & 0x7) << 18;
break;
}
return (value);
}
/* Split a UTF-8 character. */
int
utf8_split(u_int uc, struct utf8_data *utf8data)
{
if (uc < 0x7f) {
utf8data->size = 1;
utf8data->data[0] = uc;
} else if (uc < 0x7ff) {
utf8data->size = 2;
utf8data->data[0] = 0xc0 | ((uc >> 6) & 0x1f);
utf8data->data[1] = 0x80 | (uc & 0x3f);
} else if (uc < 0xffff) {
utf8data->size = 3;
utf8data->data[0] = 0xe0 | ((uc >> 12) & 0xf);
utf8data->data[1] = 0x80 | ((uc >> 6) & 0x3f);
utf8data->data[2] = 0x80 | (uc & 0x3f);
} else if (uc < 0x1fffff) {
utf8data->size = 4;
utf8data->data[0] = 0xf0 | ((uc >> 18) & 0x7);
utf8data->data[1] = 0x80 | ((uc >> 12) & 0x3f);
utf8data->data[2] = 0x80 | ((uc >> 6) & 0x3f);
utf8data->data[3] = 0x80 | (uc & 0x3f);
} else
return (-1);
utf8data->width = utf8_width(utf8data);
return (0);
}
/* Split a two-byte UTF-8 character. */
u_int
utf8_split2(u_int uc, u_char *ptr)
@ -554,6 +584,50 @@ utf8_strvis(char *dst, const char *src, size_t len, int flag)
return (dst - start);
}
/*
* Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
* the returned string. Anything not valid printable ASCII or UTF-8 is
* stripped.
*/
char *
utf8_sanitize(const char *src)
{
char *dst;
size_t n;
int more;
struct utf8_data utf8data;
u_int i;
dst = NULL;
n = 0;
while (*src != '\0') {
dst = xreallocarray(dst, n + 1, sizeof *dst);
if (utf8_open(&utf8data, *src)) {
more = 1;
while (*++src != '\0' && more)
more = utf8_append(&utf8data, *src);
if (!more) {
dst = xreallocarray(dst, n + utf8data.width,
sizeof *dst);
for (i = 0; i < utf8data.width; i++)
dst[n++] = '_';
continue;
}
src -= utf8data.have;
}
if (*src > 0x1f && *src < 0x7f)
dst[n] = *src;
src++;
n++;
}
dst = xreallocarray(dst, n + 1, sizeof *dst);
dst[n] = '\0';
return (dst);
}
/*
* Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
* Caller frees.