Sync OpenBSD patchset 421:

Try to reduce the UTF-8 mess.

Get rid of passing around u_char[4]s and define a struct utf8_data which has
character data, size (sequence length) and width. Move UTF-8 character
collection into two functions utf8_open/utf8_append in utf8.c which fill in
this struct and use these functions from input.c and the various functions in
screen-write.c.

Space for rather more data than is necessary for one UTF-8 sequence is in the
utf8_data struct because screen_write_copy is still nasty and needs to reinject
the character (after combining) into screen_write_cell.
This commit is contained in:
Tiago Cunha 2009-10-23 17:16:25 +00:00
parent c643ac4827
commit f41a3914a5
5 changed files with 200 additions and 185 deletions

42
input.c
View File

@ -1,4 +1,4 @@
/* $Id: input.c,v 1.97 2009-10-15 01:53:48 tcunha Exp $ */ /* $Id: input.c,v 1.98 2009-10-23 17:16:24 tcunha Exp $ */
/* /*
* Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net> * Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net>
@ -572,15 +572,14 @@ input_state_string_escape(u_char ch, struct input_ctx *ictx)
void void
input_state_utf8(u_char ch, struct input_ctx *ictx) input_state_utf8(u_char ch, struct input_ctx *ictx)
{ {
log_debug2("-- un %zu: %hhu (%c)", ictx->off, ch, ch); log_debug2("-- utf8 next: %zu: %hhu (%c)", ictx->off, ch, ch);
ictx->utf8_buf[ictx->utf8_off++] = ch; if (utf8_append(&ictx->utf8data, ch))
if (--ictx->utf8_len != 0) return; /* more to come */
return;
input_state(ictx, input_state_first); input_state(ictx, input_state_first);
ictx->cell.flags |= GRID_FLAG_UTF8; ictx->cell.flags |= GRID_FLAG_UTF8;
screen_write_cell(&ictx->ctx, &ictx->cell, ictx->utf8_buf); screen_write_cell(&ictx->ctx, &ictx->cell, &ictx->utf8data);
ictx->cell.flags &= ~GRID_FLAG_UTF8; ictx->cell.flags &= ~GRID_FLAG_UTF8;
} }
@ -590,40 +589,17 @@ input_handle_character(u_char ch, struct input_ctx *ictx)
struct window_pane *wp = ictx->wp; struct window_pane *wp = ictx->wp;
if (ch > 0x7f && options_get_number(&wp->window->options, "utf8")) { if (ch > 0x7f && options_get_number(&wp->window->options, "utf8")) {
/* if (utf8_open(&ictx->utf8data, ch)) {
* UTF-8 sequence. log_debug2("-- utf8 size %u: %zu: %hhu (%c)",
* ictx->utf8data.size, ictx->off, ch, ch);
* 11000010-11011111 C2-DF start of 2-byte sequence
* 11100000-11101111 E0-EF start of 3-byte sequence
* 11110000-11110100 F0-F4 start of 4-byte sequence
*/
memset(ictx->utf8_buf, 0xff, sizeof ictx->utf8_buf);
ictx->utf8_buf[0] = ch;
ictx->utf8_off = 1;
if (ch >= 0xc2 && ch <= 0xdf) {
log_debug2("-- u2 %zu: %hhu (%c)", ictx->off, ch, ch);
input_state(ictx, input_state_utf8); input_state(ictx, input_state_utf8);
ictx->utf8_len = 1;
return;
}
if (ch >= 0xe0 && ch <= 0xef) {
log_debug2("-- u3 %zu: %hhu (%c)", ictx->off, ch, ch);
input_state(ictx, input_state_utf8);
ictx->utf8_len = 2;
return;
}
if (ch >= 0xf0 && ch <= 0xf4) {
log_debug2("-- u4 %zu: %hhu (%c)", ictx->off, ch, ch);
input_state(ictx, input_state_utf8);
ictx->utf8_len = 3;
return; return;
} }
} }
log_debug2("-- ch %zu: %hhu (%c)", ictx->off, ch, ch); log_debug2("-- ch %zu: %hhu (%c)", ictx->off, ch, ch);
ictx->cell.data = ch; ictx->cell.data = ch;
screen_write_cell(&ictx->ctx, &ictx->cell, ictx->utf8_buf); screen_write_cell(&ictx->ctx, &ictx->cell, NULL);
} }
void void

View File

@ -1,4 +1,4 @@
/* $Id: screen-write.c,v 1.82 2009-10-23 17:13:10 tcunha Exp $ */ /* $Id: screen-write.c,v 1.83 2009-10-23 17:16:24 tcunha Exp $ */
/* /*
* Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net> * Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net>
@ -24,7 +24,8 @@
void screen_write_initctx(struct screen_write_ctx *, struct tty_ctx *, int); void screen_write_initctx(struct screen_write_ctx *, struct tty_ctx *, int);
void screen_write_overwrite(struct screen_write_ctx *); void screen_write_overwrite(struct screen_write_ctx *);
int screen_write_combine(struct screen_write_ctx *, u_char *); int screen_write_combine(
struct screen_write_ctx *, const struct utf8_data *);
/* Initialise writing with a window. */ /* Initialise writing with a window. */
void void
@ -92,10 +93,11 @@ screen_write_cstrlen(int utf8flag, const char *fmt, ...)
size_t printflike2 size_t printflike2
screen_write_strlen(int utf8flag, const char *fmt, ...) screen_write_strlen(int utf8flag, const char *fmt, ...)
{ {
va_list ap; va_list ap;
char *msg; char *msg;
u_char *ptr, utf8buf[4]; struct utf8_data utf8data;
size_t left, size = 0; u_char *ptr;
size_t left, size = 0;
va_start(ap, fmt); va_start(ap, fmt);
xvasprintf(&msg, fmt, ap); xvasprintf(&msg, fmt, ap);
@ -103,24 +105,17 @@ screen_write_strlen(int utf8flag, const char *fmt, ...)
ptr = msg; ptr = msg;
while (*ptr != '\0') { while (*ptr != '\0') {
if (utf8flag && *ptr > 0x7f) { if (utf8flag && *ptr > 0x7f && utf8_open(&utf8data, *ptr)) {
memset(utf8buf, 0xff, sizeof utf8buf); ptr++;
left = strlen(ptr); left = strlen(ptr);
if (*ptr >= 0xc2 && *ptr <= 0xdf && left >= 2) { if (left < utf8data.size - 1)
memcpy(utf8buf, ptr, 2); break;
ptr += 2; while (utf8_append(&utf8data, *ptr))
} else if (*ptr >= 0xe0 && *ptr <= 0xef && left >= 3) {
memcpy(utf8buf, ptr, 3);
ptr += 3;
} else if (*ptr >= 0xf0 && *ptr <= 0xf4 && left >= 4) {
memcpy(utf8buf, ptr, 4);
ptr += 4;
} else {
*utf8buf = *ptr;
ptr++; ptr++;
} ptr++;
size += utf8_width(utf8buf);
size += utf8data.width;
} else { } else {
size++; size++;
ptr++; ptr++;
@ -159,47 +154,38 @@ void
screen_write_vnputs(struct screen_write_ctx *ctx, ssize_t maxlen, screen_write_vnputs(struct screen_write_ctx *ctx, ssize_t maxlen,
struct grid_cell *gc, int utf8flag, const char *fmt, va_list ap) struct grid_cell *gc, int utf8flag, const char *fmt, va_list ap)
{ {
char *msg; char *msg;
u_char *ptr, utf8buf[4]; struct utf8_data utf8data;
size_t left, size = 0; u_char *ptr;
int width; size_t left, size = 0;
xvasprintf(&msg, fmt, ap); xvasprintf(&msg, fmt, ap);
ptr = msg; ptr = msg;
while (*ptr != '\0') { while (*ptr != '\0') {
if (utf8flag && *ptr > 0x7f) { if (utf8flag && *ptr > 0x7f && utf8_open(&utf8data, *ptr)) {
memset(utf8buf, 0xff, sizeof utf8buf); ptr++;
left = strlen(ptr); left = strlen(ptr);
if (*ptr >= 0xc2 && *ptr <= 0xdf && left >= 2) { if (left < utf8data.size - 1)
memcpy(utf8buf, ptr, 2); break;
ptr += 2; while (utf8_append(&utf8data, *ptr))
} else if (*ptr >= 0xe0 && *ptr <= 0xef && left >= 3) {
memcpy(utf8buf, ptr, 3);
ptr += 3;
} else if (*ptr >= 0xf0 && *ptr <= 0xf4 && left >= 4) {
memcpy(utf8buf, ptr, 4);
ptr += 4;
} else {
*utf8buf = *ptr;
ptr++; ptr++;
} ptr++;
width = utf8_width(utf8buf); if (maxlen > 0 &&
if (maxlen > 0 && size + width > (size_t) maxlen) { size + utf8data.width > (size_t) maxlen) {
while (size < (size_t) maxlen) { while (size < (size_t) maxlen) {
screen_write_putc(ctx, gc, ' '); screen_write_putc(ctx, gc, ' ');
size++; size++;
} }
break; break;
} }
size += width; size += utf8data.width;
gc->flags |= GRID_FLAG_UTF8; gc->flags |= GRID_FLAG_UTF8;
screen_write_cell(ctx, gc, utf8buf); screen_write_cell(ctx, gc, &utf8data);
gc->flags &= ~GRID_FLAG_UTF8; gc->flags &= ~GRID_FLAG_UTF8;
} else { } else {
if (maxlen > 0 && size + 1 > (size_t) maxlen) if (maxlen > 0 && size + 1 > (size_t) maxlen)
break; break;
@ -219,11 +205,11 @@ screen_write_cnputs(struct screen_write_ctx *ctx,
ssize_t maxlen, struct grid_cell *gc, int utf8flag, const char *fmt, ...) ssize_t maxlen, struct grid_cell *gc, int utf8flag, const char *fmt, ...)
{ {
struct grid_cell lgc; struct grid_cell lgc;
struct utf8_data utf8data;
va_list ap; va_list ap;
char *msg; char *msg;
u_char *ptr, *last, utf8buf[4]; u_char *ptr, *last;
size_t left, size = 0; size_t left, size = 0;
int width;
va_start(ap, fmt); va_start(ap, fmt);
xvasprintf(&msg, fmt, ap); xvasprintf(&msg, fmt, ap);
@ -247,38 +233,29 @@ screen_write_cnputs(struct screen_write_ctx *ctx,
continue; continue;
} }
if (utf8flag && *ptr > 0x7f) { if (utf8flag && *ptr > 0x7f && utf8_open(&utf8data, *ptr)) {
memset(utf8buf, 0xff, sizeof utf8buf); ptr++;
left = strlen(ptr); left = strlen(ptr);
if (*ptr >= 0xc2 && *ptr <= 0xdf && left >= 2) { if (left < utf8data.size - 1)
memcpy(utf8buf, ptr, 2); break;
ptr += 2; while (utf8_append(&utf8data, *ptr))
} else if (*ptr >= 0xe0 && *ptr <= 0xef && left >= 3) {
memcpy(utf8buf, ptr, 3);
ptr += 3;
} else if (*ptr >= 0xf0 && *ptr <= 0xf4 && left >= 4) {
memcpy(utf8buf, ptr, 4);
ptr += 4;
} else {
*utf8buf = *ptr;
ptr++; ptr++;
} ptr++;
width = utf8_width(utf8buf); if (maxlen > 0 &&
if (maxlen > 0 && size + width > (size_t) maxlen) { size + utf8data.width > (size_t) maxlen) {
while (size < (size_t) maxlen) { while (size < (size_t) maxlen) {
screen_write_putc(ctx, gc, ' '); screen_write_putc(ctx, gc, ' ');
size++; size++;
} }
break; break;
} }
size += width; size += utf8data.width;
lgc.flags |= GRID_FLAG_UTF8; lgc.flags |= GRID_FLAG_UTF8;
screen_write_cell(ctx, &lgc, utf8buf); screen_write_cell(ctx, &lgc, &utf8data);
lgc.flags &= ~GRID_FLAG_UTF8; lgc.flags &= ~GRID_FLAG_UTF8;
} else { } else {
if (maxlen > 0 && size + 1 > (size_t) maxlen) if (maxlen > 0 && size + 1 > (size_t) maxlen)
break; break;
@ -375,8 +352,9 @@ screen_write_copy(struct screen_write_ctx *ctx,
struct grid *gd = src->grid; struct grid *gd = src->grid;
struct grid_line *gl; struct grid_line *gl;
const struct grid_cell *gc; const struct grid_cell *gc;
u_char *udata; const struct grid_utf8 *gu;
u_int xx, yy, cx, cy, ax, bx; struct utf8_data utf8data;
u_int xx, yy, cx, cy, ax, bx, i;
cx = s->cx; cx = s->cx;
cy = s->cy; cy = s->cy;
@ -397,21 +375,30 @@ screen_write_copy(struct screen_write_ctx *ctx,
bx = gl->cellsize; bx = gl->cellsize;
else else
bx = px + nx; bx = px + nx;
for (xx = ax; xx < bx; xx++) { for (xx = ax; xx < bx; xx++) {
udata = NULL;
if (xx >= gl->cellsize) if (xx >= gl->cellsize)
gc = &grid_default_cell; gc = &grid_default_cell;
else { else
gc = &gl->celldata[xx]; gc = &gl->celldata[xx];
if (gc->flags & GRID_FLAG_UTF8) if (gc->flags & GRID_FLAG_UTF8) {
udata = gl->utf8data[xx].data; gu = &gl->utf8data[xx];
memcpy(utf8data.data,
gu->data, sizeof utf8data.data);
utf8data.width = gu->width;
utf8data.size = 0;
for (i = 0; i < UTF8_SIZE; i++) {
if (gu->data[i] == 0xff)
break;
utf8data.size++;
}
} }
screen_write_cell(ctx, gc, udata); screen_write_cell(ctx, gc, &utf8data);
} }
if (px + nx == gd->sx && px + nx > gl->cellsize) if (px + nx == gd->sx && px + nx > gl->cellsize)
screen_write_clearendofline(ctx); screen_write_clearendofline(ctx);
} else } else
screen_write_clearline(ctx); screen_write_clearline(ctx);
cy++; cy++;
screen_write_cursormove(ctx, cx, cy); screen_write_cursormove(ctx, cx, cy);
} }
@ -972,8 +959,8 @@ screen_write_clearscreen(struct screen_write_ctx *ctx)
/* Write cell data. */ /* Write cell data. */
void void
screen_write_cell( screen_write_cell(struct screen_write_ctx *ctx,
struct screen_write_ctx *ctx, const struct grid_cell *gc, u_char *udata) const struct grid_cell *gc, const struct utf8_data *utf8data)
{ {
struct screen *s = ctx->s; struct screen *s = ctx->s;
struct grid *gd = s->grid; struct grid *gd = s->grid;
@ -988,12 +975,9 @@ screen_write_cell(
return; return;
/* Find character width. */ /* Find character width. */
if (gc->flags & GRID_FLAG_UTF8) { if (gc->flags & GRID_FLAG_UTF8)
width = utf8_width(udata); width = utf8data->width;
else
gu.width = width;
memcpy(&gu.data, udata, sizeof gu.data);
} else
width = 1; width = 1;
/* /*
@ -1009,7 +993,7 @@ screen_write_cell(
* there is space. * there is space.
*/ */
if (width == 0) { if (width == 0) {
if (screen_write_combine(ctx, udata) == 0) { if (screen_write_combine(ctx, utf8data) == 0) {
screen_write_initctx(ctx, &ttyctx, 0); screen_write_initctx(ctx, &ttyctx, 0);
tty_write(tty_cmd_utf8character, &ttyctx); tty_write(tty_cmd_utf8character, &ttyctx);
} }
@ -1028,11 +1012,6 @@ screen_write_cell(
/* Check this will fit on the current line and wrap if not. */ /* Check this will fit on the current line and wrap if not. */
if (s->cx > screen_size_x(s) - width) { if (s->cx > screen_size_x(s) - width) {
/*
* Don't update the terminal now, just update the screen and
* leave the cursor to scroll naturally, unless this is only
* part of the screen width.
*/
screen_write_linefeed(ctx, 1); screen_write_linefeed(ctx, 1);
s->cx = 0; /* carriage return */ s->cx = 0; /* carriage return */
} }
@ -1056,8 +1035,15 @@ screen_write_cell(
/* Set the cell. */ /* Set the cell. */
grid_view_set_cell(gd, s->cx, s->cy, gc); grid_view_set_cell(gd, s->cx, s->cy, gc);
if (gc->flags & GRID_FLAG_UTF8) if (gc->flags & GRID_FLAG_UTF8) {
/* Construct UTF-8 and write it. */
gu.width = utf8data->width;
memset(gu.data, 0xff, sizeof gu.data);
if (utf8data->size > sizeof gu.data)
fatalx("UTF-8 data overflow");
memcpy(gu.data, utf8data->data, utf8data->size);
grid_view_set_utf8(gd, s->cx, s->cy, &gu); grid_view_set_utf8(gd, s->cx, s->cy, &gu);
}
/* Move the cursor. */ /* Move the cursor. */
s->cx += width; s->cx += width;
@ -1085,13 +1071,14 @@ screen_write_cell(
/* Combine a UTF-8 zero-width character onto the previous. */ /* Combine a UTF-8 zero-width character onto the previous. */
int int
screen_write_combine(struct screen_write_ctx *ctx, u_char *udata) screen_write_combine(
struct screen_write_ctx *ctx, const struct utf8_data *utf8data)
{ {
struct screen *s = ctx->s; struct screen *s = ctx->s;
struct grid *gd = s->grid; struct grid *gd = s->grid;
struct grid_cell *gc; struct grid_cell *gc;
struct grid_utf8 *gu, tmp_gu; struct grid_utf8 *gu, tmp_gu;
u_int i, old_size, new_size; u_int i, old_size;
/* Can't combine if at 0. */ /* Can't combine if at 0. */
if (s->cx == 0) if (s->cx == 0)
@ -1108,23 +1095,15 @@ screen_write_combine(struct screen_write_ctx *ctx, u_char *udata)
gc->flags |= GRID_FLAG_UTF8; gc->flags |= GRID_FLAG_UTF8;
} }
/* Get the previous cell's UTF-8 data. */ /* Get the previous cell's UTF-8 data and its size. */
gu = grid_view_get_utf8(gd, s->cx - 1, s->cy); gu = grid_view_get_utf8(gd, s->cx - 1, s->cy);
/* Find the new size. */
for (new_size = 0; new_size < UTF8_SIZE; new_size++) {
if (udata[new_size] == 0xff)
break;
}
/* And the old size. */
for (old_size = 0; old_size < UTF8_SIZE; old_size++) { for (old_size = 0; old_size < UTF8_SIZE; old_size++) {
if (gu->data[old_size] == 0xff) if (gu->data[old_size] == 0xff)
break; break;
} }
/* If there isn't space, scrap this character. */ /* If there isn't space, scrap this character. */
if (old_size + new_size > UTF8_SIZE) { if (old_size + utf8data->size > UTF8_SIZE) {
for (i = 0; i < gu->width && i != UTF8_SIZE; i++) for (i = 0; i < gu->width && i != UTF8_SIZE; i++)
gu->data[i] = '_'; gu->data[i] = '_';
if (i != UTF8_SIZE) if (i != UTF8_SIZE)
@ -1133,9 +1112,9 @@ screen_write_combine(struct screen_write_ctx *ctx, u_char *udata)
} }
/* Otherwise save the character. */ /* Otherwise save the character. */
memcpy(gu->data + old_size, udata, new_size); memcpy(gu->data + old_size, utf8data->data, utf8data->size);
if (old_size + new_size != UTF8_SIZE) if (old_size + utf8data->size != UTF8_SIZE)
gu->data[old_size + new_size] = 0xff; gu->data[old_size + utf8data->size] = 0xff;
return (0); return (0);
} }

31
tmux.h
View File

@ -1,4 +1,4 @@
/* $Id: tmux.h,v 1.480 2009-10-23 17:13:10 tcunha Exp $ */ /* $Id: tmux.h,v 1.481 2009-10-23 17:16:24 tcunha Exp $ */
/* /*
* Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net> * Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net>
@ -475,6 +475,23 @@ struct mode_key_table {
#define MODE_KKEYPAD 0x8 #define MODE_KKEYPAD 0x8
#define MODE_MOUSE 0x10 #define MODE_MOUSE 0x10
/*
* A single UTF-8 character.
*
* The data member in this must be UTF8_SIZE to allow screen_write_copy to
* reinject stored UTF-8 data back into screen_write_cell after combining (ugh
* XXX XXX).
*/
#define UTF8_SIZE 9
struct utf8_data {
u_char data[UTF8_SIZE];
size_t have;
size_t size;
u_int width;
};
/* Grid output. */ /* Grid output. */
#if defined(DEBUG) && \ #if defined(DEBUG) && \
((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
@ -514,7 +531,6 @@ struct grid_cell {
} __packed; } __packed;
/* Grid cell UTF-8 data. Used instead of data in grid_cell for UTF-8 cells. */ /* Grid cell UTF-8 data. Used instead of data in grid_cell for UTF-8 cells. */
#define UTF8_SIZE 9
struct grid_utf8 { struct grid_utf8 {
u_char width; u_char width;
u_char data[UTF8_SIZE]; u_char data[UTF8_SIZE];
@ -670,9 +686,7 @@ struct input_ctx {
#define STRING_APPLICATION 1 #define STRING_APPLICATION 1
#define STRING_NAME 2 #define STRING_NAME 2
u_char utf8_buf[4]; struct utf8_data utf8data;
u_int utf8_len;
u_int utf8_off;
u_char intermediate; u_char intermediate;
void *(*state)(u_char, struct input_ctx *); void *(*state)(u_char, struct input_ctx *);
@ -1680,8 +1694,8 @@ void screen_write_kkeypadmode(struct screen_write_ctx *, int);
void screen_write_clearendofscreen(struct screen_write_ctx *); void screen_write_clearendofscreen(struct screen_write_ctx *);
void screen_write_clearstartofscreen(struct screen_write_ctx *); void screen_write_clearstartofscreen(struct screen_write_ctx *);
void screen_write_clearscreen(struct screen_write_ctx *); void screen_write_clearscreen(struct screen_write_ctx *);
void screen_write_cell( void screen_write_cell(struct screen_write_ctx *,
struct screen_write_ctx *, const struct grid_cell *, u_char *); const struct grid_cell *, const struct utf8_data *);
/* screen-redraw.c */ /* screen-redraw.c */
void screen_redraw_screen(struct client *, int); void screen_redraw_screen(struct client *, int);
@ -1836,7 +1850,8 @@ void session_group_synchronize1(struct session *, struct session *);
/* utf8.c */ /* utf8.c */
void utf8_build(void); void utf8_build(void);
int utf8_width(const u_char *); int utf8_open(struct utf8_data *, u_char);
int utf8_append(struct utf8_data *, u_char);
/* osdep-*.c */ /* osdep-*.c */
char *osdep_get_name(int, char *); char *osdep_get_name(int, char *);

7
tty.c
View File

@ -1,4 +1,4 @@
/* $Id: tty.c,v 1.157 2009-10-23 17:13:10 tcunha Exp $ */ /* $Id: tty.c,v 1.158 2009-10-23 17:16:24 tcunha Exp $ */
/* /*
* Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net> * Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net>
@ -347,7 +347,7 @@ tty_putc(struct tty *tty, u_char ch)
void void
tty_pututf8(struct tty *tty, const struct grid_utf8 *gu) tty_pututf8(struct tty *tty, const struct grid_utf8 *gu)
{ {
u_int i, width; u_int i;
for (i = 0; i < UTF8_SIZE; i++) { for (i = 0; i < UTF8_SIZE; i++) {
if (gu->data[i] == 0xff) if (gu->data[i] == 0xff)
@ -357,8 +357,7 @@ tty_pututf8(struct tty *tty, const struct grid_utf8 *gu)
write(tty->log_fd, &gu->data[i], 1); write(tty->log_fd, &gu->data[i], 1);
} }
width = utf8_width(gu->data); tty->cx += gu->width;
tty->cx += width;
} }
void void

118
utf8.c
View File

@ -1,4 +1,4 @@
/* $Id: utf8.c,v 1.9 2009-06-25 16:21:32 nicm Exp $ */ /* $Id: utf8.c,v 1.10 2009-10-23 17:16:25 tcunha Exp $ */
/* /*
* Copyright (c) 2008 Nicholas Marriott <nicm@users.sourceforge.net> * Copyright (c) 2008 Nicholas Marriott <nicm@users.sourceforge.net>
@ -196,9 +196,56 @@ struct utf8_width_entry utf8_width_table[] = {
struct utf8_width_entry *utf8_width_root = NULL; struct utf8_width_entry *utf8_width_root = NULL;
int utf8_overlap(struct utf8_width_entry *, struct utf8_width_entry *); int utf8_overlap(struct utf8_width_entry *, struct utf8_width_entry *);
void utf8_print(struct utf8_width_entry *, int); u_int utf8_combine(const struct utf8_data *);
u_int utf8_combine(const u_char *); u_int utf8_width(const struct utf8_data *);
/*
* Open UTF-8 sequence.
*
* 11000010-11011111 C2-DF start of 2-byte sequence
* 11100000-11101111 E0-EF start of 3-byte sequence
* 11110000-11110100 F0-F4 start of 4-byte sequence
*
* Returns 1 if more UTF-8 to come, 0 if not UTF-8.
*/
int
utf8_open(struct utf8_data *utf8data, u_char ch)
{
memset(utf8data, 0, sizeof *utf8data);
if (ch >= 0xc2 && ch <= 0xdf)
utf8data->size = 2;
else if (ch >= 0xe0 && ch <= 0xef)
utf8data->size = 3;
else if (ch >= 0xf0 && ch <= 0xf4)
utf8data->size = 4;
else
return (0);
utf8_append(utf8data, ch);
return (1);
}
/*
* Append character to UTF-8, closing if finished.
*
* Returns 1 if more UTF-8 data to come, 1 if finished.
*/
int
utf8_append(struct utf8_data *utf8data, u_char ch)
{
if (utf8data->have >= utf8data->size)
fatalx("UTF-8 character overflow");
if (utf8data->size > sizeof utf8data->data)
fatalx("UTF-8 character size too large");
utf8data->data[utf8data->have++] = ch;
if (utf8data->have != utf8data->size)
return (1);
utf8data->width = utf8_width(utf8data);
return (0);
}
/* Check if two width tree entries overlap. */
int int
utf8_overlap( utf8_overlap(
struct utf8_width_entry *item1, struct utf8_width_entry *item2) struct utf8_width_entry *item1, struct utf8_width_entry *item2)
@ -214,6 +261,7 @@ utf8_overlap(
return (0); return (0);
} }
/* Build UTF-8 width tree. */
void void
utf8_build(void) utf8_build(void)
{ {
@ -240,52 +288,50 @@ utf8_build(void)
} }
} }
void /* Combine UTF-8 into 32-bit Unicode. */
utf8_print(struct utf8_width_entry *node, int n)
{
log_debug("%*s%04x -> %04x", n, " ", node->first, node->last);
if (node->left != NULL)
utf8_print(node->left, n + 1);
if (node->right != NULL)
utf8_print(node->right, n + 1);
}
u_int u_int
utf8_combine(const u_char *data) utf8_combine(const struct utf8_data *utf8data)
{ {
u_int uvalue; u_int value;
if (data[1] == 0xff) value = 0xff;
uvalue = data[0]; switch (utf8data->size) {
else if (data[2] == 0xff) { case 1:
uvalue = data[1] & 0x3f; value = utf8data->data[0];
uvalue |= (data[0] & 0x1f) << 6; break;
} else if (data[3] == 0xff) { case 2:
uvalue = data[2] & 0x3f; value = utf8data->data[1] & 0x3f;
uvalue |= (data[1] & 0x3f) << 6; value |= (utf8data->data[0] & 0x1f) << 6;
uvalue |= (data[0] & 0x0f) << 12; break;
} else { case 3:
uvalue = data[3] & 0x3f; value = utf8data->data[2] & 0x3f;
uvalue |= (data[2] & 0x3f) << 6; value |= (utf8data->data[1] & 0x3f) << 6;
uvalue |= (data[1] & 0x3f) << 12; value |= (utf8data->data[0] & 0x0f) << 12;
uvalue |= (data[0] & 0x3f) << 18; break;
case 4:
value = utf8data->data[3] & 0x3f;
value |= (utf8data->data[2] & 0x3f) << 6;
value |= (utf8data->data[1] & 0x3f) << 12;
value |= (utf8data->data[0] & 0x3f) << 18;
break;
} }
return (uvalue); return (value);
} }
int /* Lookup width of UTF-8 data in tree. */
utf8_width(const u_char *udata) u_int
utf8_width(const struct utf8_data *utf8data)
{ {
struct utf8_width_entry *item; struct utf8_width_entry *item;
u_int uvalue; u_int value;
uvalue = utf8_combine(udata); value = utf8_combine(utf8data);
item = utf8_width_root; item = utf8_width_root;
while (item != NULL) { while (item != NULL) {
if (uvalue < item->first) if (value < item->first)
item = item->left; item = item->left;
else if (uvalue > item->last) else if (value > item->last)
item = item->right; item = item->right;
else else
return (item->width); return (item->width);