From 1af09d6330ddba2d3ffc9c15d056fe8c4321f17e Mon Sep 17 00:00:00 2001 From: Nicholas Marriott Date: Tue, 20 Oct 2009 19:18:28 +0000 Subject: [PATCH] Try to reduce the UTF-8 mess. Get rid of passing around u_char[4]s and define a struct utf8_data which has character data, size (sequence length) and width. Move UTF-8 character collection into two functions utf8_open/utf8_append in utf8.c which fill in this struct and use these functions from input.c and the various functions in screen-write.c. Space for rather more data than is necessary for one UTF-8 sequence is in the utf8_data struct because screen_write_copy is still nasty and needs to reinject the character (after combining) into screen_write_cell. --- input.c | 40 +++-------- screen-write.c | 185 ++++++++++++++++++++++--------------------------- tmux.h | 29 ++++++-- tty.c | 5 +- utf8.c | 116 +++++++++++++++++++++---------- 5 files changed, 195 insertions(+), 180 deletions(-) diff --git a/input.c b/input.c index fa7661f7..4035b2c8 100644 --- a/input.c +++ b/input.c @@ -572,15 +572,14 @@ input_state_string_escape(u_char ch, struct input_ctx *ictx) void input_state_utf8(u_char ch, struct input_ctx *ictx) { - log_debug2("-- un %zu: %hhu (%c)", ictx->off, ch, ch); + log_debug2("-- utf8 next: %zu: %hhu (%c)", ictx->off, ch, ch); - ictx->utf8_buf[ictx->utf8_off++] = ch; - if (--ictx->utf8_len != 0) - return; + if (utf8_append(&ictx->utf8data, ch)) + return; /* more to come */ input_state(ictx, input_state_first); ictx->cell.flags |= GRID_FLAG_UTF8; - screen_write_cell(&ictx->ctx, &ictx->cell, ictx->utf8_buf); + screen_write_cell(&ictx->ctx, &ictx->cell, &ictx->utf8data); ictx->cell.flags &= ~GRID_FLAG_UTF8; } @@ -590,40 +589,17 @@ input_handle_character(u_char ch, struct input_ctx *ictx) struct window_pane *wp = ictx->wp; if (ch > 0x7f && options_get_number(&wp->window->options, "utf8")) { - /* - * UTF-8 sequence. - * - * 11000010-11011111 C2-DF start of 2-byte sequence - * 11100000-11101111 E0-EF start of 3-byte sequence - * 11110000-11110100 F0-F4 start of 4-byte sequence - */ - memset(ictx->utf8_buf, 0xff, sizeof ictx->utf8_buf); - ictx->utf8_buf[0] = ch; - ictx->utf8_off = 1; - - if (ch >= 0xc2 && ch <= 0xdf) { - log_debug2("-- u2 %zu: %hhu (%c)", ictx->off, ch, ch); + if (utf8_open(&ictx->utf8data, ch)) { + log_debug2("-- utf8 size %u: %zu: %hhu (%c)", + ictx->utf8data.size, ictx->off, ch, ch); input_state(ictx, input_state_utf8); - ictx->utf8_len = 1; - return; - } - if (ch >= 0xe0 && ch <= 0xef) { - log_debug2("-- u3 %zu: %hhu (%c)", ictx->off, ch, ch); - input_state(ictx, input_state_utf8); - ictx->utf8_len = 2; - return; - } - if (ch >= 0xf0 && ch <= 0xf4) { - log_debug2("-- u4 %zu: %hhu (%c)", ictx->off, ch, ch); - input_state(ictx, input_state_utf8); - ictx->utf8_len = 3; return; } } log_debug2("-- ch %zu: %hhu (%c)", ictx->off, ch, ch); ictx->cell.data = ch; - screen_write_cell(&ictx->ctx, &ictx->cell, ictx->utf8_buf); + screen_write_cell(&ictx->ctx, &ictx->cell, NULL); } void diff --git a/screen-write.c b/screen-write.c index 9ed8f93b..8f10976f 100644 --- a/screen-write.c +++ b/screen-write.c @@ -24,7 +24,8 @@ void screen_write_initctx(struct screen_write_ctx *, struct tty_ctx *, int); void screen_write_overwrite(struct screen_write_ctx *); -int screen_write_combine(struct screen_write_ctx *, u_char *); +int screen_write_combine( + struct screen_write_ctx *, const struct utf8_data *); /* Initialise writing with a window. */ void @@ -92,10 +93,11 @@ screen_write_cstrlen(int utf8flag, const char *fmt, ...) size_t printflike2 screen_write_strlen(int utf8flag, const char *fmt, ...) { - va_list ap; - char *msg; - u_char *ptr, utf8buf[4]; - size_t left, size = 0; + va_list ap; + char *msg; + struct utf8_data utf8data; + u_char *ptr; + size_t left, size = 0; va_start(ap, fmt); xvasprintf(&msg, fmt, ap); @@ -103,24 +105,17 @@ screen_write_strlen(int utf8flag, const char *fmt, ...) ptr = msg; while (*ptr != '\0') { - if (utf8flag && *ptr > 0x7f) { - memset(utf8buf, 0xff, sizeof utf8buf); + if (utf8flag && *ptr > 0x7f && utf8_open(&utf8data, *ptr)) { + ptr++; left = strlen(ptr); - if (*ptr >= 0xc2 && *ptr <= 0xdf && left >= 2) { - memcpy(utf8buf, ptr, 2); - ptr += 2; - } else if (*ptr >= 0xe0 && *ptr <= 0xef && left >= 3) { - memcpy(utf8buf, ptr, 3); - ptr += 3; - } else if (*ptr >= 0xf0 && *ptr <= 0xf4 && left >= 4) { - memcpy(utf8buf, ptr, 4); - ptr += 4; - } else { - *utf8buf = *ptr; + if (left < utf8data.size - 1) + break; + while (utf8_append(&utf8data, *ptr)) ptr++; - } - size += utf8_width(utf8buf); + ptr++; + + size += utf8data.width; } else { size++; ptr++; @@ -159,47 +154,38 @@ void screen_write_vnputs(struct screen_write_ctx *ctx, ssize_t maxlen, struct grid_cell *gc, int utf8flag, const char *fmt, va_list ap) { - char *msg; - u_char *ptr, utf8buf[4]; - size_t left, size = 0; - int width; + char *msg; + struct utf8_data utf8data; + u_char *ptr; + size_t left, size = 0; xvasprintf(&msg, fmt, ap); ptr = msg; while (*ptr != '\0') { - if (utf8flag && *ptr > 0x7f) { - memset(utf8buf, 0xff, sizeof utf8buf); + if (utf8flag && *ptr > 0x7f && utf8_open(&utf8data, *ptr)) { + ptr++; left = strlen(ptr); - if (*ptr >= 0xc2 && *ptr <= 0xdf && left >= 2) { - memcpy(utf8buf, ptr, 2); - ptr += 2; - } else if (*ptr >= 0xe0 && *ptr <= 0xef && left >= 3) { - memcpy(utf8buf, ptr, 3); - ptr += 3; - } else if (*ptr >= 0xf0 && *ptr <= 0xf4 && left >= 4) { - memcpy(utf8buf, ptr, 4); - ptr += 4; - } else { - *utf8buf = *ptr; + if (left < utf8data.size - 1) + break; + while (utf8_append(&utf8data, *ptr)) ptr++; - } + ptr++; - width = utf8_width(utf8buf); - if (maxlen > 0 && size + width > (size_t) maxlen) { + if (maxlen > 0 && + size + utf8data.width > (size_t) maxlen) { while (size < (size_t) maxlen) { screen_write_putc(ctx, gc, ' '); size++; } break; } - size += width; - + size += utf8data.width; + gc->flags |= GRID_FLAG_UTF8; - screen_write_cell(ctx, gc, utf8buf); + screen_write_cell(ctx, gc, &utf8data); gc->flags &= ~GRID_FLAG_UTF8; - } else { if (maxlen > 0 && size + 1 > (size_t) maxlen) break; @@ -219,11 +205,11 @@ screen_write_cnputs(struct screen_write_ctx *ctx, ssize_t maxlen, struct grid_cell *gc, int utf8flag, const char *fmt, ...) { struct grid_cell lgc; + struct utf8_data utf8data; va_list ap; char *msg; - u_char *ptr, *last, utf8buf[4]; + u_char *ptr, *last; size_t left, size = 0; - int width; va_start(ap, fmt); xvasprintf(&msg, fmt, ap); @@ -247,38 +233,29 @@ screen_write_cnputs(struct screen_write_ctx *ctx, continue; } - if (utf8flag && *ptr > 0x7f) { - memset(utf8buf, 0xff, sizeof utf8buf); + if (utf8flag && *ptr > 0x7f && utf8_open(&utf8data, *ptr)) { + ptr++; left = strlen(ptr); - if (*ptr >= 0xc2 && *ptr <= 0xdf && left >= 2) { - memcpy(utf8buf, ptr, 2); - ptr += 2; - } else if (*ptr >= 0xe0 && *ptr <= 0xef && left >= 3) { - memcpy(utf8buf, ptr, 3); - ptr += 3; - } else if (*ptr >= 0xf0 && *ptr <= 0xf4 && left >= 4) { - memcpy(utf8buf, ptr, 4); - ptr += 4; - } else { - *utf8buf = *ptr; + if (left < utf8data.size - 1) + break; + while (utf8_append(&utf8data, *ptr)) ptr++; - } + ptr++; - width = utf8_width(utf8buf); - if (maxlen > 0 && size + width > (size_t) maxlen) { + if (maxlen > 0 && + size + utf8data.width > (size_t) maxlen) { while (size < (size_t) maxlen) { screen_write_putc(ctx, gc, ' '); size++; } break; } - size += width; + size += utf8data.width; lgc.flags |= GRID_FLAG_UTF8; - screen_write_cell(ctx, &lgc, utf8buf); + screen_write_cell(ctx, &lgc, &utf8data); lgc.flags &= ~GRID_FLAG_UTF8; - } else { if (maxlen > 0 && size + 1 > (size_t) maxlen) break; @@ -375,8 +352,9 @@ screen_write_copy(struct screen_write_ctx *ctx, struct grid *gd = src->grid; struct grid_line *gl; const struct grid_cell *gc; - u_char *udata; - u_int xx, yy, cx, cy, ax, bx; + const struct grid_utf8 *gu; + struct utf8_data utf8data; + u_int xx, yy, cx, cy, ax, bx, i; cx = s->cx; cy = s->cy; @@ -397,21 +375,30 @@ screen_write_copy(struct screen_write_ctx *ctx, bx = gl->cellsize; else bx = px + nx; + for (xx = ax; xx < bx; xx++) { - udata = NULL; if (xx >= gl->cellsize) gc = &grid_default_cell; - else { + else gc = &gl->celldata[xx]; - if (gc->flags & GRID_FLAG_UTF8) - udata = gl->utf8data[xx].data; + if (gc->flags & GRID_FLAG_UTF8) { + gu = &gl->utf8data[xx]; + memcpy(utf8data.data, + gu->data, sizeof utf8data.data); + utf8data.width = gu->width; + utf8data.size = 0; + for (i = 0; i < UTF8_SIZE; i++) { + if (gu->data[i] == 0xff) + break; + utf8data.size++; + } } - screen_write_cell(ctx, gc, udata); + screen_write_cell(ctx, gc, &utf8data); } if (px + nx == gd->sx && px + nx > gl->cellsize) screen_write_clearendofline(ctx); } else - screen_write_clearline(ctx); + screen_write_clearline(ctx); cy++; screen_write_cursormove(ctx, cx, cy); } @@ -972,8 +959,8 @@ screen_write_clearscreen(struct screen_write_ctx *ctx) /* Write cell data. */ void -screen_write_cell( - struct screen_write_ctx *ctx, const struct grid_cell *gc, u_char *udata) +screen_write_cell(struct screen_write_ctx *ctx, + const struct grid_cell *gc, const struct utf8_data *utf8data) { struct screen *s = ctx->s; struct grid *gd = s->grid; @@ -988,12 +975,9 @@ screen_write_cell( return; /* Find character width. */ - if (gc->flags & GRID_FLAG_UTF8) { - width = utf8_width(udata); - - gu.width = width; - memcpy(&gu.data, udata, sizeof gu.data); - } else + if (gc->flags & GRID_FLAG_UTF8) + width = utf8data->width; + else width = 1; /* @@ -1009,7 +993,7 @@ screen_write_cell( * there is space. */ if (width == 0) { - if (screen_write_combine(ctx, udata) == 0) { + if (screen_write_combine(ctx, utf8data) == 0) { screen_write_initctx(ctx, &ttyctx, 0); tty_write(tty_cmd_utf8character, &ttyctx); } @@ -1028,11 +1012,6 @@ screen_write_cell( /* Check this will fit on the current line and wrap if not. */ if (s->cx > screen_size_x(s) - width) { - /* - * Don't update the terminal now, just update the screen and - * leave the cursor to scroll naturally, unless this is only - * part of the screen width. - */ screen_write_linefeed(ctx, 1); s->cx = 0; /* carriage return */ } @@ -1056,8 +1035,15 @@ screen_write_cell( /* Set the cell. */ grid_view_set_cell(gd, s->cx, s->cy, gc); - if (gc->flags & GRID_FLAG_UTF8) + if (gc->flags & GRID_FLAG_UTF8) { + /* Construct UTF-8 and write it. */ + gu.width = utf8data->width; + memset(gu.data, 0xff, sizeof gu.data); + if (utf8data->size > sizeof gu.data) + fatalx("UTF-8 data overflow"); + memcpy(gu.data, utf8data->data, utf8data->size); grid_view_set_utf8(gd, s->cx, s->cy, &gu); + } /* Move the cursor. */ s->cx += width; @@ -1085,13 +1071,14 @@ screen_write_cell( /* Combine a UTF-8 zero-width character onto the previous. */ int -screen_write_combine(struct screen_write_ctx *ctx, u_char *udata) +screen_write_combine( + struct screen_write_ctx *ctx, const struct utf8_data *utf8data) { struct screen *s = ctx->s; struct grid *gd = s->grid; struct grid_cell *gc; struct grid_utf8 *gu, tmp_gu; - u_int i, old_size, new_size; + u_int i, old_size; /* Can't combine if at 0. */ if (s->cx == 0) @@ -1108,23 +1095,15 @@ screen_write_combine(struct screen_write_ctx *ctx, u_char *udata) gc->flags |= GRID_FLAG_UTF8; } - /* Get the previous cell's UTF-8 data. */ + /* Get the previous cell's UTF-8 data and its size. */ gu = grid_view_get_utf8(gd, s->cx - 1, s->cy); - - /* Find the new size. */ - for (new_size = 0; new_size < UTF8_SIZE; new_size++) { - if (udata[new_size] == 0xff) - break; - } - - /* And the old size. */ for (old_size = 0; old_size < UTF8_SIZE; old_size++) { if (gu->data[old_size] == 0xff) break; } /* If there isn't space, scrap this character. */ - if (old_size + new_size > UTF8_SIZE) { + if (old_size + utf8data->size > UTF8_SIZE) { for (i = 0; i < gu->width && i != UTF8_SIZE; i++) gu->data[i] = '_'; if (i != UTF8_SIZE) @@ -1133,9 +1112,9 @@ screen_write_combine(struct screen_write_ctx *ctx, u_char *udata) } /* Otherwise save the character. */ - memcpy(gu->data + old_size, udata, new_size); - if (old_size + new_size != UTF8_SIZE) - gu->data[old_size + new_size] = 0xff; + memcpy(gu->data + old_size, utf8data->data, utf8data->size); + if (old_size + utf8data->size != UTF8_SIZE) + gu->data[old_size + utf8data->size] = 0xff; return (0); } diff --git a/tmux.h b/tmux.h index a61040b6..850c2f5b 100644 --- a/tmux.h +++ b/tmux.h @@ -477,6 +477,23 @@ struct mode_key_table { #define MODE_KKEYPAD 0x8 #define MODE_MOUSE 0x10 +/* + * A single UTF-8 character. + * + * The data member in this must be UTF8_SIZE to allow screen_write_copy to + * reinject stored UTF-8 data back into screen_write_cell after combining (ugh + * XXX XXX). + */ +#define UTF8_SIZE 9 +struct utf8_data { + u_char data[UTF8_SIZE]; + + size_t have; + size_t size; + + u_int width; +}; + /* Grid output. */ #if defined(DEBUG) && \ ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ @@ -516,7 +533,6 @@ struct grid_cell { } __packed; /* Grid cell UTF-8 data. Used instead of data in grid_cell for UTF-8 cells. */ -#define UTF8_SIZE 9 struct grid_utf8 { u_char width; u_char data[UTF8_SIZE]; @@ -672,9 +688,7 @@ struct input_ctx { #define STRING_APPLICATION 1 #define STRING_NAME 2 - u_char utf8_buf[4]; - u_int utf8_len; - u_int utf8_off; + struct utf8_data utf8data; u_char intermediate; void *(*state)(u_char, struct input_ctx *); @@ -1682,8 +1696,8 @@ void screen_write_kkeypadmode(struct screen_write_ctx *, int); void screen_write_clearendofscreen(struct screen_write_ctx *); void screen_write_clearstartofscreen(struct screen_write_ctx *); void screen_write_clearscreen(struct screen_write_ctx *); -void screen_write_cell( - struct screen_write_ctx *, const struct grid_cell *, u_char *); +void screen_write_cell(struct screen_write_ctx *, + const struct grid_cell *, const struct utf8_data *); /* screen-redraw.c */ void screen_redraw_screen(struct client *, int); @@ -1838,7 +1852,8 @@ void session_group_synchronize1(struct session *, struct session *); /* utf8.c */ void utf8_build(void); -int utf8_width(const u_char *); +int utf8_open(struct utf8_data *, u_char); +int utf8_append(struct utf8_data *, u_char); /* procname.c */ char *get_proc_name(int, char *); diff --git a/tty.c b/tty.c index 9ddffb95..ee725edf 100644 --- a/tty.c +++ b/tty.c @@ -342,7 +342,7 @@ tty_putc(struct tty *tty, u_char ch) void tty_pututf8(struct tty *tty, const struct grid_utf8 *gu) { - u_int i, width; + u_int i; for (i = 0; i < UTF8_SIZE; i++) { if (gu->data[i] == 0xff) @@ -352,8 +352,7 @@ tty_pututf8(struct tty *tty, const struct grid_utf8 *gu) write(tty->log_fd, &gu->data[i], 1); } - width = utf8_width(gu->data); - tty->cx += width; + tty->cx += gu->width; } void diff --git a/utf8.c b/utf8.c index 9b499d1c..52cc709e 100644 --- a/utf8.c +++ b/utf8.c @@ -196,9 +196,56 @@ struct utf8_width_entry utf8_width_table[] = { struct utf8_width_entry *utf8_width_root = NULL; int utf8_overlap(struct utf8_width_entry *, struct utf8_width_entry *); -void utf8_print(struct utf8_width_entry *, int); -u_int utf8_combine(const u_char *); +u_int utf8_combine(const struct utf8_data *); +u_int utf8_width(const struct utf8_data *); +/* + * Open UTF-8 sequence. + * + * 11000010-11011111 C2-DF start of 2-byte sequence + * 11100000-11101111 E0-EF start of 3-byte sequence + * 11110000-11110100 F0-F4 start of 4-byte sequence + * + * Returns 1 if more UTF-8 to come, 0 if not UTF-8. + */ +int +utf8_open(struct utf8_data *utf8data, u_char ch) +{ + memset(utf8data, 0, sizeof *utf8data); + if (ch >= 0xc2 && ch <= 0xdf) + utf8data->size = 2; + else if (ch >= 0xe0 && ch <= 0xef) + utf8data->size = 3; + else if (ch >= 0xf0 && ch <= 0xf4) + utf8data->size = 4; + else + return (0); + utf8_append(utf8data, ch); + return (1); +} + +/* + * Append character to UTF-8, closing if finished. + * + * Returns 1 if more UTF-8 data to come, 1 if finished. + */ +int +utf8_append(struct utf8_data *utf8data, u_char ch) +{ + if (utf8data->have >= utf8data->size) + fatalx("UTF-8 character overflow"); + if (utf8data->size > sizeof utf8data->data) + fatalx("UTF-8 character size too large"); + + utf8data->data[utf8data->have++] = ch; + if (utf8data->have != utf8data->size) + return (1); + + utf8data->width = utf8_width(utf8data); + return (0); +} + +/* Check if two width tree entries overlap. */ int utf8_overlap( struct utf8_width_entry *item1, struct utf8_width_entry *item2) @@ -214,6 +261,7 @@ utf8_overlap( return (0); } +/* Build UTF-8 width tree. */ void utf8_build(void) { @@ -240,52 +288,50 @@ utf8_build(void) } } -void -utf8_print(struct utf8_width_entry *node, int n) -{ - log_debug("%*s%04x -> %04x", n, " ", node->first, node->last); - if (node->left != NULL) - utf8_print(node->left, n + 1); - if (node->right != NULL) - utf8_print(node->right, n + 1); -} - +/* Combine UTF-8 into 32-bit Unicode. */ u_int -utf8_combine(const u_char *data) +utf8_combine(const struct utf8_data *utf8data) { - u_int uvalue; + u_int value; - if (data[1] == 0xff) - uvalue = data[0]; - else if (data[2] == 0xff) { - uvalue = data[1] & 0x3f; - uvalue |= (data[0] & 0x1f) << 6; - } else if (data[3] == 0xff) { - uvalue = data[2] & 0x3f; - uvalue |= (data[1] & 0x3f) << 6; - uvalue |= (data[0] & 0x0f) << 12; - } else { - uvalue = data[3] & 0x3f; - uvalue |= (data[2] & 0x3f) << 6; - uvalue |= (data[1] & 0x3f) << 12; - uvalue |= (data[0] & 0x3f) << 18; + value = 0xff; + switch (utf8data->size) { + case 1: + value = utf8data->data[0]; + break; + case 2: + value = utf8data->data[1] & 0x3f; + value |= (utf8data->data[0] & 0x1f) << 6; + break; + case 3: + value = utf8data->data[2] & 0x3f; + value |= (utf8data->data[1] & 0x3f) << 6; + value |= (utf8data->data[0] & 0x0f) << 12; + break; + case 4: + value = utf8data->data[3] & 0x3f; + value |= (utf8data->data[2] & 0x3f) << 6; + value |= (utf8data->data[1] & 0x3f) << 12; + value |= (utf8data->data[0] & 0x3f) << 18; + break; } - return (uvalue); + return (value); } -int -utf8_width(const u_char *udata) +/* Lookup width of UTF-8 data in tree. */ +u_int +utf8_width(const struct utf8_data *utf8data) { struct utf8_width_entry *item; - u_int uvalue; + u_int value; - uvalue = utf8_combine(udata); + value = utf8_combine(utf8data); item = utf8_width_root; while (item != NULL) { - if (uvalue < item->first) + if (value < item->first) item = item->left; - else if (uvalue > item->last) + else if (value > item->last) item = item->right; else return (item->width);