Change UTF-8 combining to inspect the previous character at the cursor

position rather than keeping the last character from the input stream,
this is how most terminals work and fixes problems with displaying these
characters in vim. GitHub issue 3600.
This commit is contained in:
nicm 2023-09-15 15:49:05 +00:00
parent d394293ba5
commit f09cde2542
6 changed files with 367 additions and 1219 deletions

View File

@ -32,8 +32,8 @@ static void screen_write_collect_flush(struct screen_write_ctx *, int,
const char *); const char *);
static int screen_write_overwrite(struct screen_write_ctx *, static int screen_write_overwrite(struct screen_write_ctx *,
struct grid_cell *, u_int); struct grid_cell *, u_int);
static const struct grid_cell *screen_write_combine(struct screen_write_ctx *, static int screen_write_combine(struct screen_write_ctx *,
const struct utf8_data *, u_int *, u_int *); const struct grid_cell *);
struct screen_write_citem { struct screen_write_citem {
u_int x; u_int x;
@ -1742,7 +1742,6 @@ screen_write_collect_end(struct screen_write_ctx *ctx)
if (ci->used == 0) if (ci->used == 0)
return; return;
ctx->flags &= ~SCREEN_WRITE_COMBINE;
before = screen_write_collect_trim(ctx, s->cy, s->cx, ci->used, before = screen_write_collect_trim(ctx, s->cy, s->cx, ci->used,
&wrapped); &wrapped);
@ -1841,65 +1840,22 @@ screen_write_cell(struct screen_write_ctx *ctx, const struct grid_cell *gc)
{ {
struct screen *s = ctx->s; struct screen *s = ctx->s;
struct grid *gd = s->grid; struct grid *gd = s->grid;
struct grid_cell copy; const struct utf8_data *ud = &gc->data;
const struct utf8_data *ud = &gc->data, *previous = NULL, *combine;
struct grid_line *gl; struct grid_line *gl;
struct grid_cell_entry *gce; struct grid_cell_entry *gce;
struct grid_cell tmp_gc, now_gc; struct grid_cell tmp_gc, now_gc;
struct tty_ctx ttyctx; struct tty_ctx ttyctx;
u_int sx = screen_size_x(s), sy = screen_size_y(s); u_int sx = screen_size_x(s), sy = screen_size_y(s);
u_int width = ud->width, xx, last, cx, cy; u_int width = ud->width, xx, not_wrap;
int selected, skip = 1; int selected, skip = 1;
/* Ignore padding cells. */ /* Ignore padding cells. */
if (gc->flags & GRID_FLAG_PADDING) if (gc->flags & GRID_FLAG_PADDING)
return; return;
/* Check if this cell needs to be combined with the previous cell. */ /* Get the previous cell to check for combining. */
if (ctx->flags & SCREEN_WRITE_COMBINE) if (screen_write_combine(ctx, gc) != 0)
previous = &ctx->previous;
switch (utf8_try_combined(ud, previous, &combine, &width)) {
case UTF8_DISCARD_NOW:
log_debug("%s: UTF8_DISCARD_NOW (width %u)", __func__, width);
ctx->flags &= ~SCREEN_WRITE_COMBINE;
return; return;
case UTF8_WRITE_NOW:
log_debug("%s: UTF8_WRITE_NOW (width %u)", __func__, width);
ctx->flags &= ~SCREEN_WRITE_COMBINE;
break;
case UTF8_COMBINE_NOW:
log_debug("%s: UTF8_COMBINE_NOW (width %u)", __func__, width);
screen_write_collect_flush(ctx, 0, __func__);
gc = screen_write_combine(ctx, combine, &xx, &cx);
if (gc != NULL) {
cx = s->cx; cy = s->cy;
screen_write_set_cursor(ctx, xx, s->cy);
screen_write_initctx(ctx, &ttyctx, 0);
ttyctx.cell = gc;
tty_write(tty_cmd_cell, &ttyctx);
s->cx = cx; s->cy = cy;
}
ctx->flags &= ~SCREEN_WRITE_COMBINE;
return;
case UTF8_WRITE_MAYBE_COMBINE:
log_debug("%s: UTF8_WRITE_MAYBE_COMBINE (width %u)", __func__,
width);
utf8_copy(&ctx->previous, ud);
ctx->flags |= SCREEN_WRITE_COMBINE;
break;
case UTF8_DISCARD_MAYBE_COMBINE:
log_debug("%s: UTF8_DISCARD_MAYBE_COMBINE (width %u)", __func__,
width);
utf8_copy(&ctx->previous, ud);
ctx->flags |= SCREEN_WRITE_COMBINE;
return;
}
if (width != ud->width) {
memcpy(&copy, gc, sizeof copy);
copy.data.width = width;
gc = ©
}
ud = NULL;
/* Flush any existing scrolling. */ /* Flush any existing scrolling. */
screen_write_collect_flush(ctx, 1, __func__); screen_write_collect_flush(ctx, 1, __func__);
@ -1991,11 +1947,11 @@ screen_write_cell(struct screen_write_ctx *ctx, const struct grid_cell *gc)
* Move the cursor. If not wrapping, stick at the last character and * Move the cursor. If not wrapping, stick at the last character and
* replace it. * replace it.
*/ */
last = !(s->mode & MODE_WRAP); not_wrap = !(s->mode & MODE_WRAP);
if (s->cx <= sx - last - width) if (s->cx <= sx - not_wrap - width)
screen_write_set_cursor(ctx, s->cx + width, -1); screen_write_set_cursor(ctx, s->cx + width, -1);
else else
screen_write_set_cursor(ctx, sx - last, -1); screen_write_set_cursor(ctx, sx - not_wrap, -1);
/* Create space for character in insert mode. */ /* Create space for character in insert mode. */
if (s->mode & MODE_INSERT) { if (s->mode & MODE_INSERT) {
@ -2015,65 +1971,98 @@ screen_write_cell(struct screen_write_ctx *ctx, const struct grid_cell *gc)
} }
} }
/* Combine a UTF-8 zero-width character onto the previous. */ /* Combine a UTF-8 zero-width character onto the previous if necessary. */
static const struct grid_cell * static int
screen_write_combine(struct screen_write_ctx *ctx, const struct utf8_data *ud, screen_write_combine(struct screen_write_ctx *ctx, const struct grid_cell *gc)
u_int *xx, u_int *cx)
{ {
struct screen *s = ctx->s; struct screen *s = ctx->s;
struct grid *gd = s->grid; struct grid *gd = s->grid;
static struct grid_cell gc; const struct utf8_data *ud = &gc->data;
u_int n, width; u_int n, cx = s->cx, cy = s->cy;
struct grid_cell last;
struct tty_ctx ttyctx;
int force_wide = 0, zero_width = 0;
/* Can't combine if at 0. */ /*
if (s->cx == 0) { * Is this character which makes no sense without being combined? If
*xx = 0; * this is true then flag it here and discard the character (return 1)
return (NULL); * if we cannot combine it.
*/
if (utf8_is_zwj(ud))
zero_width = 1;
else if (utf8_is_vs(ud))
zero_width = force_wide = 1;
else if (ud->width == 0)
zero_width = 1;
/* Cannot combine empty character or at left. */
if (ud->size < 2 || cx == 0)
return (zero_width);
log_debug("%s: character %.*s at %u,%u (width %u)", __func__,
(int)ud->size, ud->data, cx, cy, ud->width);
/* Find the cell to combine with. */
n = 1;
grid_view_get_cell(gd, cx - n, cy, &last);
if (cx != 1 && (last.flags & GRID_FLAG_PADDING)) {
n = 2;
grid_view_get_cell(gd, cx - n, cy, &last);
} }
*xx = s->cx; if (n != last.data.width || (last.flags & GRID_FLAG_PADDING))
return (zero_width);
/* Empty data is out. */ /*
if (ud->size == 0) * Check if we need to combine characters. This could be zero width
fatalx("UTF-8 data empty"); * (zet above), a modifier character (with an existing Unicode
* character) or a previous ZWJ.
/* Retrieve the previous cell. */ */
for (n = 1; n <= s->cx; n++) { if (!zero_width) {
grid_view_get_cell(gd, s->cx - n, s->cy, &gc); if (utf8_is_modifier(ud)) {
if (~gc.flags & GRID_FLAG_PADDING) if (last.data.size < 2)
break; return (0);
force_wide = 1;
} else if (!utf8_has_zwj(&last.data))
return (0);
} }
if (n > s->cx)
return (NULL);
/* Check there is enough space. */ /* Combining; flush any pending output. */
if (gc.data.size + ud->size > sizeof gc.data.data) screen_write_collect_flush(ctx, 0, __func__);
return (NULL);
(*xx) -= n;
log_debug("%s: %.*s onto %.*s at %u,%u (width %u)", __func__, log_debug("%s: %.*s -> %.*s at %u,%u (offset %u, width %u)", __func__,
(int)ud->size, ud->data, (int)gc.data.size, gc.data.data, *xx, (int)ud->size, ud->data, (int)last.data.size, last.data.data,
s->cy, gc.data.width); cx - n, cy, n, last.data.width);
/* Append the data. */ /* Append the data. */
memcpy(gc.data.data + gc.data.size, ud->data, ud->size); memcpy(last.data.data + last.data.size, ud->data, ud->size);
gc.data.size += ud->size; last.data.size += ud->size;
width = gc.data.width;
/* If this is U+FE0F VARIATION SELECTOR-16, force the width to 2. */ /* Force the width to 2 for modifiers and variation selector. */
if (gc.data.width == 1 && if (last.data.width == 1 && force_wide) {
ud->size == 3 && last.data.width = 2;
memcmp(ud->data, "\357\270\217", 3) == 0) { n = 2;
grid_view_set_padding(gd, (*xx) + 1, s->cy); cx++;
gc.data.width = 2; } else
width += 2; force_wide = 0;
}
/* Set the new cell. */ /* Set the new cell. */
grid_view_set_cell(gd, *xx, s->cy, &gc); grid_view_set_cell(gd, cx - n, cy, &last);
if (force_wide)
grid_view_set_padding(gd, cx, cy);
*cx = (*xx) + width; /*
log_debug("%s: character at %u; cursor at %u", __func__, *xx, *cx); * Redraw the combined cell. If forcing the cell to width 2, reset the
return (&gc); * cached cursor position in the tty, since we don't really know
* whether the terminal thought the character was width 1 or width 2
* and what it is going to do now.
*/
screen_write_set_cursor(ctx, cx - n, cy);
screen_write_initctx(ctx, &ttyctx, 0);
ttyctx.cell = &last;
ttyctx.num = force_wide; /* reset cached cursor position */
tty_write(tty_cmd_cell, &ttyctx);
screen_write_set_cursor(ctx, cx, cy);
return (1);
} }
/* /*

View File

@ -205,7 +205,6 @@ server_start(struct tmuxproc *client, int flags, struct event_base *base,
fatal("pledge failed"); fatal("pledge failed");
input_key_build(); input_key_build();
utf8_build_combined();
RB_INIT(&windows); RB_INIT(&windows);
RB_INIT(&all_window_panes); RB_INIT(&all_window_panes);
TAILQ_INIT(&clients); TAILQ_INIT(&clients);

22
tmux.h
View File

@ -30,6 +30,7 @@
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <termios.h> #include <termios.h>
#include <wchar.h>
#include "tmux-protocol.h" #include "tmux-protocol.h"
#include "xmalloc.h" #include "xmalloc.h"
@ -619,15 +620,6 @@ enum utf8_state {
UTF8_ERROR UTF8_ERROR
}; };
/* UTF-8 combine state. */
enum utf8_combine_state {
UTF8_DISCARD_NOW, /* discard immediately */
UTF8_WRITE_NOW, /* do not combine, write immediately */
UTF8_COMBINE_NOW, /* combine immediately */
UTF8_WRITE_MAYBE_COMBINE, /* write but try to combine the next */
UTF8_DISCARD_MAYBE_COMBINE /* discard but try to combine the next */
};
/* Colour flags. */ /* Colour flags. */
#define COLOUR_FLAG_256 0x01000000 #define COLOUR_FLAG_256 0x01000000
#define COLOUR_FLAG_RGB 0x02000000 #define COLOUR_FLAG_RGB 0x02000000
@ -900,7 +892,6 @@ struct screen_write_ctx {
int flags; int flags;
#define SCREEN_WRITE_SYNC 0x1 #define SCREEN_WRITE_SYNC 0x1
#define SCREEN_WRITE_COMBINE 0x2
screen_write_init_ctx_cb init_ctx_cb; screen_write_init_ctx_cb init_ctx_cb;
void *arg; void *arg;
@ -908,7 +899,6 @@ struct screen_write_ctx {
struct screen_write_citem *item; struct screen_write_citem *item;
u_int scrolled; u_int scrolled;
u_int bg; u_int bg;
struct utf8_data previous;
}; };
/* Box border lines option. */ /* Box border lines option. */
@ -3277,6 +3267,8 @@ u_int session_group_attached_count(struct session_group *);
void session_renumber_windows(struct session *); void session_renumber_windows(struct session *);
/* utf8.c */ /* utf8.c */
enum utf8_state utf8_towc (const struct utf8_data *, wchar_t *);
int utf8_in_table(wchar_t, const wchar_t *, u_int);
utf8_char utf8_build_one(u_char); utf8_char utf8_build_one(u_char);
enum utf8_state utf8_from_data(const struct utf8_data *, utf8_char *); enum utf8_state utf8_from_data(const struct utf8_data *, utf8_char *);
void utf8_to_data(utf8_char, struct utf8_data *); void utf8_to_data(utf8_char, struct utf8_data *);
@ -3299,10 +3291,10 @@ char *utf8_rpadcstr(const char *, u_int);
int utf8_cstrhas(const char *, const struct utf8_data *); int utf8_cstrhas(const char *, const struct utf8_data *);
/* utf8-combined.c */ /* utf8-combined.c */
void utf8_build_combined(void); int utf8_has_zwj(const struct utf8_data *);
int utf8_try_combined(const struct utf8_data *, int utf8_is_zwj(const struct utf8_data *);
const struct utf8_data *, const struct utf8_data **, int utf8_is_vs(const struct utf8_data *);
u_int *width); int utf8_is_modifier(const struct utf8_data *);
/* procname.c */ /* procname.c */
char *get_proc_name(int, char *); char *get_proc_name(int, char *);

3
tty.c
View File

@ -2091,6 +2091,9 @@ tty_cmd_cell(struct tty *tty, const struct tty_ctx *ctx)
tty_cell(tty, ctx->cell, &ctx->defaults, ctx->palette, tty_cell(tty, ctx->cell, &ctx->defaults, ctx->palette,
ctx->s->hyperlinks); ctx->s->hyperlinks);
if (ctx->num == 1)
tty_invalidate(tty);
} }
void void

File diff suppressed because it is too large Load Diff

218
utf8.c
View File

@ -23,10 +23,174 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <vis.h> #include <vis.h>
#include <wchar.h>
#include "tmux.h" #include "tmux.h"
static const wchar_t utf8_force_wide[] = {
0x0261D,
0x026F9,
0x0270A,
0x0270B,
0x0270C,
0x0270D,
0x1F1E6,
0x1F1E7,
0x1F1E8,
0x1F1E9,
0x1F1EA,
0x1F1EB,
0x1F1EC,
0x1F1ED,
0x1F1EE,
0x1F1EF,
0x1F1F0,
0x1F1F1,
0x1F1F2,
0x1F1F3,
0x1F1F4,
0x1F1F5,
0x1F1F6,
0x1F1F7,
0x1F1F8,
0x1F1F9,
0x1F1FA,
0x1F1FB,
0x1F1FC,
0x1F1FD,
0x1F1FE,
0x1F1FF,
0x1F385,
0x1F3C2,
0x1F3C3,
0x1F3C4,
0x1F3C7,
0x1F3CA,
0x1F3CB,
0x1F3CC,
0x1F3FB,
0x1F3FC,
0x1F3FD,
0x1F3FE,
0x1F3FF,
0x1F442,
0x1F443,
0x1F446,
0x1F447,
0x1F448,
0x1F449,
0x1F44A,
0x1F44B,
0x1F44C,
0x1F44D,
0x1F44E,
0x1F44F,
0x1F450,
0x1F466,
0x1F467,
0x1F468,
0x1F469,
0x1F46B,
0x1F46C,
0x1F46D,
0x1F46E,
0x1F470,
0x1F471,
0x1F472,
0x1F473,
0x1F474,
0x1F475,
0x1F476,
0x1F477,
0x1F478,
0x1F47C,
0x1F481,
0x1F482,
0x1F483,
0x1F485,
0x1F486,
0x1F487,
0x1F48F,
0x1F491,
0x1F4AA,
0x1F574,
0x1F575,
0x1F57A,
0x1F590,
0x1F595,
0x1F596,
0x1F645,
0x1F646,
0x1F647,
0x1F64B,
0x1F64C,
0x1F64D,
0x1F64E,
0x1F64F,
0x1F6A3,
0x1F6B4,
0x1F6B5,
0x1F6B6,
0x1F6C0,
0x1F6CC,
0x1F90C,
0x1F90F,
0x1F918,
0x1F919,
0x1F91A,
0x1F91B,
0x1F91C,
0x1F91D,
0x1F91E,
0x1F91F,
0x1F926,
0x1F930,
0x1F931,
0x1F932,
0x1F933,
0x1F934,
0x1F935,
0x1F936,
0x1F937,
0x1F938,
0x1F939,
0x1F93D,
0x1F93E,
0x1F977,
0x1F9B5,
0x1F9B6,
0x1F9B8,
0x1F9B9,
0x1F9BB,
0x1F9CD,
0x1F9CE,
0x1F9CF,
0x1F9D1,
0x1F9D2,
0x1F9D3,
0x1F9D4,
0x1F9D5,
0x1F9D6,
0x1F9D7,
0x1F9D8,
0x1F9D9,
0x1F9DA,
0x1F9DB,
0x1F9DC,
0x1F9DD,
0x1FAC3,
0x1FAC4,
0x1FAC5,
0x1FAF0,
0x1FAF1,
0x1FAF2,
0x1FAF3,
0x1FAF4,
0x1FAF5,
0x1FAF6,
0x1FAF7,
0x1FAF8
};
struct utf8_item { struct utf8_item {
RB_ENTRY(utf8_item) index_entry; RB_ENTRY(utf8_item) index_entry;
u_int index; u_int index;
@ -123,6 +287,28 @@ utf8_put_item(const u_char *data, size_t size, u_int *index)
return (0); return (0);
} }
static int
utf8_table_cmp(const void *vp1, const void *vp2)
{
const wchar_t *wc1 = vp1, *wc2 = vp2;
if (*wc1 < *wc2)
return (-1);
if (*wc1 > *wc2)
return (1);
return (0);
}
/* Check if character in table. */
int
utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
{
wchar_t *found;
found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
return (found != NULL);
}
/* Get UTF-8 character from data. */ /* Get UTF-8 character from data. */
enum utf8_state enum utf8_state
utf8_from_data(const struct utf8_data *ud, utf8_char *uc) utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
@ -217,16 +403,13 @@ utf8_width(struct utf8_data *ud, int *width)
{ {
wchar_t wc; wchar_t wc;
switch (mbtowc(&wc, ud->data, ud->size)) { if (utf8_towc(ud, &wc) != UTF8_DONE)
case -1:
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
errno);
mbtowc(NULL, NULL, MB_CUR_MAX);
return (UTF8_ERROR);
case 0:
return (UTF8_ERROR); return (UTF8_ERROR);
if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
*width = 2;
return (UTF8_DONE);
} }
log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)wc);
*width = wcwidth(wc); *width = wcwidth(wc);
log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width); log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
if (*width < 0) { if (*width < 0) {
@ -241,6 +424,23 @@ utf8_width(struct utf8_data *ud, int *width)
return (UTF8_ERROR); return (UTF8_ERROR);
} }
/* Convert UTF-8 character to wide character. */
enum utf8_state
utf8_towc(const struct utf8_data *ud, wchar_t *wc)
{
switch (mbtowc(wc, ud->data, ud->size)) {
case -1:
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
errno);
mbtowc(NULL, NULL, MB_CUR_MAX);
return (UTF8_ERROR);
case 0:
return (UTF8_ERROR);
}
log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
return (UTF8_DONE);
}
/* /*
* Open UTF-8 sequence. * Open UTF-8 sequence.
* *