Change UTF-8 combining to inspect the previous character at the cursor

position rather than keeping the last character from the input stream,
this is how most terminals work and fixes problems with displaying these
characters in vim. GitHub issue 3600.
This commit is contained in:
nicm
2023-09-15 15:49:05 +00:00
parent d394293ba5
commit f09cde2542
6 changed files with 367 additions and 1219 deletions

218
utf8.c
View File

@ -23,10 +23,174 @@
#include <stdlib.h>
#include <string.h>
#include <vis.h>
#include <wchar.h>
#include "tmux.h"
static const wchar_t utf8_force_wide[] = {
0x0261D,
0x026F9,
0x0270A,
0x0270B,
0x0270C,
0x0270D,
0x1F1E6,
0x1F1E7,
0x1F1E8,
0x1F1E9,
0x1F1EA,
0x1F1EB,
0x1F1EC,
0x1F1ED,
0x1F1EE,
0x1F1EF,
0x1F1F0,
0x1F1F1,
0x1F1F2,
0x1F1F3,
0x1F1F4,
0x1F1F5,
0x1F1F6,
0x1F1F7,
0x1F1F8,
0x1F1F9,
0x1F1FA,
0x1F1FB,
0x1F1FC,
0x1F1FD,
0x1F1FE,
0x1F1FF,
0x1F385,
0x1F3C2,
0x1F3C3,
0x1F3C4,
0x1F3C7,
0x1F3CA,
0x1F3CB,
0x1F3CC,
0x1F3FB,
0x1F3FC,
0x1F3FD,
0x1F3FE,
0x1F3FF,
0x1F442,
0x1F443,
0x1F446,
0x1F447,
0x1F448,
0x1F449,
0x1F44A,
0x1F44B,
0x1F44C,
0x1F44D,
0x1F44E,
0x1F44F,
0x1F450,
0x1F466,
0x1F467,
0x1F468,
0x1F469,
0x1F46B,
0x1F46C,
0x1F46D,
0x1F46E,
0x1F470,
0x1F471,
0x1F472,
0x1F473,
0x1F474,
0x1F475,
0x1F476,
0x1F477,
0x1F478,
0x1F47C,
0x1F481,
0x1F482,
0x1F483,
0x1F485,
0x1F486,
0x1F487,
0x1F48F,
0x1F491,
0x1F4AA,
0x1F574,
0x1F575,
0x1F57A,
0x1F590,
0x1F595,
0x1F596,
0x1F645,
0x1F646,
0x1F647,
0x1F64B,
0x1F64C,
0x1F64D,
0x1F64E,
0x1F64F,
0x1F6A3,
0x1F6B4,
0x1F6B5,
0x1F6B6,
0x1F6C0,
0x1F6CC,
0x1F90C,
0x1F90F,
0x1F918,
0x1F919,
0x1F91A,
0x1F91B,
0x1F91C,
0x1F91D,
0x1F91E,
0x1F91F,
0x1F926,
0x1F930,
0x1F931,
0x1F932,
0x1F933,
0x1F934,
0x1F935,
0x1F936,
0x1F937,
0x1F938,
0x1F939,
0x1F93D,
0x1F93E,
0x1F977,
0x1F9B5,
0x1F9B6,
0x1F9B8,
0x1F9B9,
0x1F9BB,
0x1F9CD,
0x1F9CE,
0x1F9CF,
0x1F9D1,
0x1F9D2,
0x1F9D3,
0x1F9D4,
0x1F9D5,
0x1F9D6,
0x1F9D7,
0x1F9D8,
0x1F9D9,
0x1F9DA,
0x1F9DB,
0x1F9DC,
0x1F9DD,
0x1FAC3,
0x1FAC4,
0x1FAC5,
0x1FAF0,
0x1FAF1,
0x1FAF2,
0x1FAF3,
0x1FAF4,
0x1FAF5,
0x1FAF6,
0x1FAF7,
0x1FAF8
};
struct utf8_item {
RB_ENTRY(utf8_item) index_entry;
u_int index;
@ -123,6 +287,28 @@ utf8_put_item(const u_char *data, size_t size, u_int *index)
return (0);
}
static int
utf8_table_cmp(const void *vp1, const void *vp2)
{
const wchar_t *wc1 = vp1, *wc2 = vp2;
if (*wc1 < *wc2)
return (-1);
if (*wc1 > *wc2)
return (1);
return (0);
}
/* Check if character in table. */
int
utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
{
wchar_t *found;
found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
return (found != NULL);
}
/* Get UTF-8 character from data. */
enum utf8_state
utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
@ -217,16 +403,13 @@ utf8_width(struct utf8_data *ud, int *width)
{
wchar_t wc;
switch (mbtowc(&wc, ud->data, ud->size)) {
case -1:
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
errno);
mbtowc(NULL, NULL, MB_CUR_MAX);
return (UTF8_ERROR);
case 0:
if (utf8_towc(ud, &wc) != UTF8_DONE)
return (UTF8_ERROR);
if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
*width = 2;
return (UTF8_DONE);
}
log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)wc);
*width = wcwidth(wc);
log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
if (*width < 0) {
@ -241,6 +424,23 @@ utf8_width(struct utf8_data *ud, int *width)
return (UTF8_ERROR);
}
/* Convert UTF-8 character to wide character. */
enum utf8_state
utf8_towc(const struct utf8_data *ud, wchar_t *wc)
{
switch (mbtowc(wc, ud->data, ud->size)) {
case -1:
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
errno);
mbtowc(NULL, NULL, MB_CUR_MAX);
return (UTF8_ERROR);
case 0:
return (UTF8_ERROR);
}
log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
return (UTF8_DONE);
}
/*
* Open UTF-8 sequence.
*