From d90b414223021378966f8b6988b725872e237793 Mon Sep 17 00:00:00 2001 From: nicm Date: Sat, 1 Nov 2025 16:44:24 +0000 Subject: [PATCH] Handle regional indicators and emoji modifiers in a better way, GitHub issue 3998. --- screen-write.c | 6 +-- tmux.h | 3 +- utf8-combined.c | 124 +++++++++++++++++++++++++++++++++--------------- utf8.c | 52 ++++++++++---------- 4 files changed, 117 insertions(+), 68 deletions(-) diff --git a/screen-write.c b/screen-write.c index f1b37421..1014ae80 100644 --- a/screen-write.c +++ b/screen-write.c @@ -2050,11 +2050,9 @@ screen_write_combine(struct screen_write_ctx *ctx, const struct grid_cell *gc) case HANGULJAMO_STATE_COMPOSABLE: break; case HANGULJAMO_STATE_NOT_HANGULJAMO: - if (utf8_is_modifier(ud)) { - if (last.data.size < 2) - return (0); + if (utf8_should_combine(&last.data, ud)) force_wide = 1; - } else if (!utf8_has_zwj(&last.data)) + else if (!utf8_has_zwj(&last.data)) return (0); break; } diff --git a/tmux.h b/tmux.h index 2ec3bf23..aa25a408 100644 --- a/tmux.h +++ b/tmux.h @@ -3499,7 +3499,8 @@ int utf8_cstrhas(const char *, const struct utf8_data *); int utf8_has_zwj(const struct utf8_data *); int utf8_is_zwj(const struct utf8_data *); int utf8_is_vs(const struct utf8_data *); -int utf8_is_modifier(const struct utf8_data *); +int utf8_should_combine(const struct utf8_data *, + const struct utf8_data *); enum hanguljamo_state hanguljamo_check_state(const struct utf8_data *, const struct utf8_data *); diff --git a/utf8-combined.c b/utf8-combined.c index 885dd6a4..91ddaf75 100644 --- a/utf8-combined.c +++ b/utf8-combined.c @@ -73,49 +73,99 @@ utf8_is_vs(const struct utf8_data *ud) return (memcmp(ud->data, "\357\270\217", 3) == 0); } -/* Is this in the modifier table? */ +/* Should these two characters combine? */ int -utf8_is_modifier(const struct utf8_data *ud) +utf8_should_combine(const struct utf8_data *with, const struct utf8_data *add) { - wchar_t wc; + wchar_t w, a; - if (utf8_towc(ud, &wc) != UTF8_DONE) + if (utf8_towc(with, &w) != UTF8_DONE) return (0); - switch (wc) { - case 0x1F1E6: - case 0x1F1E7: - case 0x1F1E8: - case 0x1F1E9: - case 0x1F1EA: - case 0x1F1EB: - case 0x1F1EC: - case 0x1F1ED: - case 0x1F1EE: - case 0x1F1EF: - case 0x1F1F0: - case 0x1F1F1: - case 0x1F1F2: - case 0x1F1F3: - case 0x1F1F4: - case 0x1F1F5: - case 0x1F1F6: - case 0x1F1F7: - case 0x1F1F8: - case 0x1F1F9: - case 0x1F1FA: - case 0x1F1FB: - case 0x1F1FC: - case 0x1F1FD: - case 0x1F1FE: - case 0x1F1FF: - case 0x1F3FB: - case 0x1F3FC: - case 0x1F3FD: - case 0x1F3FE: - case 0x1F3FF: + if (utf8_towc(add, &a) != UTF8_DONE) + return (0); + + /* Regional indicators. */ + if ((a >= 0x1F1E6 && a <= 0x1F1FF) && (w >= 0x1F1E6 && w <= 0x1F1FF)) return (1); + + /* Emoji skin tone modifiers. */ + switch (a) { + case 0x1F44B: + case 0x1F44C: + case 0x1F44D: + case 0x1F44E: + case 0x1F44F: + case 0x1F450: + case 0x1F466: + case 0x1F467: + case 0x1F468: + case 0x1F469: + case 0x1F46E: + case 0x1F470: + case 0x1F471: + case 0x1F472: + case 0x1F473: + case 0x1F474: + case 0x1F475: + case 0x1F476: + case 0x1F477: + case 0x1F478: + case 0x1F47C: + case 0x1F481: + case 0x1F482: + case 0x1F485: + case 0x1F486: + case 0x1F487: + case 0x1F4AA: + case 0x1F575: + case 0x1F57A: + case 0x1F590: + case 0x1F595: + case 0x1F596: + case 0x1F645: + case 0x1F646: + case 0x1F647: + case 0x1F64B: + case 0x1F64C: + case 0x1F64D: + case 0x1F64E: + case 0x1F64F: + case 0x1F6B4: + case 0x1F6B5: + case 0x1F6B6: + case 0x1F926: + case 0x1F937: + case 0x1F938: + case 0x1F939: + case 0x1F93D: + case 0x1F93E: + case 0x1F9B5: + case 0x1F9B6: + case 0x1F9B8: + case 0x1F9B9: + case 0x1F9CD: + case 0x1F9CE: + case 0x1F9CF: + case 0x1F9D1: + case 0x1F9D2: + case 0x1F9D3: + case 0x1F9D4: + case 0x1F9D5: + case 0x1F9D6: + case 0x1F9D7: + case 0x1F9D8: + case 0x1F9D9: + case 0x1F9DA: + case 0x1F9DB: + case 0x1F9DC: + case 0x1F9DD: + case 0x1F9DE: + case 0x1F9DF: + if (w >= 0x1F3FB && w <= 0x1F3FF) + return (1); + break; } - return (0); + return 0; } static enum hanguljamo_subclass diff --git a/utf8.c b/utf8.c index d9e69df4..e877f2d0 100644 --- a/utf8.c +++ b/utf8.c @@ -56,32 +56,32 @@ static struct utf8_width_item utf8_default_width_cache[] = { { .wc = 0x0270B, .width = 2 }, { .wc = 0x0270C, .width = 2 }, { .wc = 0x0270D, .width = 2 }, - { .wc = 0x1F1E6, .width = 2 }, - { .wc = 0x1F1E7, .width = 2 }, - { .wc = 0x1F1E8, .width = 2 }, - { .wc = 0x1F1E9, .width = 2 }, - { .wc = 0x1F1EA, .width = 2 }, - { .wc = 0x1F1EB, .width = 2 }, - { .wc = 0x1F1EC, .width = 2 }, - { .wc = 0x1F1ED, .width = 2 }, - { .wc = 0x1F1EE, .width = 2 }, - { .wc = 0x1F1EF, .width = 2 }, - { .wc = 0x1F1F0, .width = 2 }, - { .wc = 0x1F1F1, .width = 2 }, - { .wc = 0x1F1F2, .width = 2 }, - { .wc = 0x1F1F3, .width = 2 }, - { .wc = 0x1F1F4, .width = 2 }, - { .wc = 0x1F1F5, .width = 2 }, - { .wc = 0x1F1F6, .width = 2 }, - { .wc = 0x1F1F7, .width = 2 }, - { .wc = 0x1F1F8, .width = 2 }, - { .wc = 0x1F1F9, .width = 2 }, - { .wc = 0x1F1FA, .width = 2 }, - { .wc = 0x1F1FB, .width = 2 }, - { .wc = 0x1F1FC, .width = 2 }, - { .wc = 0x1F1FD, .width = 2 }, - { .wc = 0x1F1FE, .width = 2 }, - { .wc = 0x1F1FF, .width = 2 }, + { .wc = 0x1F1E6, .width = 1 }, + { .wc = 0x1F1E7, .width = 1 }, + { .wc = 0x1F1E8, .width = 1 }, + { .wc = 0x1F1E9, .width = 1 }, + { .wc = 0x1F1EA, .width = 1 }, + { .wc = 0x1F1EB, .width = 1 }, + { .wc = 0x1F1EC, .width = 1 }, + { .wc = 0x1F1ED, .width = 1 }, + { .wc = 0x1F1EE, .width = 1 }, + { .wc = 0x1F1EF, .width = 1 }, + { .wc = 0x1F1F0, .width = 1 }, + { .wc = 0x1F1F1, .width = 1 }, + { .wc = 0x1F1F2, .width = 1 }, + { .wc = 0x1F1F3, .width = 1 }, + { .wc = 0x1F1F4, .width = 1 }, + { .wc = 0x1F1F5, .width = 1 }, + { .wc = 0x1F1F6, .width = 1 }, + { .wc = 0x1F1F7, .width = 1 }, + { .wc = 0x1F1F8, .width = 1 }, + { .wc = 0x1F1F9, .width = 1 }, + { .wc = 0x1F1FA, .width = 1 }, + { .wc = 0x1F1FB, .width = 1 }, + { .wc = 0x1F1FC, .width = 1 }, + { .wc = 0x1F1FD, .width = 1 }, + { .wc = 0x1F1FE, .width = 1 }, + { .wc = 0x1F1FF, .width = 1 }, { .wc = 0x1F385, .width = 2 }, { .wc = 0x1F3C2, .width = 2 }, { .wc = 0x1F3C3, .width = 2 },