Tidy utf8.c a little: build table on first use, and make utf8_width take

a u_int rather than splitting and then combining again in utf8_split.
2026-05-31 22:56:16 +00:00 · 2015-11-12 12:19:57 +00:00
parent 0cc812ae34
commit d6daf37df4
3 changed files with 30 additions and 54 deletions
--- a/server.c
+++ b/server.c
@@ -186,7 +186,6 @@ server_start(struct event_base *base, int lockfd, char *lockfile)
 	TAILQ_INIT(&session_groups);
 	mode_key_init_trees();
 	key_bindings_init();
 	utf8_build();
 	start_time = time(NULL);
--- a/tmux.h
+++ b/tmux.h
@@ -2181,7 +2181,7 @@ void		 session_group_synchronize1(struct session *, struct session *);
 void		 session_renumber_windows(struct session *);
 /* utf8.c */
-void		 utf8_build(void);
+u_int		 utf8_width(u_int);
 void		 utf8_set(struct utf8_data *, u_char);
 int		 utf8_open(struct utf8_data *, u_char);
 int		 utf8_append(struct utf8_data *, u_char);
--- a/utf8.c
+++ b/utf8.c
@@ -35,7 +35,7 @@ struct utf8_width_entry {
 };
 /* Sorted, then repeatedly split in the middle to balance the tree. */
-struct utf8_width_entry utf8_width_table[] = {
+static struct utf8_width_entry utf8_width_table[] = {
 	{ 0x00b41, 0x00b44, 0, NULL, NULL },
 	{ 0x008e4, 0x00902, 0, NULL, NULL },
 	{ 0x006d6, 0x006dd, 0, NULL, NULL },
@@ -344,12 +344,9 @@ struct utf8_width_entry utf8_width_table[] = {
 	{ 0xe0100, 0xe01ef, 0, NULL, NULL },
 	{ 0x100000, 0x10fffd, 0, NULL, NULL },
 };
 static struct utf8_width_entry	*utf8_width_root = NULL;
-struct utf8_width_entry	*utf8_width_root = NULL;
+static void	utf8_build(void);
 int	utf8_overlap(struct utf8_width_entry *, struct utf8_width_entry *);
 u_int	utf8_combine(const struct utf8_data *);
 u_int	utf8_width(const struct utf8_data *);
 /* Set a single character. */
 void
@@ -405,40 +402,20 @@ utf8_append(struct utf8_data *utf8data, u_char ch)
 	if (utf8data->have != utf8data->size)
 		return (1);
-	utf8data->width = utf8_width(utf8data);
+	utf8data->width = utf8_width(utf8_combine(utf8data));
 	return (0);
 }
 /* Check if two width tree entries overlap. */
 int
 utf8_overlap(struct utf8_width_entry *item1, struct utf8_width_entry *item2)
 {
 	if (item1->first >= item2->first && item1->first <= item2->last)
 		return (1);
 	if (item1->last >= item2->first && item1->last <= item2->last)
 		return (1);
 	if (item2->first >= item1->first && item2->first <= item1->last)
 		return (1);
 	if (item2->last >= item1->first && item2->last <= item1->last)
 		return (1);
 	return (0);
 }
 /* Build UTF-8 width tree. */
-void
+static void
 utf8_build(void)
 {
 	struct utf8_width_entry	**ptr, *item, *node;
-	u_int			  i, j;
+	u_int			  i;
 	for (i = 0; i < nitems(utf8_width_table); i++) {
 		item = &utf8_width_table[i];
 		for (j = 0; j < nitems(utf8_width_table); j++) {
 			if (i != j && utf8_overlap(item, &utf8_width_table[j]))
 				log_fatalx("utf8 overlap: %u %u", i, j);
 		}
 		ptr = &utf8_width_root;
 		while (*ptr != NULL) {
 			node = *ptr;
@@ -451,6 +428,27 @@ utf8_build(void)
 	}
 }
 /* Lookup width of UTF-8 data in tree. */
 u_int
 utf8_width(u_int uc)
 {
 	struct utf8_width_entry	*item;
 	if (utf8_width_root == NULL)
 		utf8_build();
 	item = utf8_width_root;
 	while (item != NULL) {
 		if (uc < item->first)
 			item = item->left;
 		else if (uc > item->last)
 			item = item->right;
 		else
 			return (item->width);
 	}
 	return (1);
 }
 /* Combine UTF-8 into 32-bit Unicode. */
 u_int
 utf8_combine(const struct utf8_data *utf8data)
@@ -481,7 +479,7 @@ utf8_combine(const struct utf8_data *utf8data)
 	return (value);
 }
-/* Split a UTF-8 character. */
+/* Split 32-bit Unicode into UTF-8. */
 int
 utf8_split(u_int uc, struct utf8_data *utf8data)
 {
@@ -505,7 +503,7 @@ utf8_split(u_int uc, struct utf8_data *utf8data)
 		utf8data->data[3] = 0x80 | (uc & 0x3f);
 	} else
 		return (-1);
-	utf8data->width = utf8_width(utf8data);
+	utf8data->width = utf8_width(uc);
 	return (0);
 }
@@ -522,27 +520,6 @@ utf8_split2(u_int uc, u_char *ptr)
 	return (1);
 }
 /* Lookup width of UTF-8 data in tree. */
 u_int
 utf8_width(const struct utf8_data *utf8data)
 {
 	struct utf8_width_entry	*item;
 	u_int			 value;
 	value = utf8_combine(utf8data);
 	item = utf8_width_root;
 	while (item != NULL) {
 		if (value < item->first)
 			item = item->left;
 		else if (value > item->last)
 			item = item->right;
 		else
 			return (item->width);
 	}
 	return (1);
 }
 /*
 * Encode len characters from src into dst, which is guaranteed to have four
 * bytes available for each character from src (for \abc or UTF-8) plus space