Instead of storing all UTF-8 characters in the extended cell which means

that 14 bytes are wasted for each character in the BMP, only store
characters of three bytes or less in the cell itself and store others
(outside the BMP or with combining characters) in a separate global
tree. Can reduce grid memory use for heavy Unicode users by around 30%.
This commit is contained in:
nicm 2020-05-25 09:32:10 +00:00
parent 14a9fd58d5
commit 3a5219c6d0
4 changed files with 250 additions and 34 deletions

51
grid.c
View File

@ -100,11 +100,11 @@ grid_get_extended_cell(struct grid_line *gl, struct grid_cell_entry *gce,
}
/* Set cell as extended. */
static struct grid_cell *
static struct grid_extd_entry *
grid_extended_cell(struct grid_line *gl, struct grid_cell_entry *gce,
const struct grid_cell *gc)
{
struct grid_cell *gcp;
struct grid_extd_entry *gee;
int flags = (gc->flags & ~GRID_FLAG_CLEARED);
if (~gce->flags & GRID_FLAG_EXTENDED)
@ -113,10 +113,14 @@ grid_extended_cell(struct grid_line *gl, struct grid_cell_entry *gce,
fatalx("offset too big");
gl->flags |= GRID_LINE_EXTENDED;
gcp = &gl->extddata[gce->offset];
memcpy(gcp, gc, sizeof *gcp);
gcp->flags = flags;
return (gcp);
gee = &gl->extddata[gce->offset];
gee->data = utf8_map_big(&gc->data);
gee->attr = gc->attr;
gee->flags = flags;
gee->fg = gc->fg;
gee->bg = gc->bg;
gee->us = gc->us;
return (gee);
}
/* Free up unused extended cells. */
@ -124,9 +128,9 @@ static void
grid_compact_line(struct grid_line *gl)
{
int new_extdsize = 0;
struct grid_cell *new_extddata;
struct grid_extd_entry *new_extddata;
struct grid_cell_entry *gce;
struct grid_cell *gc;
struct grid_extd_entry *gee;
u_int px, idx;
if (gl->extdsize == 0)
@ -150,8 +154,8 @@ grid_compact_line(struct grid_line *gl)
for (px = 0; px < gl->cellsize; px++) {
gce = &gl->celldata[px];
if (gce->flags & GRID_FLAG_EXTENDED) {
gc = &gl->extddata[gce->offset];
memcpy(&new_extddata[idx], gc, sizeof *gc);
gee = &gl->extddata[gce->offset];
memcpy(&new_extddata[idx], gee, sizeof *gee);
gce->offset = idx++;
}
}
@ -181,17 +185,14 @@ grid_clear_cell(struct grid *gd, u_int px, u_int py, u_int bg)
{
struct grid_line *gl = &gd->linedata[py];
struct grid_cell_entry *gce = &gl->celldata[px];
struct grid_cell *gc;
struct grid_extd_entry *gee;
memcpy(gce, &grid_cleared_entry, sizeof *gce);
if (bg != 8) {
if (bg & COLOUR_FLAG_RGB) {
grid_get_extended_cell(gl, gce, gce->flags);
gl->flags |= GRID_LINE_EXTENDED;
gc = &gl->extddata[gce->offset];
memcpy(gc, &grid_cleared_cell, sizeof *gc);
gc->bg = bg;
gee = grid_extended_cell(gl, gce, &grid_cleared_cell);
gee->bg = bg;
} else {
if (bg & COLOUR_FLAG_256)
gce->flags |= GRID_FLAG_BG256;
@ -483,12 +484,20 @@ static void
grid_get_cell1(struct grid_line *gl, u_int px, struct grid_cell *gc)
{
struct grid_cell_entry *gce = &gl->celldata[px];
struct grid_extd_entry *gee;
if (gce->flags & GRID_FLAG_EXTENDED) {
if (gce->offset >= gl->extdsize)
memcpy(gc, &grid_default_cell, sizeof *gc);
else
memcpy(gc, &gl->extddata[gce->offset], sizeof *gc);
else {
gee = &gl->extddata[gce->offset];
gc->flags = gee->flags;
gc->attr = gee->attr;
gc->fg = gee->fg;
gc->bg = gee->bg;
gc->us = gee->us;
utf8_get_big(gee->data, &gc->data);
}
return;
}
@ -545,7 +554,7 @@ grid_set_cells(struct grid *gd, u_int px, u_int py, const struct grid_cell *gc,
{
struct grid_line *gl;
struct grid_cell_entry *gce;
struct grid_cell *gcp;
struct grid_extd_entry *gee;
u_int i;
if (grid_check_y(gd, __func__, py) != 0)
@ -560,8 +569,8 @@ grid_set_cells(struct grid *gd, u_int px, u_int py, const struct grid_cell *gc,
for (i = 0; i < slen; i++) {
gce = &gl->celldata[px + i];
if (grid_need_extended_cell(gce, gc)) {
gcp = grid_extended_cell(gl, gce, gc);
utf8_set(&gcp->data, s[i]);
gee = grid_extended_cell(gl, gce, gc);
gee->data = utf8_set_big(s[i], 1);
} else
grid_store_cell(gce, gc, s[i]);
}

31
tmux.h
View File

@ -597,11 +597,11 @@ struct msg_write_close {
#define MOTION_MOUSE_MODES (MODE_MOUSE_BUTTON|MODE_MOUSE_ALL)
/*
* A single UTF-8 character. UTF8_SIZE must be big enough to hold
* combining characters as well, currently at most five (of three
* bytes) are supported.
*/
#define UTF8_SIZE 18
* A single UTF-8 character. UTF8_SIZE must be big enough to hold combining
* characters as well. It can't be more than 32 bytes without changes to how
* big characters are stored.
*/
#define UTF8_SIZE 21
struct utf8_data {
u_char data[UTF8_SIZE];
@ -609,7 +609,7 @@ struct utf8_data {
u_char size;
u_char width; /* 0xff if invalid */
} __packed;
};
enum utf8_state {
UTF8_MORE,
UTF8_DONE,
@ -663,13 +663,25 @@ enum utf8_state {
/* Grid cell data. */
struct grid_cell {
struct utf8_data data; /* 21 bytes */
struct utf8_data data;
u_short attr;
u_char flags;
int fg;
int bg;
int us;
};
/* Grid extended cell entry. */
struct grid_extd_entry {
uint32_t data;
u_short attr;
u_char flags;
int fg;
int bg;
int us;
} __packed;
/* Grid cell entry. */
struct grid_cell_entry {
u_char flags;
union {
@ -690,7 +702,7 @@ struct grid_line {
struct grid_cell_entry *celldata;
u_int extdsize;
struct grid_cell *extddata;
struct grid_extd_entry *extddata;
int flags;
} __packed;
@ -2877,6 +2889,9 @@ u_int session_group_attached_count(struct session_group *);
void session_renumber_windows(struct session *);
/* utf8.c */
uint32_t utf8_set_big(char, u_int);
uint32_t utf8_map_big(const struct utf8_data *);
void utf8_get_big(uint32_t, struct utf8_data *);
void utf8_set(struct utf8_data *, u_char);
void utf8_copy(struct utf8_data *, const struct utf8_data *);
enum utf8_state utf8_open(struct utf8_data *, u_char);

174
utf8.c
View File

@ -29,6 +29,180 @@
static int utf8_width(wchar_t);
struct utf8_big_item {
u_int index;
RB_ENTRY(utf8_big_item) entry;
char data[UTF8_SIZE];
u_char size;
};
RB_HEAD(utf8_big_tree, utf8_big_item);
static int
utf8_big_cmp(struct utf8_big_item *bi1, struct utf8_big_item *bi2)
{
if (bi1->size < bi2->size)
return (-1);
if (bi1->size > bi2->size)
return (1);
return (memcmp(bi1->data, bi2->data, bi1->size));
}
RB_GENERATE_STATIC(utf8_big_tree, utf8_big_item, entry, utf8_big_cmp);
static struct utf8_big_tree utf8_big_tree = RB_INITIALIZER(utf8_big_tree);
static struct utf8_big_item *utf8_big_list;
static u_int utf8_big_list_size;
static u_int utf8_big_list_used;
union utf8_big_map {
uint32_t value;
struct {
u_char flags;
#define UTF8_BIG_SIZE 0x1f
#define UTF8_BIG_WIDTH2 0x20
u_char data[3];
};
} __packed;
static const union utf8_big_map utf8_big_space1 = {
.flags = 1,
.data = " "
};
static const union utf8_big_map utf8_big_space2 = {
.flags = UTF8_BIG_WIDTH2|2,
.data = " "
};
/* Get a big item by index. */
static struct utf8_big_item *
utf8_get_big_item(const char *data, size_t size)
{
struct utf8_big_item bi;
memcpy(bi.data, data, size);
bi.size = size;
return (RB_FIND(utf8_big_tree, &utf8_big_tree, &bi));
}
/* Add a big item. */
static int
utf8_put_big_item(const char *data, size_t size, u_int *index)
{
struct utf8_big_item *bi;
bi = utf8_get_big_item(data, size);
if (bi != NULL) {
*index = bi->index;
log_debug("%s: have %.*s at %u", __func__, (int)size, data,
*index);
return (0);
}
if (utf8_big_list_used == utf8_big_list_size) {
if (utf8_big_list_size == 0xffffff)
return (-1);
if (utf8_big_list_size == 0)
utf8_big_list_size = 256;
else if (utf8_big_list_size > 0x7fffff)
utf8_big_list_size = 0xffffff;
else
utf8_big_list_size *= 2;
utf8_big_list = xreallocarray(utf8_big_list, utf8_big_list_size,
sizeof *utf8_big_list);
}
*index = utf8_big_list_used++;
bi = &utf8_big_list[*index];
bi->index = *index;
memcpy(bi->data, data, size);
bi->size = size;
RB_INSERT(utf8_big_tree, &utf8_big_tree, bi);
log_debug("%s: added %.*s at %u", __func__, (int)size, data, *index);
return (0);
}
/* Get UTF-8 as index into buffer. */
uint32_t
utf8_map_big(const struct utf8_data *ud)
{
union utf8_big_map m = { .value = 0 };
u_int o;
const char *data = ud->data;
size_t size = ud->size;
if (ud->width != 1 && ud->width != 2)
return (utf8_big_space1.value);
if (size > UTF8_BIG_SIZE)
goto fail;
if (size == 1)
return (utf8_set_big(data[0], 1));
m.flags = size;
if (ud->width == 2)
m.flags |= UTF8_BIG_WIDTH2;
if (size <= 3) {
memcpy(&m.data, data, size);
return (m.value);
}
if (utf8_put_big_item(data, size, &o) != 0)
goto fail;
m.data[0] = (o & 0xff);
m.data[1] = (o >> 8) & 0xff;
m.data[2] = (o >> 16);
return (m.value);
fail:
if (ud->width == 1)
return (utf8_big_space1.value);
return (utf8_big_space2.value);
}
/* Get UTF-8 from index into buffer. */
void
utf8_get_big(uint32_t v, struct utf8_data *ud)
{
union utf8_big_map m = { .value = v };
struct utf8_big_item *bi;
u_int o;
memset(ud, 0, sizeof *ud);
ud->size = ud->have = (m.flags & UTF8_BIG_SIZE);
if (m.flags & UTF8_BIG_WIDTH2)
ud->width = 2;
else
ud->width = 1;
if (ud->size <= 3) {
memcpy(ud->data, m.data, ud->size);
return;
}
o = ((uint32_t)m.data[2] << 16)|((uint32_t)m.data[1] << 8)|m.data[0];
if (o >= utf8_big_list_used)
memset(ud->data, ' ', ud->size);
else {
bi = &utf8_big_list[o];
memcpy(ud->data, bi->data, ud->size);
}
}
/* Get big value for UTF-8 single character. */
uint32_t
utf8_set_big(char c, u_int width)
{
union utf8_big_map m = { .flags = 1, .data[0] = c };
if (width == 2)
m.flags |= UTF8_BIG_WIDTH2;
return (m.value);
}
/* Set a single character. */
void
utf8_set(struct utf8_data *ud, u_char ch)

View File

@ -2551,23 +2551,33 @@ window_copy_search_rl_regex(struct grid *gd, u_int *ppx, u_int *psx, u_int py,
}
static const char *
window_copy_cellstring(const struct grid_line *gl, u_int px, size_t *size)
window_copy_cellstring(const struct grid_line *gl, u_int px, size_t *size,
int *allocated)
{
static struct utf8_data ud;
struct grid_cell_entry *gce;
char *copy;
if (px >= gl->cellsize) {
*size = 1;
*allocated = 0;
return (" ");
}
gce = &gl->celldata[px];
if (~gce->flags & GRID_FLAG_EXTENDED) {
*size = 1;
*allocated = 0;
return (&gce->data.data);
}
*size = gl->extddata[gce->offset].data.size;
return (gl->extddata[gce->offset].data.data);
utf8_get_big(gl->extddata[gce->offset].data, &ud);
*size = ud.size;
*allocated = 1;
copy = xmalloc(ud.size);
memcpy(copy, ud.data, ud.size);
return (copy);
}
/* Find last match in given range. */
@ -2630,6 +2640,7 @@ window_copy_stringify(struct grid *gd, u_int py, u_int first, u_int last,
const struct grid_line *gl;
const char *d;
size_t bufsize = 1024, dlen;
int allocated;
while (bufsize < newsize)
bufsize *= 2;
@ -2638,7 +2649,7 @@ window_copy_stringify(struct grid *gd, u_int py, u_int first, u_int last,
gl = grid_peek_line(gd, py);
bx = *size - 1;
for (ax = first; ax < last; ax++) {
d = window_copy_cellstring(gl, ax, &dlen);
d = window_copy_cellstring(gl, ax, &dlen, &allocated);
newsize += dlen;
while (bufsize < newsize) {
bufsize *= 2;
@ -2650,6 +2661,8 @@ window_copy_stringify(struct grid *gd, u_int py, u_int first, u_int last,
memcpy(buf + bx, d, dlen);
bx += dlen;
}
if (allocated)
free((void *)d);
}
buf[newsize - 1] = '\0';
@ -2670,6 +2683,7 @@ window_copy_cstrtocellpos(struct grid *gd, u_int ncells, u_int *ppx, u_int *ppy,
struct {
const char *d;
size_t dlen;
int allocated;
} *cells;
/* Populate the array of cell data. */
@ -2680,7 +2694,7 @@ window_copy_cstrtocellpos(struct grid *gd, u_int ncells, u_int *ppx, u_int *ppy,
gl = grid_peek_line(gd, pywrap);
while (cell < ncells) {
cells[cell].d = window_copy_cellstring(gl, px,
&cells[cell].dlen);
&cells[cell].dlen, &cells[cell].allocated);
cell++;
px++;
if (px == gd->sx) {
@ -2738,6 +2752,10 @@ window_copy_cstrtocellpos(struct grid *gd, u_int ncells, u_int *ppx, u_int *ppy,
*ppy = pywrap;
/* Free cell data. */
for (cell = 0; cell < ncells; cell++) {
if (cells[cell].allocated)
free((void *)cells[cell].d);
}
free(cells);
}