From 86153fbba9cebaf4ecc95fa43a0853ef65c737e6 Mon Sep 17 00:00:00 2001 From: Nicholas Marriott Date: Fri, 1 Aug 2025 09:45:18 +0100 Subject: [PATCH] Add UTF-8 regress test. --- regress/UTF-8-test.txt | Bin 0 -> 22781 bytes regress/utf8-test.result | 301 +++++++++++++++++++++++++++++++++++++++ regress/utf8-test.sh | 21 +++ 3 files changed, 322 insertions(+) create mode 100644 regress/UTF-8-test.txt create mode 100644 regress/utf8-test.result create mode 100644 regress/utf8-test.sh diff --git a/regress/UTF-8-test.txt b/regress/UTF-8-test.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b GIT binary patch literal 22781 zcmdU1X_Fh*b>CYjiIrHEbIyIeVd1bb#8j*y4MbNJE8ngr{T%> zH=?M0*NGF|Zns=*maBHFY*)*j-4j3B+Sy$_dEy6TNmiFvPA)BPEUj+fmviUj>}zDb zylhzyHeB;;sk==Fw0Y8Snr+$lJK|ijTdwCUO2hB+4}n}98At=QQVnO2tbk<@(9!SA*KTMb-()7zJ}9Yk)m)3ovMPE_?< z4&m`=AYfdi2gL9x(zW;T2&>e#!>~kZOLg0AmhhcCgBHcvb3FXf@9Z>qKS;P_9xq{M zX9r^v+X4}FC$KAEXd?8A)3Eozr9kXRLJ%VDrme$0ABAGOEs(JYzJ!rugIk~^6$2>n zcEzq>1dFOMR23y23`tPI^(hkaJx~+w1GHs>5#U|33BkDdJ8tNBov>p!@zi!F0^_$X zwVlAF6hNWW!L)ctp%pbicFV`;>TVE5OOcBa*d6d8P>GKyMu;y#v2jP!v2ly zk#^vNEf_7C=(s`3piW3-P;OK9WvZqTur#5BA;#K-8dm*1m=Jj3)$CxO%20DT9xyLe zwsDmL*Fxz!%_e`wy4l5ZQiI8()+YdS1n&OdNLR+dk#pITbEYjt^fc~L?miBLAnr&X@2<+Unk z1xxnas$;=~9Ct4@5j@+*5Y%*_V9+xuWkXKLj~OeW4FN%?jpW=YyARqX)rK9|RoI{z z6*L^m6}lKPRobrk?FftsJC)FhN~FyyrMJYKuwgkgxUNh6oN`}GFDwzt;8txQTCib0 zpaX3vNd^8-vO$_rm*s*-zPDPw2N~g;P^31r#;!I1WQ{2rZPN#c8k3rQ#uD?@lrel9 zc&guQIdCZHh-Bg7@$K#H&0~wwZMb+&)Ax7DrV9Ahba&}X@N9^jV}PH6M+Px?o*Hkbs)2c49sb4l;8pIYPlARu zoGPT(Z8+q>K>~n^Evy`)jv=&wc~r>Af(RF7VRiKHP%*^JY6M2Y=RqsH9fW|J=p5L9 znCI@cwPR{0b+~K4%VEB*<45q|vDS%Qr|sIF1&&Z+V?K-_EmasQ z8kByYLD(%L0K%By=V1!rzE^DozUTgkfsVl1(8MKiTZq~&WQ?&SPutJLna1mUxDM_8 zP)ZNpy(L@sZI)m$I*2Ai09`mdo-eM(DM*@X6gpHP8)DPgmBh+wV22I*AB0J!9+fy; z8{>>!u-$}rV^pxJHiRsQ=#+ES2`J?=&=eCklnU@e6AC)5L_(+U%sfB{lLwa<+D)c0 z@KC(hsyj~Ym{ISVi82sR3hY2obbeCmm0&TKs!kix5mfO0s0&XAKVcanR?92p6&3Aj z8SfK619~GH1bV)=)P}u8YO7}+ICuU$#}Wm8Gqf%qN3>YhoB8(iHYkdjXatJ~CgW3j zY>f4iQ(Nk~HBd)WUqlpu?vm2{m{U#~UP-4-nIT8$v^)WO76SEQ@N+P=V`Y;of&qRv z)cla%oCy!KxVqci&#Y-mCc=zBS!*Ss((AM;P-3Qwi3bb$e1=vyB2^fTxfonxrVgS2 zoKKuoB7(+ABNaY2vzm!32iYF<<33-#7gY~A0sMvCghhF zq-e|{8C-$Ugm&8k%c>);(`u^J2$kppjT%GKyf1X4j@nzOxnwR`JcchcJNi(NkUAXU zsBwD@$v}mutT zh$~}5e&&(OGeVveIhlv#Txx{K7vFe!R?Jf(r*fF<3Yq8RORop!41|tK>vn&mbRg_| zm`Ke*yX7d<8wxo^y-#oF)cePZYNK$x)71O)b{_BPK~X?uV0lu#G3m<#4hJ(Lj;r?y z0rQC89Lx$irrv9cn1}opAZH|VRK3AOy>gQIHZf-ayJ7t0~3rOy}M?jxyqUWGiivnIv=ZUUoW)yJ@%RT>A zZ|6_n7AGs61L#Y#T@(gG2L*alV=(oPCW|Yqyd_s}%50DMoQs02-}TDL_lo|U|9IgU zC=ru;a?(B`3|YRkv3!@5islZ`Ka<&Mu%8qz@G5oS74LhKL_mwUeB`g-0 z885ud+RgBaa?Su6Jh+(ko8rwxp}|7{J$ZAyStvAkr{x|eJ@}@Vx<*jdf`#awGu6MuZz3+SfqlZB<_ZYm#KJdX0efT3E z{n*Dp@ySnp`tc{O+%Dc{KKqNm^vj?7mHw}O{tLhM#V4=MWD%p!p1(ZSbgF~$IZ9|@ zg!S8ed2saM1%5jc`snXjU|w4_tD$k&MOa#0(%M{7dBDVc`BIg7T&ttBKjlj4(;A;C z%OM8p(pcEW0&zC_v`#PuCjC>2m;UAU-Y8z_ntVyVEMJkY%CF0B$ZyJT$#2W=$nVPU z$?wZk^0gTrVd(n<`9t|5`D6JL`BV8b`E&UT`AhjL`D^(b`CIurd0L7ms`DQ&^p6@y z`L3KAm+#sX`8t%Z^NN=5b@{sdz5Ijxqx_Tnv;2$vtNfe%yZndzr`(Wl$ngJN9>_N+ zkBxD0tWOa~o#LpUo^Ebg{P?c6w`b&8c}|{}7v#U>zvY`V@PFJCkH$WbuTdWB&64GgT zysDZJJZ*k%+6;6}KSAQkmiF_oRc4|oB7=iG(#2q13uZ9^c@>u(QBsJ|L8XgMXgXOq zm#rG3WWd$cxU&sS8R<2D{;i>eh5TVo7);F63rof4=zI7naP@a<-sQ;!47FNLX26 z$t>m86H2;}Fo~I(<6PcQ(on~9X!b$vkUblwOT$G!=a?r8i#%}tK7u}$E(?52e5$%z z*gDHJG1+X$Lc>MgN8*O8*Ce66mo|5>-H8DsEtVFV6DByx&-srR$fax7FT;7z+SNRt z9uA?!e{cTuJqZ2D`v2HZW++5+qqcUo+YTB| z5%$vF(4+}4($33K3g=HB_{TU{ceuFp^0>{9KK<6e(WXyw-d)#qrSHrK4`C4W@n1a# ze);9^9145>{DTL(fKE5hGUG_Lmbq~a<|`Q&!6_2p=}QBWMm2ea0_}P(FGrd?Dc08U2^rTakV#X z(Xa@!XT)ZkA}VQUq(i@Jt;5YC4X0I>Gg1m|sD$YQ%kbjn(^BG&1Lw8KC-o-N)2xl) z*{Cy)XwN6?Gk^8aUTcTgmT;Fg4`YW4w{vK-y7n%!r@hIhc*HIwuXW+JhA~|rCz7kD zn_I9`2oX=0$j~ z30Q8Z3fn^)s82;V(zUh!)zHLRu!j}ZoYp3d(oJZC#sC`E$OSyN?QmdVhS@=Y=}{-( zrZe5oLhIafo$g}7J)};MR=h6_ACB8crtjR2W%^xdvJl5O|J8_AObq4NWVKM1)X&S53})42gHvR~Io zCU5;QAF6wAcnkNvJ|GL31u)a|b4=={0}XCp4!onNK0W z_(((4S{kxYI#EJ5wb`r%y)(juZ6z1&m-~8h>Gr6A7gfNC(|{@KZm0;o+q!9y_RIk$ z7vs^vE!Ql8fE`MnZe=scf6dnoFWAP%kJAe|{{-W{H~n<>H}jV-^bZ=y^2RC~gR0n} zkc;=A5NYAHD~0p8o<+db4A9Y@pgbl3uV0=OaVhN4#N5@hu%i?v24T zJ(F?mj%30|pjAJ?#2fmEhk#j-o4F^<;Vp>-Cod#(nAA=*Rwo7g`!3#-%}x${>ipI5 z!O2`BF>Lq(lPBm7t^@2UBe@rEL(U*LH;rLfodHTRS5xE6G8Hmm;>xXhvJ0b73K;#4*}HwR>dIZnepaCUAE6hTQO%}vGzUetHYUmh$4KVTMi zNfR2UFij8q9n>fC1Idl)ptnatpK)KC*w6+8W0gf`1F*Flj}by43 zC48);eEiH|X~d!}__&>n8g^^E;Ek(?MZ~f#5tC7yM2Vv=mLH8tX5`n~{)LiWmHTZs zfIPlSt;OW-$JJWE$W++IXh3%+^~WyXxV7MQ%qHF^n5l3w%v3jld22x#%-CXJgClgR zDUQ(j_@FSIvCSCFjLsA@r4yJ%=`>bD%O0UqReglcCyMja*iDR?(V1eVbON&|oyKzL zFGlE8s1Z7^6z3;;9sm|&W^|^QDV@M9Lg#Ip;0OTqXie_JvV5|DfU-W8I1Zch%s+aK zW;-RE7Lt;R(lwWfN<5k-*n@yrRL~=$aGDH@-g?Z^*m(16PpHNsaT!^{^P4=S4s$u4 zn4_g-PG6ET$^-GbU5eSWO3BPePsM`!RsujH>i4x$Ilc89&MIe9-DdQnA3T!WrWLlMIw zp{lN#KWL=_YYXWDsV)UqntpW`df>;3Z59c&tui(!b9f~fie^igAJ#+iN8yc(*PM%o zgQ+lQ#JPjq$ur$Tjq@=p!HGQLh0Y9+wa(;}L%?JP$Ax_^nZjg87@pla_1*mCaesQs zBt{CNc)%f4raCGP@*pJw5#o&C&kib4X2*-!d*E^m64IZa^19wsZbc|NX=Mqr?7V z!~O?`{SOZN9~$;QJnVmD*#GFT|FL2J|ZGz&1f`h z_Eg0#9_P<(SztEeVN5IK^2&0qE&E{3&!4{dd8KC_9bCn0@XQtbyn>$t{6x|G*@p*L z@ftjf&+!^Oi_h^IJd4lq8a#*3@fsXVxA`=q{yCuGHFyqacnzKd8eW6vfri)Md3=u7 p;CXzG*Wh`4j@JOEDqJmkvO@N>BDCH5J3T8tzwg#xE%c3^^?x0SNE-kE literal 0 HcmV?d00001 diff --git a/regress/utf8-test.result b/regress/utf8-test.result new file mode 100644 index 00000000..e700cb17 --- /dev/null +++ b/regress/utf8-test.result @@ -0,0 +1,301 @@ +UTF-8 decoder capability and stress test +---------------------------------------- + +Markus Kuhn - 2015-08-28 - CC BY 4.0 + +This test file can help you examine, how your UTF-8 decoder handles +various types of correct, malformed, or otherwise interesting UTF-8 +sequences. This file is not meant to be a conformance test. It does +not prescribe any particular outcome. Therefore, there is no way to +"pass" or "fail" this test file, even though the text does suggest a +preferable decoder behaviour at some places. Its aim is, instead, to +help you think about, and test, the behaviour of your UTF-8 decoder on a +systematic collection of unusual inputs. Experience so far suggests +that most first-time authors of UTF-8 decoders find at least one +serious problem in their decoder using this file. + +The test lines below cover boundary conditions, malformed UTF-8 +sequences, as well as correctly encoded UTF-8 sequences of Unicode code +points that should never occur in a correct UTF-8 file. + +According to ISO 10646-1:2000, sections D.7 and 2.3c, a device +receiving UTF-8 shall interpret a "malformed sequence in the same way +that it interprets a character that is outside the adopted subset" and +"characters that are not within the adopted subset shall be indicated +to the user" by a receiving device. One commonly used approach in +UTF-8 decoders is to replace any malformed UTF-8 sequence by a +replacement character (U+FFFD), which looks a bit like an inverted +question mark, or a similar symbol. It might be a good idea to +visually distinguish a malformed UTF-8 sequence from a correctly +encoded Unicode character that is just not available in the current +font but otherwise fully legal, even though ISO 10646-1 doesn't +mandate this. In any case, just ignoring malformed sequences or +unavailable characters does not conform to ISO 10646, will make +debugging more difficult, and can lead to user confusion. + +Please check, whether a malformed UTF-8 sequence is (1) represented at +all, (2) represented by exactly one single replacement character (or +equivalent signal), and (3) the following quotation mark after an +illegal UTF-8 sequence is correctly displayed, i.e. proper +resynchronization takes place immediately after any malformed +sequence. This file says "THE END" in the last line, so if you don't +see that, your decoder crashed somehow before, which should always be +cause for concern. + +All lines in this file are exactly 79 characters long (plus the line +feed). In addition, all lines end with "|", except for the two test +lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls +U+0000 and U+007F. If you display this file with a fixed-width font, +these "|" characters should all line up in column 79 (right margin). +This allows you to test quickly, whether your UTF-8 decoder finds the +correct number of characters in every line, that is whether each +malformed sequences is replaced by a single replacement character. + +Note that, as an alternative to the notion of malformed sequence used +here, it is also a perfectly acceptable (and in some situations even +preferable) solution to represent each individual byte of a malformed +sequence with a replacement character. If you follow this strategy in +your decoder, then please ignore the "|" column. + + +Here come the tests: | + | +1 Some correct UTF-8 text | + | +You should see the Greek word 'kosme': "κόσμε" | + | +2 Boundary condition test cases | + | +2.1 First possible sequence of a certain length | + | +2.1.1 1 byte (U-00000000): "" +2.1.2 2 bytes (U-00000080): "€" | +2.1.3 3 bytes (U-00000800): "ࠀ" | +2.1.4 4 bytes (U-00010000): "𐀀" | +2.1.5 5 bytes (U-00200000): "�����" | +2.1.6 6 bytes (U-04000000): "������" | + | +2.2 Last possible sequence of a certain length | + | +2.2.1 1 byte (U-0000007F): "" +2.2.2 2 bytes (U-000007FF): "߿" | +2.2.3 3 bytes (U-0000FFFF): "￿" | +2.2.4 4 bytes (U-001FFFFF): "����" | +2.2.5 5 bytes (U-03FFFFFF): "�����" | +2.2.6 6 bytes (U-7FFFFFFF): "������" | + | +2.3 Other boundary conditions | + | +2.3.1 U-0000D7FF = ed 9f bf = "퟿" | +2.3.2 U-0000E000 = ee 80 80 = "" | +2.3.3 U-0000FFFD = ef bf bd = "�" | +2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" | +2.3.5 U-00110000 = f4 90 80 80 = "�" | + | +3 Malformed sequences | + | +3.1 Unexpected continuation bytes | + | +Each unexpected continuation byte should be separately signalled as a | +malformed sequence of its own. | + | +3.1.1 First continuation byte 0x80: "�" | +3.1.2 Last continuation byte 0xbf: "�" | + | +3.1.3 2 continuation bytes: "��" | +3.1.4 3 continuation bytes: "���" | +3.1.5 4 continuation bytes: "����" | +3.1.6 5 continuation bytes: "�����" | +3.1.7 6 continuation bytes: "������" | +3.1.8 7 continuation bytes: "�������" | + | +3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): | + | + "���������������� | + ���������������� | + ���������������� | + ����������������" | + | +3.2 Lonely start characters | + | +3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), | + each followed by a space character: | + | + "� � � � � � � � � � � � � � � � | + � � � � � � � � � � � � � � � � " | + | +3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), | + each followed by a space character: | + | + "� � � � � � � � � � � � � � � � " | + | +3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), | + each followed by a space character: | + | + "� � � � � � � � " | + | +3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), | + each followed by a space character: | + | + "� � � � " | + | +3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), | + each followed by a space character: | + | + "� � " | + | +3.3 Sequences with last continuation byte missing | + | +All bytes of an incomplete sequence should be signalled as a single | +malformed sequence, i.e., you should see only a single replacement | +character in each of the next 10 tests. (Characters as in section 2) | + | +3.3.1 2-byte sequence with last byte missing (U+0000): "�" | +3.3.2 3-byte sequence with last byte missing (U+0000): "�" | +3.3.3 4-byte sequence with last byte missing (U+0000): "�" | +3.3.4 5-byte sequence with last byte missing (U+0000): "����" | +3.3.5 6-byte sequence with last byte missing (U+0000): "�����" | +3.3.6 2-byte sequence with last byte missing (U-000007FF): "�" | +3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "�" | +3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "���" | +3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "����" | +3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�����" | + | +3.4 Concatenation of incomplete sequences | + | +All the 10 sequences of 3.3 concatenated, you should see 10 malformed | +sequences being signalled: | + | + "���������������������������" | + | +3.5 Impossible bytes | + | +The following two bytes cannot appear in a correct UTF-8 string | + | +3.5.1 fe = "�" | +3.5.2 ff = "�" | +3.5.3 fe fe ff ff = "����" | + | +4 Overlong sequences | + | +The following sequences are not malformed according to the letter of | +the Unicode 2.0 standard. However, they are longer then necessary and | +a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 | +decoder" should reject them just like malformed sequences for two | +reasons: (1) It helps to debug applications if overlong sequences are | +not treated as valid representations of characters, because this helps | +to spot problems more quickly. (2) Overlong sequences provide | +alternative representations of characters, that could maliciously be | +used to bypass filters that check only for ASCII characters. For | +instance, a 2-byte encoded line feed (LF) would not be caught by a | +line counter that counts only 0x0a bytes, but it would still be | +processed as a line feed by an unsafe UTF-8 decoder later in the | +pipeline. From a security point of view, ASCII compatibility of UTF-8 | +sequences means also, that ASCII characters are *only* allowed to be | +represented by ASCII bytes in the range 0x00-0x7f. To ensure this | +aspect of ASCII compatibility, use only "safe UTF-8 decoders" that | +reject overlong UTF-8 sequences for which a shorter encoding exists. | + | +4.1 Examples of an overlong ASCII character | + | +With a safe UTF-8 decoder, all of the following five overlong | +representations of the ASCII character slash ("/") should be rejected | +like a malformed UTF-8 sequence, for instance by substituting it with | +a replacement character. If you see a slash below, you do not have a | +safe UTF-8 decoder! | + | +4.1.1 U+002F = c0 af = "��" | +4.1.2 U+002F = e0 80 af = "�" | +4.1.3 U+002F = f0 80 80 af = "�" | +4.1.4 U+002F = f8 80 80 80 af = "�����" | +4.1.5 U+002F = fc 80 80 80 80 af = "������" | + | +4.2 Maximum overlong sequences | + | +Below you see the highest Unicode value that is still resulting in an | +overlong sequence if represented with the given number of bytes. This | +is a boundary test for safe UTF-8 decoders. All five characters should | +be rejected like malformed UTF-8 sequences. | + | +4.2.1 U-0000007F = c1 bf = "��" | +4.2.2 U-000007FF = e0 9f bf = "�" | +4.2.3 U-0000FFFF = f0 8f bf bf = "�" | +4.2.4 U-001FFFFF = f8 87 bf bf bf = "�����" | +4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "������" | + | +4.3 Overlong representation of the NUL character | + | +The following five sequences should also be rejected like malformed | +UTF-8 sequences and should not be treated like the ASCII NUL | +character. | + | +4.3.1 U+0000 = c0 80 = "��" | +4.3.2 U+0000 = e0 80 80 = "�" | +4.3.3 U+0000 = f0 80 80 80 = "�" | +4.3.4 U+0000 = f8 80 80 80 80 = "�����" | +4.3.5 U+0000 = fc 80 80 80 80 80 = "������" | + | +5 Illegal code positions | + | +The following UTF-8 sequences should be rejected like malformed | +sequences, because they never represent valid ISO 10646 characters and | +a UTF-8 decoder that accepts them might introduce security problems | +comparable to overlong UTF-8 sequences. | + | +5.1 Single UTF-16 surrogates | + | +5.1.1 U+D800 = ed a0 80 = "�" | +5.1.2 U+DB7F = ed ad bf = "�" | +5.1.3 U+DB80 = ed ae 80 = "�" | +5.1.4 U+DBFF = ed af bf = "�" | +5.1.5 U+DC00 = ed b0 80 = "�" | +5.1.6 U+DF80 = ed be 80 = "�" | +5.1.7 U+DFFF = ed bf bf = "�" | + | +5.2 Paired UTF-16 surrogates | + | +5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "��" | +5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "��" | +5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "��" | +5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "��" | +5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "��" | +5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "��" | +5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "��" | +5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "��" | + | +5.3 Noncharacter code positions | + | +The following "noncharacters" are "reserved for internal use" by | +applications, and according to older versions of the Unicode Standard | +"should never be interchanged". Unicode Corrigendum #9 dropped the | +latter restriction. Nevertheless, their presence in incoming UTF-8 data | +can remain a potential security risk, depending on what use is made of | +these codes subsequently. Examples of such internal use: | + | + - Some file APIs with 16-bit characters may use the integer value -1 | + = U+FFFF to signal an end-of-file (EOF) or error condition. | + | + - In some UTF-16 receivers, code point U+FFFE might trigger a | + byte-swap operation (to convert between UTF-16LE and UTF-16BE). | + | +With such internal use of noncharacters, it may be desirable and safer | +to block those code points in UTF-8 decoders, as they should never | +occur legitimately in incoming UTF-8 data, and could trigger unsafe | +behaviour in subsequent processing. | + | +Particularly problematic noncharacters in 16-bit applications: | + | +5.3.1 U+FFFE = ef bf be = "￾" | +5.3.2 U+FFFF = ef bf bf = "￿" | + | +Other noncharacters: | + | +5.3.3 U+FDD0 .. U+FDEF = "﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯"| + | +5.3.4 U+nFFFE U+nFFFF (for n = 1..10) | + | + "🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿 | + 򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿" | + | +THE END | + diff --git a/regress/utf8-test.sh b/regress/utf8-test.sh new file mode 100644 index 00000000..3b2b22c5 --- /dev/null +++ b/regress/utf8-test.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +PATH=/bin:/usr/bin +TERM=screen + +[ -z "$TEST_TMUX" ] && TEST_TMUX=$(readlink -f ../tmux) +TMUX="$TEST_TMUX -Ltest" +TMP=$(mktemp) +trap "rm -f $TMP" 0 1 15 +$TMUX kill-server 2>/dev/null + +$TMUX -f/dev/null \ + set -g remain-on-exit on \; \ + set -g remain-on-exit-format '' \; \ + new -d -- cat UTF-8-test.txt +sleep 1 +$TMUX capturep -pCeJS- >$TMP +$TMUX kill-server + +cmp -s $TMP utf8-test.result || exit 1 +exit 0