diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt index f11040c4ec..857a3e6484 100644 --- a/runtime/doc/pattern.txt +++ b/runtime/doc/pattern.txt @@ -1,4 +1,4 @@ -*pattern.txt* For Vim version 9.1. Last change: 2025 Mar 21 +*pattern.txt* For Vim version 9.1. Last change: 2025 Mar 28 VIM REFERENCE MANUAL by Bram Moolenaar @@ -1222,7 +1222,8 @@ x A single character, with no special meaning, matches itself \o40 octal number of character up to 0o377 \x20 hexadecimal number of character up to 0xff \u20AC hex. number of multibyte character up to 0xffff - \U1234 hex. number of multibyte character up to 0xffffffff + \U1234 hex. number of multibyte character up to 8 characters + 0xffffffff |E1541| NOTE: The other backslash codes mentioned above do not work inside []! - Matching with a collection can be slow, because each character in @@ -1263,7 +1264,8 @@ x A single character, with no special meaning, matches itself \%u20AC Matches the character specified with up to four hexadecimal characters. \%U1234abcd Matches the character specified with up to eight hexadecimal - characters, up to 0x7fffffff + characters, up to 0x7fffffff (the maximum allowed value is INT_MAX + |E1541|, but the maximum valid Unicode codepoint is U+10FFFF). ============================================================================== 7. Ignoring case in a pattern */ignorecase* diff --git a/runtime/doc/tags b/runtime/doc/tags index 75b00aae14..7d54ee9b85 100644 --- a/runtime/doc/tags +++ b/runtime/doc/tags @@ -4621,6 +4621,7 @@ E1538 eval.txt /*E1538* E1539 vim9.txt /*E1539* E154 helphelp.txt /*E154* E1540 eval.txt /*E1540* +E1541 vi_diff.txt /*E1541* E155 sign.txt /*E155* E156 sign.txt /*E156* E157 sign.txt /*E157* diff --git a/runtime/doc/vi_diff.txt b/runtime/doc/vi_diff.txt index b96f77907c..46db57a458 100644 --- a/runtime/doc/vi_diff.txt +++ b/runtime/doc/vi_diff.txt @@ -1,4 +1,4 @@ -*vi_diff.txt* For Vim version 9.1. Last change: 2024 Nov 10 +*vi_diff.txt* For Vim version 9.1. Last change: 2025 Mar 28 VIM REFERENCE MANUAL by Bram Moolenaar @@ -91,8 +91,11 @@ Maximum display width Unix and Win32: 1024 characters, otherwise 255 Maximum lhs of a mapping 50 characters. Number of different highlighting types: over 30000 Range of a Number variable: -2147483648 to 2147483647 (might be more on 64 - bit systems) + bit systems) See also: |v:numbermax|, + |v:numbermin| and |v:numbersize| Maximum length of a line in a tags file: 512 bytes. + *E1541* +Maximum value for |/\U| and |/\%U|: 2147483647 (for 32bit integer). Information for undo and text in registers is kept in memory, thus when making (big) changes the amount of (virtual) memory available limits the number of diff --git a/src/errors.h b/src/errors.h index 9331484ac5..6e2782df5b 100644 --- a/src/errors.h +++ b/src/errors.h @@ -3716,3 +3716,5 @@ EXTERN char e_variadic_tuple_must_end_with_list_type_str[] EXTERN char e_cannot_use_variadic_tuple_in_concatenation[] INIT(= N_("E1540: Cannot use a variadic tuple in concatenation")); #endif +EXTERN char e_unicode_val_too_large[] + INIT(= N_("E1541: Value too large, max Unicode codepoint is U+10FFFF")); diff --git a/src/regexp.c b/src/regexp.c index ea6079b008..32a721f9f3 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -427,9 +427,9 @@ static void skipchr_keepstart(void); static int peekchr(void); static void skipchr(void); static void ungetchr(void); -static long gethexchrs(int maxinputlen); +static vimlong_T gethexchrs(int maxinputlen); static long getoctchrs(void); -static long getdecchrs(void); +static vimlong_T getdecchrs(void); static int coll_get_char(void); static int prog_magic_wrong(void); static int cstrncmp(char_u *s1, char_u *s2, int *n); @@ -979,7 +979,7 @@ ungetchr(void) * The parameter controls the maximum number of input characters. This will be * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence. */ - static long + static vimlong_T gethexchrs(int maxinputlen) { long_u nr = 0; @@ -998,14 +998,14 @@ gethexchrs(int maxinputlen) if (i == 0) return -1; - return (long)nr; + return nr; } /* * Get and return the value of the decimal string immediately after the * current position. Return -1 for invalid. Consumes all digits. */ - static long + static vimlong_T getdecchrs(void) { long_u nr = 0; @@ -1025,7 +1025,7 @@ getdecchrs(void) if (i == 0) return -1; - return (long)nr; + return nr; } /* diff --git a/src/regexp_bt.c b/src/regexp_bt.c index 16dac730de..f4bd6c36d2 100644 --- a/src/regexp_bt.c +++ b/src/regexp_bt.c @@ -1589,7 +1589,7 @@ regatom(int *flagp) case 'u': // %uabcd hex 4 case 'U': // %U1234abcd hex 8 { - long i; + vimlong_T i; switch (c) { @@ -1612,7 +1612,7 @@ regatom(int *flagp) if (i == 0) regc(0x0a); else - regmbc(i); + regmbc((int)i); regc(NUL); *flagp |= HASWIDTH; break; @@ -1831,6 +1831,10 @@ collection: || *regparse == 'U') { startc = coll_get_char(); + // max UTF-8 Codepoint is U+10FFFF, + // but allow values until INT_MAX + if (startc == INT_MAX) + EMSG_RET_NULL(_(e_unicode_val_too_large)); if (startc == 0) regc(0x0a); else @@ -2131,7 +2135,7 @@ regpiece(int *flagp) int lop = END; long nr; - nr = getdecchrs(); + nr = (long)getdecchrs(); switch (no_Magic(getchr())) { case '=': lop = MATCH; break; // \@= @@ -2610,7 +2614,7 @@ vim_regcomp_had_eol(void) static int coll_get_char(void) { - long nr = -1; + vimlong_T nr = -1; switch (*regparse++) { @@ -2620,13 +2624,15 @@ coll_get_char(void) case 'u': nr = gethexchrs(4); break; case 'U': nr = gethexchrs(8); break; } - if (nr < 0 || nr > INT_MAX) + if (nr < 0) { // If getting the number fails be backwards compatible: the character // is a backslash. --regparse; nr = '\\'; } + if (nr > INT_MAX) + nr = INT_MAX; return nr; } diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c index 557d0e1aac..6ad682bcf8 100644 --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -1560,7 +1560,7 @@ nfa_regatom(void) case 'u': // %uabcd hex 4 case 'U': // %U1234abcd hex 8 { - long nr; + vimlong_T nr; switch (c) { @@ -1577,7 +1577,7 @@ nfa_regatom(void) reg_magic == MAGIC_ALL); // A NUL is stored in the text as NL // TODO: what if a composing character follows? - EMIT(nr == 0 ? 0x0a : nr); + EMIT(nr == 0 ? 0x0a : (long)nr); } break; @@ -1953,6 +1953,10 @@ collection: { // TODO(RE) This needs more testing startc = coll_get_char(); + // max UTF-8 Codepoint is U+10FFFF, + // but allow values until INT_MAX + if (startc == INT_MAX) + EMSG_RET_FAIL(_(e_unicode_val_too_large)); got_coll_char = TRUE; MB_PTR_BACK(old_regparse, regparse); } @@ -2218,7 +2222,7 @@ nfa_regpiece(void) break; case Magic('@'): - c2 = getdecchrs(); + c2 = (long)getdecchrs(); op = no_Magic(getchr()); i = 0; switch(op) diff --git a/src/testdir/test_search.vim b/src/testdir/test_search.vim index 708aca2a82..75291750f4 100644 --- a/src/testdir/test_search.vim +++ b/src/testdir/test_search.vim @@ -1541,17 +1541,46 @@ func Test_large_hex_chars2() try /[\Ufffffc1f] catch - call assert_match('E486:', v:exception) + call assert_match('E1541:', v:exception) endtry try set re=1 /[\Ufffffc1f] catch - call assert_match('E486:', v:exception) + call assert_match('E1541:', v:exception) endtry set re& endfunc +func Test_large_hex_chars3() + " Validate max number of Unicode char + try + /[\UFFFFFFFF] + catch + call assert_match('E1541:', v:exception) + endtry + try + /[\UFFFFFFF] + catch + call assert_match('E486:', v:exception) + endtry + try + /\%#=2[\d32-\UFFFFFFFF] + catch + call assert_match('E1541:', v:exception) + endtry + try + /\%#=1[\UFFFFFFFF] + catch + call assert_match('E1541:', v:exception) + endtry + try + /\%#=1[\d32-\UFFFFFFFF] + catch + call assert_match('E945:', v:exception) + endtry +endfunc + func Test_one_error_msg() " This was also giving an internal error call assert_fails('call search(" \\((\\v[[=P=]]){185}+ ")', 'E871:') diff --git a/src/version.c b/src/version.c index 7d6c7e3f9b..22357447fa 100644 --- a/src/version.c +++ b/src/version.c @@ -704,6 +704,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ +/**/ + 1258, /**/ 1257, /**/