From f614069b67c53b37e0b168b0c29ce0a12fcca1fa Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 25 Nov 2024 07:12:24 -0700 Subject: [PATCH 1/7] utf8_to_uv_msgs(): Move dTHX to only block used in This only is needed if a problem was found. --- utf8.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utf8.c b/utf8.c index a7b48994ac07..cf375699f734 100644 --- a/utf8.c +++ b/utf8.c @@ -1640,8 +1640,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, Size_t expectlen; /* How long should this sequence be? */ Size_t avail_len; /* When input is too short, gives what that is */ - dTHX; - /* Here, is one of: * a) malformed; * b) a problematic code point (surrogate, non-unicode, or nonchar); or @@ -1955,6 +1953,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, bool success = true; if (UNLIKELY(possible_problems)) { + dTHX; /* Here, the input sequence is potentially problematic. The code here * determines if that is indeed the case and how to handle it. The From b5263a4ed0ea17ce519aa5ac571b8d45b31485b2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 25 Nov 2024 07:50:08 -0700 Subject: [PATCH 2/7] utf8_to_uv_msgs: Move declaration to first need There are several paths through the code that don't need this copy to be made. Move to just before it is really needed. --- utf8.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/utf8.c b/utf8.c index cf375699f734..d1be07635191 100644 --- a/utf8.c +++ b/utf8.c @@ -1629,11 +1629,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, const U8 * s = s0; - /* The ending position, plus 1, of the first character in the sequence - * beginning at s0. In other words, 'e', adjusted down to to be no more - * than a single character */ - const U8 * send = e; - U32 possible_problems; /* A bit is set here for each potential problem found as we go along */ UV uv = 0; @@ -1659,8 +1654,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, } /* Each of the affected Hanguls starts with \xED */ - - if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */ + if (is_HANGUL_ED_utf8_safe(s0, e)) { /* Always false on EBCDIC */ if (advance_p) { *advance_p = 3; } @@ -1675,10 +1669,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely * what it is intended to do, and that no flaws in it are masked by * dropping down and executing the code below - assert(! isUTF8_CHAR(s0, send) - || UTF8_IS_SURROGATE(s0, send) - || UTF8_IS_SUPER(s0, send) - || UTF8_IS_NONCHAR(s0,send)); + assert(! isUTF8_CHAR(s0, e) + || UTF8_IS_SURROGATE(s0, e) + || UTF8_IS_SUPER(s0, e) + || UTF8_IS_NONCHAR(s0, e)); */ s = s0; @@ -1720,6 +1714,11 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * allowed one, we could allow in something that shouldn't have been. */ + /* The ending position, plus 1, of the first character in the sequence + * beginning at s0. In other words, 'e', adjusted down to to be no more + * than a single character */ + const U8 * send = e; + Size_t curlen; if (UNLIKELY(s0 >= send)) { possible_problems |= UTF8_GOT_EMPTY; From 4ef351c7d8864c795672f0977e12c7c943555af9 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 27 Nov 2024 21:30:10 -0700 Subject: [PATCH 3/7] utf8_to_uv_msgs: Combine duplicate statements By deferring to the end of the function the setting of a variable returned to the caller, we can eliminate one copy of that setting. The early returns from this function that necessitated the early setting have been removed in previous commits. --- utf8.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/utf8.c b/utf8.c index d1be07635191..2188cd2b0b6b 100644 --- a/utf8.c +++ b/utf8.c @@ -1733,15 +1733,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, /* This is a helper function; invariants should have been handled before * calling it */ assert(! NATIVE_BYTE_IS_INVARIANT(*s0)); - - /* A well-formed UTF-8 character, as the vast majority of calls to this - * function will be for, has this expected length. For efficiency, set - * things up here to return it. It will be overridden only in those rare - * cases where a malformation is found */ - if (advance_p) { - *advance_p = expectlen; - } - /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) { possible_problems |= UTF8_GOT_CONTINUATION; @@ -2550,13 +2541,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, } } /* End of 'while (possible_problems)' */ - /* Since there was a possible problem, the returned length may need to - * be changed from the one stored at the beginning of this function. - * Instead of trying to figure out if it has changed, just do it. */ - if (advance_p) { - *advance_p = curlen; - } - if (msgs_return) { *msgs = msgs_return; } @@ -2575,6 +2559,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, } } /* End of there was a possible problem */ + if (advance_p) { + *advance_p = curlen; + } + *cp_p = UNI_TO_NATIVE(uv); return success; } From db51311003f32ef53dfa54234418d04b37234659 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 4 Dec 2024 06:46:18 -0700 Subject: [PATCH 4/7] utf8_to_uv_msgs: Rationalize use of two variables This makes these two variables always contain the value their names indicate. --- utf8.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/utf8.c b/utf8.c index 2188cd2b0b6b..84a7379cd347 100644 --- a/utf8.c +++ b/utf8.c @@ -1633,7 +1633,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, found as we go along */ UV uv = 0; Size_t expectlen; /* How long should this sequence be? */ - Size_t avail_len; /* When input is too short, gives what that is */ /* Here, is one of: * a) malformed; @@ -1678,8 +1677,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, s = s0; possible_problems = 0; expectlen = 0; - avail_len = 0; - if (errors) { *errors = 0; } @@ -1714,18 +1711,23 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * allowed one, we could allow in something that shouldn't have been. */ + Size_t curlen = 0; /* How many bytes have we processed so far */ + + /* Gives how many bytes are available, which may turn out to be less than + * the expected length */ + Size_t avail_len; + /* The ending position, plus 1, of the first character in the sequence * beginning at s0. In other words, 'e', adjusted down to to be no more * than a single character */ const U8 * send = e; - Size_t curlen; if (UNLIKELY(s0 >= send)) { possible_problems |= UTF8_GOT_EMPTY; - curlen = 0; + avail_len = 0; goto ready_to_handle_errors; } - curlen = send - s0; + avail_len = send - s0; /* We now know we can examine the first byte of the input */ expectlen = UTF8SKIP(s0); @@ -1752,12 +1754,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, /* Setup the loop end point, making sure to not look past the end of the * input string, and flag it as too short if the size isn't big enough. */ - if (UNLIKELY(curlen < expectlen)) { + if (UNLIKELY(avail_len < expectlen)) { possible_problems |= UTF8_GOT_SHORT; - avail_len = curlen; } else { send = (U8*) s0 + expectlen; + avail_len = expectlen; } /* Now, loop through the remaining bytes in the character's sequence, From f12cc173dc24e9adf385aaf4a8d24910b8d91394 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 4 Dec 2024 06:56:35 -0700 Subject: [PATCH 5/7] utf8_to_uv_msgs: Move an assert This is a small detail, but this moves this assert to after a conditional that would exclude it. That is, if the conditional is true, the assert is pointless. So move the assert to where we know the conditional is false. --- utf8.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utf8.c b/utf8.c index 84a7379cd347..775185763cd0 100644 --- a/utf8.c +++ b/utf8.c @@ -1732,9 +1732,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, /* We now know we can examine the first byte of the input */ expectlen = UTF8SKIP(s0); - /* This is a helper function; invariants should have been handled before - * calling it */ - assert(! NATIVE_BYTE_IS_INVARIANT(*s0)); /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) { possible_problems |= UTF8_GOT_CONTINUATION; @@ -1742,6 +1739,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, goto ready_to_handle_errors; } + /* This is a helper function; invariants should have been handled before + * calling it */ + assert(! NATIVE_BYTE_IS_INVARIANT(*s0)); + /* Here is not a continuation byte, nor an invariant. The only thing left * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START * to check for sure because it excludes start bytes like \xC0 that always From c495aae317df8db3ae52884fcb449cbc5e92f06d Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 12 Jan 2025 17:04:51 -0700 Subject: [PATCH 6/7] utf8_to_uv_msgs: Move decls and inits closer to first use C99 allows us to declare anywhere; so move these to where its more logical. It also makes sure some variables are initialized before the goto that jumps to the end of the program, and which currently doesn't rely on these values, but could be changed to do so someday without the coder realizing it. This prevents a problem in case that happens. --- utf8.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/utf8.c b/utf8.c index 775185763cd0..67cbe3cceebf 100644 --- a/utf8.c +++ b/utf8.c @@ -1627,13 +1627,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, { PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_; - const U8 * s = s0; - - U32 possible_problems; /* A bit is set here for each potential problem - found as we go along */ - UV uv = 0; - Size_t expectlen; /* How long should this sequence be? */ - /* Here, is one of: * a) malformed; * b) a problematic code point (surrogate, non-unicode, or nonchar); or @@ -1674,9 +1667,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, || UTF8_IS_NONCHAR(s0, e)); */ - s = s0; - possible_problems = 0; - expectlen = 0; if (errors) { *errors = 0; } @@ -1711,7 +1701,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * allowed one, we could allow in something that shouldn't have been. */ + Size_t expectlen = 0; /* How long should this sequence be? */ Size_t curlen = 0; /* How many bytes have we processed so far */ + UV uv = 0; /* The accumulated code point, so far */ + const U8 * s = s0; /* Our current position examining the sequence */ /* Gives how many bytes are available, which may turn out to be less than * the expected length */ @@ -1722,6 +1715,11 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * than a single character */ const U8 * send = e; + /* A bit is set here for each potential problem found as we go along */ + U32 possible_problems = 0; + + /* The above variables have to be initialized before the 'goto' */ + if (UNLIKELY(s0 >= send)) { possible_problems |= UTF8_GOT_EMPTY; avail_len = 0; @@ -1729,10 +1727,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, } avail_len = send - s0; - /* We now know we can examine the first byte of the input */ - expectlen = UTF8SKIP(s0); - - /* A continuation character can't start a valid sequence */ + /* We now know we can examine the first byte of the input. A continuation + * character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) { possible_problems |= UTF8_GOT_CONTINUATION; curlen = 1; @@ -1744,9 +1740,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, assert(! NATIVE_BYTE_IS_INVARIANT(*s0)); /* Here is not a continuation byte, nor an invariant. The only thing left - * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START - * to check for sure because it excludes start bytes like \xC0 that always - * lead to overlongs.) */ + * is a start byte (possibly for an overlong). */ + expectlen = UTF8SKIP(s0); /* How long should this sequence be? */ /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits * that indicate the number of bytes in the character's whole UTF-8 From 56b1fa0b391fa2bfa733728c97bd1e81246f2f2e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 4 Dec 2024 11:59:35 -0700 Subject: [PATCH 7/7] utf8_to_uv_msgs: Remove duplicate code This was introduced in a rebasing error --- utf8.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/utf8.c b/utf8.c index 67cbe3cceebf..89f3f89df79a 100644 --- a/utf8.c +++ b/utf8.c @@ -1667,10 +1667,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, || UTF8_IS_NONCHAR(s0, e)); */ - if (errors) { - *errors = 0; - } - /* Accumulate the code point translation of the input byte sequence * s0 .. e-1, looking for malformations. *