Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

utf8_to_uv_msgs: Do some code cleanup #22819

Merged
merged 7 commits into from
Jan 16, 2025
102 changes: 41 additions & 61 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -1627,21 +1627,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
{
PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_;

const U8 * s = s0;

/* The ending position, plus 1, of the first character in the sequence
* beginning at s0. In other words, 'e', adjusted down to to be no more
* than a single character */
const U8 * send = e;

U32 possible_problems; /* A bit is set here for each potential problem
found as we go along */
UV uv = 0;
Size_t expectlen; /* How long should this sequence be? */
Size_t avail_len; /* When input is too short, gives what that is */

dTHX;

/* Here, is one of:
* a) malformed;
* b) a problematic code point (surrogate, non-unicode, or nonchar); or
Expand All @@ -1661,8 +1646,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
}

/* Each of the affected Hanguls starts with \xED */

if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
if (is_HANGUL_ED_utf8_safe(s0, e)) { /* Always false on EBCDIC */
if (advance_p) {
*advance_p = 3;
}
Expand All @@ -1677,21 +1661,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
* APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
* what it is intended to do, and that no flaws in it are masked by
* dropping down and executing the code below
assert(! isUTF8_CHAR(s0, send)
|| UTF8_IS_SURROGATE(s0, send)
|| UTF8_IS_SUPER(s0, send)
|| UTF8_IS_NONCHAR(s0,send));
assert(! isUTF8_CHAR(s0, e)
|| UTF8_IS_SURROGATE(s0, e)
|| UTF8_IS_SUPER(s0, e)
|| UTF8_IS_NONCHAR(s0, e));
*/

s = s0;
possible_problems = 0;
expectlen = 0;
avail_len = 0;

if (errors) {
*errors = 0;
}

/* Accumulate the code point translation of the input byte sequence
* s0 .. e-1, looking for malformations.
*
Expand Down Expand Up @@ -1722,40 +1697,47 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
* allowed one, we could allow in something that shouldn't have been.
*/

Size_t curlen;
if (UNLIKELY(s0 >= send)) {
possible_problems |= UTF8_GOT_EMPTY;
curlen = 0;
goto ready_to_handle_errors;
}
curlen = send - s0;
Size_t expectlen = 0; /* How long should this sequence be? */
Size_t curlen = 0; /* How many bytes have we processed so far */
UV uv = 0; /* The accumulated code point, so far */
const U8 * s = s0; /* Our current position examining the sequence */

/* We now know we can examine the first byte of the input */
expectlen = UTF8SKIP(s0);
/* Gives how many bytes are available, which may turn out to be less than
* the expected length */
Size_t avail_len;

/* This is a helper function; invariants should have been handled before
* calling it */
assert(! NATIVE_BYTE_IS_INVARIANT(*s0));
/* The ending position, plus 1, of the first character in the sequence
* beginning at s0. In other words, 'e', adjusted down to to be no more
* than a single character */
const U8 * send = e;

/* A well-formed UTF-8 character, as the vast majority of calls to this
* function will be for, has this expected length. For efficiency, set
* things up here to return it. It will be overridden only in those rare
* cases where a malformation is found */
if (advance_p) {
*advance_p = expectlen;
/* A bit is set here for each potential problem found as we go along */
U32 possible_problems = 0;

/* The above variables have to be initialized before the 'goto' */

if (UNLIKELY(s0 >= send)) {
possible_problems |= UTF8_GOT_EMPTY;
avail_len = 0;
goto ready_to_handle_errors;
}
avail_len = send - s0;

/* A continuation character can't start a valid sequence */
/* We now know we can examine the first byte of the input. A continuation
* character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) {
possible_problems |= UTF8_GOT_CONTINUATION;
curlen = 1;
goto ready_to_handle_errors;
}

/* This is a helper function; invariants should have been handled before
* calling it */
assert(! NATIVE_BYTE_IS_INVARIANT(*s0));

/* Here is not a continuation byte, nor an invariant. The only thing left
* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
* to check for sure because it excludes start bytes like \xC0 that always
* lead to overlongs.) */
* is a start byte (possibly for an overlong). */
expectlen = UTF8SKIP(s0); /* How long should this sequence be? */

/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
* that indicate the number of bytes in the character's whole UTF-8
Expand All @@ -1764,12 +1746,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,

/* Setup the loop end point, making sure to not look past the end of the
* input string, and flag it as too short if the size isn't big enough. */
if (UNLIKELY(curlen < expectlen)) {
if (UNLIKELY(avail_len < expectlen)) {
possible_problems |= UTF8_GOT_SHORT;
avail_len = curlen;
}
else {
send = (U8*) s0 + expectlen;
avail_len = expectlen;
}

/* Now, loop through the remaining bytes in the character's sequence,
Expand Down Expand Up @@ -1955,6 +1937,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
bool success = true;

if (UNLIKELY(possible_problems)) {
dTHX;

/* Here, the input sequence is potentially problematic. The code here
* determines if that is indeed the case and how to handle it. The
Expand Down Expand Up @@ -2552,13 +2535,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
}
} /* End of 'while (possible_problems)' */

/* Since there was a possible problem, the returned length may need to
* be changed from the one stored at the beginning of this function.
* Instead of trying to figure out if it has changed, just do it. */
if (advance_p) {
*advance_p = curlen;
}

if (msgs_return) {
*msgs = msgs_return;
}
Expand All @@ -2577,6 +2553,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
}
} /* End of there was a possible problem */

if (advance_p) {
*advance_p = curlen;
}

*cp_p = UNI_TO_NATIVE(uv);
return success;
}
Expand Down
Loading