Perl · khwilliamson · Jan 16, 2025 · Nov 25, 2024 · Nov 25, 2024 · Nov 28, 2024
diff --git a/utf8.c b/utf8.c
@@ -1627,21 +1627,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
 {
     PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_;
 
-    const U8 * s = s0;
-
-    /* The ending position, plus 1, of the first character in the sequence
-     * beginning at s0.  In other words, 'e', adjusted down to to be no more
-     * than a single character */
-    const U8 * send = e;
-
-    U32 possible_problems;  /* A bit is set here for each potential problem
-                               found as we go along */
-    UV uv = 0;
-    Size_t expectlen;    /* How long should this sequence be? */
-    Size_t avail_len;    /* When input is too short, gives what that is */
-
-    dTHX;
-
     /* Here, is one of:
      *  a)  malformed;
      *  b)  a problematic code point (surrogate, non-unicode, or nonchar); or
@@ -1661,8 +1646,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
     }
 
     /* Each of the affected Hanguls starts with \xED */
-
-    if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
+    if (is_HANGUL_ED_utf8_safe(s0, e)) { /* Always false on EBCDIC */
         if (advance_p) {
             *advance_p = 3;
         }
@@ -1677,21 +1661,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
      * what it is intended to do, and that no flaws in it are masked by
      * dropping down and executing the code below
-    assert(! isUTF8_CHAR(s0, send)
-          || UTF8_IS_SURROGATE(s0, send)
-          || UTF8_IS_SUPER(s0, send)
-          || UTF8_IS_NONCHAR(s0,send));
+    assert(! isUTF8_CHAR(s0, e)
+          || UTF8_IS_SURROGATE(s0, e)
+          || UTF8_IS_SUPER(s0, e)
+          || UTF8_IS_NONCHAR(s0, e));
     */
 
-    s = s0;
-    possible_problems = 0;
-    expectlen = 0;
-    avail_len = 0;
-
-    if (errors) {
-        *errors = 0;
-    }
-
     /* Accumulate the code point translation of the input byte sequence
      * s0 .. e-1, looking for malformations.
      *
@@ -1722,40 +1697,47 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
      * allowed one, we could allow in something that shouldn't have been.
      */
 
-    Size_t curlen;
-    if (UNLIKELY(s0 >= send)) {
-        possible_problems |= UTF8_GOT_EMPTY;
-        curlen = 0;
-        goto ready_to_handle_errors;
-    }
-    curlen = send - s0;
+    Size_t expectlen = 0;   /* How long should this sequence be? */
+    Size_t curlen = 0;      /* How many bytes have we processed so far */
+    UV uv = 0;              /* The accumulated code point, so far */
+    const U8 * s = s0;      /* Our current position examining the sequence */
 
-    /* We now know we can examine the first byte of the input */
-    expectlen = UTF8SKIP(s0);
+    /* Gives how many bytes are available, which may turn out to be less than
+     * the expected length */
+    Size_t avail_len;
 
-    /* This is a helper function; invariants should have been handled before
-     * calling it */
-    assert(! NATIVE_BYTE_IS_INVARIANT(*s0));
+    /* The ending position, plus 1, of the first character in the sequence
+     * beginning at s0.  In other words, 'e', adjusted down to to be no more
+     * than a single character */
+    const U8 * send = e;
 
-    /* A well-formed UTF-8 character, as the vast majority of calls to this
-     * function will be for, has this expected length.  For efficiency, set
-     * things up here to return it.  It will be overridden only in those rare
-     * cases where a malformation is found */
-    if (advance_p) {
-        *advance_p = expectlen;
+    /* A bit is set here for each potential problem found as we go along */
+    U32 possible_problems = 0;
+
+    /* The above variables have to be initialized before the 'goto' */
+
+    if (UNLIKELY(s0 >= send)) {
+        possible_problems |= UTF8_GOT_EMPTY;
+        avail_len = 0;
+        goto ready_to_handle_errors;
     }
+    avail_len = send - s0;
 
-    /* A continuation character can't start a valid sequence */
+    /* We now know we can examine the first byte of the input.  A continuation
+     * character can't start a valid sequence */
     if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
         curlen = 1;
         goto ready_to_handle_errors;
     }
 
+    /* This is a helper function; invariants should have been handled before
+     * calling it */
+    assert(! NATIVE_BYTE_IS_INVARIANT(*s0));
+
     /* Here is not a continuation byte, nor an invariant.  The only thing left
-     * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
-     * to check for sure because it excludes start bytes like \xC0 that always
-     * lead to overlongs.) */
+     * is a start byte (possibly for an overlong). */
+    expectlen = UTF8SKIP(s0); /* How long should this sequence be? */
 
     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
      * that indicate the number of bytes in the character's whole UTF-8
@@ -1764,12 +1746,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
 
     /* Setup the loop end point, making sure to not look past the end of the
      * input string, and flag it as too short if the size isn't big enough. */
-    if (UNLIKELY(curlen < expectlen)) {
+    if (UNLIKELY(avail_len < expectlen)) {
         possible_problems |= UTF8_GOT_SHORT;
-        avail_len = curlen;
     }
     else {
         send = (U8*) s0 + expectlen;
+        avail_len = expectlen;
     }
 
     /* Now, loop through the remaining bytes in the character's sequence,
@@ -1955,6 +1937,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
     bool success = true;
 
     if (UNLIKELY(possible_problems)) {
+        dTHX;
 
         /* Here, the input sequence is potentially problematic.  The code here
          * determines if that is indeed the case and how to handle it.  The
@@ -2552,13 +2535,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
             }
         }   /* End of 'while (possible_problems)' */
 
-        /* Since there was a possible problem, the returned length may need to
-         * be changed from the one stored at the beginning of this function.
-         * Instead of trying to figure out if it has changed, just do it. */
-        if (advance_p) {
-            *advance_p = curlen;
-        }
-
         if (msgs_return) {
             *msgs = msgs_return;
         }
@@ -2577,6 +2553,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
         }
     } /* End of there was a possible problem */
 
+    if (advance_p) {
+        *advance_p = curlen;
+    }
+
     *cp_p = UNI_TO_NATIVE(uv);
     return success;
 }