Fix length calculations when using multibyte characters

fix tokenEnd and cursor for string, template, xml literals and multiline comments when using multibyte characters
mozilla · Dec 3, 2024 · 75e9c9f · 75e9c9f
1 parent 8b45873
commit 75e9c9f
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 2 deletions.
diff --git a/rhino/src/main/java/org/mozilla/javascript/TokenStream.java b/rhino/src/main/java/org/mozilla/javascript/TokenStream.java
@@ -1127,6 +1127,8 @@ final int getToken() throws IOException {
 
                 String str = getStringFromBuffer();
                 this.string = internString(str);
+                cursor = sourceCursor;
+                tokenEnd = cursor;
                 return Token.STRING;
             }
 
@@ -1330,6 +1332,7 @@ && matchChar('.')) {
                                 lookForSlash = true;
                             } else if (c == '/') {
                                 if (lookForSlash) {
+                                    cursor = sourceCursor;
                                     tokenEnd = cursor;
                                     return Token.COMMENT;
                                 }
@@ -1653,6 +1656,8 @@ int readTemplateLiteral(boolean isTaggedLiteral) throws IOException {
                 case '`':
                     rawString.setLength(rawString.length() - 1); // don't include "`"
                     this.string = hasInvalidEscapeSequences ? null : getStringFromBuffer();
+                    cursor = sourceCursor;
+                    tokenEnd = cursor;
                     return Token.TEMPLATE_LITERAL;
                 case '$':
                     if (matchTemplateLiteralChar('{')) {
@@ -1907,6 +1912,8 @@ int getNextXMLToken() throws IOException {
 
                 if (!xmlIsTagContent && xmlOpenTagsCount == 0) {
                     this.string = getStringFromBuffer();
+                    cursor = sourceCursor;
+                    tokenEnd = cursor;
                     return Token.XMLEND;
                 }
             } else {

diff --git a/tests/src/test/java/org/mozilla/javascript/tests/ParserTest.java b/tests/src/test/java/org/mozilla/javascript/tests/ParserTest.java
@@ -48,11 +48,15 @@
 import org.mozilla.javascript.ast.StringLiteral;
 import org.mozilla.javascript.ast.SwitchCase;
 import org.mozilla.javascript.ast.SwitchStatement;
+import org.mozilla.javascript.ast.TemplateCharacters;
+import org.mozilla.javascript.ast.TemplateLiteral;
 import org.mozilla.javascript.ast.TryStatement;
 import org.mozilla.javascript.ast.UpdateExpression;
 import org.mozilla.javascript.ast.VariableDeclaration;
 import org.mozilla.javascript.ast.VariableInitializer;
 import org.mozilla.javascript.ast.WithStatement;
+import org.mozilla.javascript.ast.XmlFragment;
+import org.mozilla.javascript.ast.XmlLiteral;
 import org.mozilla.javascript.testing.TestErrorReporter;
 
 public class ParserTest {
@@ -1203,14 +1207,52 @@ public void parseUnicodeFormatName() {
     }
 
     @Test
-    public void testParseUnicodeMultibyteCharacter() {
+    public void parseUnicodeMultibyteCharacter() {
         AstRoot root = parse("\uD842\uDFB7");
         AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression();
         assertEquals("𠮷", first.getString());
     }
 
     @Test
-    public void testParseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() {
+    public void parseMultibyteCharacter_StringLiteral() {
+        AstRoot root = parse("'\uD83C\uDF1F'");
+        StringLiteral first =
+                (StringLiteral) ((ExpressionStatement) root.getFirstChild()).getExpression();
+        assertEquals(4, first.getLength());
+        assertEquals("'🌟'", first.getValue(true));
+    }
+
+    @Test
+    public void parseMultibyteCharacter_TemplateLiteral() {
+        AstRoot root = parse("`\uD83C\uDF1F`");
+        TemplateLiteral first =
+                (TemplateLiteral) ((ExpressionStatement) root.getFirstChild()).getExpression();
+        TemplateCharacters templateCharacter = (TemplateCharacters) first.getElement(0);
+        assertEquals(2, templateCharacter.getLength());
+        assertEquals("🌟", templateCharacter.getValue());
+        assertEquals(4, first.getLength());
+    }
+
+    @Test
+    public void parseMultibyteCharacter_XMLLiteral() {
+        AstRoot root = parse("<xml>\uD83C\uDF1F</xml>");
+        XmlLiteral first =
+                (XmlLiteral) ((ExpressionStatement) root.getFirstChild()).getExpression();
+        XmlFragment fragment = first.getFragments().get(0);
+        assertEquals(13, fragment.getLength());
+        assertEquals("<xml>🌟</xml>", fragment.toSource());
+    }
+
+    @Test
+    public void parseMultibyteCharacter_Comment() {
+        AstRoot root = parse("/*\uD83C\uDF1F*/");
+        Comment comment = root.getComments().first();
+        assertEquals(6, comment.getLength());
+        assertEquals("/*🌟*/", comment.getValue());
+    }
+
+    @Test
+    public void parseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() {
         // On the JDK 11 I'm using, Character.isUnicodeIdentifierPart(U+9FEB) returns true
         // but Character.isJavaIdentifierPart(U+9FEB) returns false. On a JDK 17 results
         // seem to vary, but I think it's enough to verify that TokenStream uses