From 75e9c9f721f80e332aea32ec5702218dd67463af Mon Sep 17 00:00:00 2001 From: meraedit Date: Tue, 3 Dec 2024 02:46:51 +0200 Subject: [PATCH] Fix length calculations when using multibyte characters fix tokenEnd and cursor for string, template, xml literals and multiline comments when using multibyte characters --- .../org/mozilla/javascript/TokenStream.java | 7 +++ .../mozilla/javascript/tests/ParserTest.java | 46 ++++++++++++++++++- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/rhino/src/main/java/org/mozilla/javascript/TokenStream.java b/rhino/src/main/java/org/mozilla/javascript/TokenStream.java index f133ac8f31..f7676e3a06 100644 --- a/rhino/src/main/java/org/mozilla/javascript/TokenStream.java +++ b/rhino/src/main/java/org/mozilla/javascript/TokenStream.java @@ -1127,6 +1127,8 @@ final int getToken() throws IOException { String str = getStringFromBuffer(); this.string = internString(str); + cursor = sourceCursor; + tokenEnd = cursor; return Token.STRING; } @@ -1330,6 +1332,7 @@ && matchChar('.')) { lookForSlash = true; } else if (c == '/') { if (lookForSlash) { + cursor = sourceCursor; tokenEnd = cursor; return Token.COMMENT; } @@ -1653,6 +1656,8 @@ int readTemplateLiteral(boolean isTaggedLiteral) throws IOException { case '`': rawString.setLength(rawString.length() - 1); // don't include "`" this.string = hasInvalidEscapeSequences ? null : getStringFromBuffer(); + cursor = sourceCursor; + tokenEnd = cursor; return Token.TEMPLATE_LITERAL; case '$': if (matchTemplateLiteralChar('{')) { @@ -1907,6 +1912,8 @@ int getNextXMLToken() throws IOException { if (!xmlIsTagContent && xmlOpenTagsCount == 0) { this.string = getStringFromBuffer(); + cursor = sourceCursor; + tokenEnd = cursor; return Token.XMLEND; } } else { diff --git a/tests/src/test/java/org/mozilla/javascript/tests/ParserTest.java b/tests/src/test/java/org/mozilla/javascript/tests/ParserTest.java index c9b6aa9480..336dcf2c9a 100644 --- a/tests/src/test/java/org/mozilla/javascript/tests/ParserTest.java +++ b/tests/src/test/java/org/mozilla/javascript/tests/ParserTest.java @@ -48,11 +48,15 @@ import org.mozilla.javascript.ast.StringLiteral; import org.mozilla.javascript.ast.SwitchCase; import org.mozilla.javascript.ast.SwitchStatement; +import org.mozilla.javascript.ast.TemplateCharacters; +import org.mozilla.javascript.ast.TemplateLiteral; import org.mozilla.javascript.ast.TryStatement; import org.mozilla.javascript.ast.UpdateExpression; import org.mozilla.javascript.ast.VariableDeclaration; import org.mozilla.javascript.ast.VariableInitializer; import org.mozilla.javascript.ast.WithStatement; +import org.mozilla.javascript.ast.XmlFragment; +import org.mozilla.javascript.ast.XmlLiteral; import org.mozilla.javascript.testing.TestErrorReporter; public class ParserTest { @@ -1203,14 +1207,52 @@ public void parseUnicodeFormatName() { } @Test - public void testParseUnicodeMultibyteCharacter() { + public void parseUnicodeMultibyteCharacter() { AstRoot root = parse("\uD842\uDFB7"); AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression(); assertEquals("𠮷", first.getString()); } @Test - public void testParseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() { + public void parseMultibyteCharacter_StringLiteral() { + AstRoot root = parse("'\uD83C\uDF1F'"); + StringLiteral first = + (StringLiteral) ((ExpressionStatement) root.getFirstChild()).getExpression(); + assertEquals(4, first.getLength()); + assertEquals("'🌟'", first.getValue(true)); + } + + @Test + public void parseMultibyteCharacter_TemplateLiteral() { + AstRoot root = parse("`\uD83C\uDF1F`"); + TemplateLiteral first = + (TemplateLiteral) ((ExpressionStatement) root.getFirstChild()).getExpression(); + TemplateCharacters templateCharacter = (TemplateCharacters) first.getElement(0); + assertEquals(2, templateCharacter.getLength()); + assertEquals("🌟", templateCharacter.getValue()); + assertEquals(4, first.getLength()); + } + + @Test + public void parseMultibyteCharacter_XMLLiteral() { + AstRoot root = parse("\uD83C\uDF1F"); + XmlLiteral first = + (XmlLiteral) ((ExpressionStatement) root.getFirstChild()).getExpression(); + XmlFragment fragment = first.getFragments().get(0); + assertEquals(13, fragment.getLength()); + assertEquals("🌟", fragment.toSource()); + } + + @Test + public void parseMultibyteCharacter_Comment() { + AstRoot root = parse("/*\uD83C\uDF1F*/"); + Comment comment = root.getComments().first(); + assertEquals(6, comment.getLength()); + assertEquals("/*🌟*/", comment.getValue()); + } + + @Test + public void parseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() { // On the JDK 11 I'm using, Character.isUnicodeIdentifierPart(U+9FEB) returns true // but Character.isJavaIdentifierPart(U+9FEB) returns false. On a JDK 17 results // seem to vary, but I think it's enough to verify that TokenStream uses