Merge pull request #3414 from Earlopain/parser-translator-better-strings

Further refine string handling in the parser translator
ruby · Jan 16, 2025 · 6fdee87 · 6fdee87
2 parents 9794bf2 + 4edfe9d
commit 6fdee87
Show file tree

Hide file tree

Showing 10 changed files with 724 additions and 241 deletions.
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
@@ -74,7 +74,29 @@ def visit_and_node(node)
         # []
         # ^^
         def visit_array_node(node)
-          builder.array(token(node.opening_loc), visit_all(node.elements), token(node.closing_loc))
+          if node.opening&.start_with?("%w", "%W", "%i", "%I")
+            elements = node.elements.flat_map do |element|
+              if element.is_a?(StringNode)
+                if element.content.include?("\n")
+                  string_nodes_from_line_continuations(element.unescaped, element.content, element.content_loc.start_offset, node.opening)
+                else
+                  [builder.string_internal([element.unescaped, srange(element.content_loc)])]
+                end
+              elsif element.is_a?(InterpolatedStringNode)
+                builder.string_compose(
+                  token(element.opening_loc),
+                  string_nodes_from_interpolation(element, node.opening),
+                  token(element.closing_loc)
+                )
+              else
+                [visit(element)]
+              end
+            end
+          else
+            elements = visit_all(node.elements)
+          end
+
+          builder.array(token(node.opening_loc), elements, token(node.closing_loc))
         end
 
         # foo => [bar]
@@ -1085,19 +1107,9 @@ def visit_interpolated_string_node(node)
             return visit_heredoc(node) { |children, closing| builder.string_compose(token(node.opening_loc), children, closing) }
           end
 
-          parts = node.parts.flat_map do |part|
-            # When the content of a string node is split across multiple lines, the
-            # parser gem creates individual string nodes for each line the content is part of.
-            if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
-              string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, node.opening)
-            else
-              visit(part)
-            end
-          end
-
           builder.string_compose(
             token(node.opening_loc),
-            parts,
+            string_nodes_from_interpolation(node, node.opening),
             token(node.closing_loc)
           )
         end
@@ -1116,14 +1128,14 @@ def visit_interpolated_symbol_node(node)
         # ^^^^^^^^^^^^
         def visit_interpolated_x_string_node(node)
           if node.heredoc?
-            visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
-          else
-            builder.xstring_compose(
-              token(node.opening_loc),
-              visit_all(node.parts),
-              token(node.closing_loc)
-            )
+            return visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
           end
+
+          builder.xstring_compose(
+            token(node.opening_loc),
+            string_nodes_from_interpolation(node, node.opening),
+            token(node.closing_loc)
+          )
         end
 
         # -> { it }
@@ -2011,13 +2023,6 @@ def visit_block(call, block)
           end
         end
 
-        # The parser gem automatically converts \r\n to \n, meaning our offsets
-        # need to be adjusted to always subtract 1 from the length.
-        def chomped_bytesize(line)
-          chomped = line.chomp
-          chomped.bytesize + (chomped == line ? 0 : 1)
-        end
-
         # Visit a heredoc that can be either a string or an xstring.
         def visit_heredoc(node)
           children = Array.new
@@ -2086,55 +2091,88 @@ def within_pattern
           end
         end
 
+        # When the content of a string node is split across multiple lines, the
+        # parser gem creates individual string nodes for each line the content is part of.
+        def string_nodes_from_interpolation(node, opening)
+          node.parts.flat_map do |part|
+            if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
+              string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, opening)
+            else
+              visit(part)
+            end
+          end
+        end
+
         # Create parser string nodes from a single prism node. The parser gem
         # "glues" strings together when a line continuation is encountered.
         def string_nodes_from_line_continuations(unescaped, escaped, start_offset, opening)
           unescaped = unescaped.lines
           escaped = escaped.lines
+          percent_array = opening&.start_with?("%w", "%W", "%i", "%I")
+
+          # Non-interpolating strings
+          if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i")
+            current_length = 0
+            current_line = +""
+
+            escaped.filter_map.with_index do |escaped_line, index|
+              unescaped_line = unescaped.fetch(index, "")
+              current_length += escaped_line.bytesize
+              current_line << unescaped_line
 
-          escaped_lengths = []
-          normalized_lengths = []
-          # Keeps track of where an unescaped line should start a new token. An unescaped
-          # \n would otherwise be indistinguishable from the actual newline at the end of
-          # of the line. The parser gem only emits a new string node at "real" newlines,
-          # line continuations don't start a new node as well.
-          do_next_tokens = []
-
-          if opening&.end_with?("'")
-            escaped.each do |line|
-              escaped_lengths << line.bytesize
-              normalized_lengths << chomped_bytesize(line)
-              do_next_tokens << true
+              # Glue line continuations together. Only %w and %i arrays can contain these.
+              if percent_array && escaped_line[/(\\)*\n$/, 1]&.length&.odd?
+                next unless index == escaped.count - 1
+              end
+              s = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_length)])
+              start_offset += escaped_line.bytesize
+              current_line = +""
+              current_length = 0
+              s
             end
           else
+            escaped_lengths = []
+            normalized_lengths = []
+            # Keeps track of where an unescaped line should start a new token. An unescaped
+            # \n would otherwise be indistinguishable from the actual newline at the end of
+            # of the line. The parser gem only emits a new string node at "real" newlines,
+            # line continuations don't start a new node as well.
+            do_next_tokens = []
+
             escaped
               .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
               .each do |lines|
                 escaped_lengths << lines.sum(&:bytesize)
-                normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
                 unescaped_lines_count = lines.sum do |line|
                   line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
                 end
-                do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
+                extra = 1
+                extra = lines.count if percent_array # Account for line continuations in percent arrays
+
+                normalized_lengths.concat(Array.new(unescaped_lines_count + extra, 0))
+                normalized_lengths[-1] = lines.sum { |line| line.bytesize }
+                do_next_tokens.concat(Array.new(unescaped_lines_count + extra, false))
                 do_next_tokens[-1] = true
               end
-          end
-
-          current_line = +""
-          current_normalized_length = 0
 
-          unescaped.filter_map.with_index do |unescaped_line, index|
-            current_line << unescaped_line
-            current_normalized_length += normalized_lengths.fetch(index, 0)
-
-            if do_next_tokens[index]
-              inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
-              start_offset += escaped_lengths.fetch(index, 0)
-              current_line = +""
-              current_normalized_length = 0
-              inner_part
-            else
-              nil
+            current_line = +""
+            current_normalized_length = 0
+
+            emitted_count = 0
+            unescaped.filter_map.with_index do |unescaped_line, index|
+              current_line << unescaped_line
+              current_normalized_length += normalized_lengths.fetch(index, 0)
+
+              if do_next_tokens[index]
+                inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
+                start_offset += escaped_lengths.fetch(emitted_count, 0)
+                current_line = +""
+                current_normalized_length = 0
+                emitted_count += 1
+                inner_part
+              else
+                nil
+              end
             end
           end
         end

diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
@@ -353,11 +353,15 @@ def to_a
                 location = range(next_location.start_offset, next_location.end_offset)
                 index += 1
               elsif value.start_with?("'", '"', "%")
-                if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
-                  # the parser gem doesn't simplify strings when its value ends in a newline
-                  if !(string_value = next_token.value).end_with?("\n") && basic_quotes
+                if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
+                  string_value = next_token.value
+                  if simplify_string?(string_value, value)
                     next_location = token.location.join(next_next_token.location)
-                    value = unescape_string(string_value, value)
+                    if percent_array?(value)
+                      value = percent_array_unescape(string_value)
+                    else
+                      value = unescape_string(string_value, value)
+                    end
                     type = :tSTRING
                     location = range(next_location.start_offset, next_location.end_offset)
                     index += 2
@@ -399,17 +403,31 @@ def to_a
               is_percent_array = percent_array?(quote_stack.last)
 
               if (lines = token.value.lines).one?
-                # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
-                is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
-                # The parser gem only removes indentation when the heredoc is not nested
-                not_nested = heredoc_stack.size == 1
-                if is_percent_array
-                  value = percent_array_unescape(value)
-                elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
-                  value = trim_heredoc_whitespace(value, current_heredoc)
-                end
+                # Prism usually emits a single token for strings with line continuations.
+                # For squiggly heredocs they are not joined so we do that manually here.
+                current_string = +""
+                current_length = 0
+                start_offset = token.location.start_offset
+                while token.type == :STRING_CONTENT
+                  current_length += token.value.bytesize
+                  # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
+                  is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
+                  # The parser gem only removes indentation when the heredoc is not nested
+                  not_nested = heredoc_stack.size == 1
+                  if is_percent_array
+                    value = percent_array_unescape(token.value)
+                  elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
+                    value = trim_heredoc_whitespace(token.value, current_heredoc)
+                  end
 
-                value = unescape_string(value, quote_stack.last)
+                  current_string << unescape_string(value, quote_stack.last)
+                  if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
+                    tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
+                    break
+                  end
+                  token = lexed[index][0]
+                  index += 1
+                end
               else
                 # When the parser gem encounters a line continuation inside of a multiline string,
                 # it emits a single string node. The backslash (and remaining newline) is removed.
@@ -447,8 +465,8 @@ def to_a
                     adjustment = 0
                   end
                 end
-                next
               end
+              next
             when :tSTRING_DVAR
               value = nil
             when :tSTRING_END
@@ -571,20 +589,21 @@ def calculate_heredoc_whitespace(heredoc_token_index)
           while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
             next_token_index += 1
             next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
+            first_token_on_line = next_token.location.start_column == 0
 
             # String content inside nested heredocs and interpolation is ignored
             if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
               # When interpolation is the first token of a line there is no string
               # content to check against. There will be no common whitespace.
-              if nesting_level == 0 && next_token.location.start_column == 0
+              if nesting_level == 0 && first_token_on_line
                 result = 0
               end
               nesting_level += 1
             elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
               nesting_level -= 1
               # When we encountered the matching heredoc end, we can exit
               break if nesting_level == -1
-            elsif next_token.type == :STRING_CONTENT && nesting_level == 0
+            elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
               common_whitespace = 0
               next_token.value[/^\s*/].each_char do |char|
                 if char == "\t"
@@ -674,8 +693,11 @@ def unescape_string(string, quote)
               # Append what was just skipped over, excluding the found backslash.
               result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
 
-              # Simple single-character escape sequences like \n
-              if (replacement = ESCAPES[scanner.peek(1)])
+              if scanner.peek(1) == "\n"
+                # Line continuation
+                scanner.pos += 1
+              elsif (replacement = ESCAPES[scanner.peek(1)])
+                # Simple single-character escape sequences like \n
                 result.append_as_bytes(replacement)
                 scanner.pos += 1
               elsif (octal = scanner.check(/[0-7]{1,3}/))
@@ -714,6 +736,23 @@ def unescape_string(string, quote)
           end
         end
 
+        # Certain strings are merged into a single string token.
+        def simplify_string?(value, quote)
+          case quote
+          when "'"
+            # Only simplify 'foo'
+            !value.include?("\n")
+          when '"'
+            # Simplify when every line ends with a line continuation, or it is the last line
+            value.lines.all? do |line|
+              !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
+            end
+          else
+            # %q and similar are never simplified
+            false
+          end
+        end
+
         # In a percent array, certain whitespace can be preceeded with a backslash,
         # causing the following characters to be part of the previous element.
         def percent_array_unescape(string)
@@ -737,7 +776,7 @@ def percent_array_leading_whitespace(string)
 
         # Determine if characters preceeded by a backslash should be escaped or not
         def interpolation?(quote)
-          quote != "'" && !quote.start_with?("%q", "%w", "%i")
+          !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
         end
 
         # Regexp allow interpolation but are handled differently during unescaping