Skip to content

Commit

Permalink
Merge pull request #3414 from Earlopain/parser-translator-better-strings
Browse files Browse the repository at this point in the history
Further refine string handling in the parser translator
  • Loading branch information
kddnewton authored Jan 16, 2025
2 parents 9794bf2 + 4edfe9d commit 6fdee87
Show file tree
Hide file tree
Showing 10 changed files with 724 additions and 241 deletions.
152 changes: 95 additions & 57 deletions lib/prism/translation/parser/compiler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,29 @@ def visit_and_node(node)
# []
# ^^
def visit_array_node(node)
builder.array(token(node.opening_loc), visit_all(node.elements), token(node.closing_loc))
if node.opening&.start_with?("%w", "%W", "%i", "%I")
elements = node.elements.flat_map do |element|
if element.is_a?(StringNode)
if element.content.include?("\n")
string_nodes_from_line_continuations(element.unescaped, element.content, element.content_loc.start_offset, node.opening)
else
[builder.string_internal([element.unescaped, srange(element.content_loc)])]
end
elsif element.is_a?(InterpolatedStringNode)
builder.string_compose(
token(element.opening_loc),
string_nodes_from_interpolation(element, node.opening),
token(element.closing_loc)
)
else
[visit(element)]
end
end
else
elements = visit_all(node.elements)
end

builder.array(token(node.opening_loc), elements, token(node.closing_loc))
end

# foo => [bar]
Expand Down Expand Up @@ -1085,19 +1107,9 @@ def visit_interpolated_string_node(node)
return visit_heredoc(node) { |children, closing| builder.string_compose(token(node.opening_loc), children, closing) }
end

parts = node.parts.flat_map do |part|
# When the content of a string node is split across multiple lines, the
# parser gem creates individual string nodes for each line the content is part of.
if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, node.opening)
else
visit(part)
end
end

builder.string_compose(
token(node.opening_loc),
parts,
string_nodes_from_interpolation(node, node.opening),
token(node.closing_loc)
)
end
Expand All @@ -1116,14 +1128,14 @@ def visit_interpolated_symbol_node(node)
# ^^^^^^^^^^^^
def visit_interpolated_x_string_node(node)
if node.heredoc?
visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
else
builder.xstring_compose(
token(node.opening_loc),
visit_all(node.parts),
token(node.closing_loc)
)
return visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
end

builder.xstring_compose(
token(node.opening_loc),
string_nodes_from_interpolation(node, node.opening),
token(node.closing_loc)
)
end

# -> { it }
Expand Down Expand Up @@ -2011,13 +2023,6 @@ def visit_block(call, block)
end
end

# The parser gem automatically converts \r\n to \n, meaning our offsets
# need to be adjusted to always subtract 1 from the length.
def chomped_bytesize(line)
chomped = line.chomp
chomped.bytesize + (chomped == line ? 0 : 1)
end

# Visit a heredoc that can be either a string or an xstring.
def visit_heredoc(node)
children = Array.new
Expand Down Expand Up @@ -2086,55 +2091,88 @@ def within_pattern
end
end

# When the content of a string node is split across multiple lines, the
# parser gem creates individual string nodes for each line the content is part of.
def string_nodes_from_interpolation(node, opening)
node.parts.flat_map do |part|
if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, opening)
else
visit(part)
end
end
end

# Create parser string nodes from a single prism node. The parser gem
# "glues" strings together when a line continuation is encountered.
def string_nodes_from_line_continuations(unescaped, escaped, start_offset, opening)
unescaped = unescaped.lines
escaped = escaped.lines
percent_array = opening&.start_with?("%w", "%W", "%i", "%I")

# Non-interpolating strings
if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i")
current_length = 0
current_line = +""

escaped.filter_map.with_index do |escaped_line, index|
unescaped_line = unescaped.fetch(index, "")
current_length += escaped_line.bytesize
current_line << unescaped_line

escaped_lengths = []
normalized_lengths = []
# Keeps track of where an unescaped line should start a new token. An unescaped
# \n would otherwise be indistinguishable from the actual newline at the end of
# of the line. The parser gem only emits a new string node at "real" newlines,
# line continuations don't start a new node as well.
do_next_tokens = []

if opening&.end_with?("'")
escaped.each do |line|
escaped_lengths << line.bytesize
normalized_lengths << chomped_bytesize(line)
do_next_tokens << true
# Glue line continuations together. Only %w and %i arrays can contain these.
if percent_array && escaped_line[/(\\)*\n$/, 1]&.length&.odd?
next unless index == escaped.count - 1
end
s = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_length)])
start_offset += escaped_line.bytesize
current_line = +""
current_length = 0
s
end
else
escaped_lengths = []
normalized_lengths = []
# Keeps track of where an unescaped line should start a new token. An unescaped
# \n would otherwise be indistinguishable from the actual newline at the end of
# of the line. The parser gem only emits a new string node at "real" newlines,
# line continuations don't start a new node as well.
do_next_tokens = []

escaped
.chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
.each do |lines|
escaped_lengths << lines.sum(&:bytesize)
normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
unescaped_lines_count = lines.sum do |line|
line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
end
do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
extra = 1
extra = lines.count if percent_array # Account for line continuations in percent arrays

normalized_lengths.concat(Array.new(unescaped_lines_count + extra, 0))
normalized_lengths[-1] = lines.sum { |line| line.bytesize }
do_next_tokens.concat(Array.new(unescaped_lines_count + extra, false))
do_next_tokens[-1] = true
end
end

current_line = +""
current_normalized_length = 0

unescaped.filter_map.with_index do |unescaped_line, index|
current_line << unescaped_line
current_normalized_length += normalized_lengths.fetch(index, 0)

if do_next_tokens[index]
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
start_offset += escaped_lengths.fetch(index, 0)
current_line = +""
current_normalized_length = 0
inner_part
else
nil
current_line = +""
current_normalized_length = 0

emitted_count = 0
unescaped.filter_map.with_index do |unescaped_line, index|
current_line << unescaped_line
current_normalized_length += normalized_lengths.fetch(index, 0)

if do_next_tokens[index]
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
start_offset += escaped_lengths.fetch(emitted_count, 0)
current_line = +""
current_normalized_length = 0
emitted_count += 1
inner_part
else
nil
end
end
end
end
Expand Down
79 changes: 59 additions & 20 deletions lib/prism/translation/parser/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -353,11 +353,15 @@ def to_a
location = range(next_location.start_offset, next_location.end_offset)
index += 1
elsif value.start_with?("'", '"', "%")
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
# the parser gem doesn't simplify strings when its value ends in a newline
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
string_value = next_token.value
if simplify_string?(string_value, value)
next_location = token.location.join(next_next_token.location)
value = unescape_string(string_value, value)
if percent_array?(value)
value = percent_array_unescape(string_value)
else
value = unescape_string(string_value, value)
end
type = :tSTRING
location = range(next_location.start_offset, next_location.end_offset)
index += 2
Expand Down Expand Up @@ -399,17 +403,31 @@ def to_a
is_percent_array = percent_array?(quote_stack.last)

if (lines = token.value.lines).one?
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
if is_percent_array
value = percent_array_unescape(value)
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
value = trim_heredoc_whitespace(value, current_heredoc)
end
# Prism usually emits a single token for strings with line continuations.
# For squiggly heredocs they are not joined so we do that manually here.
current_string = +""
current_length = 0
start_offset = token.location.start_offset
while token.type == :STRING_CONTENT
current_length += token.value.bytesize
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
if is_percent_array
value = percent_array_unescape(token.value)
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
value = trim_heredoc_whitespace(token.value, current_heredoc)
end

value = unescape_string(value, quote_stack.last)
current_string << unescape_string(value, quote_stack.last)
if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
break
end
token = lexed[index][0]
index += 1
end
else
# When the parser gem encounters a line continuation inside of a multiline string,
# it emits a single string node. The backslash (and remaining newline) is removed.
Expand Down Expand Up @@ -447,8 +465,8 @@ def to_a
adjustment = 0
end
end
next
end
next
when :tSTRING_DVAR
value = nil
when :tSTRING_END
Expand Down Expand Up @@ -571,20 +589,21 @@ def calculate_heredoc_whitespace(heredoc_token_index)
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
next_token_index += 1
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
first_token_on_line = next_token.location.start_column == 0

# String content inside nested heredocs and interpolation is ignored
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
# When interpolation is the first token of a line there is no string
# content to check against. There will be no common whitespace.
if nesting_level == 0 && next_token.location.start_column == 0
if nesting_level == 0 && first_token_on_line
result = 0
end
nesting_level += 1
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
nesting_level -= 1
# When we encountered the matching heredoc end, we can exit
break if nesting_level == -1
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
common_whitespace = 0
next_token.value[/^\s*/].each_char do |char|
if char == "\t"
Expand Down Expand Up @@ -674,8 +693,11 @@ def unescape_string(string, quote)
# Append what was just skipped over, excluding the found backslash.
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))

# Simple single-character escape sequences like \n
if (replacement = ESCAPES[scanner.peek(1)])
if scanner.peek(1) == "\n"
# Line continuation
scanner.pos += 1
elsif (replacement = ESCAPES[scanner.peek(1)])
# Simple single-character escape sequences like \n
result.append_as_bytes(replacement)
scanner.pos += 1
elsif (octal = scanner.check(/[0-7]{1,3}/))
Expand Down Expand Up @@ -714,6 +736,23 @@ def unescape_string(string, quote)
end
end

# Certain strings are merged into a single string token.
def simplify_string?(value, quote)
case quote
when "'"
# Only simplify 'foo'
!value.include?("\n")
when '"'
# Simplify when every line ends with a line continuation, or it is the last line
value.lines.all? do |line|
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
end
else
# %q and similar are never simplified
false
end
end

# In a percent array, certain whitespace can be preceeded with a backslash,
# causing the following characters to be part of the previous element.
def percent_array_unescape(string)
Expand All @@ -737,7 +776,7 @@ def percent_array_leading_whitespace(string)

# Determine if characters preceeded by a backslash should be escaped or not
def interpolation?(quote)
quote != "'" && !quote.start_with?("%q", "%w", "%i")
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
end

# Regexp allow interpolation but are handled differently during unescaping
Expand Down
Loading

0 comments on commit 6fdee87

Please sign in to comment.