Skip to content

Commit

Permalink
Handle control and meta escapes in parser translation
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Jan 16, 2025
1 parent 6fdee87 commit 09c59a3
Showing 1 changed file with 50 additions and 34 deletions.
84 changes: 50 additions & 34 deletions lib/prism/translation/parser/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -692,44 +692,12 @@ def unescape_string(string, quote)
while (skipped = scanner.skip_until(/\\/))
# Append what was just skipped over, excluding the found backslash.
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))

if scanner.peek(1) == "\n"
# Line continuation
scanner.pos += 1
elsif (replacement = ESCAPES[scanner.peek(1)])
# Simple single-character escape sequences like \n
result.append_as_bytes(replacement)
scanner.pos += 1
elsif (octal = scanner.check(/[0-7]{1,3}/))
# \nnn
result.append_as_bytes(octal.to_i(8).chr)
scanner.pos += octal.bytesize
elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
# \xnn
result.append_as_bytes(hex[1..].to_i(16).chr)
scanner.pos += hex.bytesize
elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
# \unnnn
result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8))
scanner.pos += unicode.bytesize
elsif scanner.peek(3) == "u{}"
# https://github.com/whitequark/parser/issues/856
scanner.pos += 3
elsif (unicode_parts = scanner.check(/u{.*}/))
# \u{nnnn ...}
unicode_parts[2..-2].split.each do |unicode|
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
end
scanner.pos += unicode_parts.bytesize
end
escape_read(result, scanner, false, false)
end

# Add remainging chars
# Add remaining chars
result.append_as_bytes(string.byteslice(scanner.pos..))

result.force_encoding(source_buffer.source.encoding)

result
else
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
string.gsub(/\\([\\#{delimiters}])/, '\1')
Expand All @@ -753,6 +721,54 @@ def simplify_string?(value, quote)
end
end

# Escape a byte value, given the control and meta flags.
def escape_build(value, control, meta)
value &= 0x9f if control
value |= 0x80 if meta
value.chr
end

# Read an escape out of the string scanner, given the control and meta
# flags, and push the unescaped value into the result.
def escape_read(result, scanner, control, meta)
if scanner.skip("\n")
# Line continuation
elsif (value = ESCAPES[scanner.peek(1)])
# Simple single-character escape sequences like \n
result.append_as_bytes(value)
scanner.pos += 1
elsif (value = scanner.scan(/[0-7]{1,3}/))
# \nnn
result.append_as_bytes(escape_build(value.to_i(8), control, meta))
elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
# \xnn
result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
# \unnnn
result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
elsif scanner.skip("u{}")
# https://github.com/whitequark/parser/issues/856
elsif (value = scanner.scan(/u{.*?}/))
# \u{nnnn ...}
value[2..-2].split.each do |unicode|
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
end
elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
# \cx or \C-x where x is an ASCII printable character
escape_read(result, scanner, true, meta)
elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
# \M-x where x is an ASCII printable character
escape_read(result, scanner, control, true)
elsif (byte = scanner.get_byte)
# Something else after an escape.
if control && byte == "?"
result.append_as_bytes(escape_build(0x7f, false, meta))
else
result.append_as_bytes(escape_build(byte.ord, control, meta))
end
end
end

# In a percent array, certain whitespace can be preceeded with a backslash,
# causing the following characters to be part of the previous element.
def percent_array_unescape(string)
Expand Down

0 comments on commit 09c59a3

Please sign in to comment.