Skip to content

Commit 0eaddf2

Browse files
committed
[text-format] Fix parsing of string literals
This renames `next_byte_value` to `next_str_lit_bytes` and may return between 1..=4 bytes per call, representing the variable-length nature of the UTF-8 encoding.
1 parent 16c9dc5 commit 0eaddf2

File tree

2 files changed

+60
-22
lines changed

2 files changed

+60
-22
lines changed

protobuf-support/src/lexer/lexer_impl.rs

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ impl From<ProtobufFloatParseError> for LexerError {
6767
}
6868
}
6969

70+
/// The raw bytes for a single char or escape sequence in a string literal
71+
///
72+
/// The raw bytes are available via an `into_iter` implementation.
73+
pub struct DecodedBytes {
74+
// a single char can be up to 4-bytes when encoded in utf-8
75+
buf: [u8; 4],
76+
len: u8,
77+
}
78+
7079
#[derive(Copy, Clone)]
7180
pub struct Lexer<'a> {
7281
language: ParserLanguage,
@@ -440,24 +449,24 @@ impl<'a> Lexer<'a> {
440449
// octEscape = '\' octalDigit octalDigit octalDigit
441450
// charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
442451
// quote = "'" | '"'
443-
pub fn next_byte_value(&mut self) -> LexerResult<u8> {
452+
pub fn next_str_lit_bytes(&mut self) -> LexerResult<DecodedBytes> {
444453
match self.next_char()? {
445454
'\\' => {
446455
match self.next_char()? {
447-
'\'' => Ok(b'\''),
448-
'"' => Ok(b'"'),
449-
'\\' => Ok(b'\\'),
450-
'a' => Ok(b'\x07'),
451-
'b' => Ok(b'\x08'),
452-
'f' => Ok(b'\x0c'),
453-
'n' => Ok(b'\n'),
454-
'r' => Ok(b'\r'),
455-
't' => Ok(b'\t'),
456-
'v' => Ok(b'\x0b'),
456+
'\'' => Ok(b'\''.into()),
457+
'"' => Ok(b'"'.into()),
458+
'\\' => Ok(b'\\'.into()),
459+
'a' => Ok(b'\x07'.into()),
460+
'b' => Ok(b'\x08'.into()),
461+
'f' => Ok(b'\x0c'.into()),
462+
'n' => Ok(b'\n'.into()),
463+
'r' => Ok(b'\r'.into()),
464+
't' => Ok(b'\t'.into()),
465+
'v' => Ok(b'\x0b'.into()),
457466
'x' => {
458467
let d1 = self.next_hex_digit()? as u8;
459468
let d2 = self.next_hex_digit()? as u8;
460-
Ok(((d1 << 4) | d2) as u8)
469+
Ok((((d1 << 4) | d2) as u8).into())
461470
}
462471
d if d >= '0' && d <= '7' => {
463472
let mut r = d as u8 - b'0';
@@ -467,16 +476,14 @@ impl<'a> Lexer<'a> {
467476
Ok(d) => r = (r << 3) + d as u8,
468477
}
469478
}
470-
Ok(r)
479+
Ok(r.into())
471480
}
472481
// https://github.com/google/protobuf/issues/4562
473-
// TODO: overflow
474-
c => Ok(c as u8),
482+
c => Ok(c.into()),
475483
}
476484
}
477485
'\n' | '\0' => Err(LexerError::IncorrectInput),
478-
// TODO: check overflow
479-
c => Ok(c as u8),
486+
c => Ok(c.into()),
480487
}
481488
}
482489

@@ -530,7 +537,7 @@ impl<'a> Lexer<'a> {
530537
};
531538
first = false;
532539
while self.lookahead_char() != Some(q) {
533-
self.next_byte_value()?;
540+
self.next_str_lit_bytes()?;
534541
}
535542
self.next_char_expect_eq(q)?;
536543

@@ -663,6 +670,37 @@ impl<'a> Lexer<'a> {
663670
}
664671
}
665672

673+
impl From<u8> for DecodedBytes {
674+
fn from(value: u8) -> Self {
675+
DecodedBytes {
676+
buf: [value, 0, 0, 0],
677+
len: 1,
678+
}
679+
}
680+
}
681+
682+
impl From<char> for DecodedBytes {
683+
fn from(value: char) -> Self {
684+
let mut this = DecodedBytes {
685+
buf: [0; 4],
686+
len: 0,
687+
};
688+
let len = value.encode_utf8(&mut this.buf).len();
689+
this.len = len as _;
690+
this
691+
}
692+
}
693+
694+
// means that we work with `Vec::extend`.
695+
impl IntoIterator for DecodedBytes {
696+
type Item = u8;
697+
type IntoIter = std::iter::Take<std::array::IntoIter<u8, 4>>;
698+
699+
fn into_iter(self) -> Self::IntoIter {
700+
self.buf.into_iter().take(self.len as _)
701+
}
702+
}
703+
666704
#[cfg(test)]
667705
mod test {
668706
use super::*;

protobuf-support/src/lexer/str_lit.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ impl StrLit {
3232
let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json);
3333
let mut r = Vec::new();
3434
while !lexer.eof() {
35-
r.push(
35+
r.extend(
3636
lexer
37-
.next_byte_value()
37+
.next_str_lit_bytes()
3838
.map_err(|_| StrLitDecodeError::OtherError)?,
3939
);
4040
}
@@ -45,9 +45,9 @@ impl StrLit {
4545
let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json);
4646
let mut r = Vec::new();
4747
while !lexer.eof() {
48-
r.push(
48+
r.extend(
4949
lexer
50-
.next_byte_value()
50+
.next_str_lit_bytes()
5151
.map_err(|_| StrLitDecodeError::OtherError)?,
5252
);
5353
}

0 commit comments

Comments
 (0)