Skip to content

Commit

Permalink
Follow Hunspell's way of parsing flags with large Unicode scalar values
Browse files Browse the repository at this point in the history
Unicode flags that take more than 16 bits to represent are rejected by
Nuspell but are accepted by Hunspell. When parsing single flags (like in
a PFX rule for example), Hunspell takes the higher of the two code
units, discarding the lower. When parsing a flag set (like in a .dic
line), Hunspell takes both code units.

I haven't updated all parsing. In particular I don't think that compound
rules (using wildcards '*' and '?') would work accurately if used with
flags with high scalar values. It may be worthwhile to emit an error in
those cases instead of silently behavior unpredictably.
  • Loading branch information
the-mikedavis committed Mar 4, 2025
1 parent 0335a52 commit a3b8e02
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 17 deletions.
55 changes: 38 additions & 17 deletions src/aff/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use core::{
hash::BuildHasher,
iter::{Enumerate, Peekable, TakeWhile},
num::NonZeroU16,
str::{Chars, FromStr, SplitWhitespace},
str::{FromStr, SplitWhitespace},
};

use hashbrown::HashMap;
Expand Down Expand Up @@ -569,7 +569,7 @@ fn parse_compound_syllable<'aff>(

fn parse_flag_aliases(cx: &mut AffLineParser, lines: &mut Lines) -> ParseResult {
lines.parse_table1("AF", |alias| {
let flagset = parse_flags_from_chars(cx.flag_type, alias.chars())?;
let flagset = parse_flags_from_str(cx.flag_type, alias)?;
cx.flag_aliases.push(flagset);
Ok(())
})
Expand Down Expand Up @@ -1146,22 +1146,28 @@ fn parse_flag_from_str(
try_flag_from_u16(number)
}
FlagType::Utf8 => {
let mut chars = input.chars();
let ch = chars.next().expect("asserted to be non-empty above");
try_flag_from_char(ch)
// A u16 is not large enough to fit any Unicode scalar. Nuspell rejects scalars with
// codepoint values above `u16::MAX` but Hunspell accepts them. Hunspell converts the
// input string into UTF-16 and then takes the first u16.
let u16 = input
.encode_utf16()
.next()
.expect("asserted to be non-empty above");
try_flag_from_u16(u16)
}
}
}

fn parse_flags_from_chars(
fn parse_flags_from_str(
flag_type: FlagType,
mut chars: Chars,
input: &str,
) -> core::result::Result<FlagSet, ParseFlagError> {
use ParseFlagError::*;

match flag_type {
FlagType::Short => {
let flagset = chars
let flagset = input
.chars()
.map(|ch| {
if ch.is_ascii() {
// The flag is ASCII: it's a valid `u8` so it can fit into a `u16`.
Expand All @@ -1174,6 +1180,7 @@ fn parse_flags_from_chars(
Ok(flagset.into())
}
FlagType::Long => {
let mut chars = input.chars();
let mut flags = Vec::new();
while let Some(c1) = chars.next() {
let c2 = match chars.next() {
Expand All @@ -1189,7 +1196,7 @@ fn parse_flags_from_chars(
let mut flags = Vec::new();
let mut number = String::new();
let mut separated = false;
for ch in chars.by_ref() {
for ch in input.chars() {
if ch == ',' {
if separated {
return Err(DuplicateComma);
Expand All @@ -1209,8 +1216,12 @@ fn parse_flags_from_chars(
Ok(flags.into())
}
FlagType::Utf8 => {
let flags = chars
.map(try_flag_from_char)
// Using the UTF-16 encoding looks funny here... Nuspell rejects Unicode flags that
// take more than 16 bits to represent, but Hunspell silently accepts them (though it
// might lead to weird behavior down the line.)
let flags = input
.encode_utf16()
.map(try_flag_from_u16)
.collect::<core::result::Result<Vec<Flag>, _>>()?;
Ok(flags.into())
}
Expand All @@ -1232,8 +1243,7 @@ fn decode_flagset(
) -> core::result::Result<FlagSet, ParseFlagError> {
// Fast lane for numeric flag-types and empty aliases.
if matches!(flag_type, FlagType::Numeric) || aliases.is_empty() {
// TODO: refactor this function to take a str
return parse_flags_from_chars(flag_type, input.chars());
return parse_flags_from_str(flag_type, input);
}

if let Some(index) = input
Expand All @@ -1245,7 +1255,7 @@ fn decode_flagset(
// NOTE: the aliases are 1-indexed.
Ok(aliases[index - 1].clone())
} else {
parse_flags_from_chars(flag_type, input.chars())
parse_flags_from_str(flag_type, input)
}
}

Expand Down Expand Up @@ -1839,17 +1849,28 @@ mod test {
);
assert_eq!(Ok(flag!(1)), parse_flag_from_str(FlagType::Numeric, "1"));

// U+1F52D '🔭' is four bytes in UTF8 and two code units in UTF-16. Nuspell rejects flags
// like this but Hunspell accepts them by discarding the lower code unit.
let telescope_flag =
parse_flag_from_str(FlagType::Utf8, "🔭").expect("can parse 🔭 UTF-8 flag");
// A consequence of this is that flags describing large Unicode scalar values are not
// precise and two emojis (for example) may "collide" to reuse the same flag value, for
// example the above telescope U+1F52D and the next scalar, U+1F52E crystal ball.
let crystal_ball_flag =
parse_flag_from_str(FlagType::Utf8, "🔮").expect("can parse 🔮 UTF-8 flag");
assert_eq!(telescope_flag, crystal_ball_flag);

assert_eq!(
Ok(flagset![1]),
parse_flags_from_chars(FlagType::Numeric, "1".chars())
parse_flags_from_str(FlagType::Numeric, "1")
);
assert_eq!(
Ok(flagset![1001, 2002]),
parse_flags_from_chars(FlagType::Numeric, "1001,2002".chars())
parse_flags_from_str(FlagType::Numeric, "1001,2002")
);
assert_eq!(
Ok(flagset![214, 216, 54321]),
parse_flags_from_chars(FlagType::Numeric, "214,216,54321".chars())
parse_flags_from_str(FlagType::Numeric, "214,216,54321")
);
}

Expand Down
20 changes: 20 additions & 0 deletions src/checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2720,4 +2720,24 @@ mod test {
assert!(EN_US.checker().check_lower_as_title(true).check("alice"));
assert!(EN_US.checker().check_lower_as_upper(true).check("rsvp"));
}

#[test]
fn emoji_pfx_flag_test() {
// See <https://github.com/titoBouzout/Dictionaries/blob/80a5112e41b21ade9d00b837c05b0d06280f138f/Spanish.aff#L75-L77>
let aff = r#"
FLAG UTF-8
PFX 🔭 Y 2
PFX 🔭 0 macro [^r]
PFX 🔭 0 macror r
"#;

let dic = r#"1
concierto/hS🔭
"#;

let dict = Dictionary::new(aff, dic).unwrap();

assert!(dict.check("macroconcierto"));
}
}

0 comments on commit a3b8e02

Please sign in to comment.