From 6fbb7defe4c4d8331bfb19ae72b815b5d9933bb4 Mon Sep 17 00:00:00 2001 From: Harrison Burt <57491488+ChillFish8@users.noreply.github.com> Date: Sat, 25 Jun 2022 13:03:38 +0100 Subject: [PATCH] LNX-NONE: Move to compose lib over symspell fork. (#96) * Add compose over personal fork * Add unicode normalizing tokenizer * Reformat code --- lnx-engine/search-index/Cargo.toml | 3 +- lnx-engine/search-index/src/corrections.rs | 24 +-- lnx-engine/search-index/src/lib.rs | 1 + lnx-engine/search-index/src/query.rs | 7 +- lnx-engine/search-index/src/schema.rs | 2 +- lnx-engine/search-index/src/structures.rs | 5 + lnx-engine/search-index/src/tokenizer.rs | 192 +++++++++++++++++++++ 7 files changed, 219 insertions(+), 15 deletions(-) create mode 100644 lnx-engine/search-index/src/tokenizer.rs diff --git a/lnx-engine/search-index/Cargo.toml b/lnx-engine/search-index/Cargo.toml index 75d414cc..5258a660 100644 --- a/lnx-engine/search-index/Cargo.toml +++ b/lnx-engine/search-index/Cargo.toml @@ -13,8 +13,9 @@ serde = { version = "1", features = ["derive"] } sled = { version = "0.34.7", features = ["compression"] } hashbrown = { version = "0.11", features = ["serde"] } tokio = { version = "1.12", features = ["sync", "fs", "rt"] } -symspell = { git = "https://github.com/lnx-search/symspell", tag = "v0.5.0" } +compose = { git = "https://github.com/lnx-search/compose.git", tag = "0.1.0" } +deunicode = "1.3.1" tantivy = "0.18.0" tracing = "0.1.29" tracing-futures = "0.2.5" diff --git a/lnx-engine/search-index/src/corrections.rs b/lnx-engine/search-index/src/corrections.rs index 089aaad6..1796ad49 100644 --- a/lnx-engine/search-index/src/corrections.rs +++ b/lnx-engine/search-index/src/corrections.rs @@ -2,20 +2,20 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use arc_swap::ArcSwap; +use compose::{Suggestion, SymSpell, Verbosity}; use hashbrown::HashMap; -use symspell::{AsciiStringStrategy, Suggestion, SymSpell, Verbosity}; pub(crate) type SymSpellCorrectionManager = Arc; /// The manager around the sym spell fuzzy searching system. pub(crate) struct SymSpellManager { - sym: Arc>>, + sym: ArcSwap, } impl SymSpellManager { pub(crate) fn new() -> Self { let sym = SymSpell::default(); - let sym = Arc::new(ArcSwap::from_pointee(sym)); + let sym = ArcSwap::from_pointee(sym); Self { sym } } @@ -38,13 +38,17 @@ impl SymSpellManager { pub(crate) fn adjust_index_frequencies(&self, frequencies: &HashMap) { info!("adjusting spell correction system to new frequency count, this may take a while..."); - let mut symspell: SymSpell = SymSpell::default(); - symspell.using_dictionary_frequencies( - frequencies - .into_iter() - .map(|(k, v)| (k.clone(), *v as i64)) - .collect(), - ); + let frequencies = frequencies + .into_iter() + .map(|(k, v)| (k.clone(), *v as i64)) + .collect(); + + let mut symspell = SymSpell::default(); + + // SAFETY: + // This is safe as long as the keys being passed are ASCII. If this uses UTF-8 characters + // there is a chance this can make the algorithm become UB when accessing the wordmap. + unsafe { symspell.using_dictionary_frequencies(frequencies, false) }; self.sym.store(Arc::from(symspell)) } diff --git a/lnx-engine/search-index/src/lib.rs b/lnx-engine/search-index/src/lib.rs index 53c9eaf4..fc2ce6e3 100644 --- a/lnx-engine/search-index/src/lib.rs +++ b/lnx-engine/search-index/src/lib.rs @@ -15,6 +15,7 @@ mod stop_words; mod storage; pub mod structures; mod synonyms; +mod tokenizer; mod writer; pub use helpers::cr32_hash; diff --git a/lnx-engine/search-index/src/query.rs b/lnx-engine/search-index/src/query.rs index 863b2c58..c58aa5b9 100644 --- a/lnx-engine/search-index/src/query.rs +++ b/lnx-engine/search-index/src/query.rs @@ -28,13 +28,14 @@ use tantivy::schema::{ IndexRecordOption, Schema, }; -use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer}; +use tantivy::tokenizer::TokenStream; use tantivy::{DateTime, Index, Score, Term}; use crate::corrections::SymSpellCorrectionManager; use crate::stop_words::StopWordManager; use crate::structures::DocumentValue; use crate::synonyms::SynonymsManager; +use crate::tokenizer::SimpleUnicodeTokenizer; pub type DocumentId = u64; @@ -413,7 +414,7 @@ pub(crate) struct QueryBuilder { pool: crate::ReaderExecutor, /// A basic word tokenizers for fuzzy queries. - tokenizer: TextAnalyzer, + tokenizer: SimpleUnicodeTokenizer, } impl QueryBuilder { @@ -427,7 +428,7 @@ impl QueryBuilder { pool: crate::ReaderExecutor, ) -> Self { let parser = get_parser(&ctx, index); - let tokenizer = TextAnalyzer::from(SimpleTokenizer).filter(LowerCaser); + let tokenizer = SimpleUnicodeTokenizer::with_limit(16); Self { ctx: Arc::new(ctx), diff --git a/lnx-engine/search-index/src/schema.rs b/lnx-engine/search-index/src/schema.rs index e4ba6b8f..7a29c1f6 100644 --- a/lnx-engine/search-index/src/schema.rs +++ b/lnx-engine/search-index/src/schema.rs @@ -287,7 +287,7 @@ impl SchemaContext { } /// Generates a new schema from the given fields. - pub fn as_tantivy_schema(&self) -> tantivy::schema::Schema { + pub fn as_tantivy_schema(&self) -> Schema { let mut schema = SchemaBuilder::new(); schema.add_u64_field(PRIMARY_KEY, FAST | STORED | INDEXED); diff --git a/lnx-engine/search-index/src/structures.rs b/lnx-engine/search-index/src/structures.rs index c90d4d33..63b27092 100644 --- a/lnx-engine/search-index/src/structures.rs +++ b/lnx-engine/search-index/src/structures.rs @@ -23,6 +23,7 @@ use crate::schema::{SchemaContext, PRIMARY_KEY}; use crate::stop_words::StopWordManager; use crate::storage::{OpenType, SledBackedDirectory, StorageBackend}; use crate::synonyms::SynonymsManager; +use crate::tokenizer::SimpleUnicodeTokenizer; use crate::writer::WriterContext; use crate::DocumentId; @@ -184,6 +185,10 @@ impl IndexDeclaration { let corrections = Arc::new(SymSpellManager::new()); let storage = StorageBackend::using_conn(dir); + index + .tokenizers() + .register("default", SimpleUnicodeTokenizer::default()); + Ok(IndexContext { name: self.name.clone(), storage, diff --git a/lnx-engine/search-index/src/tokenizer.rs b/lnx-engine/search-index/src/tokenizer.rs new file mode 100644 index 00000000..8da3356e --- /dev/null +++ b/lnx-engine/search-index/src/tokenizer.rs @@ -0,0 +1,192 @@ +use deunicode::deunicode_char; +use tantivy::tokenizer::{ + BoxTokenStream, + SimpleTokenizer, + Token, + TokenStream, + Tokenizer, +}; + +#[derive(Clone)] +pub struct SimpleUnicodeTokenizer { + limit: usize, +} + +impl Default for SimpleUnicodeTokenizer { + fn default() -> Self { + Self { limit: usize::MAX } + } +} + +impl SimpleUnicodeTokenizer { + pub fn with_limit(num_words: usize) -> Self { + Self { limit: num_words } + } + + pub fn token_stream(&self, text: &str) -> SimpleTokenStream { + let tokens = produce_tokens(text, self.limit); + + SimpleTokenStream { tokens, pointer: 0 } + } +} + +impl Tokenizer for SimpleUnicodeTokenizer { + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + let tokens = produce_tokens(text, self.limit); + BoxTokenStream::from(SimpleTokenStream { tokens, pointer: 0 }) + } +} + +pub fn produce_tokens(text: &str, num_tokens: usize) -> Vec { + let mut characters = String::with_capacity(text.len()); + for char in text.chars() { + if let Some(ascii) = deunicode_char(char) { + if ascii.len() > 1 { + characters.push(' '); + } + characters.push_str(&ascii.to_lowercase()); + } + } + + let simple = SimpleTokenizer {}; + let mut stream = simple.token_stream(&characters); + + let mut tokens = vec![]; + while let Some(token) = stream.next() { + if tokens.len() >= num_tokens { + break; + } + + tokens.push(token.clone()); + } + + tokens +} + +pub struct SimpleTokenStream { + tokens: Vec, + pointer: usize, +} + +impl TokenStream for SimpleTokenStream { + fn advance(&mut self) -> bool { + if self.pointer < self.tokens.len() { + self.pointer += 1; + true + } else { + false + } + } + + fn token(&self) -> &Token { + // safe because our pointer cannot go beyond bounds + unsafe { self.tokens.get_unchecked(self.pointer - 1) } + } + + fn token_mut(&mut self) -> &mut Token { + // safe because our pointer cannot go beyond bounds + unsafe { self.tokens.get_unchecked_mut(self.pointer - 1) } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_and_compare(text: &str, expected: Vec<&str>) { + let tokenizer = SimpleUnicodeTokenizer::default(); + let mut stream = tokenizer.token_stream(text); + + let mut tokens = vec![]; + while let Some(token) = stream.next() { + tokens.push(token.text.to_string()); + } + + assert_eq!(tokens, expected); + } + + #[test] + fn test_plain_english() { + let text = "hello world, I couldn't be more proud!"; + let tokens = vec!["hello", "world", "i", "couldn", "t", "be", "more", "proud"]; + parse_and_compare(text, tokens); + } + + #[test] + fn test_mixed() { + let text = "Ôóű, 🦄☣ in 北亰"; + let tokens = vec!["oou", "unicorn", "biohazard", "in", "bei", "jing"]; + parse_and_compare(text, tokens); + } + + #[test] + fn test_accents() { + let text = "étude"; + let tokens = vec!["etude"]; + parse_and_compare(text, tokens); + } + + #[test] + fn test_greek() { + let text = "Æneid"; + let tokens = vec!["aeneid"]; + parse_and_compare(text, tokens); + } + + #[test] + fn test_other() { + let text = "ᔕᓇᓇ"; + let tokens = vec!["sha", "na", "na"]; + parse_and_compare(text, tokens); + } + + #[test] + /// Note about this test: + /// We don't really do much clever tokenizing here for CJK languages, this is + /// mostly just testing the normalization rather than the tokenization ability. + fn test_chinese_simplified() { + let text = "你好,世界,我感到无比自豪! "; + let tokens = vec![ + "ni", "hao", "shi", "jie", "wo", "gan", "dao", "wu", "bi", "zi", "hao", + ]; + parse_and_compare(text, tokens); + } + + #[test] + /// Note about this test: + /// We don't really do much clever tokenizing here for CJK languages, this is + /// mostly just testing the normalization rather than the tokenization ability. + fn test_chinese_traditional() { + let text = "你好,世界,我感到無比自豪! "; + let tokens = vec![ + "ni", "hao", "shi", "jie", "wo", "gan", "dao", "wu", "bi", "zi", "hao", + ]; + parse_and_compare(text, tokens); + } + + #[test] + /// Note about this test: + /// We don't really do much clever tokenizing here for CJK languages, this is + /// mostly just testing the normalization rather than the tokenization ability. + fn test_japanese() { + let text = "Hello world、これ以上誇りに思うことはできません! "; + let tokens = vec![ + "hello", "world", "ko", "re", "yi", "shang", "kua", "ri", "ni", "si", "u", + "ko", "to", "ha", "de", "ki", "ma", "sen", + ]; + parse_and_compare(text, tokens); + } + + #[test] + /// Note about this test: + /// We don't really do much clever tokenizing here for CJK languages, this is + /// mostly just testing the normalization rather than the tokenization ability. + fn test_korean() { + let text = "안녕하세요 세상, 이보다 더 자랑스러울 수 없습니다! "; + let tokens = vec![ + "an", "nyeong", "ha", "se", "yo", "se", "sang", "i", "bo", "da", "deo", + "ja", "rang", "seu", "reo", "ul", "su", "eobs", "seub", "ni", "da", + ]; + parse_and_compare(text, tokens); + } +}