LNX-NONE: Move to compose lib over symspell fork. (#96)

* Add compose over personal fork * Add unicode normalizing tokenizer * Reformat code
lnx-search · Jun 25, 2022 · 6fbb7de · 6fbb7de
1 parent 66f4d48
commit 6fbb7de
Show file tree

Hide file tree

Showing 7 changed files with 219 additions and 15 deletions.
diff --git a/lnx-engine/search-index/Cargo.toml b/lnx-engine/search-index/Cargo.toml
@@ -13,8 +13,9 @@ serde = { version = "1", features = ["derive"] }
 sled = { version = "0.34.7", features = ["compression"] }
 hashbrown = { version = "0.11", features = ["serde"] }
 tokio = { version = "1.12", features = ["sync", "fs", "rt"] }
-symspell = { git = "https://github.com/lnx-search/symspell", tag = "v0.5.0" }
+compose = { git = "https://github.com/lnx-search/compose.git", tag = "0.1.0" }
 
+deunicode = "1.3.1"
 tantivy = "0.18.0"
 tracing = "0.1.29"
 tracing-futures = "0.2.5"

diff --git a/lnx-engine/search-index/src/corrections.rs b/lnx-engine/search-index/src/corrections.rs
@@ -2,20 +2,20 @@ use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
 use arc_swap::ArcSwap;
+use compose::{Suggestion, SymSpell, Verbosity};
 use hashbrown::HashMap;
-use symspell::{AsciiStringStrategy, Suggestion, SymSpell, Verbosity};
 
 pub(crate) type SymSpellCorrectionManager = Arc<SymSpellManager>;
 
 /// The manager around the sym spell fuzzy searching system.
 pub(crate) struct SymSpellManager {
-    sym: Arc<ArcSwap<SymSpell<AsciiStringStrategy>>>,
+    sym: ArcSwap<SymSpell>,
 }
 
 impl SymSpellManager {
     pub(crate) fn new() -> Self {
         let sym = SymSpell::default();
-        let sym = Arc::new(ArcSwap::from_pointee(sym));
+        let sym = ArcSwap::from_pointee(sym);
         Self { sym }
     }
 
@@ -38,13 +38,17 @@ impl SymSpellManager {
     pub(crate) fn adjust_index_frequencies(&self, frequencies: &HashMap<String, u32>) {
         info!("adjusting spell correction system to new frequency count, this may take a while...");
 
-        let mut symspell: SymSpell<AsciiStringStrategy> = SymSpell::default();
-        symspell.using_dictionary_frequencies(
-            frequencies
-                .into_iter()
-                .map(|(k, v)| (k.clone(), *v as i64))
-                .collect(),
-        );
+        let frequencies = frequencies
+            .into_iter()
+            .map(|(k, v)| (k.clone(), *v as i64))
+            .collect();
+
+        let mut symspell = SymSpell::default();
+
+        // SAFETY:
+        //  This is safe as long as the keys being passed are ASCII. If this uses UTF-8 characters
+        //  there is a chance this can make the algorithm become UB when accessing the wordmap.
+        unsafe { symspell.using_dictionary_frequencies(frequencies, false) };
 
         self.sym.store(Arc::from(symspell))
     }

diff --git a/lnx-engine/search-index/src/lib.rs b/lnx-engine/search-index/src/lib.rs
@@ -15,6 +15,7 @@ mod stop_words;
 mod storage;
 pub mod structures;
 mod synonyms;
+mod tokenizer;
 mod writer;
 
 pub use helpers::cr32_hash;

diff --git a/lnx-engine/search-index/src/query.rs b/lnx-engine/search-index/src/query.rs
@@ -28,13 +28,14 @@ use tantivy::schema::{
     IndexRecordOption,
     Schema,
 };
-use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
+use tantivy::tokenizer::TokenStream;
 use tantivy::{DateTime, Index, Score, Term};
 
 use crate::corrections::SymSpellCorrectionManager;
 use crate::stop_words::StopWordManager;
 use crate::structures::DocumentValue;
 use crate::synonyms::SynonymsManager;
+use crate::tokenizer::SimpleUnicodeTokenizer;
 
 pub type DocumentId = u64;
 
@@ -413,7 +414,7 @@ pub(crate) struct QueryBuilder {
     pool: crate::ReaderExecutor,
 
     /// A basic word tokenizers for fuzzy queries.
-    tokenizer: TextAnalyzer,
+    tokenizer: SimpleUnicodeTokenizer,
 }
 
 impl QueryBuilder {
@@ -427,7 +428,7 @@ impl QueryBuilder {
         pool: crate::ReaderExecutor,
     ) -> Self {
         let parser = get_parser(&ctx, index);
-        let tokenizer = TextAnalyzer::from(SimpleTokenizer).filter(LowerCaser);
+        let tokenizer = SimpleUnicodeTokenizer::with_limit(16);
 
         Self {
             ctx: Arc::new(ctx),

diff --git a/lnx-engine/search-index/src/schema.rs b/lnx-engine/search-index/src/schema.rs
@@ -287,7 +287,7 @@ impl SchemaContext {
     }
 
     /// Generates a new schema from the given fields.
-    pub fn as_tantivy_schema(&self) -> tantivy::schema::Schema {
+    pub fn as_tantivy_schema(&self) -> Schema {
         let mut schema = SchemaBuilder::new();
         schema.add_u64_field(PRIMARY_KEY, FAST | STORED | INDEXED);
 

diff --git a/lnx-engine/search-index/src/structures.rs b/lnx-engine/search-index/src/structures.rs
@@ -23,6 +23,7 @@ use crate::schema::{SchemaContext, PRIMARY_KEY};
 use crate::stop_words::StopWordManager;
 use crate::storage::{OpenType, SledBackedDirectory, StorageBackend};
 use crate::synonyms::SynonymsManager;
+use crate::tokenizer::SimpleUnicodeTokenizer;
 use crate::writer::WriterContext;
 use crate::DocumentId;
 
@@ -184,6 +185,10 @@ impl IndexDeclaration {
         let corrections = Arc::new(SymSpellManager::new());
         let storage = StorageBackend::using_conn(dir);
 
+        index
+            .tokenizers()
+            .register("default", SimpleUnicodeTokenizer::default());
+
         Ok(IndexContext {
             name: self.name.clone(),
             storage,

diff --git a/lnx-engine/search-index/src/tokenizer.rs b/lnx-engine/search-index/src/tokenizer.rs
@@ -0,0 +1,192 @@
+use deunicode::deunicode_char;
+use tantivy::tokenizer::{
+    BoxTokenStream,
+    SimpleTokenizer,
+    Token,
+    TokenStream,
+    Tokenizer,
+};
+
+#[derive(Clone)]
+pub struct SimpleUnicodeTokenizer {
+    limit: usize,
+}
+
+impl Default for SimpleUnicodeTokenizer {
+    fn default() -> Self {
+        Self { limit: usize::MAX }
+    }
+}
+
+impl SimpleUnicodeTokenizer {
+    pub fn with_limit(num_words: usize) -> Self {
+        Self { limit: num_words }
+    }
+
+    pub fn token_stream(&self, text: &str) -> SimpleTokenStream {
+        let tokens = produce_tokens(text, self.limit);
+
+        SimpleTokenStream { tokens, pointer: 0 }
+    }
+}
+
+impl Tokenizer for SimpleUnicodeTokenizer {
+    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
+        let tokens = produce_tokens(text, self.limit);
+        BoxTokenStream::from(SimpleTokenStream { tokens, pointer: 0 })
+    }
+}
+
+pub fn produce_tokens(text: &str, num_tokens: usize) -> Vec<Token> {
+    let mut characters = String::with_capacity(text.len());
+    for char in text.chars() {
+        if let Some(ascii) = deunicode_char(char) {
+            if ascii.len() > 1 {
+                characters.push(' ');
+            }
+            characters.push_str(&ascii.to_lowercase());
+        }
+    }
+
+    let simple = SimpleTokenizer {};
+    let mut stream = simple.token_stream(&characters);
+
+    let mut tokens = vec![];
+    while let Some(token) = stream.next() {
+        if tokens.len() >= num_tokens {
+            break;
+        }
+
+        tokens.push(token.clone());
+    }
+
+    tokens
+}
+
+pub struct SimpleTokenStream {
+    tokens: Vec<Token>,
+    pointer: usize,
+}
+
+impl TokenStream for SimpleTokenStream {
+    fn advance(&mut self) -> bool {
+        if self.pointer < self.tokens.len() {
+            self.pointer += 1;
+            true
+        } else {
+            false
+        }
+    }
+
+    fn token(&self) -> &Token {
+        // safe because our pointer cannot go beyond bounds
+        unsafe { self.tokens.get_unchecked(self.pointer - 1) }
+    }
+
+    fn token_mut(&mut self) -> &mut Token {
+        // safe because our pointer cannot go beyond bounds
+        unsafe { self.tokens.get_unchecked_mut(self.pointer - 1) }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn parse_and_compare(text: &str, expected: Vec<&str>) {
+        let tokenizer = SimpleUnicodeTokenizer::default();
+        let mut stream = tokenizer.token_stream(text);
+
+        let mut tokens = vec![];
+        while let Some(token) = stream.next() {
+            tokens.push(token.text.to_string());
+        }
+
+        assert_eq!(tokens, expected);
+    }
+
+    #[test]
+    fn test_plain_english() {
+        let text = "hello world, I couldn't be more proud!";
+        let tokens = vec!["hello", "world", "i", "couldn", "t", "be", "more", "proud"];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    fn test_mixed() {
+        let text = "Ôóű, 🦄☣ in 北亰";
+        let tokens = vec!["oou", "unicorn", "biohazard", "in", "bei", "jing"];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    fn test_accents() {
+        let text = "étude";
+        let tokens = vec!["etude"];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    fn test_greek() {
+        let text = "Æneid";
+        let tokens = vec!["aeneid"];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    fn test_other() {
+        let text = "ᔕᓇᓇ";
+        let tokens = vec!["sha", "na", "na"];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    /// Note about this test:
+    /// We don't really do much clever tokenizing here for CJK languages, this is
+    /// mostly just testing the normalization rather than the tokenization ability.
+    fn test_chinese_simplified() {
+        let text = "你好，世界，我感到无比自豪！ ";
+        let tokens = vec![
+            "ni", "hao", "shi", "jie", "wo", "gan", "dao", "wu", "bi", "zi", "hao",
+        ];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    /// Note about this test:
+    /// We don't really do much clever tokenizing here for CJK languages, this is
+    /// mostly just testing the normalization rather than the tokenization ability.
+    fn test_chinese_traditional() {
+        let text = "你好，世界，我感到無比自豪！ ";
+        let tokens = vec![
+            "ni", "hao", "shi", "jie", "wo", "gan", "dao", "wu", "bi", "zi", "hao",
+        ];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    /// Note about this test:
+    /// We don't really do much clever tokenizing here for CJK languages, this is
+    /// mostly just testing the normalization rather than the tokenization ability.
+    fn test_japanese() {
+        let text = "Hello world、これ以上誇りに思うことはできません！ ";
+        let tokens = vec![
+            "hello", "world", "ko", "re", "yi", "shang", "kua", "ri", "ni", "si", "u",
+            "ko", "to", "ha", "de", "ki", "ma", "sen",
+        ];
+        parse_and_compare(text, tokens);
+    }
+
+    #[test]
+    /// Note about this test:
+    /// We don't really do much clever tokenizing here for CJK languages, this is
+    /// mostly just testing the normalization rather than the tokenization ability.
+    fn test_korean() {
+        let text = "안녕하세요 세상, 이보다 더 자랑스러울 수 없습니다! ";
+        let tokens = vec![
+            "an", "nyeong", "ha", "se", "yo", "se", "sang", "i", "bo", "da", "deo",
+            "ja", "rang", "seu", "reo", "ul", "su", "eobs", "seub", "ni", "da",
+        ];
+        parse_and_compare(text, tokens);
+    }
+}