Merge #257

257: normalize Ð and Đ into d r=ManyTheFish a=ngdbao # Pull Request ## Related issue Fixes issue [#<245>](#245) ## What does this PR do? - Add Vietnamese normalizer ## PR checklist Please check if your PR fulfills the following requirements: - [ x ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [ x ] Have you read the contributing guidelines? - [ x ] Have you made sure that the title is accurate and descriptive of the changes? Co-authored-by: ngdbao <ngdbao94@Gmail.com>
meilisearch · Jan 24, 2024 · 286ef64 · 286ef64
2 parents d1cfce8 + 4103a73
commit 286ef64
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 1 deletion.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -37,7 +37,7 @@ litemap = "0.6.1"
 zerovec = "0.9.3"
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
 
 # allow chinese specialized tokenization
 chinese = ["dep:pinyin", "dep:jieba-rs"]
@@ -65,6 +65,9 @@ latin-camelcase = ["dep:finl_unicode"]
 
 khmer = []
 
+# allow vietnamese specialized tokenization
+vietnamese = []
+
 # allow splitting snake_case latin words
 latin-snakecase = ["dep:finl_unicode"]
 

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -15,6 +15,7 @@ pub use self::japanese::JapaneseNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
 use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
+pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
@@ -31,6 +32,8 @@ mod japanese;
 mod lowercase;
 mod nonspacing_mark;
 mod quote;
+#[cfg(feature = "vietnamese")]
+mod vietnamese;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
@@ -54,6 +57,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(GreekNormalizer),
         Box::new(ArabicNormalizer),
         Box::new(NonspacingMarkNormalizer),
+        #[cfg(feature = "vietnamese")]
+        Box::new(VietnameseNormalizer),
     ]
 });
 

diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs
@@ -0,0 +1,22 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::Script;
+use crate::Token;
+
+pub struct VietnameseNormalizer;
+
+impl CharNormalizer for VietnameseNormalizer {
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        match c {
+            'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
+            _ => None,
+        }
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
+    }
+}
+
+fn is_should_normalize(c: char) -> bool {
+    matches!(c, 'Ð' | 'Đ' | 'đ')
+}