Merge #270

270: Vietnamese: Add laking tests and fix bug r=ManyTheFish a=ManyTheFish The pre-implemented tests have been removed during the implementation of the Vietnamese normalizer, which missed the bug Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
meilisearch · Feb 13, 2024 · a869338 · a869338
2 parents b140e49 + ed34dd6
commit a869338
Showing 1 changed file with 159 additions and 5 deletions.
diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs
@@ -1,14 +1,13 @@
 use super::{CharNormalizer, CharOrStr};
-use crate::Script;
-use crate::Token;
+use crate::{Script, Token};
 
 pub struct VietnameseNormalizer;
 
 impl CharNormalizer for VietnameseNormalizer {
     fn normalize_char(&self, c: char) -> Option<CharOrStr> {
         match c {
-            'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
-            _ => None,
+            'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
+            _ => Some(c.into()),
         }
     }
 
@@ -18,5 +17,160 @@ impl CharNormalizer for VietnameseNormalizer {
 }
 
 fn is_should_normalize(c: char) -> bool {
-    matches!(c, 'Ð' | 'Đ' | 'đ')
+    matches!(c, 'Ð' | 'Đ' | 'đ' | 'ð')
+}
+
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::{Normalizer, NormalizerOption};
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("Ðại Việt".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("Đại Việt".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("đại Việt".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                script: Script::Latin,
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("dại Việt".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                char_map: Some(vec![
+                    (2, 1),
+                    (3, 3),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (3, 3),
+                    (1, 1),
+                ]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("dại Việt".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                char_map: Some(vec![
+                    (2, 1),
+                    (3, 3),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (3, 3),
+                    (1, 1),
+                ]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("dại Việt".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                char_map: Some(vec![
+                    (2, 1),
+                    (3, 3),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (3, 3),
+                    (1, 1),
+                ]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("dai viet".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                char_map: Some(vec![
+                    (2, 1),
+                    (3, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (3, 1),
+                    (1, 1),
+                ]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("dai viet".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                char_map: Some(vec![
+                    (2, 1),
+                    (3, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (3, 1),
+                    (1, 1),
+                ]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("dai viet".to_string()),
+                char_end: 8,
+                byte_end: 13,
+                char_map: Some(vec![
+                    (2, 1),
+                    (3, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (3, 1),
+                    (1, 1),
+                ]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+        ]
+    }
+
+    test_normalizer!(VietnameseNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }