Skip to content

Commit

Permalink
Merge #257
Browse files Browse the repository at this point in the history
257: normalize Ð and Đ into d r=ManyTheFish a=ngdbao

# Pull Request

## Related issue
Fixes issue [#<245>](#245)

## What does this PR do?
- Add Vietnamese normalizer

## PR checklist
Please check if your PR fulfills the following requirements:
- [ x ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [ x ] Have you read the contributing guidelines?
- [ x ] Have you made sure that the title is accurate and descriptive of the changes?



Co-authored-by: ngdbao <ngdbao94@Gmail.com>
  • Loading branch information
meili-bors[bot] and ngdbao committed Jan 24, 2024
2 parents d1cfce8 + 4103a73 commit 286ef64
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 1 deletion.
5 changes: 4 additions & 1 deletion charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ litemap = "0.6.1"
zerovec = "0.9.3"

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]

# allow chinese specialized tokenization
chinese = ["dep:pinyin", "dep:jieba-rs"]
Expand Down Expand Up @@ -65,6 +65,9 @@ latin-camelcase = ["dep:finl_unicode"]

khmer = []

# allow vietnamese specialized tokenization
vietnamese = []

# allow splitting snake_case latin words
latin-snakecase = ["dep:finl_unicode"]

Expand Down
5 changes: 5 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ pub use self::japanese::JapaneseNormalizer;
pub use self::lowercase::LowercaseNormalizer;
use self::nonspacing_mark::NonspacingMarkNormalizer;
use self::quote::QuoteNormalizer;
pub use self::vietnamese::VietnameseNormalizer;
use crate::segmenter::SegmentedTokenIter;
use crate::Token;

Expand All @@ -31,6 +32,8 @@ mod japanese;
mod lowercase;
mod nonspacing_mark;
mod quote;
#[cfg(feature = "vietnamese")]
mod vietnamese;

/// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Expand All @@ -54,6 +57,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Box::new(GreekNormalizer),
Box::new(ArabicNormalizer),
Box::new(NonspacingMarkNormalizer),
#[cfg(feature = "vietnamese")]
Box::new(VietnameseNormalizer),
]
});

Expand Down
22 changes: 22 additions & 0 deletions charabia/src/normalizer/vietnamese.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
use super::{CharNormalizer, CharOrStr};
use crate::Script;
use crate::Token;

pub struct VietnameseNormalizer;

impl CharNormalizer for VietnameseNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
match c {
'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
_ => None,
}
}

fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
}
}

fn is_should_normalize(c: char) -> bool {
matches!(c, 'Ð' | 'Đ' | 'đ')
}

0 comments on commit 286ef64

Please sign in to comment.