-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Taqi Jaffri
committed
Mar 13, 2024
1 parent
f4c49c9
commit db5f98a
Showing
1 changed file
with
56 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import pytest | ||
from docugami_dfm_benchmarks.utils.text import normalize, get_tokens | ||
|
||
|
||
def test_normalize_basic(): | ||
"""Test normalization on a simple string.""" | ||
assert normalize("This is an example.") == "this is example" | ||
|
||
|
||
def test_normalize_with_punctuation(): | ||
"""Test normalization removes punctuation.""" | ||
assert normalize("Hello, world!") == "hello world" | ||
|
||
|
||
def test_normalize_with_articles(): | ||
"""Test normalization removes articles 'a', 'an', 'the'.""" | ||
assert ( | ||
normalize("A quick brown fox jumps over the lazy dog.") | ||
== "quick brown fox jumps over lazy dog" | ||
) | ||
|
||
|
||
def test_normalize_with_extra_whitespace(): | ||
"""Test normalization removes extra whitespace.""" | ||
assert normalize(" This is a test. ") == "this is test" | ||
|
||
|
||
def test_get_tokens_empty(): | ||
"""Test get_tokens returns an empty list for empty input.""" | ||
assert get_tokens("") == [] | ||
|
||
|
||
def test_get_tokens_basic(): | ||
"""Test get_tokens on a simple string.""" | ||
assert get_tokens("This is a test.") == ["this", "is", "test"] | ||
|
||
|
||
def test_get_tokens_complex(): | ||
"""Test get_tokens with punctuation and extra whitespace.""" | ||
expected = [ | ||
"this", | ||
"is", | ||
"complex", | ||
"test", | ||
"with", | ||
"punctuation", | ||
"and", | ||
"extra", | ||
"whitespace", | ||
] | ||
assert ( | ||
get_tokens( | ||
"This, is a complex test. With punctuation! And... extra whitespace?" | ||
) | ||
== expected | ||
) |