-
-
Notifications
You must be signed in to change notification settings - Fork 229
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Better content hashes and default file names (#314)
* CLI: content hash as file name by default * change BOW generator, sha1 → blake2, separate function for file names * add hashing module, replace fingerprint by simhash hex string * refine and add type annotations * revise input and sampling * use int.bit_count() if available * meta: add function to cleared caches * CLI: add deprecation warning * test: existing_hash is int
- Loading branch information
Showing
10 changed files
with
362 additions
and
153 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
# pylint:disable-msg=I1101,W1401 | ||
""" | ||
Unit tests for the trafilatura's text filters and cache. | ||
""" | ||
|
||
# language detection | ||
try: | ||
import py3langid | ||
LANGID_FLAG = True | ||
except ImportError: | ||
LANGID_FLAG = False | ||
|
||
|
||
from lxml import etree, html | ||
|
||
from trafilatura import extract | ||
|
||
import trafilatura.filters | ||
|
||
from trafilatura.core import Extractor | ||
from trafilatura.filters import check_html_lang, duplicate_test, language_filter | ||
from trafilatura.lru import LRUCache | ||
from trafilatura.metadata import Document | ||
from trafilatura.settings import DEFAULT_CONFIG | ||
|
||
|
||
ZERO_CONFIG = DEFAULT_CONFIG | ||
ZERO_CONFIG['DEFAULT']['MIN_OUTPUT_SIZE'] = '0' | ||
ZERO_CONFIG['DEFAULT']['MIN_EXTRACTED_SIZE'] = '0' | ||
|
||
DEFAULT_OPTIONS = Extractor(*[False]*11) | ||
DEFAULT_OPTIONS.config = DEFAULT_CONFIG | ||
|
||
SAMPLE_META = Document() | ||
|
||
|
||
def test_filters(): | ||
'''Test content filtering''' | ||
if LANGID_FLAG is True: | ||
# main text | ||
assert language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META)[0] is False | ||
assert language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META)[0] is True | ||
# comments | ||
assert language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de', SAMPLE_META)[0] is False | ||
# lang detection on the content | ||
doc = html.fromstring('<html><body><article><p>How many ages hence/Shall this our lofty scene be acted over,/In states unborn and accents yet unknown!</p></article></body></html>') | ||
assert extract(doc, config=ZERO_CONFIG, target_language='de') is None | ||
assert extract(doc, config=ZERO_CONFIG, target_language='en') is not None | ||
else: | ||
# no detection | ||
assert language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META)[0] is False | ||
# test URL blacklist | ||
assert extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None | ||
## recursion limit | ||
my_p = '<p>abc</p>' | ||
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>') | ||
assert extract(doc, max_tree_size=500) is not None | ||
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>') | ||
assert extract(doc, max_tree_size=500) is None | ||
my_p = '<p><hi rend="#i">abc</hi></p>' | ||
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>') | ||
assert extract(doc, include_formatting=True, max_tree_size=500) is None | ||
doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>') | ||
assert extract(doc, include_formatting=True, max_tree_size=500) is not None | ||
## deduplication | ||
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>') | ||
trafilatura.filters.LRU_TEST = LRUCache(maxsize=2) | ||
assert extract(doc, deduplicate=True) is not None | ||
assert extract(doc, deduplicate=True) is not None | ||
assert extract(doc, deduplicate=True) is not None | ||
assert extract(doc, deduplicate=True) is None | ||
# paragraph level | ||
trafilatura.filters.LRU_TEST = LRUCache(maxsize=2) | ||
my_p = etree.fromstring('<p>' + 'abc'*50 + '</p>') | ||
options = DEFAULT_OPTIONS | ||
options.dedup = True | ||
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None | ||
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None | ||
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None | ||
assert trafilatura.htmlprocessing.process_node(my_p, options) is None | ||
# HTML lang filter | ||
# no lang | ||
assert check_html_lang(html.fromstring('<html><body></body></html>'), target_language='en') is True | ||
# text + lang | ||
my_p = '<p>In sleep a king, but waking no such matter.</p>' | ||
if LANGID_FLAG is True: | ||
assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), no_fallback=True, target_language='en') is not None | ||
assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), no_fallback=True, target_language='de') is None | ||
# caught | ||
assert extract(html.fromstring('<html lang="de-DE"><body>' + my_p*50 + '</body></html>'), no_fallback=False, target_language='de') is None | ||
else: | ||
# not caught, HTML tag used | ||
assert extract(html.fromstring('<html lang="de-DE"><body>' + my_p*50 + '</body></html>'), no_fallback=False, target_language='de') is not None | ||
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de') is True | ||
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='en') is True | ||
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de', strict=True) is True | ||
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='en', strict=True) is True | ||
assert check_html_lang(html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'), target_language='en') is True | ||
assert check_html_lang(html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'), target_language='de') is False | ||
assert check_html_lang(html.fromstring('<html><head><meta http-equiv="content-language" content="DE"></head><body></body></html>'), target_language='de') is True | ||
# html lang attribute superseded by og:locale | ||
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de') is True | ||
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='en') is False | ||
assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it', strict=True) is False | ||
assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it', strict=False) is True | ||
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de', strict=False) is True | ||
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de', strict=True) is True | ||
|
||
|
||
def test_lrucache(): | ||
'''test basic duplicate detection''' | ||
lru_test = LRUCache(maxsize=2) | ||
trafilatura.filters.LRU_TEST = lru_test | ||
my_body = etree.Element('body') | ||
### element too short | ||
#my_element = html.fromstring('<p>AAAA BBBB</p>') | ||
#my_body.append(my_element) | ||
#put_in_cache(my_body) | ||
#assert duplicate_test(my_element, DEFAULT_CONFIG) is False | ||
### cached element | ||
my_element = html.fromstring('<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>') | ||
my_body.append(my_element) | ||
assert duplicate_test(my_element, DEFAULT_CONFIG) is False | ||
assert duplicate_test(my_element, DEFAULT_CONFIG) is False | ||
assert duplicate_test(my_body, DEFAULT_CONFIG) is False | ||
assert duplicate_test(my_element, DEFAULT_CONFIG) is True | ||
other_body = etree.Element('body') | ||
other_element = html.fromstring('<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>') | ||
other_body.append(other_element) | ||
assert duplicate_test(other_body, DEFAULT_CONFIG) is False | ||
assert duplicate_test(other_element, DEFAULT_CONFIG) is False | ||
assert duplicate_test(other_body, DEFAULT_CONFIG) is False | ||
assert duplicate_test(other_element, DEFAULT_CONFIG) is True | ||
yet_another_body = etree.Element('body') | ||
yet_another_element = html.fromstring('<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>') | ||
yet_another_body.append(yet_another_element) | ||
assert duplicate_test(yet_another_body, DEFAULT_CONFIG) is False | ||
assert duplicate_test(yet_another_body, DEFAULT_CONFIG) is False | ||
assert duplicate_test(yet_another_body, DEFAULT_CONFIG) is False | ||
# 2 elements in cache, original element has been cleared? | ||
# print(LRU_TEST.maxsize, LRU_TEST.full) | ||
assert duplicate_test(other_element, DEFAULT_CONFIG) is True | ||
assert duplicate_test(yet_another_element, DEFAULT_CONFIG) is True | ||
assert duplicate_test(my_element, DEFAULT_CONFIG) is False | ||
# clear the cache | ||
lru_test.clear() | ||
assert duplicate_test(other_element, DEFAULT_CONFIG) is False | ||
# get wrong key | ||
assert lru_test.get('tralala') == -1 | ||
|
||
|
||
if __name__ == '__main__': | ||
test_filters() | ||
test_lrucache() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
|
||
|
||
from trafilatura.hashing import Simhash, content_fingerprint, generate_hash_filename | ||
|
||
|
||
|
||
def test_hashes(): | ||
"Test hashing functions." | ||
content = "abcde ijk l, "*10 | ||
assert content_fingerprint(content) == "528497a1d07b66d6" | ||
assert generate_hash_filename(content) == "42LNugG3Sc95646i" | ||
|
||
|
||
|
||
def test_simhash(): | ||
# https://en.wiktionary.org/wiki/put_lipstick_on_a_pig | ||
factor = 1 | ||
hashes = [] | ||
hashes.append(Simhash("This is like putting lipstick on a pig."*factor)) | ||
# hashes.append(Simhash("This is like putting lipstick on a pig.123"*factor)) | ||
hashes.append(Simhash("This is just like putting lipstick on a pig."*factor)) | ||
hashes.append(Simhash("Putting lipstick on a pig is what this is about."*factor)) | ||
hashes.append(Simhash("The words are completely different but let's see."*factor)) | ||
|
||
sims = [hashes[0].similarity(h) for h in hashes] | ||
assert sims[0] == 1.0 and min(sims) == sims[-1] | ||
|
||
# sanity checks | ||
assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash | ||
assert int(hex(hashes[0].hash)[2:], 16) == hashes[0].hash | ||
assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash | ||
|
||
# re-hashed | ||
assert Simhash(existing_hash="aghj").hash == 18446744073709551615 | ||
assert Simhash(existing_hash="18446744073709551615").hash == 18446744073709551615 | ||
assert Simhash(existing_hash=123).hash != 123 | ||
assert Simhash(existing_hash=18446744073709551615).hash == 18446744073709551615 | ||
assert Simhash(existing_hash=None).hash == Simhash().hash | ||
|
||
# similarity | ||
assert Simhash("abcde").similarity(Simhash("abcde")) == 1.0 | ||
assert Simhash("abcde").similarity(Simhash("abcde", length=2)) != 1.0 | ||
assert Simhash("abcde").similarity(Simhash("fghij")) < 0.6 | ||
assert Simhash("abcde "*100).similarity(Simhash("abcde")) == 1.0 | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
test_hashes() | ||
test_simhash() |
Oops, something went wrong.