Skip to content

Commit

Permalink
Better content hashes and default file names (#314)
Browse files Browse the repository at this point in the history
* CLI: content hash as file name by default

* change BOW generator, sha1 → blake2, separate function for file names

* add hashing module, replace fingerprint by simhash hex string

* refine and add type annotations

* revise input and sampling

* use int.bit_count() if available

* meta: add function to cleared caches

* CLI: add deprecation warning

* test: existing_hash is int
  • Loading branch information
adbar committed Apr 13, 2023
1 parent 55237a7 commit 56421cc
Show file tree
Hide file tree
Showing 10 changed files with 362 additions and 153 deletions.
6 changes: 2 additions & 4 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def test_parser():
assert e.value.code == 0
assert re.match(r'Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]', f.getvalue())
# test future deprecations
testargs = ['', '--inputfile', 'test.txt', '--with-metadata', '--nocomments', '--notables']
testargs = ['', '--inputfile', 'test.txt', '--with-metadata', '--nocomments', '--notables', '--hash-as-name']
with patch.object(sys, 'argv', testargs):
args = cli.map_args(cli.parse_args(testargs))
assert args.no_comments is False and args.no_tables is False and args.only_with_metadata and args.input_file == 'test.txt'
Expand Down Expand Up @@ -199,12 +199,10 @@ def test_sysoutput():
filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt', '')
assert filepath == 'test/testfile.txt'
# test hash as output file name
assert args.hash_as_name is False
args.hash_as_name = True
assert args.keep_dirs is True
args.keep_dirs = False
filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt', '')
assert filepath == 'test/2jmj7l5rSw0yVb-vlWAYkK-YBwk.txt'
assert filepath == 'test/uOHdo6wKo4IK0pkL.txt'


def test_download():
Expand Down
154 changes: 154 additions & 0 deletions tests/filters_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# pylint:disable-msg=I1101,W1401
"""
Unit tests for the trafilatura's text filters and cache.
"""

# language detection
try:
import py3langid
LANGID_FLAG = True
except ImportError:
LANGID_FLAG = False


from lxml import etree, html

from trafilatura import extract

import trafilatura.filters

from trafilatura.core import Extractor
from trafilatura.filters import check_html_lang, duplicate_test, language_filter
from trafilatura.lru import LRUCache
from trafilatura.metadata import Document
from trafilatura.settings import DEFAULT_CONFIG


ZERO_CONFIG = DEFAULT_CONFIG
ZERO_CONFIG['DEFAULT']['MIN_OUTPUT_SIZE'] = '0'
ZERO_CONFIG['DEFAULT']['MIN_EXTRACTED_SIZE'] = '0'

DEFAULT_OPTIONS = Extractor(*[False]*11)
DEFAULT_OPTIONS.config = DEFAULT_CONFIG

SAMPLE_META = Document()


def test_filters():
'''Test content filtering'''
if LANGID_FLAG is True:
# main text
assert language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META)[0] is False
assert language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META)[0] is True
# comments
assert language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de', SAMPLE_META)[0] is False
# lang detection on the content
doc = html.fromstring('<html><body><article><p>How many ages hence/Shall this our lofty scene be acted over,/In states unborn and accents yet unknown!</p></article></body></html>')
assert extract(doc, config=ZERO_CONFIG, target_language='de') is None
assert extract(doc, config=ZERO_CONFIG, target_language='en') is not None
else:
# no detection
assert language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META)[0] is False
# test URL blacklist
assert extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None
## recursion limit
my_p = '<p>abc</p>'
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
assert extract(doc, max_tree_size=500) is not None
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
assert extract(doc, max_tree_size=500) is None
my_p = '<p><hi rend="#i">abc</hi></p>'
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is None
doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is not None
## deduplication
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
trafilatura.filters.LRU_TEST = LRUCache(maxsize=2)
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is None
# paragraph level
trafilatura.filters.LRU_TEST = LRUCache(maxsize=2)
my_p = etree.fromstring('<p>' + 'abc'*50 + '</p>')
options = DEFAULT_OPTIONS
options.dedup = True
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is None
# HTML lang filter
# no lang
assert check_html_lang(html.fromstring('<html><body></body></html>'), target_language='en') is True
# text + lang
my_p = '<p>In sleep a king, but waking no such matter.</p>'
if LANGID_FLAG is True:
assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), no_fallback=True, target_language='en') is not None
assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), no_fallback=True, target_language='de') is None
# caught
assert extract(html.fromstring('<html lang="de-DE"><body>' + my_p*50 + '</body></html>'), no_fallback=False, target_language='de') is None
else:
# not caught, HTML tag used
assert extract(html.fromstring('<html lang="de-DE"><body>' + my_p*50 + '</body></html>'), no_fallback=False, target_language='de') is not None
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de') is True
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='en') is True
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de', strict=True) is True
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='en', strict=True) is True
assert check_html_lang(html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'), target_language='en') is True
assert check_html_lang(html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'), target_language='de') is False
assert check_html_lang(html.fromstring('<html><head><meta http-equiv="content-language" content="DE"></head><body></body></html>'), target_language='de') is True
# html lang attribute superseded by og:locale
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de') is True
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='en') is False
assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it', strict=True) is False
assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it', strict=False) is True
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de', strict=False) is True
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de', strict=True) is True


def test_lrucache():
'''test basic duplicate detection'''
lru_test = LRUCache(maxsize=2)
trafilatura.filters.LRU_TEST = lru_test
my_body = etree.Element('body')
### element too short
#my_element = html.fromstring('<p>AAAA BBBB</p>')
#my_body.append(my_element)
#put_in_cache(my_body)
#assert duplicate_test(my_element, DEFAULT_CONFIG) is False
### cached element
my_element = html.fromstring('<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>')
my_body.append(my_element)
assert duplicate_test(my_element, DEFAULT_CONFIG) is False
assert duplicate_test(my_element, DEFAULT_CONFIG) is False
assert duplicate_test(my_body, DEFAULT_CONFIG) is False
assert duplicate_test(my_element, DEFAULT_CONFIG) is True
other_body = etree.Element('body')
other_element = html.fromstring('<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>')
other_body.append(other_element)
assert duplicate_test(other_body, DEFAULT_CONFIG) is False
assert duplicate_test(other_element, DEFAULT_CONFIG) is False
assert duplicate_test(other_body, DEFAULT_CONFIG) is False
assert duplicate_test(other_element, DEFAULT_CONFIG) is True
yet_another_body = etree.Element('body')
yet_another_element = html.fromstring('<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>')
yet_another_body.append(yet_another_element)
assert duplicate_test(yet_another_body, DEFAULT_CONFIG) is False
assert duplicate_test(yet_another_body, DEFAULT_CONFIG) is False
assert duplicate_test(yet_another_body, DEFAULT_CONFIG) is False
# 2 elements in cache, original element has been cleared?
# print(LRU_TEST.maxsize, LRU_TEST.full)
assert duplicate_test(other_element, DEFAULT_CONFIG) is True
assert duplicate_test(yet_another_element, DEFAULT_CONFIG) is True
assert duplicate_test(my_element, DEFAULT_CONFIG) is False
# clear the cache
lru_test.clear()
assert duplicate_test(other_element, DEFAULT_CONFIG) is False
# get wrong key
assert lru_test.get('tralala') == -1


if __name__ == '__main__':
test_filters()
test_lrucache()
50 changes: 50 additions & 0 deletions tests/hashing_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@


from trafilatura.hashing import Simhash, content_fingerprint, generate_hash_filename



def test_hashes():
"Test hashing functions."
content = "abcde ijk l, "*10
assert content_fingerprint(content) == "528497a1d07b66d6"
assert generate_hash_filename(content) == "42LNugG3Sc95646i"



def test_simhash():
# https://en.wiktionary.org/wiki/put_lipstick_on_a_pig
factor = 1
hashes = []
hashes.append(Simhash("This is like putting lipstick on a pig."*factor))
# hashes.append(Simhash("This is like putting lipstick on a pig.123"*factor))
hashes.append(Simhash("This is just like putting lipstick on a pig."*factor))
hashes.append(Simhash("Putting lipstick on a pig is what this is about."*factor))
hashes.append(Simhash("The words are completely different but let's see."*factor))

sims = [hashes[0].similarity(h) for h in hashes]
assert sims[0] == 1.0 and min(sims) == sims[-1]

# sanity checks
assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash
assert int(hex(hashes[0].hash)[2:], 16) == hashes[0].hash
assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash

# re-hashed
assert Simhash(existing_hash="aghj").hash == 18446744073709551615
assert Simhash(existing_hash="18446744073709551615").hash == 18446744073709551615
assert Simhash(existing_hash=123).hash != 123
assert Simhash(existing_hash=18446744073709551615).hash == 18446744073709551615
assert Simhash(existing_hash=None).hash == Simhash().hash

# similarity
assert Simhash("abcde").similarity(Simhash("abcde")) == 1.0
assert Simhash("abcde").similarity(Simhash("abcde", length=2)) != 1.0
assert Simhash("abcde").similarity(Simhash("fghij")) < 0.6
assert Simhash("abcde "*100).similarity(Simhash("abcde")) == 1.0



if __name__ == "__main__":
test_hashes()
test_simhash()

0 comments on commit 56421cc

Please sign in to comment.