Skip to content

Commit

Permalink
fix: [language] crawled items, force gcld3 detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Feb 5, 2024
1 parent 99fedf9 commit aa56e71
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
4 changes: 2 additions & 2 deletions bin/lib/Language.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,9 +357,9 @@ def detect_libretranslate(self, content):
languages.append(language)
return languages

def detect(self, content):
def detect(self, content, force_gcld3=False):
# gcld3
if len(content) >= 200 or not self.lt:
if len(content) >= 200 or not self.lt or force_gcld3:
language = self.detect_gcld3(content)
# libretranslate
else:
Expand Down
4 changes: 2 additions & 2 deletions bin/lib/objects/Items.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,9 @@ def get_meta_lines(self, content=None):
return {'nb': nb_line, 'max_length': max_length}

# TODO RENAME ME
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7, force_gcld3=False):
ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len)
return ld.detect(self.get_content())
return ld.detect(self.get_content(), force_gcld3=force_gcld3)

def get_mimetype(self, content=None):
if not content:
Expand Down
2 changes: 1 addition & 1 deletion bin/modules/Languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def compute(self, message):
if obj.type == 'item':
if obj.is_crawled():
domain = Domain(obj.get_domain())
for lang in obj.get_languages(min_probability=0.8):
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
print(lang)
domain.add_language(lang)

Expand Down

0 comments on commit aa56e71

Please sign in to comment.