Skip to content

Commit

Permalink
Support OMW 1.4 (#2899)
Browse files Browse the repository at this point in the history
* Support OMW 1.4

* Adapt license, citation and readme functions

* Add 'langs' parameter to all_synsets()

* Create helper method for Synset.definitions and Synset.examples

* Updated wordnet doctests to support OMW 1.4

Co-authored-by: Tom Aarsen <>
  • Loading branch information
ekaf committed Dec 8, 2021
1 parent ba989e5 commit 8ed8b70
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 85 deletions.
205 changes: 128 additions & 77 deletions nltk/corpus/reader/wordnet.py
Expand Up @@ -443,11 +443,29 @@ def name(self):
def frame_ids(self):
return self._frame_ids

def definition(self):
return self._definition
def _doc(self, doc_type, default, lang="eng"):
"""Helper method for Synset.definition and Synset.examples"""
corpus = self._wordnet_corpus_reader
if lang not in corpus.langs():
return None
elif lang == "eng":
return default
else:
corpus._load_lang_data(lang)
of = corpus.ss2of(self)
i = corpus.lg_attrs.index(doc_type)
if of in corpus._lang_data[lang][i].keys():
return corpus._lang_data[lang][i][of]
else:
return None

def definition(self, lang="eng"):
"""Return definition in specified language"""
return self._doc("def", self._definition, lang=lang)

def examples(self):
return self._examples
def examples(self, lang="eng"):
"""Return examples in specified language"""
return self._doc("exe", self._examples, lang=lang)

def lexname(self):
return self._lexname
Expand Down Expand Up @@ -1132,10 +1150,6 @@ def __init__(self, root, omw_reader):
Construct a new wordnet corpus reader, with the given root
directory.
"""
if omw_reader is None:
warnings.warn(
"The multilingual functions are not available with this Wordnet version"
)

super().__init__(root, self._FILES, encoding=self._ENCODING)

Expand All @@ -1154,6 +1168,13 @@ def __init__(self, root, omw_reader):
# Corpus reader containing omw data.
self._omw_reader = omw_reader

if self._omw_reader is None:
warnings.warn(
"The multilingual functions are not available with this Wordnet version"
)
else:
self.provenances = self.omw_prov()

# A cache to store the wordnet data of multiple languages
self._lang_data = defaultdict(list)

Expand All @@ -1179,20 +1200,22 @@ def __init__(self, root, omw_reader):
# map from WordNet 3.0 for OMW data
self.map30 = self.map_wn30()

# Language data attributes
self.lg_attrs = ["lemma", "none", "def", "exe"]

def corpus2sk(self, corpus=None):
"""Read sense key to synset id mapping,
from index.sense file in corpus directory"""
fn = "index.sense"
if corpus:
fn = os.path.join(os.pardir, corpus, fn)
fp = self.open(fn)
sk_map = {}
for line in fp:
items = line.strip().split(" ")
sk = items[0]
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
sk_map[sk] = f"{items[1]}-{pos}"
fp.close()
with self.open(fn) as fp:
sk_map = {}
for line in fp:
items = line.strip().split(" ")
sk = items[0]
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
sk_map[sk] = f"{items[1]}-{pos}"
return sk_map

def map_wn30(self):
Expand Down Expand Up @@ -1250,20 +1273,31 @@ def _load_lang_data(self, lang):
if lang not in self.langs():
raise WordNetError("Language is not supported.")

with self._omw_reader.open("{0:}/wn-data-{0:}.tab".format(lang)) as fp:
with self._omw_reader.open(
f"{self.provenances[lang]}/wn-data-{lang.split('_')[0]}.tab"
) as fp:
self.custom_lemmas(fp, lang)

def langs(self):
"""return a list of languages supported by Multilingual Wordnet"""

langs = ["eng"]
def omw_prov(self):
"""Return a provenance dictionary of the languages in Multilingual Wordnet"""
provdict = {}
provdict["eng"] = ""
fileids = self._omw_reader.fileids()
for fileid in fileids:
file_name, file_extension = os.path.splitext(fileid)
prov, langfile = os.path.split(fileid)
file_name, file_extension = os.path.splitext(langfile)
if file_extension == ".tab":
langs.append(file_name.split("-")[-1])
lang = file_name.split("-")[-1]
if lang in provdict.keys():
# We already have another resource for this lang,
# so we need to further specify the lang id:
lang = f"{lang}_{prov}"
provdict[lang] = prov
return provdict

return langs
def langs(self):
"""return a list of languages supported by Multilingual Wordnet"""
return self.provenances.keys()

def _load_lemma_pos_offset_map(self):
for suffix in self._FILEMAP.values():
Expand Down Expand Up @@ -1732,11 +1766,32 @@ def all_lemma_names(self, pos=None, lang="eng"):
lemma = iter(set(lemma))
return lemma

def all_synsets(self, pos=None):
def all_omw_synsets(self, pos=None, lang=None):
if lang not in self.langs():
return None
self._load_lang_data(lang)
for of in self._lang_data[lang][0].keys():
try:
ss = self.of2ss(of)
yield ss
except:
# A few OMW offsets don't exist in Wordnet 3.0.
# Additionally, when mapped to later Wordnets,
# increasing numbers of synsets are lost in the mapping.
# warnings.warn(f"Language {lang}: no synset found for {of}")
pass

def all_synsets(self, pos=None, lang="eng"):
"""Iterate over all synsets with a given part of speech tag.
If no pos is specified, all synsets for all parts of speech
will be loaded.
"""
if lang == "eng":
return self.all_eng_synsets(pos=pos)
else:
return self.all_omw_synsets(pos=pos, lang=lang)

def all_eng_synsets(self, pos=None):
if pos is None:
pos_tags = self._FILEMAP.keys()
else:
Expand Down Expand Up @@ -1794,59 +1849,38 @@ def words(self, lang="eng"):
"""return lemmas of the given language as list of words"""
return self.all_lemma_names(lang=lang)

def doc(self, file="README", lang="eng"):
"""Return the contents of readme, license or citation file
use lang=lang to get the file for an individual language"""
if lang == "eng":
reader = self
else:
reader = self._omw_reader
if lang in self.langs():
file = f"{os.path.join(self.provenances[lang],file)}"
try:
with reader.open(file) as fp:
return fp.read()
except:
if lang in self._lang_data:
return f"Cannot determine {file} for {lang}"
else:
return f"Language {lang} is not supported."

def license(self, lang="eng"):
"""Return the contents of LICENSE (for omw)
use lang=lang to get the license for an individual language"""
if lang == "eng":
with self.open("LICENSE") as fp:
return fp.read()
elif lang in self.langs():
with self._omw_reader.open(f"{lang}/LICENSE") as fp:
return fp.read()
elif lang == "omw":
# under the assumption you don't mean Omwunra-Toqura
with self._omw_reader.open("LICENSE") as fp:
return fp.read()
elif lang in self._lang_data:
raise WordNetError("Cannot determine license for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
return self.doc(file="LICENSE", lang=lang)

def readme(self, lang="omw"):
def readme(self, lang="eng"):
"""Return the contents of README (for omw)
use lang=lang to get the readme for an individual language"""
if lang == "eng":
with self.open("README") as fp:
return fp.read()
elif lang in self.langs():
with self._omw_reader.open(f"{lang}/README") as fp:
return fp.read()
elif lang == "omw":
# under the assumption you don't mean Omwunra-Toqura
with self._omw_reader.open("README") as fp:
return fp.read()
elif lang in self._lang_data:
raise WordNetError("No README for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
return self.doc(file="README", lang=lang)

def citation(self, lang="omw"):
def citation(self, lang="eng"):
"""Return the contents of citation.bib file (for omw)
use lang=lang to get the citation for an individual language"""
if lang == "eng":
with self.open("citation.bib") as fp:
return fp.read()
elif lang in self.langs():
with self._omw_reader.open(f"{lang}/citation.bib") as fp:
return fp.read()
elif lang == "omw":
# under the assumption you don't mean Omwunra-Toqura
with self._omw_reader.open("citation.bib") as fp:
return fp.read()
elif lang in self._lang_data:
raise WordNetError("citation not known for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
return self.doc(file="citation.bib", lang=lang)

#############################################################
# Misc
Expand Down Expand Up @@ -2088,17 +2122,26 @@ def custom_lemmas(self, tab_file, lang):
:type: lang str
:param: lang ISO 639-3 code of the language of the tab file
"""
if len(lang) != 3:
lg = lang.split("_")[0]
if len(lg) != 3:
raise ValueError("lang should be a (3 character) ISO 639-3 code")
self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
self._lang_data[lang] = [
defaultdict(list),
defaultdict(list),
defaultdict(list),
defaultdict(list),
]
for line in tab_file.readlines():
if isinstance(line, bytes):
# Support byte-stream files (e.g. as returned by Python 2's
# open() function) as well as text-stream ones
line = line.decode("utf-8")
if not line.startswith("#"):
offset_pos, lemma_type, lemma = line.strip().split("\t")
lemma = lemma.strip().replace(" ", "_")
triple = line.strip().split("\t")
if len(triple) < 3:
continue
offset_pos, label = triple[:2]
val = triple[-1]
if self.map30:
if offset_pos in self.map30.keys():
# Map offset_pos to current Wordnet version:
Expand All @@ -2107,11 +2150,19 @@ def custom_lemmas(self, tab_file, lang):
# Synsets with no mapping keep their Wordnet 3.0 offset
# warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
pass
self._lang_data[lang][0][offset_pos].append(lemma)
self._lang_data[lang][1][lemma.lower()].append(offset_pos)
pair = label.split(":")
attr = pair[-1]
if len(pair) == 1 or pair[0] == lg:
if attr == "lemma":
val = val.strip().replace(" ", "_")
self._lang_data[lang][1][val.lower()].append(offset_pos)
if attr in self.lg_attrs:
self._lang_data[lang][self.lg_attrs.index(attr)][
offset_pos
].append(val)
# Make sure no more entries are accidentally added subsequently
self._lang_data[lang][0].default_factory = None
self._lang_data[lang][1].default_factory = None
for n in range(len(self.lg_attrs)):
self._lang_data[lang][n].default_factory = None

######################################################################
# Visualize WordNet relation graphs using Graphviz
Expand Down
17 changes: 9 additions & 8 deletions nltk/test/wordnet.doctest
Expand Up @@ -48,19 +48,20 @@ The WordNet corpus reader gives access to the Open Multilingual
WordNet, using ISO-639 language codes.

>>> sorted(wn.langs())
['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fas',
'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno',
'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm']
['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus',
'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn',
'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'ron', 'slk',
'slv', 'spa', 'swe', 'tha', 'zsm']
>>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
[Synset('dog.n.01'), Synset('spy.n.01')]

>>> wn.synset('spy.n.01').lemma_names('jpn')
['いぬ', 'スパイ', '回者', '回し者', '密偵', '工作員',
'廻者', '廻し者', '探', '探り', '犬', '秘密捜査員',
'まわし者', '諜報員', '諜者', '間者', '間諜', '隠密']
['いぬ', 'まわし者', 'スパイ', '回し者', '回者', '密偵',
'工作員', '廻し者', '廻者', '探', '探り', '犬', '秘密捜査員',
'諜報員', '諜者', '間者', '間諜', '隠密']

>>> wn.synset('dog.n.01').lemma_names('ita')
['cane', 'Canis_familiaris']
['Canis_familiaris', 'cane']
>>> wn.lemmas('cane', lang='ita')
[Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
Lemma('incompetent.n.01.cane')]
Expand All @@ -77,7 +78,7 @@ WordNet, using ISO-639 language codes.
>>> dog_lemma.lang()
'por'
>>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn')))
64797
66031

-------
Synsets
Expand Down

0 comments on commit 8ed8b70

Please sign in to comment.