diff --git a/.gitmodules b/.gitmodules index d1337b3..31495b9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "spacy-dev-resources"] path = spacy-dev-resources - url = git@github.com:howl-anderson/spacy-dev-resources.git + url = https://github.com/howl-anderson/spacy-dev-resources.git [submodule "third-part/brown-cluster"] path = third-part/brown-cluster - url = https://github.com/percyliang/brown-cluster.git + url = https://github.com/howl-anderson/brown-cluster.git diff --git a/all_in_one.bash b/all_in_one.bash new file mode 100644 index 0000000..5b22596 --- /dev/null +++ b/all_in_one.bash @@ -0,0 +1,17 @@ +./create_wikipedia_corpus.bash +./move_wikipedia_corpus.bash +./compute_words_freq.bash +./merge_all_text_files.bash +./download_and_compile_brown_cluster.bash +./compute_plain_word_vec.bash +./create_init_model.bash +./update_model_meta.py +./download_UD_Chinese-GSD_corpus.bash +./extract_UD_Chinese-GSD_corpus.bash +./convert_UD_Chinese-GSD_corpus.bash +./format_convertor.bash +./init_model.bash +./train_model.bash +./onto_to_spacy_json.bash +./train_ner.bash +./merge_submodel.py diff --git a/compute_plain_word_vec.bash b/compute_plain_word_vec.bash index 7483360..63dfcbf 100755 --- a/compute_plain_word_vec.bash +++ b/compute_plain_word_vec.bash @@ -3,4 +3,4 @@ cpu_count=`nproc --all` process_count=$(expr $cpu_count - 1) -python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ../chinese-wikipedia-corpus-creator/token_cleaned_plain_files WORDS_VECS.txt +python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ./WORDS.txt WORDS_VECS.txt diff --git a/create_init_model.bash b/create_init_model.bash index 4087d32..ed0ca62 100755 --- a/create_init_model.bash +++ b/create_init_model.bash @@ -1,3 +1,3 @@ #!/bin/bash -python -m spacy init-model -c ./WORDS-c1000-p1.out/paths -v WORDS_VECS.txt zh zh_wiki_core WORDS_FREQ.txt +python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors diff --git a/create_jsonl_corpus.bash b/create_jsonl_corpus.bash new file mode 100755 index 0000000..d50b4c3 --- /dev/null +++ b/create_jsonl_corpus.bash @@ -0,0 +1,3 @@ +#!/bin/bash + +python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths diff --git a/create_jsonl_vocabulary.py b/create_jsonl_vocabulary.py new file mode 100644 index 0000000..4ad5b9f --- /dev/null +++ b/create_jsonl_vocabulary.py @@ -0,0 +1,269 @@ +import json +import math +import string +from ast import literal_eval +from pathlib import Path + +import ftfy +import jsonlines +import plac +import validators +from preshed.counter import PreshCounter +from spacy.lang.en import stop_words as en_stop_words +from spacy.lang.zh import stop_words as zh_stop_words +from tqdm import tqdm + + +class Word: + counter = -1 + + def __init__(self, word_str, cluster, probs): + self._word = word_str + self._cluster = cluster + self._probs = probs + + chinese_punct = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." + self._punct_list = list(set(string.punctuation + chinese_punct)) + + chinese_whitespace = "" + self._whitespace_list = list(set(string.whitespace + chinese_whitespace)) + + english_stopword = en_stop_words.STOP_WORDS + chinese_stopword = zh_stop_words.STOP_WORDS + self._stopword_list = {*english_stopword, *chinese_stopword} + + chinese_quote = "“”‘’" + english_quote = "\"'" + self._qute_list = list(set(english_quote + chinese_quote)) + + chinese_left_punct = "<([{" + english_left_punct = "<([「『【〔〖〘〚{" + self._left_punct_list = list(set(english_left_punct + chinese_left_punct)) + + chinese_right_punct = ">)]}" + english_right_punct = ">)]」』】〕〗〙〛}" + self._right_punct_list = list(set(english_right_punct + chinese_right_punct)) + + @property + def orth(self): + return self._word + + @property + def id(self): + self.__class__.counter += 1 + + return self.__class__.counter + + @property + def lower(self): + return self._word.lower() + + @property + def norm(self): + return self._word + + @property + def shape(self): + return "".join(map(lambda x: "X" if x.isupper() else "x", self._word)) + + @property + def prefix(self): + return self._word[0] + + @property + def suffix(self): + return self._word[-1] + + @property + def length(self): + return len(self._word) + + @property + def cluster(self): + return self._cluster + + @property + def prob(self): + return self._probs.get(self, 0) + + @property + def is_alpha(self): + return self._word.isalpha() + + @property + def is_ascii(self): + # only for py 3.7 + # return self._word.isascii() + try: + self._word.encode('ascii') + except UnicodeEncodeError: + return False + + return True + + @property + def is_digit(self): + return self._word.isdigit() + + @property + def is_lower(self): + return self._word.islower() + + @property + def is_punct(self): + return self._word in self._punct_list + + @property + def is_space(self): + return self._word in self._whitespace_list + + @property + def is_title(self): + return self._word.istitle() + + @property + def is_upper(self): + return self._word.isupper() + + @property + def like_url(self): + return bool(validators.url(self._word)) + + @property + def like_num(self): + # TODO(howl-anderson): fix it later + return False + + @property + def like_email(self): + return bool(validators.email(self._word)) + + @property + def is_stop(self): + return self._word in self._stopword_list + + @property + def is_oov(self): + return not self._word in self._probs + + @property + def is_quote(self): + return self._word in self._qute_list + + @property + def is_left_punct(self): + return self._word in self._left_punct_list + + @property + def is_right_punct(self): + return self._word in self._right_punct_list + + +def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): + print("Counting frequencies...") + counts = PreshCounter() + total = 0 + with freqs_loc.open() as f: + for i, line in enumerate(f): + freq, doc_freq, key = line.rstrip().split("\t", 2) + freq = int(freq) + counts.inc(i + 1, freq) + total += freq + counts.smooth() + log_total = math.log(total) + probs = {} + with freqs_loc.open() as f: + for line in tqdm(f): + freq, doc_freq, key = line.rstrip().split("\t", 2) + doc_freq = int(doc_freq) + freq = int(freq) + if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: + word = literal_eval(key) + smooth_count = counts.smoother(int(freq)) + probs[word] = math.log(smooth_count) - log_total + oov_prob = math.log(counts.smoother(0)) - log_total + return probs, oov_prob + + +def read_clusters(clusters_loc): + print("Reading clusters...") + clusters = {} + with clusters_loc.open() as f: + for line in tqdm(f): + try: + cluster, word, freq = line.split() + word = ftfy.fix_text(word) + except ValueError: + continue + # If the clusterer has only seen the word a few times, its + # cluster is unreliable. + if int(freq) >= 3: + clusters[word] = cluster + else: + clusters[word] = "0" + # Expand clusters with re-casing + for word, cluster in list(clusters.items()): + if word.lower() not in clusters: + clusters[word.lower()] = cluster + if word.title() not in clusters: + clusters[word.title()] = cluster + if word.upper() not in clusters: + clusters[word.upper()] = cluster + return clusters + + +@plac.annotations( + lang=("model language", "positional", None, str), + output_loc=("model output directory", "positional", None, str), + freqs_loc=("location of words frequencies file", "positional", None, Path), + clusters_loc=("location of brown clusters data", "positional", None, Path), +) +def main(lang, output_loc, freqs_loc, clusters_loc): + clusters = read_clusters(clusters_loc) + probs, oov_prob = read_freqs(freqs_loc) + + with jsonlines.open(output_loc, mode="w") as writer: + header = {"lang": lang, "settings": {"oov_prob": oov_prob}} + + writer.write(header) + + for word_str, cluster in clusters.items(): + + if not word_str: + continue + + word = Word(word_str, cluster, probs) + row = { + "orth": word.orth, # the word text + "id": word.id, # can correspond to row in vectors table + "lower": word.lower, + "norm": word.norm, + "shape": word.shape, + "prefix": word.prefix, + "suffix": word.suffix, + "length": word.length, + "cluster": word.cluster, + "prob": word.prob, + "is_alpha": word.is_alpha, + "is_ascii": word.is_ascii, + "is_digit": word.is_digit, + "is_lower": word.is_lower, + "is_punct": word.is_punct, + "is_space": word.is_space, + "is_title": word.is_title, + "is_upper": word.is_upper, + "like_url": word.like_url, + "like_num": word.like_num, + "like_email": word.like_email, + "is_stop": word.is_stop, + "is_oov": word.is_oov, + "is_quote": word.is_quote, + "is_left_punct": word.is_left_punct, + "is_right_punct": word.is_right_punct, + } + + writer.write(row) + + +if __name__ == "__main__": + plac.call(main) diff --git a/create_model_package.bash b/create_model_package.bash new file mode 100755 index 0000000..6d81fdd --- /dev/null +++ b/create_model_package.bash @@ -0,0 +1,7 @@ +#!/bin/bash + +python -m spacy package spacy_models/final_model spacy_models/model_package --force + +cd spacy_models/model_package/zh_core_web_sm-0.1.0 +python ./setup.py sdist + diff --git a/create_wikipedia_corpus.bash b/create_wikipedia_corpus.bash new file mode 100644 index 0000000..beaf722 --- /dev/null +++ b/create_wikipedia_corpus.bash @@ -0,0 +1,2 @@ +cd chinese-wikipedia-corpus-creator +bash ./allinone_process.bash diff --git a/merge_all_text_files.py b/merge_all_text_files.py index 4943dfa..0e6e24e 100644 --- a/merge_all_text_files.py +++ b/merge_all_text_files.py @@ -9,10 +9,10 @@ output_path = pathlib.Path(output_file) -with output_path.open('wt') as outfile: +with output_path.open("wt") as outfile: for fname in input_files: - with fname.open('rt') as infile: + with fname.open("rt") as infile: for line in infile: - if not line.endswith('\n'): - line = line + '\n' + if not line.endswith("\n"): + line = line + "\n" outfile.write(line) diff --git a/merge_submodel.py b/merge_submodel.py new file mode 100644 index 0000000..01053ed --- /dev/null +++ b/merge_submodel.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 + +import shutil +import json +from pathlib import Path + + +def read_pipeline(meta_file): + with open(meta_file) as fd: + data = json.load(fd) + return data["pipeline"] + + +def update_pipeline(meta_file, pipeline): + with open(meta_file) as fd: + data = json.load(fd) + + data["pipeline"] = pipeline + + with open(meta_file, "w") as fd: + json.dump(data, fd) + + +def copy_tree(src: Path, dst: Path, folder: str): + shutil.copytree(src / folder, dst / folder) + + +def main(): + target_dir = Path("./spacy_models/final_model") + target_dir.mkdir(exist_ok=True) + + pipeline = [] + + source_dir = Path("./spacy_models/dependency_model/model-best") + copy_tree(source_dir, target_dir, "parser") + copy_tree(source_dir, target_dir, "tagger") + copy_tree(source_dir, target_dir, "vocab") + + pipeline.extend(read_pipeline(source_dir / "meta.json")) + + source_dir = Path("./spacy_models/ner_model/model-best") + copy_tree(source_dir, target_dir, "ner") + shutil.copy(source_dir / "meta.json", target_dir / "meta.json") + + pipeline.extend(read_pipeline(source_dir / "meta.json")) + + update_pipeline(target_dir / "meta.json", pipeline) + + +if __name__ == "__main__": + main() diff --git a/meta.json b/meta.json index 7c175e5..d7765fb 100644 --- a/meta.json +++ b/meta.json @@ -1,40 +1,14 @@ -{ - "lang":"zh", - "pipeline":[ - "tagger", - "parser", - "ner" - ], - "accuracy":{ - "token_acc":99.8698372794, - "ents_p":84.9664503965, - "ents_r":85.6312524451, - "uas":91.7237657538, - "tags_acc":97.0403350292, - "ents_f":85.2975560875, - "las":89.800872413 - }, - "name":"core_web_sm", - "license":"CC BY-SA 3.0", - "author":"Xiaoquan Kong", - "url":"https://xiaoquankong.ai", - "vectors":{ - "keys":0, - "width":0, - "vectors":0 - }, - "sources":[ - "OntoNotes 5", - "Common Crawl" - ], - "version":"2.0.0", - "spacy_version":">=2.0.0a18", - "parent_package":"spacy", - "speed":{ - "gpu":null, - "nwords":291344, - "cpu":5122.3040471407 - }, - "email":"u1mail2me@gmail.com", - "description":"Chinese multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." -} + { + "name": "core_web_sm", + "version": "0.1.0", + "license": "CC BY-SA 3.0", + "author": "Xiaoquan Kong", + "url": "https://xiaoquankong.ai", + "sources": [ + "OntoNotes 5", + "Common Crawl", + "Universal Dependencies" + ], + "email": "u1mail2me@gmail.com", + "description": "Chinese multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." + } diff --git a/move_wikipedia_corpus.bash b/move_wikipedia_corpus.bash new file mode 100644 index 0000000..84d4db4 --- /dev/null +++ b/move_wikipedia_corpus.bash @@ -0,0 +1 @@ +cp -r chinese-wikipedia-corpus-creator/token_cleaned_plain_files token_cleaned_plain_files diff --git a/onto_to_spacy_json.bash b/onto_to_spacy_json.bash index 7777ff9..9c84bfa 100755 --- a/onto_to_spacy_json.bash +++ b/onto_to_spacy_json.bash @@ -1,3 +1,3 @@ #!/usr/bin/env bash -python onto_to_spacy_json.py -i "/home/howl/Datasets/ontonotes-release-5.0/data/files/data/chinese/annotations/" -t "china_ner_train.json" -e "china_ner_eval.json" -v 0.1 +python onto_to_spacy_json.py -i "./ontonotes-release-5.0/data/files/data/chinese/annotations/" -t "china_ner_train.json" -e "china_ner_eval.json" -v 0.05 diff --git a/onto_to_spacy_json.py b/onto_to_spacy_json.py index 0a4adfa..914a349 100644 --- a/onto_to_spacy_json.py +++ b/onto_to_spacy_json.py @@ -22,12 +22,12 @@ def get_root_filename(onto_dir): def split_sentence(text): - text = text.strip().split('\n')[1:-1] + text = text.strip().split("\n")[1:-1] return text def split_doc(text): - text_list = text.strip().split('\s\s', t)[0] for t in text_list] text_list = [re.sub('', "", t).strip() for t in text_list] return ids, text_list @@ -35,7 +35,7 @@ def split_doc(text): def clean_ent(ent): tag = re.findall('TYPE="(.+?)"', ent)[0] - text = re.findall('>(.+)', ent)[0] + text = re.findall(">(.+)", ent)[0] text = re.sub("\$", "\$", text) return (text, tag) @@ -78,10 +78,11 @@ def onf_to_raw(onf_file): """ with open(onf_file, "r") as f: onf = f.read() - sentences = re.findall("Plain sentence\:\n\-+?\n(.+?)Treebanked sentence", - onf, re.DOTALL) + sentences = re.findall( + "Plain sentence\:\n\-+?\n(.+?)Treebanked sentence", onf, re.DOTALL + ) sentences = [re.sub("\n+?\s*", " ", i).strip() for i in sentences] - paragraph = ' '.join(sentences) + paragraph = " ".join(sentences) return paragraph @@ -98,16 +99,18 @@ def name_to_sentences(ner_filename): for sent in onto_sents: offsets = text_to_spacy(sent) doc = nlp(offsets[0]) - tags = biluo_tags_from_offsets(doc, offsets[1]['entities']) + tags = biluo_tags_from_offsets(doc, offsets[1]["entities"]) ner_info = list(zip(doc, tags)) tokens = [] for n, i in enumerate(ner_info): - token = {"head": 0, - "dep": "", - "tag": "", - "orth": i[0].string, - "ner": i[1], - "id": n} + token = { + "head": 0, + "dep": "", + "tag": "", + "orth": i[0].string, + "ner": i[1], + "id": n, + } tokens.append(token) sentences.append({"tokens": tokens}) return sentences @@ -124,10 +127,7 @@ def dir_to_annotation(onto_dir): try: raw = onf_to_raw(onf_filename) sentences = name_to_sentences(ner_filename) - final = {"id": "fake", - "paragraphs": [ - {"raw": raw, - "sentences": sentences}]} + final = {"id": "fake", "paragraphs": [{"raw": raw, "sentences": sentences}]} all_annotations.append(final) except Exception as e: print("Error formatting ", fn, e) @@ -138,7 +138,8 @@ def dir_to_annotation(onto_dir): onto_dir=("Directory of OntoNotes data to traverse", "option", "i", str), train_file=("File to write training spaCy JSON out to", "option", "t", str), val_file=("File to write validation spaCy JSON out to", "option", "e", str), - val_split=("Percentage to use for evaluation", "option", "v", float)) + val_split=("Percentage to use for evaluation", "option", "v", float), +) def main(onto_dir, train_file, val_file, val_split=0.75): print("Reading and formatting annotations") all_annotations = dir_to_annotation(onto_dir) @@ -147,8 +148,11 @@ def main(onto_dir, train_file, val_file, val_split=0.75): val = all_annotations[:cutpoint] train = all_annotations[cutpoint:] - print("Saving {0} training examples and {1} validation examples".format( - len(train), len(val))) + print( + "Saving {0} training examples and {1} validation examples".format( + len(train), len(val) + ) + ) with open(train_file, "w") as f: json.dump(train, f, ensure_ascii=False, indent=4) with open(val_file, "w") as f: diff --git a/plain_word_vectors.py b/plain_word_vectors.py new file mode 100644 index 0000000..1f9759f --- /dev/null +++ b/plain_word_vectors.py @@ -0,0 +1,51 @@ +import plac +import gensim +from gensim import utils + + +class Corpus: + def __init__(self, corpus_file): + self.corpus_file = corpus_file + + def __iter__(self): + with open(self.corpus_file) as fd: + for line in fd: + yield utils.simple_preprocess(line) + + +@plac.annotations( + in_dir=("Location of input directory"), + out_loc=("Location of output file"), + n_workers=("Number of workers", "option", "n", int), + size=("Dimension of the word vectors", "option", "d", int), + window=("Context window size", "option", "w", int), + min_count=("Min count", "option", "m", int), + negative=("Number of negative samples", "option", "g", int), + nr_iter=("Number of iterations", "option", "i", int), +) +def main( + in_dir, + out_loc, + negative=5, + n_workers=4, + window=5, + size=128, + min_count=10, + nr_iter=2, +): + sentences = Corpus(in_dir) + model = gensim.models.Word2Vec( + sentences=sentences, + size=size, + window=window, + min_count=min_count, + workers=n_workers, + sample=1e-5, + negative=negative, + iter=nr_iter, + ) + model.wv.save_word2vec_format(out_loc, binary=False) + + +if __name__ == "__main__": + plac.call(main) diff --git a/requirements.txt b/requirements.txt index 3f23f45..d155217 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ plac spacy pandas jieba +ftfy +validators diff --git a/spacy-dev-resources b/spacy-dev-resources deleted file mode 160000 index 4ba4bea..0000000 --- a/spacy-dev-resources +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4ba4bea947c4bb6779066f4ef6a30decaad304e4 diff --git a/spacy-dev-resources b/spacy-dev-resources new file mode 120000 index 0000000..89e29cd --- /dev/null +++ b/spacy-dev-resources @@ -0,0 +1 @@ +../spacy-dev-resources \ No newline at end of file diff --git a/test_as_model_dir.py b/test_as_model_dir.py new file mode 100755 index 0000000..5b3a5d6 --- /dev/null +++ b/test_as_model_dir.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +from spacy import displacy +import spacy + +nlp = spacy.load("./spacy_models/final_model") + + +def main(): + doc = nlp("王小明在北京的清华大学读书") + for token in doc: + print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, + token.shape_, token.is_alpha, token.is_stop, token.has_vector, + token.ent_iob_, token.ent_type_, + token.vector_norm, token.is_oov) + + # displacy.serve(doc) + + +if __name__ == "__main__": + main() diff --git a/train_model.bash b/train_model.bash index 5f6f38e..4ccf4de 100755 --- a/train_model.bash +++ b/train_model.bash @@ -1,3 +1,3 @@ #!/bin/bash -python -m spacy train zh depedency_model corpus/spacy/zh-simplified-ud-train.json corpus/spacy/zh-simplified-ud-dev.json -v zh_model -m ./meta.json --no-entities +python -m spacy train zh spacy_models/dependency_model corpus/spacy/zh-simplified-ud-train.json corpus/spacy/zh-simplified-ud-dev.json --pipeline tagger,parser -v spacy_models/base_model -m meta.json -V 0.1.0 -n 1 diff --git a/train_ner.bash b/train_ner.bash index f034f39..630857e 100755 --- a/train_ner.bash +++ b/train_ner.bash @@ -1,3 +1,3 @@ #!/bin/bash -python -m spacy train zh ner_model china_ner_train.json china_ner_eval.json --no-tagger --no-parser -verbose True -g 0 --vectors ./zh_model +python -m spacy train zh spacy_models/ner_model ./china_ner_train.json ./china_ner_eval.json --pipeline ner -m meta.json -v ./spacy_models/dependency_model/model-best -n 1 diff --git a/update_model_meta.py b/update_model_meta.py new file mode 100644 index 0000000..b43e701 --- /dev/null +++ b/update_model_meta.py @@ -0,0 +1,15 @@ +import json + + +def main(): + with open("./spacy_models/base_model/meta.json") as fd: + data = json.load(infd) + + data["name"] = "core_web_sm" + + with open("./spacy_models/base_model/meta.json", "wt") as fd: + json.dump(data, fd) + + +if __name__ == "__main__": + main() diff --git a/workflow.md b/workflow.md index a005de1..17dc154 100644 --- a/workflow.md +++ b/workflow.md @@ -2,6 +2,16 @@ ## get preprocessed Chinese Wikipedia corpus see project [chinese-wikipedia-corpus-creator](https://github.com/howl-anderson/chinese-wikipedia-corpus-creator) for more details. + +### produce wikipedia corpus ### + * input: - + * output: `token_cleaned_plain_files/` + * script: `create_wikipedia_corpus.bash` + +### copy corpus to workspace ### + * input: `chinese-wikipedia-corpus-creator/token_cleaned_plain_files/`` + * output: `token_cleaned_plain_files/` + * script: `move_wikipedia_corpus.bash` ## computing word frequency * input: `token_cleaned_plain_files/*` @@ -32,10 +42,18 @@ * output: `WORDS_VECS.txt` * script: `compute_plain_word_vec.bash` -## initial SpaCy model [TODO: may be removed] +## initial SpaCy model + +### build base model * input: `./WORDS-c1000-p1.out/paths WORDS_VECS.txt WORDS_FREQ.txt` - * output: `zh_wiki_core/**/*` + * output: `spacy_models/base_model/**/*` * script: `create_init_model.bash` + +### modify model name + * input: `spacy_models/base_model/meta.json` + * output: `spacy_models/base_model/meta.json` + * script: `update_model_meta.py` + ## getting UD_Chinese-GSD corpus @@ -78,3 +96,8 @@ * input: `zh_model china_ner_train.json china_ner_eval.json` * output: `ner_model` * script: `train_ner.bash` + +## merge sub-model + * input: `spacy_models/dependency_model`, `spacy_models/ner_model` + * output: `spacy_models/final_model` + * script: `merge_submodel.py`