Skip to content

Commit

Permalink
fix: update for SpaCy 2.2.x
Browse files Browse the repository at this point in the history
  • Loading branch information
howl-anderson committed Mar 12, 2020
1 parent 3db4160 commit 03d1eea
Show file tree
Hide file tree
Showing 22 changed files with 514 additions and 74 deletions.
4 changes: 2 additions & 2 deletions .gitmodules
@@ -1,6 +1,6 @@
[submodule "spacy-dev-resources"]
path = spacy-dev-resources
url = git@github.com:howl-anderson/spacy-dev-resources.git
url = https://github.com/howl-anderson/spacy-dev-resources.git
[submodule "third-part/brown-cluster"]
path = third-part/brown-cluster
url = https://github.com/percyliang/brown-cluster.git
url = https://github.com/howl-anderson/brown-cluster.git
17 changes: 17 additions & 0 deletions all_in_one.bash
@@ -0,0 +1,17 @@
./create_wikipedia_corpus.bash
./move_wikipedia_corpus.bash
./compute_words_freq.bash
./merge_all_text_files.bash
./download_and_compile_brown_cluster.bash
./compute_plain_word_vec.bash
./create_init_model.bash
./update_model_meta.py
./download_UD_Chinese-GSD_corpus.bash
./extract_UD_Chinese-GSD_corpus.bash
./convert_UD_Chinese-GSD_corpus.bash
./format_convertor.bash
./init_model.bash
./train_model.bash
./onto_to_spacy_json.bash
./train_ner.bash
./merge_submodel.py
2 changes: 1 addition & 1 deletion compute_plain_word_vec.bash
Expand Up @@ -3,4 +3,4 @@
cpu_count=`nproc --all`
process_count=$(expr $cpu_count - 1)

python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ../chinese-wikipedia-corpus-creator/token_cleaned_plain_files WORDS_VECS.txt
python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ./WORDS.txt WORDS_VECS.txt
2 changes: 1 addition & 1 deletion create_init_model.bash
@@ -1,3 +1,3 @@
#!/bin/bash

python -m spacy init-model -c ./WORDS-c1000-p1.out/paths -v WORDS_VECS.txt zh zh_wiki_core WORDS_FREQ.txt
python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors
3 changes: 3 additions & 0 deletions create_jsonl_corpus.bash
@@ -0,0 +1,3 @@
#!/bin/bash

python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths
269 changes: 269 additions & 0 deletions create_jsonl_vocabulary.py
@@ -0,0 +1,269 @@
import json
import math
import string
from ast import literal_eval
from pathlib import Path

import ftfy
import jsonlines
import plac
import validators
from preshed.counter import PreshCounter
from spacy.lang.en import stop_words as en_stop_words
from spacy.lang.zh import stop_words as zh_stop_words
from tqdm import tqdm


class Word:
counter = -1

def __init__(self, word_str, cluster, probs):
self._word = word_str
self._cluster = cluster
self._probs = probs

chinese_punct = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
self._punct_list = list(set(string.punctuation + chinese_punct))

chinese_whitespace = ""
self._whitespace_list = list(set(string.whitespace + chinese_whitespace))

english_stopword = en_stop_words.STOP_WORDS
chinese_stopword = zh_stop_words.STOP_WORDS
self._stopword_list = {*english_stopword, *chinese_stopword}

chinese_quote = "“”‘’"
english_quote = "\"'"
self._qute_list = list(set(english_quote + chinese_quote))

chinese_left_punct = "<([{"
english_left_punct = "<([「『【〔〖〘〚{"
self._left_punct_list = list(set(english_left_punct + chinese_left_punct))

chinese_right_punct = ">)]}"
english_right_punct = ">)]」』】〕〗〙〛}"
self._right_punct_list = list(set(english_right_punct + chinese_right_punct))

@property
def orth(self):
return self._word

@property
def id(self):
self.__class__.counter += 1

return self.__class__.counter

@property
def lower(self):
return self._word.lower()

@property
def norm(self):
return self._word

@property
def shape(self):
return "".join(map(lambda x: "X" if x.isupper() else "x", self._word))

@property
def prefix(self):
return self._word[0]

@property
def suffix(self):
return self._word[-1]

@property
def length(self):
return len(self._word)

@property
def cluster(self):
return self._cluster

@property
def prob(self):
return self._probs.get(self, 0)

@property
def is_alpha(self):
return self._word.isalpha()

@property
def is_ascii(self):
# only for py 3.7
# return self._word.isascii()
try:
self._word.encode('ascii')
except UnicodeEncodeError:
return False

return True

@property
def is_digit(self):
return self._word.isdigit()

@property
def is_lower(self):
return self._word.islower()

@property
def is_punct(self):
return self._word in self._punct_list

@property
def is_space(self):
return self._word in self._whitespace_list

@property
def is_title(self):
return self._word.istitle()

@property
def is_upper(self):
return self._word.isupper()

@property
def like_url(self):
return bool(validators.url(self._word))

@property
def like_num(self):
# TODO(howl-anderson): fix it later
return False

@property
def like_email(self):
return bool(validators.email(self._word))

@property
def is_stop(self):
return self._word in self._stopword_list

@property
def is_oov(self):
return not self._word in self._probs

@property
def is_quote(self):
return self._word in self._qute_list

@property
def is_left_punct(self):
return self._word in self._left_punct_list

@property
def is_right_punct(self):
return self._word in self._right_punct_list


def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
print("Counting frequencies...")
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
probs = {}
with freqs_loc.open() as f:
for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob


def read_clusters(clusters_loc):
print("Reading clusters...")
clusters = {}
with clusters_loc.open() as f:
for line in tqdm(f):
try:
cluster, word, freq = line.split()
word = ftfy.fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = "0"
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters


@plac.annotations(
lang=("model language", "positional", None, str),
output_loc=("model output directory", "positional", None, str),
freqs_loc=("location of words frequencies file", "positional", None, Path),
clusters_loc=("location of brown clusters data", "positional", None, Path),
)
def main(lang, output_loc, freqs_loc, clusters_loc):
clusters = read_clusters(clusters_loc)
probs, oov_prob = read_freqs(freqs_loc)

with jsonlines.open(output_loc, mode="w") as writer:
header = {"lang": lang, "settings": {"oov_prob": oov_prob}}

writer.write(header)

for word_str, cluster in clusters.items():

if not word_str:
continue

word = Word(word_str, cluster, probs)
row = {
"orth": word.orth, # the word text
"id": word.id, # can correspond to row in vectors table
"lower": word.lower,
"norm": word.norm,
"shape": word.shape,
"prefix": word.prefix,
"suffix": word.suffix,
"length": word.length,
"cluster": word.cluster,
"prob": word.prob,
"is_alpha": word.is_alpha,
"is_ascii": word.is_ascii,
"is_digit": word.is_digit,
"is_lower": word.is_lower,
"is_punct": word.is_punct,
"is_space": word.is_space,
"is_title": word.is_title,
"is_upper": word.is_upper,
"like_url": word.like_url,
"like_num": word.like_num,
"like_email": word.like_email,
"is_stop": word.is_stop,
"is_oov": word.is_oov,
"is_quote": word.is_quote,
"is_left_punct": word.is_left_punct,
"is_right_punct": word.is_right_punct,
}

writer.write(row)


if __name__ == "__main__":
plac.call(main)
7 changes: 7 additions & 0 deletions create_model_package.bash
@@ -0,0 +1,7 @@
#!/bin/bash

python -m spacy package spacy_models/final_model spacy_models/model_package --force

cd spacy_models/model_package/zh_core_web_sm-0.1.0
python ./setup.py sdist

2 changes: 2 additions & 0 deletions create_wikipedia_corpus.bash
@@ -0,0 +1,2 @@
cd chinese-wikipedia-corpus-creator
bash ./allinone_process.bash
8 changes: 4 additions & 4 deletions merge_all_text_files.py
Expand Up @@ -9,10 +9,10 @@
output_path = pathlib.Path(output_file)


with output_path.open('wt') as outfile:
with output_path.open("wt") as outfile:
for fname in input_files:
with fname.open('rt') as infile:
with fname.open("rt") as infile:
for line in infile:
if not line.endswith('\n'):
line = line + '\n'
if not line.endswith("\n"):
line = line + "\n"
outfile.write(line)
51 changes: 51 additions & 0 deletions merge_submodel.py
@@ -0,0 +1,51 @@
#!/usr/bin/python3

import shutil
import json
from pathlib import Path


def read_pipeline(meta_file):
with open(meta_file) as fd:
data = json.load(fd)
return data["pipeline"]


def update_pipeline(meta_file, pipeline):
with open(meta_file) as fd:
data = json.load(fd)

data["pipeline"] = pipeline

with open(meta_file, "w") as fd:
json.dump(data, fd)


def copy_tree(src: Path, dst: Path, folder: str):
shutil.copytree(src / folder, dst / folder)


def main():
target_dir = Path("./spacy_models/final_model")
target_dir.mkdir(exist_ok=True)

pipeline = []

source_dir = Path("./spacy_models/dependency_model/model-best")
copy_tree(source_dir, target_dir, "parser")
copy_tree(source_dir, target_dir, "tagger")
copy_tree(source_dir, target_dir, "vocab")

pipeline.extend(read_pipeline(source_dir / "meta.json"))

source_dir = Path("./spacy_models/ner_model/model-best")
copy_tree(source_dir, target_dir, "ner")
shutil.copy(source_dir / "meta.json", target_dir / "meta.json")

pipeline.extend(read_pipeline(source_dir / "meta.json"))

update_pipeline(target_dir / "meta.json", pipeline)


if __name__ == "__main__":
main()

0 comments on commit 03d1eea

Please sign in to comment.