-
Notifications
You must be signed in to change notification settings - Fork 212
/
get_iwslt14_bpe.sh
executable file
·124 lines (105 loc) · 3.72 KB
/
get_iwslt14_bpe.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
# Adapted from https://github.com/pytorch/fairseq/blob/master/examples/translation/prepare-iwslt14.sh
############## https://github.com/joeynmt/joeynmt/pull/216
# Usage:
# $ cd /path/to/joeynmt/scripts # Call this script from /path/to/joeynmt/scripts dir
# $ bash get_iwslt14_bpe.sh # This will create /path/to/joeynmt/test/data/iwslt14/{train | valid | test}.{en | de}
# # Make sure that /path/to/joeynmt/test/data/iwslt14/bpe.32000 exists, too.
# $ cd .. # now back to /path/to/joeynmt/
#
# Train: comment out the `voc_file` lines in the data section -> vocab files will be created in the training process
# $ python -m joeynmt train configs/iwslt14_deen_bpe.yaml --skip-test
##############
git clone https://github.com/moses-smt/mosesdecoder.git
MOSES=`pwd`/mosesdecoder
SCRIPTS=${MOSES}/scripts
TOKENIZER=${SCRIPTS}/tokenizer/tokenizer.perl
LC=${SCRIPTS}/tokenizer/lowercase.perl
CLEAN=${SCRIPTS}/training/clean-corpus-n.perl
URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
GZ=de-en.tgz
merge_ops=32000
src=de
tgt=en
lang=de-en
prep="../test/data/iwslt14"
tmp=${prep}/tmp
orig=orig
mkdir -p ${orig} ${tmp} ${prep}
echo "Downloading data from ${URL}..."
cd ${orig}
curl -O "${URL}"
if [ -f ${GZ} ]; then
echo "Data successfully downloaded."
else
echo "Data not successfully downloaded."
exit
fi
tar zxvf ${GZ}
cd ..
echo "pre-processing train data..."
for l in ${src} ${tgt}; do
f=train.tags.$lang.$l
tok=train.tags.$lang.tok.$l
cat ${orig}/${lang}/${f} | \
grep -v '<url>' | \
grep -v '<talkid>' | \
grep -v '<keywords>' | \
sed -e 's/<title>//g' | \
sed -e 's/<\/title>//g' | \
sed -e 's/<description>//g' | \
sed -e 's/<\/description>//g' | \
perl ${TOKENIZER} -threads 8 -l $l > ${tmp}/${tok}
echo ""
done
perl ${CLEAN} -ratio 1.5 ${tmp}/train.tags.${lang}.tok ${src} ${tgt} ${tmp}/train.tags.${lang}.clean 1 80
for l in ${src} ${tgt}; do
perl ${LC} < ${tmp}/train.tags.${lang}.clean.${l} > ${tmp}/train.tags.${lang}.${l}
done
echo "pre-processing valid/test data..."
for l in ${src} ${tgt}; do
for o in `ls ${orig}/${lang}/IWSLT14.TED*.${l}.xml`; do
fname=${o##*/}
f=${tmp}/${fname%.*}
echo $o $f
grep '<seg id' $o | \
sed -e 's/<seg id="[0-9]*">\s*//g' | \
sed -e 's/\s*<\/seg>\s*//g' | \
sed -e "s/\’/\'/g" | \
perl ${TOKENIZER} -threads 8 -l ${l} | \
perl ${LC} > ${f}
echo ""
done
done
echo "creating train, valid, test..."
for l in ${src} ${tgt}; do
awk '{if (NR%23 == 0) print $0; }' ${tmp}/train.tags.de-en.${l} > ${tmp}/valid.${l}
awk '{if (NR%23 != 0) print $0; }' ${tmp}/train.tags.de-en.${l} > ${tmp}/train.${l}
cat ${tmp}/IWSLT14.TED.dev2010.de-en.${l} \
${tmp}/IWSLT14.TEDX.dev2012.de-en.${l} \
${tmp}/IWSLT14.TED.tst2010.de-en.${l} \
${tmp}/IWSLT14.TED.tst2011.de-en.${l} \
${tmp}/IWSLT14.TED.tst2012.de-en.${l} \
> ${tmp}/test.${l}
done
echo "learning * joint * BPE..."
codes_file="${tmp}/bpe.${merge_ops}"
cat "${tmp}/train.${src}" "${tmp}/train.${tgt}" > ${tmp}/train.tmp
python3 -m subword_nmt.learn_bpe -s "${merge_ops}" -i "${tmp}/train.tmp" -o "${codes_file}"
rm "${tmp}/train.tmp"
echo "applying BPE..."
for l in ${src} ${tgt}; do
for p in train valid test; do
python3 -m subword_nmt.apply_bpe -c "${codes_file}" -i "${tmp}/${p}.${l}" -o "${prep}/${p}.bpe.${merge_ops}.${l}"
done
done
for l in ${src} ${tgt}; do
for p in train valid test; do
mv ${tmp}/${p}.${l} ${prep}/
done
done
mv "${codes_file}" "${prep}/"
rm -rf ${MOSES}
rm -rf ${tmp}
rm -rf ${orig}