Skip to content
This repository has been archived by the owner on Oct 31, 2023. It is now read-only.

Update compilation instructions for fastBPE #84

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
26 changes: 13 additions & 13 deletions NMT/get_data_enfr.sh
Expand Up @@ -90,7 +90,7 @@ cd $TOOLS_PATH
if [ ! -f "$FASTBPE" ]; then
echo "Compiling fastBPE..."
cd $FASTBPE_DIR
g++ -std=c++11 -pthread -O3 fast.cc -o fast
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
fi
echo "fastBPE compiled in: $FASTBPE"

Expand Down Expand Up @@ -171,8 +171,8 @@ if ! [[ "$(wc -l < $TGT_RAW)" -eq "$N_MONO" ]]; then echo "ERROR: Number of line
# tokenize data
if ! [[ -f "$SRC_TOK" && -f "$TGT_TOK" ]]; then
echo "Tokenize monolingual data..."
cat $SRC_RAW | $NORM_PUNC -l en | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TOK
cat $TGT_RAW | $NORM_PUNC -l fr | $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TOK
cat $SRC_RAW | perl $NORM_PUNC -l en | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TOK
cat $TGT_RAW | perl $NORM_PUNC -l fr | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TOK
fi
echo "EN monolingual data tokenized in: $SRC_TOK"
echo "FR monolingual data tokenized in: $TGT_TOK"
Expand Down Expand Up @@ -207,8 +207,8 @@ echo "Full vocab in: $FULL_VOCAB"
# binarize data
if ! [[ -f "$SRC_TOK.$CODES.pth" && -f "$TGT_TOK.$CODES.pth" ]]; then
echo "Binarizing data..."
$UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TOK.$CODES
$UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TOK.$CODES
python $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TOK.$CODES
python $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TOK.$CODES
fi
echo "EN binarized data in: $SRC_TOK.$CODES.pth"
echo "FR binarized data in: $TGT_TOK.$CODES.pth"
Expand All @@ -233,10 +233,10 @@ if ! [[ -f "$SRC_TEST.sgm" ]]; then echo "$SRC_TEST.sgm is not found!"; exit; fi
if ! [[ -f "$TGT_TEST.sgm" ]]; then echo "$TGT_TEST.sgm is not found!"; exit; fi

echo "Tokenizing valid and test data..."
$INPUT_FROM_SGM < $SRC_VALID.sgm | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_VALID
$INPUT_FROM_SGM < $TGT_VALID.sgm | $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_VALID
$INPUT_FROM_SGM < $SRC_TEST.sgm | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST
$INPUT_FROM_SGM < $TGT_TEST.sgm | $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TEST
perl $INPUT_FROM_SGM < $SRC_VALID.sgm | perl $NORM_PUNC -l en | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_VALID
perl $INPUT_FROM_SGM < $TGT_VALID.sgm | perl $NORM_PUNC -l fr | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_VALID
perl $INPUT_FROM_SGM < $SRC_TEST.sgm | perl $NORM_PUNC -l en | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST
perl $INPUT_FROM_SGM < $TGT_TEST.sgm | perl $NORM_PUNC -l fr | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TEST

echo "Applying BPE to valid and test files..."
$FASTBPE applybpe $SRC_VALID.$CODES $SRC_VALID $BPE_CODES $SRC_VOCAB
Expand All @@ -246,10 +246,10 @@ $FASTBPE applybpe $TGT_TEST.$CODES $TGT_TEST $BPE_CODES $TGT_VOCAB

echo "Binarizing data..."
rm -f $SRC_VALID.$CODES.pth $TGT_VALID.$CODES.pth $SRC_TEST.$CODES.pth $TGT_TEST.$CODES.pth
$UMT_PATH/preprocess.py $FULL_VOCAB $SRC_VALID.$CODES
$UMT_PATH/preprocess.py $FULL_VOCAB $TGT_VALID.$CODES
$UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TEST.$CODES
$UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TEST.$CODES
python $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_VALID.$CODES
python $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_VALID.$CODES
python $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TEST.$CODES
python $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TEST.$CODES


#
Expand Down