-
Notifications
You must be signed in to change notification settings - Fork 12
/
config.ini
61 lines (56 loc) · 2.65 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
[GENERAL]
# verbose output
; verbose =
# source language; default = HINDI
language_1 =
# target language; default = ENGLISH
language_2 =
# choose which stages of the pipeline are going to be run; default: pregcm, gcm
stages_to_run =
# whether to run the pregcm and gcm stages parallely; default: 0 ; set to 1 to run parellely
; parallel_run =
# path of input directory where input files are present; default: data
input_loc =
# path for output directory; default: data
output_loc =
[ALIGNER]
# name of input file with non-english data; default: hi-to-en-input_lang1
source_inp_file =
# name of file with english data; default: hi-to-en-input_lang2
target_inp_file =
# name of output file with non-english data; default: hi-to-en-input_lang1
source_op_file =
# name of output file with english data; default: hi-to-en-input_lang2
target_op_file =
# name of PFMS file if present; if not given a pfms file will be generated with default name using initials of the two languages, example for hindi and english: hi-to-en_pfms.txt
pfms_file =
# name of output file for saving parallel alignments; default: hi-to-en-input_parallel_alignments
align_op_file =
[PREGCM]
# path for directory to store output of pregcm stage, this directory will be saved as a sub-directory of the output_loc. if not given then a directory name using initials of the two languages will be used, example for hindi and english: hi-to-en/ inside output_loc directory.
pregcm_output_loc =
# cut-off value for PFMS score
max_pfms =
# select the parser to be used from available parsers - stanford and benepar; default: benepar
parser =
[GCM]
# directory that contains output of the pregcm; default: data/hi-to-en
gcm_input_loc =
# path for directory to store output of gcm stage, this directory will be saved as a sub-directory of the output_loc. if not given then a directory name using initials of the two languages will be used, example for hindi and english: hi-to-en-gcm/ inside output_loc directory.
gcm_output_loc =
# max number of sentences to generate per sentence, set -1 for getting all the generations; default: 5
k = -1
# which theory to choose for generating cm sentences 'ec' (ec theory) or 'ml' (ml theory); default ec
linguistic_theory =
[OUTPUT]
# language tag at word level in each output code-mixed sentence; default 0
lid_output =
# visualize DFAs that were used to make generations; default 0
; dfa_output =
# sampling technique to use - random or spf; default: random
sampling =
# file name of the input rcm file which is needed for spf sampling; default is rcm_lang_tagged.txt
rcm_file =
[WEB]
# value of your azure token that will be used to generate translations using Azure API
azure_subscription_key =