-
Notifications
You must be signed in to change notification settings - Fork 4
/
msi_profiler.py
176 lines (168 loc) · 4.2 KB
/
msi_profiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# Isidro Cortes-Ciriano
# Harvard Medical School
# isidrolauscher@gmail.com
import argparse
import models
def initialize_parser():
parser = argparse.ArgumentParser(
description=(
'msi_profiler serves to detect microsatellite instability '
'from sequencing data. Type msi_profiler.py --help for '
'further instructions.'
)
)
parser.add_argument(
'--tumor_bam',
help='Tumor bam file name',
required=True
)
parser.add_argument(
'--normal_bam',
help='Normal bam file name',
required=True
)
parser.add_argument(
'--bed',
help='Input bedfile name',
required=True
)
parser.add_argument(
'--chromosomes',
help='Chromosomes to be {}'.format(
models.MicroSatelliteProfiler.PHASED
),
required=True,
nargs='+'
)
parser.add_argument(
'--fasta',
help=(
'Path to the directory containing the fasta sequences (one per '
'chromosome). The expected names for the fasta files are e.g. chr1.fa'
),
required=True
)
parser.add_argument(
'--reference_set',
help=(
'Path to the directory containing the reference '
'sets of microsatellites'
),
required=True
)
parser.add_argument(
'--output_prefix',
help=(
'Path and prefix for the output files. E.g. '
'path_to_out_dir/out_prefix'
),
required=True
)
parser.add_argument(
'--mode',
help='{} or {}'.format(
models.MicroSatelliteProfiler.PHASED.title(),
models.MicroSatelliteProfiler.UNPHASED.title()
),
required=True,
choices=[
models.MicroSatelliteProfiler.PHASED,
models.MicroSatelliteProfiler.UNPHASED
]
)
parser.add_argument(
'--nprocs',
help='Number of processes',
required=True,
type=int
)
parser.add_argument(
'--rus',
help=(
'MS repeat units. Supported from 1 (i.e. mono repeats) to 6 (i.e. '
'hexarepeats)'
),
required=True,
nargs='+',
type=int
)
# Optional arguments
parser.add_argument(
'--min_MS_length',
help=(
'Minimum length of microsatellites to be considered. Minimum '
'available is 6; default is 10.'
),
required=False,
default=10,
type=int
)
parser.add_argument(
'--max_MS_length',
help=(
'Maximum length of microsatellites to be considered. Maximum '
'available is 60; default is 60.'
),
required=False,
default=60,
type=int
)
parser.add_argument(
'--mapping_quality',
help=(
'Minimum mapping quality. Default is 40.'
),
required=False,
default=40,
type=int
)
parser.add_argument(
'--flank_size',
help=(
'Minimum length of the flanking regions. Default is 10'
),
required=False,
default=10,
type=int
)
parser.add_argument(
'--min_coverage',
help=(
'Minimum coverage at each MS locus -both in the case and control '
'bams-. Default is 10'
),
required=False,
default=10,
type=int
)
parser.add_argument(
'--tolerated_mismatches',
help=(
'Maximum number of tolerated mismatches in the flanking regions. '
'Default is 0'
),
required=False,
default=0,
type=int
)
# parser.add_argument(
# '--notation',
# help=(
# 'Ensemble (1,2, .. Y) or UCSC (chr1, chr2, .. chrY)'
# ),
# required=False,
# default="Ensemble",
# type=str,
# choices=[
# 'Ensemble',
# 'UCSC'
# ]
# )
return parser
def main():
parser = initialize_parser()
args = parser.parse_args()
msi_profiler = models.MicroSatelliteProfiler(args)
msi_profiler.run()
if __name__ == '__main__':
main()