-
Notifications
You must be signed in to change notification settings - Fork 12
/
LTR_FINDER_parallel
executable file
·263 lines (234 loc) · 9.81 KB
/
LTR_FINDER_parallel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/env perl
use strict;
use warnings;
use threads;
use Thread::Queue;
use File::Basename;
use FindBin;
use Pod::Usage;
my $version = "v1.2";
# customized parameters
my $seq_path = ""; #specify the sequence file
my $finder_para = "-w 2 -C -D 15000 -d 1000 -L 7000 -l 100 -p 20 -M 0.85"; #specify ltr_finder parameters. The "-w 2" parameter is required.
my $harvest_format = 0; #0, print out format is in LTR_finder table output format (default); 1, LTRharvest screen output format
my $size_of_each_piece = 5000000; #5 Mb per piece
my $timeout = 1500; #set maximum time for a thread to run. After $timeout seconds, the child thread is killed.
my $try1 = 1; #1, further split to 50 Kb regions if thread killed by timeout. 0, no further split.
my $next = 0; #0, run LTR_FINDER. 1, only summarize results if the .finder folder is retained.
my $verbose = 0; #0, remove the .finder folder after finished; 1, retain the folder for later use.
my $threads = 4; #specify thread number to run ltr_finder
my $annotator = "LTR_FINDER_parallel"; #the annotator name used in gff output
# dependencies
#my $script_path = dirname(__FILE__);
my $script_path = $FindBin::Bin;
my $cut = "$script_path/bin/cut.pl"; #the program to cut sequence
my $convert = "$script_path/bin/convert_ltr_finder2.pl"; #the program to convert finder -w2 to harvest scn format
my $ltr_finder = ""; #the path to ltr_finder
my $check_dependencies = undef;
my $help = undef;
my $usage = "
~ ~ ~ Run LTR_FINDER in parallel ~ ~ ~
Author: Shujun Ou (shujun.ou.1\@gmail.com)
Date: 09/19/2018
Update: 01/28/2020
Version: $version
Usage: perl LTR_FINDER_parallel -seq [file] -size [int] -threads [int]
Options: -seq [file] Specify the sequence file.
-size [int] Specify the size you want to split the genome sequence.
Please make it large enough to avoid spliting too many LTR elements. Default 5000000 (bp)
-time [int] Specify the maximum time to run a subregion (a thread).
This helps to skip simple repeat regions that take a substantial of time to run. Default: 1500 (seconds).
Suggestion: 300 for -size 1000000. Increase -time when -size increased.
-try1 [0|1] If a region requires more time than the specified -time (timeout), decide:
0, discard the entire region.
1, further split to 50 Kb regions to salvage LTR candidates (default);
-harvest_out Output LTRharvest format if specified. Default: output LTR_FINDER table format.
-next Only summarize the results for previous jobs without rerunning LTR_FINDER (for -v).
-verbose|-v Retain LTR_FINDER outputs for each sequence piece.
-finder [file] The path to the program LTR_FINDER (default v1.0.7, included in this package).
-threads|-t [int] Indicate how many CPU/threads you want to run LTR_FINDER.
-check_dependencies Check if dependencies are fullfiled and quit
-help|-h Display this help information.
\n";
# read user parameters
my $i=0;
foreach (@ARGV){
$seq_path = $ARGV[$i+1] if /^-seq$/i;
$size_of_each_piece = $ARGV[$i+1] if /^-size$/i;
$timeout = $ARGV[$i+1] if /^-time$/i;
$try1 = $ARGV[$i+1] if /^-try1$/i;
$harvest_format = 1 if /^-harvest_out$/i;
$next = 1 if /^-next$/i;
$verbose = 1 if /^-verbose$|^-v$/i;
$cut = $ARGV[$i+1] if /^-cut$/i;
$ltr_finder = $ARGV[$i+1] if /^-finder$/i;
$threads = $ARGV[$i+1] if /^-threads$|^-t$/i;
$check_dependencies = 1 if /^-check_dependencies$/i;
$help = 1 if /^-help$|^-h$/i;
$i++;
}
# check parameters
if ($help) {
pod2usage( { -verbose => 0,
-exitval => 0,
-message => "$usage\n" } );
}
$ltr_finder=`which ltr_finder 2>/dev/null` if $ltr_finder eq '';
$ltr_finder = "$script_path/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder" if $ltr_finder eq ''; #use default ltr_finder if not ENV
$ltr_finder=~s/ltr_finder\n?//;
$ltr_finder="$ltr_finder/" if $ltr_finder ne '' and $ltr_finder !~ /\/$/;
die "LTR_FINDER is not exist in the path $ltr_finder!\n" unless -X "${ltr_finder}ltr_finder";
print "\nUsing this LTR_FINDER: $ltr_finder\n"; #test
if ( ! $seq_path and (! $check_dependencies) ){
pod2usage( {
-message => "At least 1 parameter mandatory:\n1) Input fasta file: -seq\n".
"$usage\n\n",
-verbose => 0,
-exitval => 2 } );
}
print "Pass!\n";
exit if $check_dependencies;
# make a softlink to the genome file
my $seq_file = basename($seq_path);
`ln -s $seq_path $seq_file` unless -s $seq_file;
if ($threads == 1){
# run the original single threaded code
`${ltr_finder}ltr_finder $finder_para $seq_file > $seq_file.finder.scn`;
if ($harvest_format == 0){
`mv $seq_file.finder.scn $seq_file.finder.combine.scn`;
} else {
`perl $convert $seq_file.finder.scn > $seq_file.finder.combine.scn`;
}
} else {
## run the paralleled code
# read genome in memory
open SEQ, "<$seq_file" or die $usage;
open GFF, ">$seq_file.finder.combine.gff3" or die $!;
print GFF "##gff-version 3\n";
my %seq; #a hash to store seq name and length info
my %order; #store seq order in genome
my $chr_info; #store seq id in genome
$i=0;
$/ = "\n>";
while (<SEQ>){
s/>//g;
my ($id, $seq) = (split /\n/, $_, 2);
$seq =~ s/\s+//g;
my $len = length $seq;
print GFF "##sequence-region $id 1 $len\n";
$chr_info.="#$id\n";
$seq{$id} = $len;
$order{$id} = $i;
$i++;
}
print GFF "$chr_info";
close SEQ;
$/="\n";
goto Next if $next == 1; #run the next step if all LTR_finder processes are done
# prepre files
`mkdir $seq_file.finder` unless -d "$seq_file.finder";
chdir "$seq_file.finder";
`perl $cut ../$seq_file -s -l $size_of_each_piece`;
##multi-threading using queue, create a worker module for parallel computation
my $process_q=Thread::Queue->new();
sub worker {
while (my $seq = $process_q -> dequeue()){
chomp $seq;
$seq =~ s/\s+//;
print localtime() ." CPU".threads -> self() -> tid().": running on $seq\n";
`timeout $timeout ${ltr_finder}ltr_finder $finder_para $seq > $seq.finder.scn`; #set the max running time for a thread as $timeout seconds
if ($? ne 0 and $try1 ne 0){
print localtime() ." CPU".threads -> self() -> tid().": $seq timeout, process it with the salvage mode\n";
my $in=`perl $script_path/LTR_FINDER_parallel -seq $seq -size 50000 -time 10 -try1 0 -threads 1 -cut $cut -finder $ltr_finder`;
`mv $seq.finder.combine.scn $seq.finder.scn`;
`rm -rf ${seq}.finder`; #remove?
}
}
}
#insert seq names into the worker queue
open List, "<../$seq_file.list" or die $!;
$process_q -> enqueue (<List>);
$process_q -> end(); #stop adding items to the queue
close List;
#work and finish
for (1..$threads){
threads -> create ( \&worker );
}
foreach my $thr (threads -> list()){
$thr -> join();
}
chdir "../";
Next:
#combine split ltr_finder results
open Out, ">$seq_file.finder.combine.scn" or die $!;
#print out headers
if ($harvest_format == 0){
print Out "index SeqID Location LTR len Inserted element len TSR PBS PPT RT IN (core) IN (c-term) RH Strand Score Sharpness Similarity\n";
} else {
print Out "#LTR_FINDER_parallel -seq $seq_file -size $size_of_each_piece -time $timeout -try1 $try1 -harvest_out -threads $threads -cut $cut -finder $ltr_finder
# LTR_FINDER args=$finder_para
# LTR_FINDER_parallel version=$version
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr chr
# where:
# s = starting position
# e = ending position
# l = length
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# sim = similarity
# seq-nr = sequence order\n";
}
# process each candidates
open List, "<$seq_file.list" or die $!;
my $count = 1; #count repeats in gff
foreach my $seq (<List>){
$seq =~ s/\s+//;
my ($base, $order) = ($seq, 1);
($base, $order) = ($1, $2) if $seq =~ /(.*)_sub([0-9]+)$/;
my $coord_adj = ($order - 1) * $size_of_each_piece;
next unless -e "$seq_file.finder/$seq.finder.scn";
open Scn, "<$seq_file.finder/$seq.finder.scn" or die $!;
while (<Scn>){
next unless /^\[/;
s/^\[\s+[0-9]+\]/\[NA\]/;
my ($index, $id, $loc, $ltr_len, $ele_len, $TSR, $PBS, $PPT, $RT, $IN_core, $IN_cterm, $RH, $strand, $score, $sharpness, $sim) = (split);
#convert coordinates back to the genome scale
my @coord = ($loc, $PBS, $PPT, $RT, $IN_core, $IN_cterm, $RH);
my $i = -1;
foreach (@coord) {
$i++;
next if /^N-N$/;
my ($start, $end) = ($1+$coord_adj, $2+$coord_adj) if /([0-9]+)\-([0-9]+)/;
$coord[$i] = "$start-$end";
}
my ($start, $end) = ($1, $2) if $coord[0] =~ /([0-9]+)\-([0-9]+)/;
my ($lltr, $rltr) = ($1, $2) if $ltr_len=~/([0-9]+),([0-9]+)/;
my ($lltr_e, $rltr_s) = ($start+$lltr-1, $end-$rltr+1);
#output LTRharvest or LTR_FINDER (-w 2) format
if ($harvest_format == 0){
print Out "[NA]\t$base\t$coord[0]\t$ltr_len\t$ele_len\t$TSR\t$coord[1]\t$coord[2]\t$coord[3]\t$coord[4]\t$coord[5]\t$coord[6]\t$strand\t$score\t$sharpness\t$sim\n";
} else {
$sim*=100;
print Out "$start $end $ele_len $start $lltr_e $lltr $rltr_s $end $rltr $sim $order{$base} $base\n";
}
#print GFF format
my $chr = $base;
print GFF "$chr\t$annotator\trepeat_region\t$start\t$end\t.\t$strand\t.\tID=repeat_region$count\n";
#print GFF "$chr\t$annotator\ttarget_site_duplication\t$lTSD\t.\t$strand\t.\tParent=repeat_region$count\n" unless $TSD eq "NA";
print GFF "$chr\t$annotator\tLTR_retrotransposon\t$start\t$end\t.\t$strand\t.\tID=LTR_retrotransposon$count;Parent=repeat_region$count;tsd=$TSR;ltr_identity=$sim;seq_number=$order{$chr}\n";
print GFF "$chr\t$annotator\tlong_terminal_repeat\t$start\t$lltr_e\t.\t$strand\t.\tParent=LTR_retrotransposon$count\n";
print GFF "$chr\t$annotator\tlong_terminal_repeat\t$rltr_s\t$end\t.\t$strand\t.\tParent=LTR_retrotransposon$count\n";
#print GFF "$chr\t$annotator\ttarget_site_duplication\t$rTSD\t.\t$strand\t.\tParent=repeat_region$count\n" unless $TSD eq "NA";
print GFF "###\n";
}
close Scn;
$count++;
}
close List;
#close Scn;
close GFF;
`rm -rf ./$seq_file.finder 2>/dev/null` if $verbose eq 0;
}
print localtime() ." Job finished! Check out $seq_file.finder.combine.scn\n";