/
cds_extractor.pl
executable file
·85 lines (66 loc) · 2.33 KB
/
cds_extractor.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/perl
#
# This program extracts the fields of interest from a GB file and outputs to a separate file.
#
# 4/19/2011 - Andrew Pann - Initial development.
use strict;
# Strict requires declaration of variables.
my $infile=$ARGV[0];
my $outfile="$infile.extract";
my ($loc1,$loc2,$gene_id,$locus_tag,$note);
my $cds_found=0;
my $cds_count=0;
# @ARGV is just args array, 0-indexed, and $#ARGV is highest index (not length of array or number of arguments!).
if ($#ARGV < 0){
print "Usage: $0 <input filename>\n";
exit 1;
}
print "Will process $infile.\n";
print "Output file is $outfile.\n";
# File I/O setup.
open INFILE, "<$infile" or die "Unable to open input file $infile: $!\n";
open OUTFILE, ">$outfile" or die "Unable to open output file $outfile: $!\n";
while (<INFILE>){
chomp; # Don't want trailing newlines.
# Only process lines within a CDS -> gene block (that's the .. operator [inclusive]).
if (/\s+CDS\s+/ .. /\s+gene\s+/){
$cds_found=1;
# Perl has no native switch/case statement, so make our own, rather than dealing with an if-then-elsif jungle.
SWITCH: for ($_){
/CDS\s+.*?(\d+)\.\.(\d+).*?/ && do { $loc1=$1; $loc2=$2; last SWITCH;};
/\/gene="(.+?)"/ && do {$gene_id=$1; last SWITCH;};
/\/locus_tag="(.+?)"/ && do {$locus_tag=$1; last SWITCH;};
# Note field is lame, since it can span lines. Keep building the string until we find the matching start/end quotes.
/\/note="(.*)/ && do {
$note="\"$1";
# Assume that a double quote won't show up within (actually, at EOL of) the text line of a note field.
# Also assuming that each note field will also have a closing " on the last line.
while ( ! ($note =~ /"$/) ){
my $line = <INFILE>;
chomp $line;
$line =~ s/^\s+//g; # Remove leading spaces of line.
$note .= " " . $line;
}
last SWITCH;
};
# Default cause - line didn't match anything we cared about, so ignore.
do { last SWITCH; }
}
}
else{
if ($cds_found){
print OUTFILE "$loc1\t$loc2\t$gene_id\t$locus_tag\t$note\n";
$cds_count++;
$loc1="";
$loc2="";
$gene_id="";
$locus_tag="";
$note="";
$cds_found=0;
print "\rCoding sequences extracted: $cds_count";
}
}
}
print "\nDone.\n";
close INFILE;
close OUTFILE;