/
convertANGSDreadcountsToBinaryReadGZIP.pl
119 lines (98 loc) · 4.71 KB
/
convertANGSDreadcountsToBinaryReadGZIP.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/perl
# For our unbiased estimates of pairwise divergence, we want to run calculations
# on hundreds of millions of sites, where we have read counts for the major and
# minor alleles.
#
# This script does a few things. It takes as input the read counts dumped by
# ANGSD, finds the minor and major read counts for each individual at that locus (from the mafs file),
# then packs them into binary file representations of that data. The binary file
# format is as follows:
# -Every read depth (integer) for the major or minor allele at
# a locus is encoded as an 8-bit unsigned integer (so values can range from 0 to 255).
# In this experiment, we don't want to include loci with read depth greater
# than 255, because with coverage around 1X, such read depth would be an indication
# that something is wrong. In fact, we will cap the maximum read depth to consider
# a locus at a much lower number (10 or 15, set with $maxReadDepth)
# -Every individual gets two integers in the output file for each locus, with
# the major allele read count first, and the minor allele read count second
# -As such, there are N*2 integers per locus, where N is the number of bams
# (individuals) input into ANGSD, in the same order as the bam file list
use strict;
use warnings;
use List::Util qw(sum max);
use Data::Dumper;
use Getopt::Long;
my $MAF;
my $countsFile;
my $numIndividuals;
my $outFile;
my $maxReadDepth;
my $help;
GetOptions ("maf=s" => \$MAF,
"counts=s" => \$countsFile,
"individuals=i" => \$numIndividuals,
"out=s" => \$outFile,
"maxdepth=i" => \$maxReadDepth,
"help|man" => \$help) || die "Couldn't get options with GetOpt::Long: $!\n";
if (!$MAF or !$countsFile or !$outFile or !$numIndividuals or !$maxReadDepth or ($maxReadDepth > 255) or $help) {
die "Must supply --maf, --counts, --individuals, --out, and --maxdepth.
--maf designates the mafs file output by ANGSD (.gz or uncompressed)
--counts is for the counts file output by ANGSD (.gz or uncompressed)
--individuals is the number of individuals input into ANGSD (the number of files in the bam file list)
--out is the desired name of the output file that holds the binary representation of the readcounts.
--maxdepth is the highest allowed number for a major or minor allele read count (to avoid overweighting repetitive regions.
Also note that the maximum value for --maxdepth is 255";
}
my $locusCounter = 1;
my $fileCounter = 1;
my %alleleOrderHash = ("A" => 0,
"C" => 1,
"G" => 2,
"T" => 3
);
my $MAFfh;
if ($MAF =~ /\.gz$/) {
open($MAFfh, "gunzip -c $MAF |") || die "can’t open pipe to $MAF: $!\n";
} else {
open($MAFfh, "<", $MAF) || die "can’t open $MAF: $!\n";
}
my $countsFH;
if ($countsFile =~ /\.gz$/) {
open($countsFH, "gunzip -c $countsFile |") || die "Can't open pipe to $countsFile: $!\n";
} else {
open(my $countsFH, "<", $countsFile) or die "Couldn't open $countsFile for reading: $!\n";
}
open(my $outFH, ">", $outFile) or die "Couldn't open $outFile for writing: $!\n";
while ( not eof $countsFH and not eof $MAFfh ) {
# Each iteration of this loop processes a single locus from the ANGSD output
my $MAFline = <$MAFfh>;
my $countsLine = <$countsFH>;
chomp($countsLine);
if ($countsLine =~ /^ind/) { # This is the header line of the counts file, so skip
next;
}
my @mafFields = split(/\t/, $MAFline);
my $majorAllele = $mafFields[2];
my $minorAllele = $mafFields[3];
if ($majorAllele !~ /[ATCG]/ or $minorAllele !~ /[ATCG]/) {
die "Either the major allele ($majorAllele) or the minor allele ($minorAllele) \
does not equal A, C, G, or T\n";
}
# Get the information on the read counts for all the individuals
my @countsFields = split(/\t/, $countsLine);
if (max(@countsFields) > $maxReadDepth) { # Make sure none of the read counts are higher than the max allowed
next;
}
my $tortCounter = 0;
while ($tortCounter < $numIndividuals) {
my @individualDepths = splice(@countsFields,0,4); # Remove the first 4 elements of the array
my $majorDepth = $individualDepths[$alleleOrderHash{$majorAllele}];
my $minorDepth = $individualDepths[$alleleOrderHash{$minorAllele}];
# Uncomment the line below if you want a text output instead of binary
#print $outFH "$majorDepth\t$minorDepth\t";
print $outFH pack("C", $majorDepth); # "C" = 8-bit integer
print $outFH pack("C", $minorDepth); # "C" = 8-bit integer
$tortCounter++;
}
$locusCounter++;
}