Skip to content

Commit

Permalink
[misc:feat,test] Add -doGQ 7 to handle non-SNPs
Browse files Browse the repository at this point in the history
Add -doGQ 7 option to treat non-SNP sites as called with highest GQ
score.

Add test for the new option.
  • Loading branch information
isinaltinkaya committed Jan 9, 2024
1 parent c78454b commit 8f175d5
Show file tree
Hide file tree
Showing 8 changed files with 354 additions and 28 deletions.
9 changes: 7 additions & 2 deletions misc/Makefile
Expand Up @@ -44,11 +44,13 @@ all: $(PROGRAM)
CXXSRC = $(wildcard *.cpp)
OBJ = $(CXXSRC:.cpp=.o)

FLAGS := $(FLAGS) $(CXXFLAGS)

-include $(OBJ:.o=.d)

%.o: %.cpp
$(CXX) -c $(CXXFLAGS) $(COMPILEMODE) $*.cpp
$(CXX) -MM $(CXXFLAGS) $(COMPILEMODE) $*.cpp >$*.d
$(CXX) -c $(FLAGS) $*.cpp
$(CXX) -MM $(FLAGS) $*.cpp >$*.d



Expand All @@ -64,3 +66,6 @@ test:
$(info Test 1)
./$(PROGRAM) -i data/call.vcf -t data/truth.vcf -o testwd/out_test.tsv;
diff -s testwd/out_test.tsv reference/test_doGq0.tsv
./$(PROGRAM) -i data/call3.vcf -t data/truth3.vcf -o testwd/out_test3.tsv -doGQ 7;
diff -s testwd/out_test3.tsv reference/test3_doGq7.tsv

9 changes: 0 additions & 9 deletions misc/data/call.vcf
@@ -1,29 +1,20 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##source=tskit 0.4.1
##contig=<ID=chr22,length=51304566>
##fileDate=Thu Oct 12 22:39:39 2023
##source=vcfgl --input sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/vcf/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0.vcf --output sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/vcfgl/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs1_betavar7 --output-mode b --error-rate 0.002000 --depth 1.000000 --depths-file (null) --pos0 0 --seed 1 --error-qs 1 --beta-variance 1.000000e-07 --rm-invar-sites 0 --trim-alt-alleles 0 --platform 0 -explode 0 -addGL 1 -addGP 1 -addPL 1 -addI16 0 -addQS 1 -addFormatDP 1 -addFormatAD 1 -addFormatADF 0 -addFormatADR 0 -addInfoDP 0 -addInfoAD 1 -addInfoADF 0 -addInfoADR 0
##source=vcfgl version: v0.3.2 71cd0a9-dirty
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Simulated per-sample read depth">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihood in log10 likelihood ratio format">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="Genotype probabilities">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the REF and ALT alleles">
##INFO=<ID=AD,Number=R,Type=Integer,Description="Total allelic depths for the REF and ALT alleles">
##bcftools_annotateVersion=1.18+htslib-1.18
##bcftools_annotateCommand=annotate -x FORMAT/GT -O b -o sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/vcfgl/nogt/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs1_betavar7.bcf sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/vcfgl/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs1_betavar7.bcf; Date=Fri Oct 13 17:03:36 2023
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Phred-scaled Genotype Quality">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
##bcftools_callVersion=1.18+htslib-1.18
##bcftools_callCommand=call --keep-alts --annotate FORMAT/GQ,FORMAT/GP,INFO/PV4 -mv --prior 0.0011 -O b -o sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/genotype_calling/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs1_betavar7-snp.bcf sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/vcfgl/nogt/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs1_betavar7.bcf; Date=Fri Oct 13 17:05:57 2023
##bcftools_viewVersion=1.18-6-gc7cbe0b3+htslib-1.18-7-gac70212
##bcftools_viewCommand=view -s popYRI_ind1 ../../../simulation/sim/sim_vcfgl_2310/model_OutOfAfrica_3G09/contig_chr22/genotype_calling/sim_vcfgl_2310-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs1_betavar7-snp.bcf; Date=Thu Nov 9 11:29:14 2023
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT popYRI_ind1
chr22 1 . C A,T,G 71.1385 PASS AD=83,6,0,0;AC=0,0,0;AN=2 GT:DP:GL:PL:GP:AD:GQ 1/1:2:0,-0.601541,-6.45269,-0.601541,-6.45269,-6.45269,-0.601541,-6.45269,-6.45269,-6.45269:0,6,65,6,65,65,6,65,65,65:0.97304,0.0269597,9.35923e-10,0,0,0,0,0,0,0:2,0,0,0:15
chr22 10 . A C 527.955 PASS AD=88,24,0,0;AC=0,0,0;AN=2 GT:DP:GL:PL:GP:AD:GQ 0/1:1:0,-0.3008,-3.27643,-0.3008,-3.27643,-3.27643,-0.3008,-3.27643,-3.27643,-3.27643:0,3,33,3,33,33,3,33,33,33:0.784909,0.215061,2.93931e-05,0,0,0,0,0,0,0:1,0,0,0:6
Expand Down
7 changes: 0 additions & 7 deletions misc/data/call2.vcf
@@ -1,10 +1,7 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##source=tskit 0.4.1
##contig=<ID=chr22,length=51304566>
##fileDate=Tue Dec 19 11:21:41 2023
##source=vcfgl [version: v0.4] [build: Dec 18 2023 17:43:39] [htslib: 1.16]
##source=Command: vcfgl --verbose 0 --threads 1 --seed 1 --input sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/vcf/sim_vcfgl_2312-OutOfAfrica_3G09-chr22.vcf --output sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/vcfgl_gl1/sim_vcfgl_2312-OutOfAfrica_3G09-chr22-rep0-d1-e0.002_qs0_0 --output-mode b --depth 1.000000 --error-rate 0.002000 --error-qs 0 --beta-variance -1.000000e+00 --gl-model 1 --gl1-theta 0.830000 --platform 0 --precise-gl 0 --i16-mapq 20 --gvcf-dps (null) -explode 0 --rm-invar-sites 3 --rm-empty-sites 1 -doUnobserved 1 -doGVCF 0 -printPileup 1 -printTruth 1 -addGL 1 -addGP 0 -addPL 1 -addI16 0 -addQS 1 -addFormatDP 1 -addInfoDP 0 -addFormatAD 1 -addInfoAD 1 -addFormatADF 0 -addInfoADF 0 -addFormatADR 0 -addInfoADR 0
##ALT=<ID=*,Description="Symbolic alternate allele representing any possible alternative allele at this location">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Simulated per-sample read depth">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihood in log10 likelihood ratio format">
Expand All @@ -19,10 +16,6 @@
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
##bcftools_callVersion=1.18+htslib-1.18
##bcftools_callCommand=call --annotate FORMAT/GQ,FORMAT/GP,INFO/PV4 -mv --prior 0.0011 -O b -o sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/genotype_calling_gl1/sim_vcfgl_2312-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs0_0-snp.bcf sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/vcfgl_gl1/sim_vcfgl_2312-OutOfAfrica_3G09-chr22-rep0-d1-e0.002_qs0_0.bcf; Date=Tue Dec 19 11:59:11 2023
##bcftools_viewVersion=1.18-6-gc7cbe0b3+htslib-1.18-7-gac70212
##bcftools_viewCommand=view ../../../simulation/sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/genotype_calling_gl1/sim_vcfgl_2312-OutOfAfrica_3G09-chr22-rep0-d1-e0.002-qs0_0-snp.bcf; Date=Thu Dec 21 14:39:20 2023
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT popYRI_ind1 popYRI_ind2
chr22 42 . A C,T 515.208 PASS . GT:DP:GQ ./.:0:0 1/1:2:7
chr22 44 . A T 515.208 PASS . GT:DP:GQ 0/1:0:127 0/0:2:16
26 changes: 26 additions & 0 deletions misc/data/call3.vcf
@@ -0,0 +1,26 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr22,length=10>
##fileDate=Tue Dec 19 11:21:41 2023
##ALT=<ID=*,Description="Symbolic alternate allele representing any possible alternative allele at this location">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Simulated per-sample read depth">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihood in log10 likelihood ratio format">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the REF and ALT alleles">
##INFO=<ID=AD,Number=R,Type=Integer,Description="Total allelic depths for the REF and ALT alleles">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Phred-scaled Genotype Quality">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="Genotype posterior probabilities in the range 0 to 1">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
##source=manually created by isinaltinkaya 240109
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT popYRI_ind1 popYRI_ind2
chr22 2 . A . 515.208 PASS . GT 0/0 0/0
chr22 4 . C A,T 515.208 PASS . GT:DP:GQ ./.:0:0 1/1:2:7
chr22 5 . A C 515.208 PASS . GT:DP:GQ 0/0:10:10 1/0:10:5
chr22 6 . C . 515.208 PASS . GT 0/0 0/0
chr22 8 . A . 515.208 PASS . GT ./. 0/0
chr22 9 . A T 515.208 PASS . GT:DP:GQ 0/1:10:127 1/1:2:26
5 changes: 1 addition & 4 deletions misc/data/truth2.vcf
Expand Up @@ -4,10 +4,7 @@
##contig=<ID=chr22,length=51304566>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##fileDate=Tue Dec 19 11:21:41 2023
##source=vcfgl [version: v0.4] [build: Dec 18 2023 17:43:39] [htslib: 1.16]
##source=Command: vcfgl --verbose 0 --threads 1 --seed 1 --input sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/vcf/sim_vcfgl_2312-OutOfAfrica_3G09-chr22.vcf --output sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/vcfgl_gl1/sim_vcfgl_2312-OutOfAfrica_3G09-chr22-rep0-d1-e0.002_qs0_0 --output-mode b --depth 1.000000 --error-rate 0.002000 --error-qs 0 --beta-variance -1.000000e+00 --gl-model 1 --gl1-theta 0.830000 --platform 0 --precise-gl 0 --i16-mapq 20 --gvcf-dps (null) -explode 0 --rm-invar-sites 3 --rm-empty-sites 1 -doUnobserved 1 -doGVCF 0 -printPileup 1 -printTruth 1 -addGL 1 -addGP 0 -addPL 1 -addI16 0 -addQS 1 -addFormatDP 1 -addInfoDP 0 -addFormatAD 1 -addInfoAD 1 -addFormatADF 0 -addInfoADF 0 -addFormatADR 0 -addInfoADR 0
##bcftools_viewVersion=1.18-6-gc7cbe0b3+htslib-1.18-7-gac70212
##bcftools_viewCommand=view ../../../simulation/sim/sim_vcfgl_2312/model_OutOfAfrica_3G09/contig_chr22/vcfgl_gl1/sim_vcfgl_2312-OutOfAfrica_3G09-chr22-rep0-d1-e0.002_qs0_0.truth.bcf; Date=Thu Dec 21 14:40:18 2023
##source=manual edit
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT popYRI_ind1 popYRI_ind2
chr22 42 . A C 515.208 PASS . GT 0/0 1/1
chr22 44 . A C 515.208 PASS . GT 1/0 0/0
13 changes: 13 additions & 0 deletions misc/data/truth3.vcf
@@ -0,0 +1,13 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr22,length=10>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##fileDate=Tue Dec 19 11:21:41 2023
##source=manually created by isinaltinkaya 240109
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT popYRI_ind1 popYRI_ind2
chr22 2 . A C 515.208 PASS . GT 0/1 1/1
chr22 4 . A C 515.208 PASS . GT 0/0 0/1
chr22 5 . A C 515.208 PASS . GT 0/0 0/1
chr22 6 . A C 515.208 PASS . GT 1/1 0/0
chr22 8 . A C 515.208 PASS . GT 0/0 0/0
chr22 9 . A C 515.208 PASS . GT 1/0 1/1
55 changes: 49 additions & 6 deletions misc/gtDiscordance.cpp
@@ -1,4 +1,4 @@
// v3
// v4
// isinaltinkaya


Expand Down Expand Up @@ -65,6 +65,7 @@ void usage(void) {
fprintf(stderr, " -doGQ 4 print tidy gq summary with hom/het details\n");
fprintf(stderr, " -doGQ 5 per-sample doGQ 3\n");
fprintf(stderr, " -doGQ 6 per-sample doGQ 4\n");
fprintf(stderr, " -doGQ 7 per-sample doGQ 4, for the sites with missing GQ assume highest GQ value\n");
fprintf(stderr, " -h, --help print help\n");
fprintf(stderr, "\n");

Expand Down Expand Up @@ -132,6 +133,8 @@ int main(int argc, char** argv)
fprintf(stderr, "-doGQ 5. Will do gq and attempt to tidy up the output for each sample.\n");
} else if (doGq == 6) {
fprintf(stderr, "-doGQ 6. Will do gq and attempt to tidy up the output for each sample and add true/call hom/het status information.\n");
} else if (doGq == 7) {
fprintf(stderr, "-doGQ 7. Will do gq and attempt to tidy up the output for each sample and add true/call hom/het status information. For the sites with missing GQ assume highest GQ value.\n");
} else {
ERROR("Unknown doGq:%d\n", doGq);
}
Expand Down Expand Up @@ -255,9 +258,9 @@ int main(int argc, char** argv)
gt_arr2 = NULL;
ngq = 0;
ngq_arr = 0;
if(gq_arr!=NULL){
free(gq_arr);
gq_arr = NULL;
if (gq_arr != NULL) {
free(gq_arr);
gq_arr = NULL;
}


Expand All @@ -276,8 +279,24 @@ int main(int argc, char** argv)
// get genotype quality scores from the input/call file
if (doGq > 0) {
ngq = bcf_get_format_int32(hdrInput, recInput, "GQ", &gq_arr, &ngq_arr);
ASSERT(ngq > 0);
ASSERT(ngq==nSamples);

if (doGq == 7) {

if (gq_arr == NULL) {
// if GQ is missing, assume highest GQ value
gq_arr = (int32_t*)malloc(nSamples * sizeof(int32_t));
for (int i = 0; i < nSamples; i++)
{
gq_arr[i] = MAX_GQ;
}
}


} else {
ASSERT(ngq > 0);
ASSERT(ngq == nSamples);

}
}


Expand Down Expand Up @@ -497,6 +516,30 @@ int main(int argc, char** argv)
}
}

} else if (7 == doGq) {

// per-sample doGq 4 with missing GQ assumed to be highest GQ
for (int i = 0; i < nSamples; i++)
{
// colnames(d)<-c("Sample","GQ","nDiscordant","nHomToHomDiscordant","nHomToHetDiscordant","nHetToHomDiscordant","nHetToHetDiscordant","nConcordant","nHomToHomConcordant","nHetToHetConcordant","nSitesComparedForSample")
// N.B. nOverall == nSitesComparedForSample == nSitesInTotal - nSitesDP0forSample
for (int k = 1;k < GQ_ARR_SIZE;++k) {
fprintf(out_fp, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
i,
k,
gqCounts[i][COUNT_OVERALL][k][T_DISCORDANT],
gqCounts[i][HOM_TO_HOM][k][T_DISCORDANT],
gqCounts[i][HOM_TO_HET][k][T_DISCORDANT],
gqCounts[i][HET_TO_HOM][k][T_DISCORDANT],
gqCounts[i][HET_TO_HET][k][T_DISCORDANT],
gqCounts[i][COUNT_OVERALL][k][T_CONCORDANT],
gqCounts[i][HOM_TO_HOM][k][T_CONCORDANT],
gqCounts[i][HET_TO_HET][k][T_CONCORDANT],
nSites_compared_forSample[i]
);
}
}

} else if (0 == doGq) {
for (int i = 0; i < nSamples; i++)
{
Expand Down

0 comments on commit 8f175d5

Please sign in to comment.