From ebc45024febd7ddbb1a2e4821317a83be05e991d Mon Sep 17 00:00:00 2001 From: Isaac Turner Date: Tue, 4 Oct 2016 14:43:51 +0100 Subject: [PATCH 1/3] Merge develop into master Add disk based fetching of kmers for mccortex server and API webserver Speed up hash table initialisation Add new kmer size experiment Add new breakpoint tests Fix bugs in seq2pdf.sh, ctx_reads.c --- .gitignore | 1 + libs/bcftools | 2 +- libs/bit_array | 2 +- libs/htslib | 2 +- libs/sort_r | 2 +- libs/string_buffer | 2 +- results/kmer_agnostic/Makefile | 95 --- results/kmer_agnostic/list-contig-lengths.sh | 27 - results/kmer_agnostic/plot-kmer-err.R | 79 --- results/kmer_agnostic/plot-link-err.R | 77 --- results/kmer_agnostic/plot-results.R | 107 ---- results/kmer_agnostic/report/Makefile | 13 - .../kmer_agnostic/report/kmer_agnostic.tex | 49 -- results/kmer_agnostic/run.sh | 299 --------- results/kmer_agnostic120/Makefile | 38 -- results/kmer_agnostic120/run.sh | 205 ------ results/kmer_agnostic80/Makefile | 38 -- results/kmer_agnostic80/run.sh | 205 ------ results/kmer_size_experiment/Makefile | 81 +++ .../results/20160912mon/notes.txt | 10 + .../results/20160912mon/perfect.cov.pdf | Bin 0 -> 5615 bytes .../results/20160912mon/perfect.links.csv | 10 + .../results/20160912mon/perfect.plain.csv | 10 + .../results/20160912mon/seqn.errors.csv | 12 + .../results/20160912mon/stoch.cov.pdf | Bin 0 -> 5663 bytes .../results/20160912mon/stoch.links.csv | 10 + .../results/20160912mon/stoch.plain.csv | 10 + .../results/20160912mon/stocherr.cov.pdf | Bin 0 -> 5682 bytes .../results/20160912mon/stocherr.links.csv | 10 + .../results/20160912mon/stocherr.plain.csv | 10 + .../results/20160929thurs/bad.edges.csv | 12 + .../results/20160929thurs/cleaning.table.csv | 12 + .../results/20160929thurs/perfect.cov.pdf | Bin 0 -> 6040 bytes .../results/20160929thurs/perfect.links.csv | 10 + .../results/20160929thurs/perfect.pe.csv | 10 + .../results/20160929thurs/perfect.plain.csv | 10 + .../results/20160929thurs/stoch.cov.pdf | Bin 0 -> 6066 bytes .../results/20160929thurs/stoch.links.csv | 10 + .../results/20160929thurs/stoch.pe.csv | 10 + .../results/20160929thurs/stoch.plain.csv | 10 + .../results/20160929thurs/stocherr.cov.pdf | Bin 0 -> 6043 bytes .../results/20160929thurs/stocherr.links.csv | 10 + .../results/20160929thurs/stocherr.pe.csv | 10 + .../results/20160929thurs/stocherr.plain.csv | 10 + .../results/make-cleaning-table.py | 44 ++ .../kmer_size_experiment/results/make-csv.sh | 11 + .../results/make-csvs-and-plots.sh | 19 + .../results/plot-n50-and-errs.R | 84 +++ results/kmer_size_experiment/runk.mk | 97 +++ scripts/mccortex | 6 + scripts/mccortex-server.py | 128 ---- scripts/python/break-contigs-vs-truth.py | 253 ++++++++ scripts/python/count-bad-edges.py | 56 ++ scripts/python/mccortex-server.py | 144 +++++ scripts/python/mccortex.py | 8 +- scripts/python/pyRBT.py | 597 ++++++++++++++++++ scripts/seq2pdf.sh | 22 +- src/commands/ctx_breakpoints.c | 3 +- src/commands/ctx_bubbles.c | 3 +- src/commands/ctx_clean.c | 8 +- src/commands/ctx_contigs.c | 3 +- src/commands/ctx_correct.c | 3 +- src/commands/ctx_dist_matrix.c | 6 +- src/commands/ctx_exp_abc.c | 5 +- src/commands/ctx_health_check.c | 3 +- src/commands/ctx_index.c | 4 +- src/commands/ctx_infer_edges.c | 6 +- src/commands/ctx_pjoin.c | 3 +- src/commands/ctx_pop_bubbles.c | 4 +- src/commands/ctx_pview.c | 3 +- src/commands/ctx_reads.c | 1 + src/commands/ctx_rmsubstr.c | 31 +- src/commands/ctx_server.c | 306 ++++++--- src/commands/ctx_sort.c | 4 +- src/commands/ctx_thread.c | 15 +- src/commands/ctx_uniqkmers.c | 4 +- src/commands/ctx_unitigs.c | 2 +- src/global/cortex_types.h | 3 + src/global/global.h | 2 +- src/graph/binary_kmer.c | 24 +- src/graph/binary_kmer.h | 56 +- src/graph/cmd_mem.c | 2 +- src/graph/db_graph.c | 19 +- src/graph/db_graph.h | 2 +- src/graph/db_node.c | 10 +- src/graph/db_node.h | 18 +- src/graph/db_unitig.c | 16 +- src/graph/graph_cache.c | 4 +- src/graph/graph_file_reader.c | 26 +- src/graph/graph_file_reader.h | 16 +- src/graph/graph_search.c | 165 +++++ src/graph/graph_search.h | 26 + src/graph/graph_walker.c | 10 +- src/graph/graph_writer.c | 20 +- src/graph/graphs_load.c | 62 +- src/graph/graphs_load.h | 5 +- src/graph/hash_table.c | 71 +-- src/graph/hash_table.h | 61 +- src/graph/json_hdr.c | 4 +- src/graph/json_hdr.h | 2 +- src/graph/prune_nodes.c | 4 +- src/graph_paths/gpath_checks.c | 8 +- src/graph_paths/gpath_reader.c | 70 +- src/graph_paths/gpath_reader.h | 42 +- src/graph_paths/gpath_save.c | 6 +- src/paths/gpath_hash.h | 2 +- src/paths/gpath_store.c | 7 +- src/paths/gpath_store.h | 2 + src/tests/bkmer_tests.c | 22 +- src/tests/cleaning_tests.c | 42 +- src/tests/corrected_aln_tests.c | 4 +- src/tests/graph_crawler_tests.c | 2 +- src/tests/hash_table_tests.c | 10 +- src/tests/infer_edges_tests.c | 2 +- src/tests/node_tests.c | 2 +- src/tests/repeat_walker_tests.c | 3 +- src/tests/subgraph_tests.c | 11 +- src/tools/assemble_contigs.c | 2 +- src/tools/breakpoint_caller.c | 15 +- src/tools/bubble_caller.c | 7 +- src/tools/clean_graph.c | 6 +- src/tools/correct_reads.c | 4 +- src/tools/generate_paths.c | 2 +- src/tools/genotyping.h | 2 +- src/tools/infer_edges.c | 2 +- src/tools/subgraph.c | 2 +- tests/breakpoint/Makefile | 127 +--- .../breakpoint0}/Makefile | 6 +- tests/breakpoint/breakpoint1/Makefile | 119 ++++ tests/breakpoint/breakpoint2/Makefile | 44 ++ tests/dist_matrix/Makefile | 2 +- 131 files changed, 2618 insertions(+), 1991 deletions(-) delete mode 100644 results/kmer_agnostic/Makefile delete mode 100755 results/kmer_agnostic/list-contig-lengths.sh delete mode 100644 results/kmer_agnostic/plot-kmer-err.R delete mode 100644 results/kmer_agnostic/plot-link-err.R delete mode 100644 results/kmer_agnostic/plot-results.R delete mode 100644 results/kmer_agnostic/report/Makefile delete mode 100644 results/kmer_agnostic/report/kmer_agnostic.tex delete mode 100755 results/kmer_agnostic/run.sh delete mode 100644 results/kmer_agnostic120/Makefile delete mode 100755 results/kmer_agnostic120/run.sh delete mode 100644 results/kmer_agnostic80/Makefile delete mode 100755 results/kmer_agnostic80/run.sh create mode 100644 results/kmer_size_experiment/Makefile create mode 100644 results/kmer_size_experiment/results/20160912mon/notes.txt create mode 100644 results/kmer_size_experiment/results/20160912mon/perfect.cov.pdf create mode 100644 results/kmer_size_experiment/results/20160912mon/perfect.links.csv create mode 100644 results/kmer_size_experiment/results/20160912mon/perfect.plain.csv create mode 100644 results/kmer_size_experiment/results/20160912mon/seqn.errors.csv create mode 100644 results/kmer_size_experiment/results/20160912mon/stoch.cov.pdf create mode 100644 results/kmer_size_experiment/results/20160912mon/stoch.links.csv create mode 100644 results/kmer_size_experiment/results/20160912mon/stoch.plain.csv create mode 100644 results/kmer_size_experiment/results/20160912mon/stocherr.cov.pdf create mode 100644 results/kmer_size_experiment/results/20160912mon/stocherr.links.csv create mode 100644 results/kmer_size_experiment/results/20160912mon/stocherr.plain.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/bad.edges.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/cleaning.table.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/perfect.cov.pdf create mode 100644 results/kmer_size_experiment/results/20160929thurs/perfect.links.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/perfect.pe.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/perfect.plain.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/stoch.cov.pdf create mode 100644 results/kmer_size_experiment/results/20160929thurs/stoch.links.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/stoch.pe.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/stoch.plain.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/stocherr.cov.pdf create mode 100644 results/kmer_size_experiment/results/20160929thurs/stocherr.links.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/stocherr.pe.csv create mode 100644 results/kmer_size_experiment/results/20160929thurs/stocherr.plain.csv create mode 100755 results/kmer_size_experiment/results/make-cleaning-table.py create mode 100755 results/kmer_size_experiment/results/make-csv.sh create mode 100755 results/kmer_size_experiment/results/make-csvs-and-plots.sh create mode 100755 results/kmer_size_experiment/results/plot-n50-and-errs.R create mode 100644 results/kmer_size_experiment/runk.mk delete mode 100755 scripts/mccortex-server.py create mode 100644 scripts/python/break-contigs-vs-truth.py create mode 100644 scripts/python/count-bad-edges.py create mode 100755 scripts/python/mccortex-server.py create mode 100644 scripts/python/pyRBT.py create mode 100644 src/graph/graph_search.c create mode 100644 src/graph/graph_search.h rename tests/{breakpoint_empty => breakpoint/breakpoint0}/Makefile (84%) create mode 100644 tests/breakpoint/breakpoint1/Makefile create mode 100644 tests/breakpoint/breakpoint2/Makefile diff --git a/.gitignore b/.gitignore index 55d33588..c0447059 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ *.RData *.Rhistory *.DS_Store +*.pyc /commit.txt /notes.txt diff --git a/libs/bcftools b/libs/bcftools index 9691ef38..fd5c871c 160000 --- a/libs/bcftools +++ b/libs/bcftools @@ -1 +1 @@ -Subproject commit 9691ef380276cadd7236e7e4ccc43771f59e7dce +Subproject commit fd5c871c191d520422f7392a998e2e973e9cf76f diff --git a/libs/bit_array b/libs/bit_array index 21ae09d8..3699b652 160000 --- a/libs/bit_array +++ b/libs/bit_array @@ -1 +1 @@ -Subproject commit 21ae09d8d5086ec06a3f86f1965c9ad7cfef21f1 +Subproject commit 3699b6526d7f1f4e48bc8300f5ffcdbfd2793c6b diff --git a/libs/htslib b/libs/htslib index e79775f5..4295de42 160000 --- a/libs/htslib +++ b/libs/htslib @@ -1 +1 @@ -Subproject commit e79775f5173ddb68cdfa4abbd6d50dc828c26304 +Subproject commit 4295de423b015b216587a9043f1c0504d5d48f3f diff --git a/libs/sort_r b/libs/sort_r index cf4e60dd..6c822360 160000 --- a/libs/sort_r +++ b/libs/sort_r @@ -1 +1 @@ -Subproject commit cf4e60dd8216f0ac9c697c5e26f642272939e0b5 +Subproject commit 6c822360a09bbb391bb0dfa4d9b0c1f65210f9a8 diff --git a/libs/string_buffer b/libs/string_buffer index 36dbd31c..35dfcd3c 160000 --- a/libs/string_buffer +++ b/libs/string_buffer @@ -1 +1 @@ -Subproject commit 36dbd31ccfcba67f727e0eb01c6ccba77c87cc28 +Subproject commit 35dfcd3c69de5e23b59ad5e853300e9e321b6363 diff --git a/results/kmer_agnostic/Makefile b/results/kmer_agnostic/Makefile deleted file mode 100644 index 5ef32f60..00000000 --- a/results/kmer_agnostic/Makefile +++ /dev/null @@ -1,95 +0,0 @@ -SHELL:=/bin/bash -euo pipefail -# Makefile -# Isaac Turner -# 2014-10-01 -# Using 1Mb of chr22 and error free reads to measure effect of kmer-size - -# Expand these for all k values -DIRS=plots readlen80 readlen100 readlen125 - -LINK_100_CSVS=readlen100/k15/link_cleaning/link_cleaning.csv \ - readlen100/k21/link_cleaning/link_cleaning.csv \ - readlen100/k31/link_cleaning/link_cleaning.csv \ - readlen100/k41/link_cleaning/link_cleaning.csv \ - readlen100/k51/link_cleaning/link_cleaning.csv \ - readlen100/k63/link_cleaning/link_cleaning.csv \ - readlen100/k75/link_cleaning/link_cleaning.csv - -KMER_100_CSVS=readlen100/k15/kmer_cleaning/kmer_cleaning.csv \ - readlen100/k21/kmer_cleaning/kmer_cleaning.csv \ - readlen100/k31/kmer_cleaning/kmer_cleaning.csv \ - readlen100/k41/kmer_cleaning/kmer_cleaning.csv \ - readlen100/k51/kmer_cleaning/kmer_cleaning.csv \ - readlen100/k63/kmer_cleaning/kmer_cleaning.csv \ - readlen100/k75/kmer_cleaning/kmer_cleaning.csv - -RESULT_100_CSVS=readlen100/gP.l0.plain.stats.csv \ - readlen100/gS.l0.plain.stats.csv \ - readlen100/gE.l0.plain.stats.csv \ - readlen100/gP.lP.raw.links.stats.csv \ - readlen100/gP.lP.raw.string.stats.csv \ - readlen100/gS.lS.raw.links.stats.csv \ - readlen100/gS.lS.raw.string.stats.csv \ - readlen100/gS.lS.clean.links.stats.csv \ - readlen100/gS.lS.clean.string.stats.csv \ - readlen100/gE.lE.clean.links.stats.csv \ - readlen100/gE.lE.clean.string.stats.csv \ - readlen100/gS.lE.clean.links.stats.csv \ - readlen100/gS.lE.clean.string.stats.csv \ - readlen100/gE.lS.clean.links.stats.csv \ - readlen100/gE.lS.clean.string.stats.csv - -RESULT_80_CSVS =$(subst readlen100,readlen80, $(RESULT_100_CSVS)) -RESULT_125_CSVS=$(subst readlen100,readlen125,$(RESULT_100_CSVS)) - -PLOTS_100=plots/readlen100.pdf plots/link_100.pdf plots/kmer_100.pdf -PLOTS=$(substr 100,80,$(PLOTS_100)) $(PLOTS_100) $(substr 100,125,$(PLOTS_100)) - -all: $(PLOTS) - -$(RESULT_80_CSVS): - mkdir -p readlen80 && cd readlen80 && ../run.sh 80 - -$(RESULT_100_CSVS): - mkdir -p readlen100 && cd readlen100 && ../run.sh 100 - -$(RESULT_125_CSVS): - mkdir -p readlen125 && cd readlen125 && ../run.sh 125 - -plots/readlen80.pdf: $(RESULT_80_CSVS) | $(DIRS) -plots/readlen100.pdf: $(RESULT_100_CSVS) | $(DIRS) -plots/readlen125.pdf: $(RESULT_125_CSVS) | $(DIRS) - -plots/kmer_100.pdf: $(KMER_100_CSVS) | $(DIRS) - R --vanilla -f plot-link-err.R --args $@ $(KMER_100_CSVS) - -plots/link_100.pdf: $(LINK_100_CSVS) | $(DIRS) - R --vanilla -f plot-kmer-err.R --args $@ $(LINK_100_CSVS) - -plots/readlen%.pdf: - R --vanilla -f plot-results.R --args $@ \ - readlen$*/gP.l0.plain.stats.csv \ - readlen$*/gS.l0.plain.stats.csv \ - readlen$*/gE.l0.plain.stats.csv \ - readlen$*/gP.lP.raw.links.stats.csv \ - readlen$*/gP.lP.raw.string.stats.csv \ - readlen$*/gS.lS.raw.links.stats.csv \ - readlen$*/gS.lS.raw.string.stats.csv \ - readlen$*/gS.lS.clean.links.stats.csv \ - readlen$*/gS.lS.clean.string.stats.csv \ - readlen$*/gE.lE.clean.links.stats.csv \ - readlen$*/gE.lE.clean.string.stats.csv \ - readlen$*/gS.lE.clean.links.stats.csv \ - readlen$*/gS.lE.clean.string.stats.csv \ - readlen$*/gE.lS.clean.links.stats.csv \ - readlen$*/gE.lS.clean.string.stats.csv - -$(DIRS): - mkdir -p $@ - -clean: - rm -rf $(DIRS) {perf,stoch,stocherr}.{plain,links,string}.stats.csv - -.force: - -.PHONY: all clean plots .force diff --git a/results/kmer_agnostic/list-contig-lengths.sh b/results/kmer_agnostic/list-contig-lengths.sh deleted file mode 100755 index cf4c02d8..00000000 --- a/results/kmer_agnostic/list-contig-lengths.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -if [[ $# -lt 4 ]]; then - echo "usage: $0 " - exit -fi - -out=$1 -shift -strategy=$1 -shift - -# echo "out: $out/" -mkdir -p $out - -for k in "$@" -do - (echo k$k; - dnacat -P k$k/perf.contigs.$strategy.rmdup.fa | \ - awk '{print length($0)}' | \ - sort -nr -;) > $out/$k.txt -done - -a=`echo "{$@}" | tr ' ' ','` -x=`eval echo "$out/$a.txt"` -# echo $x -paste $x diff --git a/results/kmer_agnostic/plot-kmer-err.R b/results/kmer_agnostic/plot-kmer-err.R deleted file mode 100644 index 869f0c51..00000000 --- a/results/kmer_agnostic/plot-kmer-err.R +++ /dev/null @@ -1,79 +0,0 @@ -# R script for plotting experimental results - -# Set fonts to match Latex - library(Cairo) - mainfont <- "Garamond" - CairoFonts(regular = paste(mainfont,"style=Regular",sep=":"), - bold = paste(mainfont,"style=Bold",sep=":"), - italic = paste(mainfont,"style=Italic",sep=":"), - bolditalic = paste(mainfont,"style=Bold Italic,BoldItalic",sep=":")) - pdf <- CairoPDF - png <- CairoPNG -# - -set_margin <- function() { - par(oma=c(0,0,0,0)) - par(mar=c(5,4,1.5,1)) -} - -args <- commandArgs(trailingOnly = TRUE) - -# files=c() -# kmers= c(15,21,31,41,51,63,75,99) -# for(i in 1:length(kmers)) { files[i] = paste('k',kmers[i],'/kmer_cleaning/kmer_cleaning.csv',sep='') } - -out_path=args[1] -files=args[2:length(args)] -nfiles=length(files) -titles=substr(files,1,3) - -# Thresholds used for cleaning, pulled from k*/graphs/stocherr.ctx.log: -# for k in k*; do cat $k/graphs/stocherr.ctx.log | grep 'cleaning threshold'; done | grep -o '[0-9]*$' -threshs=c(9,8,8,9,8,7,5) - -cols=rainbow(nfiles); - -data=list() -maxErrKmers=0 -for(i in 1:nfiles) { - data[[i]]=read.csv(file=files[i],as.is=T,header=T) - cat("files[",i,"] = '",files[i],"'\n",sep='') - maxErrKmers=max(maxErrKmers,max(data[[i]][,'errorKmers'])) - - t=threshs[i] - x=which.min(abs(data[[i]][,'kmerThresh']-t)) - act=data[[i]][x,'kmerThresh'] - cat(titles[i],' thresh=',t,'; at ',act,' kmer mismatch = ',data[[i]][x,'errorKmers']/(data[[i]][x,'correctKmers']+data[[i]][x,'errorKmers']),'\n') -} - -pdf(file=out_path,width=10,height=5) - -par(mfrow=c(1,2)) - -i=1 -plot(data[[i]][,'kmerThresh'], 100*data[[i]][,'errorKmers']/data[[i]][,'correctKmers'], - xlim=c(0,30), ylim=c(0,10), - col=cols[i], - xlab='Kmer Threshold', ylab='Match Rate (%)', - main="Kmer kmerThresh vs matching rate",type='b'); - -for(i in 2:nfiles) { - points(data[[i]][,'kmerThresh'], data[[i]][,'errorKmers']/data[[i]][,'correctKmers'], type='b', col=cols[i]); -} - -legend('bottomright',titles,fill=cols) - -i=1 -plot(data[[i]][,'kmerThresh'], data[[i]][,'errorKmers'], - xlim=c(0,30), ylim=c(0,maxErrKmers), - xlab='Kmer Threshold', ylab='Number of bad kmers', - main="kmer-thresh vs # bad kmers",type='b'); - -for(i in 2:nfiles) { - points(data[[i]][,'kmerThresh'],data[[i]][,'errorKmers'], type='b', col=cols[i]); -} - -legend('topright',titles,fill=cols) - -dev.off(); - diff --git a/results/kmer_agnostic/plot-link-err.R b/results/kmer_agnostic/plot-link-err.R deleted file mode 100644 index f8cbeff2..00000000 --- a/results/kmer_agnostic/plot-link-err.R +++ /dev/null @@ -1,77 +0,0 @@ -# R script for plotting experimental results - -# Set fonts to match Latex - library(Cairo) - mainfont <- "Garamond" - CairoFonts(regular = paste(mainfont,"style=Regular",sep=":"), - bold = paste(mainfont,"style=Bold",sep=":"), - italic = paste(mainfont,"style=Italic",sep=":"), - bolditalic = paste(mainfont,"style=Bold Italic,BoldItalic",sep=":")) - pdf <- CairoPDF - png <- CairoPNG -# - -set_margin <- function() { - par(oma=c(0,0,0,0)) - par(mar=c(5,4,1.5,1)) -} - -args <- commandArgs(trailingOnly = TRUE) - -# files=c() -# kmers= c(15,21,31,41,51,63,75) -# for(i in 1:length(kmers)) { files[i] = paste('k',kmers[i],'/link_cleaning/link_cleaning.csv',sep='') } - -out_path=args[1] -files=args[2:length(args)] -nfiles=length(files) -titles=substr(files,1,3) - -# Thresholds used for cleaning, pulled from: -# for k in k*; do echo $k `tail -1 $k/links/cleaning.txt`; done -threshs=c(17,16,6,5,4,3,3,0) - -cols=rainbow(nfiles); - -data=list() -maxNumLinks=0 -for(i in 1:nfiles) { - data[[i]]=read.csv(file=files[i],as.is=T,header=T) - cat("files[",i,"] = '",files[i],"'\n",sep='') - maxNumLinks=max(maxNumLinks,max(data[[i]][,'numLinks'])) - - t=threshs[i] - x=which.min(abs(data[[i]][,'linkThresh']-t)) - act=data[[i]][x,'linkThresh'] - cat(titles[i],' thresh=',t,'; at ',act,' link mismatch = ',1-data[[i]][x,'numMatch']/data[[i]][x,'numLinks'],'\n') -} - -pdf(file=out_path,width=10,height=5) - -par(mfrow=c(1,2)) - -i=1 -plot(data[[i]][,'linkThresh'],data[[i]][,'matchRate'],ylim=c(0,100), - col=cols[i], - xlab='Link Threshold', ylab='Match Rate (%)', - main="Link threshold vs matching rate",type='b'); - -for(i in 2:nfiles) { - points(data[[i]][,'linkThresh'],data[[i]][,'matchRate'], type='b', col=cols[i]); -} - -legend('bottomright',titles,fill=cols) - -i=1 -plot(data[[i]][,'linkThresh'],data[[i]][,'numLinks'],ylim=c(0,maxNumLinks), - xlab='Link Threshold', ylab='Number of links', - main="Link threshold vs number of links",type='b'); - -for(i in 2:nfiles) { - points(data[[i]][,'linkThresh'],data[[i]][,'numLinks'], type='b', col=cols[i]); -} - -legend('topright',titles,fill=cols) - -dev.off(); - diff --git a/results/kmer_agnostic/plot-results.R b/results/kmer_agnostic/plot-results.R deleted file mode 100644 index dc360c1a..00000000 --- a/results/kmer_agnostic/plot-results.R +++ /dev/null @@ -1,107 +0,0 @@ -# R script for plotting experimental results -# require(tikzDevice) - -# Set fonts to match Latex - library(Cairo) - mainfont <- "Garamond" - CairoFonts(regular = paste(mainfont,"style=Regular",sep=":"), - bold = paste(mainfont,"style=Bold",sep=":"), - italic = paste(mainfont,"style=Italic",sep=":"), - bolditalic = paste(mainfont,"style=Bold Italic,BoldItalic",sep=":")) - pdf <- CairoPDF - png <- CairoPNG -# - -# R --vanilla -f plot-results.R --args -#options(echo=FALSE) -# nfiles=9 -args <- commandArgs(trailingOnly = TRUE) -# if(length(args) != nfiles) { -# message("Usage: R --vanilla -f plot-results.R --args <{perf,stoch,stocherr}.{plain,links,string}.csv>") -# quit() -# } - -out_path=args[1] -files=args[2:length(args)] -nfiles=length(files) - -cat('out_path=',out_path,'\n'); - -titles=c() -for(i in 1:nfiles) { titles[i] = substr(files[i],1,nchar(files[i])-10) } - -set_margin <- function() { - par(oma=c(0,0,0,0)) - par(mar=c(5,4,1.5,1)) -} - -data = list() -for(i in 1:nfiles) { - data[[i]]=read.table(file=files[i], sep=',', as.is=T, row.names=1, header=T) - cat("files[",i,"] = '",files[i],"'\n",sep='') -} - -kmers=as.numeric(data[[1]]['kmer',]) - -# tmpcols=rainbow(3) -# cols=tmpcols[rep(1:3,each=3)] -# linetypes=rep(c(1,2,3),times=3) -cols=rainbow(nfiles) -linetypes=rep(c(1,2,3),length.out=nfiles) -pchtypes=rep(c(0,3,4),length.out=nfiles) - -fields=c('median','mean','N50','sum_length','contigs','med_walk') -descriptions=c("Median contig length","Mean contig length","Contig N50", - "Assembled length","Number of contigs", "Median walking distance") - -# Correct length field to account for kmer overlap -for(f in 1:nfiles) { - data[[f]]['sum_length',] = data[[f]]['length',] - - pmax(as.numeric(data[[f]]['contigs',])-1, 0) * - (kmers-1); -} - -# Generate multi-page pdf -pdf(file=out_path,width=7,height=7) - -for(i in 1:length(fields)) { - cat('run',i,'\n'); - field = fields[i] - description = descriptions[i] - - # tikz(paste(path,field,'.tex',sep=''),width=2.5,height=2.5) - set_margin() - - m = 0; - for(j in 1:nfiles) { m = max(m, as.numeric(data[[j]][field,])); } - - plot(kmers,as.numeric(data[[1]][field,]),type='b', - col=cols[1], pch=pchtypes[1], lty=linetypes[1], - ylim=c(0,2*m),axes=F,ylab=description,xlab="kmer", - )#main=paste(description,"vs kmer-size")) - axis(side = 2) - axis(side = 1,at=kmers) - - for(j in 2:nfiles) { - points(kmers,as.numeric(data[[j]][field,]),type='b', - col=cols[j], pch=pchtypes[j], lty=linetypes[j]) - } - - s0=0; sn=0; - for(j in 1:nfiles) { - s0 = s0 + as.numeric(data[[j]][field,1]); - sn = sn + as.numeric(data[[j]][field,length(kmers)-1]); - } - - lpos='topright'; - # if(s0 < sn) { lpos='bottomright' } - - # legtxt = c('Perfect Plain', 'Perfect Links', 'Perfect String', - # 'Stoch. Plain', 'Stoch. Links', 'Stoch. String', - # 'Stoch. Err Plain', 'Stoch. Err Links', 'Stoch. Err String'); - legtxt=titles - - legend(lpos,legtxt,col=cols,pch=pchtypes,lty=linetypes) -} - -dev.off() diff --git a/results/kmer_agnostic/report/Makefile b/results/kmer_agnostic/report/Makefile deleted file mode 100644 index 0a6c4ce1..00000000 --- a/results/kmer_agnostic/report/Makefile +++ /dev/null @@ -1,13 +0,0 @@ - -all: kmer_agnostic.pdf - -kmer_agnostic.pdf: kmer_agnostic.tex .force - pdflatex kmer_agnostic - pdflatex kmer_agnostic - -clean: - rm -rf kmer_agnostic.pdf kmer_agnostic.log kmer_agnostic.aux - -.force: - -.PHONY: all clean .force diff --git a/results/kmer_agnostic/report/kmer_agnostic.tex b/results/kmer_agnostic/report/kmer_agnostic.tex deleted file mode 100644 index 24bb8ebb..00000000 --- a/results/kmer_agnostic/report/kmer_agnostic.tex +++ /dev/null @@ -1,49 +0,0 @@ -\documentclass{article} -\title{Kmer agnostic: simulations with perfect coverage} - -\usepackage{graphicx} -\usepackage{subcaption} - -% \usepackage{fullpage} % smaller margins -\usepackage[margin=1.2in]{geometry} - -\begin{document} - -\begin{figure}[ht] -\begin{subfigure}{.5\textwidth} - \centering - \caption{Assembly N50} - \includegraphics[width=2.5in]{../plots/N50.pdf} -\end{subfigure} -\begin{subfigure}{.5\textwidth} - \centering - \caption{Contig mean length} - \includegraphics[width=2.5in]{../plots/mean.pdf} -\end{subfigure} - -\begin{subfigure}{.5\textwidth} - \centering - \caption{Median assembly distance ($n=100$)} - \includegraphics[width=2.5in]{../plots/med_walk.pdf} -\end{subfigure} -\begin{subfigure}{.5\textwidth} - \centering - \caption{Contig median length} - \includegraphics[width=2.5in]{../plots/median.pdf} -\end{subfigure} - -\begin{subfigure}{.5\textwidth} - \centering - \caption{Number of contigs} - \includegraphics[width=2.5in]{../plots/contigs.pdf} -\end{subfigure} -\begin{subfigure}{.5\textwidth} - \centering - \caption{Total assembled length} - \includegraphics[width=2.5in]{../plots/sum_length.pdf} -\end{subfigure} -\caption{Simulations with varied $k$mer-size. Perfect coverage is an error free read starting at each base. Stochastic coverage is 100X coverage with empirical sequencing error. Reads are 100bp single-ended. Reference is haploid 1Mbp from GRCh37 chr22:17,000,000-17,999,999.} -\label{fig:kmer_agnostic} -\end{figure} - -\end{document} diff --git a/results/kmer_agnostic/run.sh b/results/kmer_agnostic/run.sh deleted file mode 100755 index 60ee2f3e..00000000 --- a/results/kmer_agnostic/run.sh +++ /dev/null @@ -1,299 +0,0 @@ -#!/bin/bash - -set -euo pipefail -set -o xtrace - -# Turn on with -l -RUN_LINK_ERR=0 -# Turn on with -k -RUN_KMER_ERR=0 - -# Don't print parsing input -set +o xtrace - -function usage { - echo "Usage: $0 [options]" 2>&1 - echo " -k measure kmer error" 2>&1 - echo " -l measure link error" 2>&1 - exit -1 -} - -if [ $# -lt 1 ]; then - usage -fi - -READLEN=$1 -shift - -for arg in "$@" -do - if [ $arg == "-l" ]; then - RUN_KMER_ERR=1 - elif [ $arg == "-k" ]; then - RUN_KMER_ERR=1 - else - usage - fi -done - -# xtrace prints commands as we run them -set -o xtrace - -CTXDIR=../.. -CTXK=$CTXDIR/bin/ctx -READSIM=$CTXDIR/libs/readsim/readsim -ALLREADS=$CTXDIR/libs/seq_file/scripts/perfect_covg.sh -STRCHK=$CTXDIR/libs/bioinf-perl/sim_mutations/sim_substrings.pl -CONTIG_STATS=$CTXDIR/libs/bioinf-perl/fastn_scripts/contig_stats.pl -LINK_PROC=$CTXDIR/scripts/cortex_links.pl -LINK_THRESH_SCRIPT=$CTXDIR/scripts/R/make_link_cutoffs.R -# DNACAT=$CTXDIR/libs/seq_file/bin/dnacat - -REF=$CTXDIR/results/data/chr22/uniq_flanks/chr22.1Mbp.uniq.fa -DEPTH=100 -ERR_RATE=0.005 # 0.5% per base sequencing error - -# How many contigs to pull out to find median walk distance -NSEED_MEDIAN_WALK=100 - -# Memory to use for each command -MEM=5G - -# -# Finished configure -# - -# Get executable for a given kmer size -getctx () { - k="$1" - echo "$CTXK"$[ ($k+31)/32*32-1 ]; -} - -kmers=$(echo 15 21 31 41 51 63 75 99 | tr ' ' '\n' | awk '$0 < '$READLEN) -nkmers=$(echo $kmers | tr ' ' '\n' | awk 'END{print NR}') - -# create directories -for k in $kmers; do - mkdir -p k$k/{,graphs,links,contigs,results,kmer_cleaning,link_cleaning} - [ -x $(getctx $k) ] || ( echo "Please compile cortex with 'make MAXK=$[ ($k+31)/32*32-1 ]'" 1>&2 && false ) -done - -mkdir -p reads - -# g = graph, l = links -# P = perfect, S = stochastic, E = stochastic+error -# raw = pre-cleaning, clean = after cleaning -# e.g. gPF.lPF.raw.ctp.gz - -name_list=(gP.l0 gS.l0 gE.l0 gP.lP.raw gS.lS.raw gS.lS.clean gE.lE.clean gS.lE.clean gE.lS.clean) -glist=( perf.ctx stoch.ctx stocherr.ctx perf.ctx stoch.ctx stoch.ctx stocherr.ctx stoch.ctx stocherr.ctx) -llist=( '' '' '' gP.lP.raw gS.lS.raw gS.lS.clean gE.lE.clean gS.lE.clean gE.lS.clean) -all_indices=$(echo {0..8}) -plain_indices=$(echo {0..2}) -link_indices=$(echo {3..8}) - -annot_list=$(for i in $plain_indices; do echo ${name_list[$i]}.plain; done; - for i in $link_indices; do echo ${name_list[$i]}.{links,string}; done) - -# Generate reads -# Redirect stderr with 2> -[ ! -f reads/perf.fa.gz ] && ($ALLREADS $READLEN $REF | gzip -c) > reads/perf.fa.gz 2> reads/perf.fa.gz.log -[ ! -f reads/stoch.fa.gz ] && $READSIM -l $READLEN -r $REF -d $DEPTH -s reads/stoch >& reads/stoch.fa.gz.log -[ ! -f reads/stocherr.fa.gz ] && $READSIM -l $READLEN -r $REF -d $DEPTH -e $ERR_RATE -s reads/stocherr >& reads/stocherr.fa.gz.log - -# Cortex build k=$(K) -echo == Building cortex graphs == - -for k in $kmers; do - mkdir -p k$k/graphs - [ ! -f k$k/graphs/perf.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/perf.fa.gz k$k/graphs/perf.ctx >& k$k/graphs/perf.ctx.log - - if [ $k == 99 ]; then clean_thresh="--supernodes=3"; else clean_thresh=""; fi # Use auto threshold for k<99 - [ ! -f k$k/graphs/stocherr.raw.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/stocherr.fa.gz k$k/graphs/stocherr.raw.ctx >& k$k/graphs/stocherr.raw.ctx.log - [ ! -f k$k/graphs/stocherr.ctx ] && `getctx $k` clean -m $MEM $clean_thresh --covg-before k$k/graphs/stocherr.raw.covg.csv --out k$k/graphs/stocherr.ctx k$k/graphs/stocherr.raw.ctx >& k$k/graphs/stocherr.ctx.log - - thresh=$(cat k$k/graphs/stocherr.ctx.log | grep -m 1 'Removing supernodes with coverage < ' | grep -o '[0-9]*' | tail -1) - [ ! -f k$k/graphs/stoch.raw.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/stoch.fa.gz k$k/graphs/stoch.raw.ctx >& k$k/graphs/stoch.raw.ctx.log - [ ! -f k$k/graphs/stoch.ctx ] && `getctx $k` clean -m $MEM --tips $[$k*2] --supernodes=$thresh --covg-before k$k/graphs/stoch.raw.covg.csv --out k$k/graphs/stoch.ctx k$k/graphs/stoch.raw.ctx >& k$k/graphs/stoch.ctx.log -done - -echo == Read threading == - -for k in $kmers; do - [ ! -f k$k/links/gP.lP.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gP.lP.raw.ctp.gz --seq reads/perf.fa.gz k$k/graphs/perf.ctx >& k$k/links/gP.lP.raw.ctp.gz.log - [ ! -f k$k/links/gS.lS.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gS.lS.raw.ctp.gz --seq reads/stoch.fa.gz k$k/graphs/stoch.ctx >& k$k/links/gS.lS.raw.ctp.gz.log - [ ! -f k$k/links/gE.lE.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gE.lE.raw.ctp.gz --seq reads/stocherr.fa.gz k$k/graphs/stocherr.ctx >& k$k/links/gE.lE.raw.ctp.gz.log - - [ ! -f k$k/links/gS.lE.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gS.lE.raw.ctp.gz --seq reads/stocherr.fa.gz k$k/graphs/stoch.ctx >& k$k/links/gS.lE.raw.ctp.gz.log - [ ! -f k$k/links/gE.lS.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gE.lS.raw.ctp.gz --seq reads/stoch.fa.gz k$k/graphs/stocherr.ctx >& k$k/links/gE.lS.raw.ctp.gz.log -done - -echo == Link Cleaning == - -for k in $kmers; do - - # Pick a threshold using stocherr links on stocherr graph - if [ ! -f k$k/links/cleaning.txt ]; then - # Generate table of first 1000 kmers with links - $LINK_PROC list --limit 1000 <(gzip -fcd k$k/links/gE.lE.raw.ctp.gz) k$k/links/gE.lE.raw.ctp.gz.effcovg.csv k$k/links/gE.lE.raw.ctp.gz.links.csv >& k$k/links/gE.lE.raw.ctp.gz.links.csv.log - R --slave --vanilla --quiet -f $LINK_THRESH_SCRIPT --args $k k$k/links/gE.lE.raw.ctp.gz.links.csv > k$k/links/cleaning.txt - fi - - # Use this threshold for all graphs - thresh=$(tail -1 k$k/links/cleaning.txt) - - for f in gP.lP gS.lS gE.lE gS.lE gE.lS; do - if [ ! -f k$k/links/$f.clean.ctp.gz ]; then - ($LINK_PROC clean <(gzip -fcd k$k/links/$f.raw.ctp.gz) $thresh | gzip -c) > k$k/links/$f.clean.ctp.gz 2> k$k/links/$f.clean.ctp.gz.log - fi - done -done - -echo == Assembling contigs for N50 == - -# Used for N50 -for k in $kmers; do - for i in $plain_indices; do - g=${glist[$i]}; o=${name_list[$i]} - [ ! -f k$k/contigs/$o.plain.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.plain.contigs.fa k$k/graphs/$g >& k$k/contigs/$o.plain.contigs.fa.log - done - - for i in $link_indices; do - g=${glist[$i]}; o=${name_list[$i]}; l=${llist[$i]} - [ ! -f k$k/contigs/$o.links.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.links.contigs.fa --confid-step 0.95 -p k$k/links/$o.ctp.gz k$k/graphs/$g >& k$k/contigs/$o.links.contigs.fa.log - [ ! -f k$k/contigs/$o.string.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.string.contigs.fa --use-seed-paths --confid-step 0.95 -p k$k/links/$o.ctp.gz k$k/graphs/$g >& k$k/contigs/$o.string.contigs.fa.log - done - - # Remove duplicates in contigs - for annot in $annot_list; do - [ ! -f k$k/contigs/$annot.contigs.rmdup.fa ] && \ - `getctx $k` rmsubstr -k $k -m $MEM -o k$k/contigs/$annot.contigs.rmdup.fa k$k/contigs/$annot.contigs.fa >& k$k/contigs/$annot.rmdup.fa.log - done -done - -echo == Median walk distance == - -# Get median walk distance -# example: med_walk [-p link_file.ctp] -med_walk() { - k="$1"; g="$2"; pathargs="$3"; - ctx=$(getctx $k) - dist=$($ctx contigs -m $MEM --reseed --ncontigs $NSEED_MEDIAN_WALK $pathargs $g 2>&1 | \ - grep -ioE 'Lengths:.*median: [0-9,]*' | grep -oE '[0-9,]+$' | tr -d ',') - printf "med_walk,$dist\n" -} - -for k in $kmers; do - mkdir -p k$k/results - for i in $plain_indices; do - g=${glist[$i]}; o=${name_list[$i]} - [ ! -f k$k/results/$o.plain.medwalk.txt ] && med_walk $k k$k/graphs/$g '' > k$k/results/$o.plain.medwalk.txt - done - for i in $link_indices; do - g=${glist[$i]}; o=${name_list[$i]}; l=${llist[$i]} - [ ! -f k$k/results/$o.links.medwalk.txt ] && med_walk $k k$k/graphs/$g "-p k$k/links/$l.ctp.gz" > k$k/results/$o.links.medwalk.txt - [ ! -f k$k/results/$o.string.medwalk.txt ] && cp k$k/results/$o.links.medwalk.txt k$k/results/$o.string.medwalk.txt - done -done - -echo == Contig stats == - -for k in $kmers; do - mkdir -p k$k/results - - for annot in $annot_list; do - [ ! -f k$k/results/$annot.contigs.rmdup.csv ] && \ - ( $CONTIG_STATS --print-csv k$k/contigs/$annot.contigs.rmdup.fa | \ - cat - k$k/results/$annot.medwalk.txt ) \ - > k$k/results/$annot.contigs.rmdup.csv - done -done - -# Combine CSV files to summarise statistics -echo == Merging CSV files == - -colidx=$(echo $(eval echo '{1,$[{1..'$nkmers'}*2]}') | tr ' ' ','); - -for annot in $annot_list; do - [ ! -f $annot.stats.csv ] && \ - (printf "metric,k%s\n" $(echo $kmers | sed 's/ /,k/g'); - printf "kmer,%s\n" $(echo $kmers | tr ' ' ','); - paste -d, k*/results/$annot.contigs.rmdup.csv | \ - cut -d, -f $colidx - | tail -n +2) > $annot.stats.csv -done - -# Stats -echo == Checking contig matches == - -for k in $kmers; do - for annot in $annot_list; do - [ ! -f k$k/results/$annot.contigs.rmdup.fa.txt ] && $STRCHK $k 0.1 k$k/contigs/$annot.contigs.rmdup.fa $REF >& k$k/results/$annot.contigs.rmdup.fa.txt - done -done - -# Check various kmer cleaning thresholds -if [ $RUN_KMER_ERR ]; then - echo == Checking various kmer cleaning thresholds == - - for k in $kmers; do - mkdir -p k$k/kmer_cleaning - thresholds=`seq 1 1 30` - ctx=`getctx $k` - - for i in $thresholds; do - # Clean kmers with given threshold - if [ ! -f k$k/kmer_cleaning/stocherr.clean.$i.ctx ]; then - $ctx clean -m $MEM -o k$k/kmer_cleaning/stocherr.clean.$i.ctx --supernodes=$i --tips $[2*$k] k$k/graphs/stocherr.ctx >& k$k/kmer_cleaning/stocherr.clean.$i.ctx.log - fi - # Get rate of kmer match/mismatch - if [ ! -f k$k/kmer_cleaning/stocherr.clean.$i.stats.txt ]; then - ( echo 'kmerThresh,errorKmers,missingKmers,correctKmers'; - $ctx join -q -m $MEM -o - k$k/kmer_cleaning/stocherr.clean.$i.ctx k$k/graphs/perf.ctx | $ctx view -q --kmers - | \ - awk 'BEGIN{OFS=",";a=b=ab=0} {a+=($2>0 && $3==0); b+=($2==0 && $3>0); ab+=($2>0 && $3>0);} END{print '$i',a,b,ab}'; ) \ - > k$k/kmer_cleaning/stocherr.clean.$i.stats.txt - fi - done - - # Generate CSV of results - ( echo 'kmerThresh,errorKmers,missingKmers,correctKmers'; - for i in $thresholds; do - tail -1 k$k/kmer_cleaning/stocherr.clean.$i.stats.txt; - done ) > k$k/kmer_cleaning/kmer_cleaning.csv - done -fi - -# Check various link cleaning thresholds -if [ $RUN_LINK_ERR ]; then - echo == Checking various link cleaning thresholds == - - for k in $kmers; do - if [ $READLEN -gt $[$k+1] ]; then # some kmers have no links - mkdir -p k$k/link_cleaning - thresholds=`seq 0 1 20` - - for i in $thresholds; do - if [ ! -f k$k/link_cleaning/stocherr.clean.$i.ctp ]; then gzip -fcd k$k/links/gE.lE.raw.ctp.gz | $LINK_PROC clean - $i > k$k/link_cleaning/stocherr.clean.$i.ctp 2> k$k/link_cleaning/stocherr.clean.$i.ctp.log; fi - if [ ! -f k$k/link_cleaning/stocherr.clean.$i.stats.txt ]; then cat k$k/link_cleaning/stocherr.clean.$i.ctp | grep -o 'seq=[ACGT]*' | tr '=' ' ' | awk '{print ">seq\n"$2"\n"}' | $STRCHK $k 0 - $REF >& k$k/link_cleaning/stocherr.clean.$i.stats.txt; fi - done - - # Make CSV of results - ( echo 'linkThresh,numKmers,numLinks,numMatch,matchRate'; - for i in $thresholds; do - stats=$(cat k$k/link_cleaning/stocherr.clean.$i.stats.txt | grep 'Perfect matches *:' | grep -oE '[0-9,\.]{2,}') - num_match=$(echo $stats | awk '{print $1}' | tr -d ','); - num_links=$(echo $stats | awk '{print $2}' | tr -d ','); - match_rate=$(echo $stats | awk '{print $3}' | tr -d ','); - ctp_num_kmers_with_links=$(grep -m 1 "num_kmers_with_paths" k$k/link_cleaning/stocherr.clean.$i.ctp | grep -o '[0-9]*') - ctp_num_links=$(grep -m 1 "num_paths" k$k/link_cleaning/stocherr.clean.$i.ctp | grep -o '[0-9]*') - [ "$num_links" == "$ctp_num_links" ] || ( echo THIS IS BAD "$i $count $npaths" && false ) - echo "$i,$ctp_num_kmers_with_links,$ctp_num_links,$num_match,$match_rate" - done ) > k$k/link_cleaning/link_cleaning.csv - fi - done -fi - -# Now make plots with: -mkdir -p plots -echo Plot with: -files=$(for a in $annot_list; do echo $a'.stats.csv'; done) -echo " " R --vanilla -f plot-results.R --args $files diff --git a/results/kmer_agnostic120/Makefile b/results/kmer_agnostic120/Makefile deleted file mode 100644 index 9380c4d7..00000000 --- a/results/kmer_agnostic120/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -SHELL:=/bin/bash -euo pipefail -# Makefile -# Isaac Turner -# 2014-10-01 -# Using 1Mb of chr22 and error free reads to measure effect of kmer-size - -KMERS=15 21 31 41 51 63 75 99 - -# Expand these for all k values -DIRS=reads $(foreach K,$(KMERS),k$(K)) plots - -all: - ./run.sh -plots: - mkdir -p plots - R --vanilla -f plot-results.R --args gP.l0.plain.stats.csv \ - gS.l0.plain.stats.csv \ - gE.l0.plain.stats.csv \ - gP.lP.raw.links.stats.csv \ - gP.lP.raw.string.stats.csv \ - gS.lS.raw.links.stats.csv \ - gS.lS.raw.string.stats.csv \ - gS.lS.clean.links.stats.csv \ - gS.lS.clean.string.stats.csv \ - gE.lE.clean.links.stats.csv \ - gE.lE.clean.string.stats.csv \ - gS.lE.clean.links.stats.csv \ - gS.lE.clean.string.stats.csv \ - gE.lS.clean.links.stats.csv \ - gE.lS.clean.string.stats.csv - cd report && $(MAKE) - -clean: - rm -rf $(DIRS) {perf,stoch,stocherr}.{plain,links,string}.stats.csv - -.force: - -.PHONY: all clean plots .force diff --git a/results/kmer_agnostic120/run.sh b/results/kmer_agnostic120/run.sh deleted file mode 100755 index 5e2baa9a..00000000 --- a/results/kmer_agnostic120/run.sh +++ /dev/null @@ -1,205 +0,0 @@ -#!/bin/bash - -# xtrace prints commands as we run them -set -euo pipefail -set -o xtrace - -CTXDIR=../.. -CTXK=$CTXDIR/bin/ctx -READSIM=$CTXDIR/libs/readsim/readsim -ALLREADS=$CTXDIR/libs/seq_file/scripts/perfect_covg.sh -STRCHK=$CTXDIR/libs/bioinf-perl/sim_mutations/sim_substrings.pl -CONTIG_STATS=$CTXDIR/libs/bioinf-perl/fastn_scripts/contig_stats.pl -LINK_PROC=$CTXDIR/scripts/cortex_links.pl -LINK_THRESH_SCRIPT=$CTXDIR/scripts/R/make_link_cutoffs.R -# DNACAT=$CTXDIR/libs/seq_file/bin/dnacat - -REF=$CTXDIR/results/data/chr22/uniq_flanks/chr22.1Mbp.uniq.fa -ERR_PROFILE=$CTXDIR/results/data/PhiX/PhiX.1.fq.gz -READLEN=120 -DEPTH=100 -ERR_RATE=0.005 # 0.5% per base sequencing error - -# How many contigs to pull out to find median walk distance -NSEED_MEDIAN_WALK=100 - -# Memory to use for each command -MEM=5G - -# -# Finished configure -# - -# Get executable for a given kmer size -getctx () { - k="$1" - echo "$CTXK"$[ ($k+31)/32*32-1 ]; -} - -kmers=$(echo 15 21 31 41 51 63 75 99) -nkmers=$(echo $kmers | tr ' ' '\n' | awk 'END{print NR}') - -# create directories -for k in $kmers; do - mkdir -p k$k/{,graphs,links,contigs,results,kmer_cleaning,link_cleaning} - [ -x $(getctx $k) ] || ( echo "Please compile cortex with 'make MAXK=$[ ($k+31)/32*32-1 ]'" 1>&2 && false ) -done - -mkdir -p reads - -# g = graph, l = links -# P = perfect, S = stochastic, E = stochastic+error -# raw = pre-cleaning, clean = after cleaning -# e.g. gPF.lPF.raw.ctp.gz - -name_list=(gP.l0 gS.l0 gE.l0 gP.lP.raw gS.lS.raw gS.lS.clean gE.lE.clean gS.lE.clean gE.lS.clean) -glist=( perf.ctx stoch.ctx stocherr.ctx perf.ctx stoch.ctx stoch.ctx stocherr.ctx stoch.ctx stocherr.ctx) -llist=( '' '' '' gP.lP.raw gS.lS.raw gS.lS.clean gE.lE.clean gS.lE.clean gE.lS.clean) -all_indices=$(echo {0..8}) -plain_indices=$(echo {0..2}) -link_indices=$(echo {3..8}) - -annot_list=$(for i in $plain_indices; do echo ${name_list[$i]}.plain; done; - for i in $link_indices; do echo ${name_list[$i]}.{links,string}; done) - -# Generate reads -# Redirect stderr with 2> -[ ! -f reads/perf.fa.gz ] && ($ALLREADS $READLEN $REF | gzip -c) > reads/perf.fa.gz 2> reads/perf.fa.gz.log -[ ! -f reads/stoch.fa.gz ] && $READSIM -l $READLEN -r $REF -d $DEPTH -s reads/stoch >& reads/stoch.fa.gz.log -[ ! -f reads/stocherr.fa.gz ] && $READSIM -l $READLEN -r $REF -d $DEPTH -e $ERR_RATE -s reads/stocherr >& reads/stocherr.fa.gz.log - -# Cortex build k=$(K) -echo == Building cortex graphs == - -for k in $kmers; do - mkdir -p k$k/graphs - [ ! -f k$k/graphs/perf.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/perf.fa.gz k$k/graphs/perf.ctx >& k$k/graphs/perf.ctx.log - - if [ $k == 99 ]; then clean_thresh="--supernodes=3"; else clean_thresh=""; fi # Use auto threshold for k<99 - [ ! -f k$k/graphs/stocherr.raw.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/stocherr.fa.gz k$k/graphs/stocherr.raw.ctx >& k$k/graphs/stocherr.raw.ctx.log - [ ! -f k$k/graphs/stocherr.ctx ] && `getctx $k` clean -m $MEM $clean_thresh --covg-before k$k/graphs/stocherr.raw.covg.csv --out k$k/graphs/stocherr.ctx k$k/graphs/stocherr.raw.ctx >& k$k/graphs/stocherr.ctx.log - - thresh=$(cat k$k/graphs/stocherr.ctx.log | grep -m 1 'Removing supernodes with coverage < ' | grep -o '[0-9]*' | tail -1) - [ ! -f k$k/graphs/stoch.raw.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/stoch.fa.gz k$k/graphs/stoch.raw.ctx >& k$k/graphs/stoch.raw.ctx.log - [ ! -f k$k/graphs/stoch.ctx ] && `getctx $k` clean -m $MEM --tips $[$k*2] --supernodes=$thresh --covg-before k$k/graphs/stoch.raw.covg.csv --out k$k/graphs/stoch.ctx k$k/graphs/stoch.raw.ctx >& k$k/graphs/stoch.ctx.log -done - -echo == Read threading == - -for k in $kmers; do - [ ! -f k$k/links/gP.lP.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gP.lP.raw.ctp.gz --seq reads/perf.fa.gz k$k/graphs/perf.ctx >& k$k/links/gP.lP.raw.ctp.gz.log - [ ! -f k$k/links/gS.lS.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gS.lS.raw.ctp.gz --seq reads/stoch.fa.gz k$k/graphs/stoch.ctx >& k$k/links/gS.lS.raw.ctp.gz.log - [ ! -f k$k/links/gE.lE.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gE.lE.raw.ctp.gz --seq reads/stocherr.fa.gz k$k/graphs/stocherr.ctx >& k$k/links/gE.lE.raw.ctp.gz.log - - [ ! -f k$k/links/gS.lE.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gS.lE.raw.ctp.gz --seq reads/stocherr.fa.gz k$k/graphs/stoch.ctx >& k$k/links/gS.lE.raw.ctp.gz.log - [ ! -f k$k/links/gE.lS.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gE.lS.raw.ctp.gz --seq reads/stoch.fa.gz k$k/graphs/stocherr.ctx >& k$k/links/gE.lS.raw.ctp.gz.log -done - -echo == Link Cleaning == - -for k in $kmers; do - - # Pick a threshold using stocherr links on stocherr graph - if [ ! -f k$k/links/cleaning.txt ]; then - # Generate table of first 1000 kmers with links - $LINK_PROC list --limit 1000 <(gzip -fcd k$k/links/gE.lE.raw.ctp.gz) k$k/links/gE.lE.raw.ctp.gz.effcovg.csv k$k/links/gE.lE.raw.ctp.gz.links.csv >& k$k/links/gE.lE.raw.ctp.gz.links.csv.log - R --slave --vanilla --quiet -f $LINK_THRESH_SCRIPT --args $k k$k/links/gE.lE.raw.ctp.gz.links.csv > k$k/links/cleaning.txt - fi - - # Use this threshold for all graphs - thresh=$(tail -1 k$k/links/cleaning.txt) - - for f in gP.lP gS.lS gE.lE gS.lE gE.lS; do - if [ ! -f k$k/links/$f.clean.ctp.gz ]; then - ($LINK_PROC clean <(gzip -fcd k$k/links/$f.raw.ctp.gz) $thresh | gzip -c) > k$k/links/$f.clean.ctp.gz 2> k$k/links/$f.clean.ctp.gz.log - fi - done -done - -echo == Assembling contigs for N50 == - -# Used for N50 -for k in $kmers; do - for i in $plain_indices; do - g=${glist[$i]}; o=${name_list[$i]} - [ ! -f k$k/contigs/$o.plain.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.plain.contigs.fa k$k/graphs/$g >& k$k/contigs/$o.plain.contigs.fa.log - done - - for i in $link_indices; do - g=${glist[$i]}; o=${name_list[$i]}; l=${llist[$i]} - [ ! -f k$k/contigs/$o.links.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.links.contigs.fa --confid-step 0.95 -p k$k/links/$o.ctp.gz k$k/graphs/$g >& k$k/contigs/$o.links.contigs.fa.log - [ ! -f k$k/contigs/$o.string.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.string.contigs.fa --use-seed-paths --confid-step 0.95 -p k$k/links/$o.ctp.gz k$k/graphs/$g >& k$k/contigs/$o.string.contigs.fa.log - done - - # Remove duplicates in contigs - for annot in $annot_list; do - [ ! -f k$k/contigs/$annot.contigs.rmdup.fa ] && \ - `getctx $k` rmsubstr -k $k -m $MEM -o k$k/contigs/$annot.contigs.rmdup.fa k$k/contigs/$annot.contigs.fa >& k$k/contigs/$annot.rmdup.fa.log - done -done - -echo == Median walk distance == - -# Get median walk distance -# example: med_walk [-p link_file.ctp] -med_walk() { - k="$1"; g="$2"; pathargs="$3"; - ctx=$(getctx $k) - dist=$($ctx contigs -m $MEM --reseed --ncontigs $NSEED_MEDIAN_WALK $pathargs $g 2>&1 | \ - grep -ioE 'Lengths:.*median: [0-9,]*' | grep -oE '[0-9,]+$' | tr -d ',') - printf "med_walk,$dist\n" -} - -for k in $kmers; do - mkdir -p k$k/results - for i in $plain_indices; do - g=${glist[$i]}; o=${name_list[$i]} - [ ! -f k$k/results/$o.plain.medwalk.txt ] && med_walk $k k$k/graphs/$g '' > k$k/results/$o.plain.medwalk.txt - done - for i in $link_indices; do - g=${glist[$i]}; o=${name_list[$i]}; l=${llist[$i]} - [ ! -f k$k/results/$o.links.medwalk.txt ] && med_walk $k k$k/graphs/$g "-p k$k/links/$l.ctp.gz" > k$k/results/$o.links.medwalk.txt - [ ! -f k$k/results/$o.string.medwalk.txt ] && cp k$k/results/$o.links.medwalk.txt k$k/results/$o.string.medwalk.txt - done -done - -echo == Contig stats == - -for k in $kmers; do - mkdir -p k$k/results - - for annot in $annot_list; do - [ ! -f k$k/results/$annot.contigs.rmdup.csv ] && \ - ( $CONTIG_STATS --print-csv k$k/contigs/$annot.contigs.rmdup.fa | \ - cat - k$k/results/$annot.medwalk.txt ) \ - > k$k/results/$annot.contigs.rmdup.csv - done -done - -# Combine CSV files to summarise statistics -echo == Merging CSV files == - -colidx=$(echo $(eval echo '{1,$[{1..'$nkmers'}*2]}') | tr ' ' ','); - -for annot in $annot_list; do - [ ! -f $annot.stats.csv ] && \ - (printf "metric,k%s\n" $(echo $kmers | sed 's/ /,k/g'); - printf "kmer,%s\n" $(echo $kmers | tr ' ' ','); - paste -d, k*/results/$annot.contigs.rmdup.csv | \ - cut -d, -f $colidx - | tail -n +2) > $annot.stats.csv -done - -# Stats -echo == Checking contig matches == - -for k in $kmers; do - for annot in $annot_list; do - [ ! -f k$k/results/$annot.contigs.rmdup.fa.txt ] && $STRCHK $k 0.1 k$k/contigs/$annot.contigs.rmdup.fa $REF >& k$k/results/$annot.contigs.rmdup.fa.txt - done -done - -# Now make plots with: -mkdir -p plots -echo Plot with: -files=$(for a in $annot_list; do echo $a'.stats.csv'; done) -echo " " R --vanilla -f plot-results.R --args $files diff --git a/results/kmer_agnostic80/Makefile b/results/kmer_agnostic80/Makefile deleted file mode 100644 index 9380c4d7..00000000 --- a/results/kmer_agnostic80/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -SHELL:=/bin/bash -euo pipefail -# Makefile -# Isaac Turner -# 2014-10-01 -# Using 1Mb of chr22 and error free reads to measure effect of kmer-size - -KMERS=15 21 31 41 51 63 75 99 - -# Expand these for all k values -DIRS=reads $(foreach K,$(KMERS),k$(K)) plots - -all: - ./run.sh -plots: - mkdir -p plots - R --vanilla -f plot-results.R --args gP.l0.plain.stats.csv \ - gS.l0.plain.stats.csv \ - gE.l0.plain.stats.csv \ - gP.lP.raw.links.stats.csv \ - gP.lP.raw.string.stats.csv \ - gS.lS.raw.links.stats.csv \ - gS.lS.raw.string.stats.csv \ - gS.lS.clean.links.stats.csv \ - gS.lS.clean.string.stats.csv \ - gE.lE.clean.links.stats.csv \ - gE.lE.clean.string.stats.csv \ - gS.lE.clean.links.stats.csv \ - gS.lE.clean.string.stats.csv \ - gE.lS.clean.links.stats.csv \ - gE.lS.clean.string.stats.csv - cd report && $(MAKE) - -clean: - rm -rf $(DIRS) {perf,stoch,stocherr}.{plain,links,string}.stats.csv - -.force: - -.PHONY: all clean plots .force diff --git a/results/kmer_agnostic80/run.sh b/results/kmer_agnostic80/run.sh deleted file mode 100755 index 2ba269d9..00000000 --- a/results/kmer_agnostic80/run.sh +++ /dev/null @@ -1,205 +0,0 @@ -#!/bin/bash - -# xtrace prints commands as we run them -set -euo pipefail -set -o xtrace - -CTXDIR=../.. -CTXK=$CTXDIR/bin/ctx -READSIM=$CTXDIR/libs/readsim/readsim -ALLREADS=$CTXDIR/libs/seq_file/scripts/perfect_covg.sh -STRCHK=$CTXDIR/libs/bioinf-perl/sim_mutations/sim_substrings.pl -CONTIG_STATS=$CTXDIR/libs/bioinf-perl/fastn_scripts/contig_stats.pl -LINK_PROC=$CTXDIR/scripts/cortex_links.pl -LINK_THRESH_SCRIPT=$CTXDIR/scripts/R/make_link_cutoffs.R -# DNACAT=$CTXDIR/libs/seq_file/bin/dnacat - -REF=$CTXDIR/results/data/chr22/uniq_flanks/chr22.1Mbp.uniq.fa -ERR_PROFILE=$CTXDIR/results/data/PhiX/PhiX.1.fq.gz -READLEN=80 -DEPTH=100 -ERR_RATE=0.005 # 0.5% per base sequencing error - -# How many contigs to pull out to find median walk distance -NSEED_MEDIAN_WALK=100 - -# Memory to use for each command -MEM=5G - -# -# Finished configure -# - -# Get executable for a given kmer size -getctx () { - k="$1" - echo "$CTXK"$[ ($k+31)/32*32-1 ]; -} - -kmers=$(echo 15 21 31 41 51 63 75 99) -nkmers=$(echo $kmers | tr ' ' '\n' | awk 'END{print NR}') - -# create directories -for k in $kmers; do - mkdir -p k$k/{,graphs,links,contigs,results,kmer_cleaning,link_cleaning} - [ -x $(getctx $k) ] || ( echo "Please compile cortex with 'make MAXK=$[ ($k+31)/32*32-1 ]'" 1>&2 && false ) -done - -mkdir -p reads - -# g = graph, l = links -# P = perfect, S = stochastic, E = stochastic+error -# raw = pre-cleaning, clean = after cleaning -# e.g. gPF.lPF.raw.ctp.gz - -name_list=(gP.l0 gS.l0 gE.l0 gP.lP.raw gS.lS.raw gS.lS.clean gE.lE.clean gS.lE.clean gE.lS.clean) -glist=( perf.ctx stoch.ctx stocherr.ctx perf.ctx stoch.ctx stoch.ctx stocherr.ctx stoch.ctx stocherr.ctx) -llist=( '' '' '' gP.lP.raw gS.lS.raw gS.lS.clean gE.lE.clean gS.lE.clean gE.lS.clean) -all_indices=$(echo {0..8}) -plain_indices=$(echo {0..2}) -link_indices=$(echo {3..8}) - -annot_list=$(for i in $plain_indices; do echo ${name_list[$i]}.plain; done; - for i in $link_indices; do echo ${name_list[$i]}.{links,string}; done) - -# Generate reads -# Redirect stderr with 2> -[ ! -f reads/perf.fa.gz ] && ($ALLREADS $READLEN $REF | gzip -c) > reads/perf.fa.gz 2> reads/perf.fa.gz.log -[ ! -f reads/stoch.fa.gz ] && $READSIM -l $READLEN -r $REF -d $DEPTH -s reads/stoch >& reads/stoch.fa.gz.log -[ ! -f reads/stocherr.fa.gz ] && $READSIM -l $READLEN -r $REF -d $DEPTH -e $ERR_RATE -s reads/stocherr >& reads/stocherr.fa.gz.log - -# Cortex build k=$(K) -echo == Building cortex graphs == - -for k in $kmers; do - mkdir -p k$k/graphs - [ ! -f k$k/graphs/perf.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/perf.fa.gz k$k/graphs/perf.ctx >& k$k/graphs/perf.ctx.log - - if [ $k == 99 ]; then clean_thresh="--supernodes=3"; else clean_thresh=""; fi # Use auto threshold for k<99 - [ ! -f k$k/graphs/stocherr.raw.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/stocherr.fa.gz k$k/graphs/stocherr.raw.ctx >& k$k/graphs/stocherr.raw.ctx.log - [ ! -f k$k/graphs/stocherr.ctx ] && `getctx $k` clean -m $MEM $clean_thresh --covg-before k$k/graphs/stocherr.raw.covg.csv --out k$k/graphs/stocherr.ctx k$k/graphs/stocherr.raw.ctx >& k$k/graphs/stocherr.ctx.log - - thresh=$(cat k$k/graphs/stocherr.ctx.log | grep -m 1 'Removing supernodes with coverage < ' | grep -o '[0-9]*' | tail -1) - [ ! -f k$k/graphs/stoch.raw.ctx ] && `getctx $k` build -m $MEM -k $k --sample chr22_17M_18M --seq reads/stoch.fa.gz k$k/graphs/stoch.raw.ctx >& k$k/graphs/stoch.raw.ctx.log - [ ! -f k$k/graphs/stoch.ctx ] && `getctx $k` clean -m $MEM --tips $[$k*2] --supernodes=$thresh --covg-before k$k/graphs/stoch.raw.covg.csv --out k$k/graphs/stoch.ctx k$k/graphs/stoch.raw.ctx >& k$k/graphs/stoch.ctx.log -done - -echo == Read threading == - -for k in $kmers; do - [ ! -f k$k/links/gP.lP.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gP.lP.raw.ctp.gz --seq reads/perf.fa.gz k$k/graphs/perf.ctx >& k$k/links/gP.lP.raw.ctp.gz.log - [ ! -f k$k/links/gS.lS.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gS.lS.raw.ctp.gz --seq reads/stoch.fa.gz k$k/graphs/stoch.ctx >& k$k/links/gS.lS.raw.ctp.gz.log - [ ! -f k$k/links/gE.lE.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gE.lE.raw.ctp.gz --seq reads/stocherr.fa.gz k$k/graphs/stocherr.ctx >& k$k/links/gE.lE.raw.ctp.gz.log - - [ ! -f k$k/links/gS.lE.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gS.lE.raw.ctp.gz --seq reads/stocherr.fa.gz k$k/graphs/stoch.ctx >& k$k/links/gS.lE.raw.ctp.gz.log - [ ! -f k$k/links/gE.lS.raw.ctp.gz ] && `getctx $k` thread -m $MEM --out k$k/links/gE.lS.raw.ctp.gz --seq reads/stoch.fa.gz k$k/graphs/stocherr.ctx >& k$k/links/gE.lS.raw.ctp.gz.log -done - -echo == Link Cleaning == - -for k in $kmers; do - - # Pick a threshold using stocherr links on stocherr graph - if [ ! -f k$k/links/cleaning.txt ]; then - # Generate table of first 1000 kmers with links - $LINK_PROC list --limit 1000 <(gzip -fcd k$k/links/gE.lE.raw.ctp.gz) k$k/links/gE.lE.raw.ctp.gz.effcovg.csv k$k/links/gE.lE.raw.ctp.gz.links.csv >& k$k/links/gE.lE.raw.ctp.gz.links.csv.log - R --slave --vanilla --quiet -f $LINK_THRESH_SCRIPT --args $k k$k/links/gE.lE.raw.ctp.gz.links.csv > k$k/links/cleaning.txt - fi - - # Use this threshold for all graphs - thresh=$(tail -1 k$k/links/cleaning.txt) - - for f in gP.lP gS.lS gE.lE gS.lE gE.lS; do - if [ ! -f k$k/links/$f.clean.ctp.gz ]; then - ($LINK_PROC clean <(gzip -fcd k$k/links/$f.raw.ctp.gz) $thresh | gzip -c) > k$k/links/$f.clean.ctp.gz 2> k$k/links/$f.clean.ctp.gz.log - fi - done -done - -echo == Assembling contigs for N50 == - -# Used for N50 -for k in $kmers; do - for i in $plain_indices; do - g=${glist[$i]}; o=${name_list[$i]} - [ ! -f k$k/contigs/$o.plain.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.plain.contigs.fa k$k/graphs/$g >& k$k/contigs/$o.plain.contigs.fa.log - done - - for i in $link_indices; do - g=${glist[$i]}; o=${name_list[$i]}; l=${llist[$i]} - [ ! -f k$k/contigs/$o.links.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.links.contigs.fa --confid-step 0.95 -p k$k/links/$o.ctp.gz k$k/graphs/$g >& k$k/contigs/$o.links.contigs.fa.log - [ ! -f k$k/contigs/$o.string.contigs.fa ] && `getctx $k` contigs -m $MEM -o k$k/contigs/$o.string.contigs.fa --use-seed-paths --confid-step 0.95 -p k$k/links/$o.ctp.gz k$k/graphs/$g >& k$k/contigs/$o.string.contigs.fa.log - done - - # Remove duplicates in contigs - for annot in $annot_list; do - [ ! -f k$k/contigs/$annot.contigs.rmdup.fa ] && \ - `getctx $k` rmsubstr -k $k -m $MEM -o k$k/contigs/$annot.contigs.rmdup.fa k$k/contigs/$annot.contigs.fa >& k$k/contigs/$annot.rmdup.fa.log - done -done - -echo == Median walk distance == - -# Get median walk distance -# example: med_walk [-p link_file.ctp] -med_walk() { - k="$1"; g="$2"; pathargs="$3"; - ctx=$(getctx $k) - dist=$($ctx contigs -m $MEM --reseed --ncontigs $NSEED_MEDIAN_WALK $pathargs $g 2>&1 | \ - grep -ioE 'Lengths:.*median: [0-9,]*' | grep -oE '[0-9,]+$' | tr -d ',') - printf "med_walk,$dist\n" -} - -for k in $kmers; do - mkdir -p k$k/results - for i in $plain_indices; do - g=${glist[$i]}; o=${name_list[$i]} - [ ! -f k$k/results/$o.plain.medwalk.txt ] && med_walk $k k$k/graphs/$g '' > k$k/results/$o.plain.medwalk.txt - done - for i in $link_indices; do - g=${glist[$i]}; o=${name_list[$i]}; l=${llist[$i]} - [ ! -f k$k/results/$o.links.medwalk.txt ] && med_walk $k k$k/graphs/$g "-p k$k/links/$l.ctp.gz" > k$k/results/$o.links.medwalk.txt - [ ! -f k$k/results/$o.string.medwalk.txt ] && cp k$k/results/$o.links.medwalk.txt k$k/results/$o.string.medwalk.txt - done -done - -echo == Contig stats == - -for k in $kmers; do - mkdir -p k$k/results - - for annot in $annot_list; do - [ ! -f k$k/results/$annot.contigs.rmdup.csv ] && \ - ( $CONTIG_STATS --print-csv k$k/contigs/$annot.contigs.rmdup.fa | \ - cat - k$k/results/$annot.medwalk.txt ) \ - > k$k/results/$annot.contigs.rmdup.csv - done -done - -# Combine CSV files to summarise statistics -echo == Merging CSV files == - -colidx=$(echo $(eval echo '{1,$[{1..'$nkmers'}*2]}') | tr ' ' ','); - -for annot in $annot_list; do - [ ! -f $annot.stats.csv ] && \ - (printf "metric,k%s\n" $(echo $kmers | sed 's/ /,k/g'); - printf "kmer,%s\n" $(echo $kmers | tr ' ' ','); - paste -d, k*/results/$annot.contigs.rmdup.csv | \ - cut -d, -f $colidx - | tail -n +2) > $annot.stats.csv -done - -# Stats -echo == Checking contig matches == - -for k in $kmers; do - for annot in $annot_list; do - [ ! -f k$k/results/$annot.contigs.rmdup.fa.txt ] && $STRCHK $k 0.1 k$k/contigs/$annot.contigs.rmdup.fa $REF >& k$k/results/$annot.contigs.rmdup.fa.txt - done -done - -# Now make plots with: -mkdir -p plots -echo Plot with: -files=$(for a in $annot_list; do echo $a'.stats.csv'; done) -echo " " R --vanilla -f plot-results.R --args $files diff --git a/results/kmer_size_experiment/Makefile b/results/kmer_size_experiment/Makefile new file mode 100644 index 00000000..607c679c --- /dev/null +++ b/results/kmer_size_experiment/Makefile @@ -0,0 +1,81 @@ +SHELL=/bin/bash -euo pipefail +# +# Get N50 for assemblies with perfect coverage, stochastic coverage and +# stochastic coverage + sequencing error for k=21,31,41,51,63,75,99 +# +# Fetch data before running: +# cd mccortex/results/data && ./download.sh +# +# Generate plots with: +# cd results && ./make-csvs-and-plots.sh +# +# Isaac Turner 2016-10-28 + +KMERS=21 31 41 51 61 71 81 91 99 +CTXDIR=../../ +DNACAT=$(CTXDIR)/libs/seq_file/bin/dnacat +GENREADS=$(CTXDIR)/scripts/python/generate-reads.py +COUNT_BAD_EDGES=python $(CTXDIR)/scripts/python/count-bad-edges.py +REF=$(CTXDIR)/results/data/chr22/chr22_17M_18M.fa +READS_PERFECT=data/perfect_cov.fa.gz +READS_STOCH=data/stoch_cov.fa.gz +READS_STOCHERR=data/stocherr_cov.fa.gz +MKFILE=runk.mk +FRAGLEN=400 +READLEN=100 +DEPTH=100 +ERRRATE=0.005 +SEED=2380999655 + +# {perfect_cov,stoch_cov,stocherr_cov}/k{21,31,41,51,61,71,81,91,99}/stats.links.txt +NAMES=perfect_cov stoch_cov stocherr_cov +PLAIN_STATS=$(shell for d in $(NAMES); do for k in $(KMERS); do echo $$d/k$$k/stats.plain.txt; done; done) +LINKS_STATS=$(PLAIN_STATS:plain.txt=links.txt) +DIRS=data $(NAMES) + +# Keep all files +.SECONDARY: + +PERFECT=$(foreach K,$(KMERS),perfect_k$(K)) +STOCH=$(foreach K,$(KMERS),stoch_k$(K)) +STOCHERR=$(foreach K,$(KMERS),stocherr_k$(K)) + +# all: $(PLAIN_STATS) $(LINKS_STATS) bad.edges.csv +all: $(PERFECT) $(STOCH) $(STOCHERR) bad.edges.csv + +$(REF): + cd ../data && ./download.sh + +$(READS_PERFECT): $(REF) | $(DIRS) + $(DNACAT) -P $(REF) | $(GENREADS) -s $(SEED) -p $(FRAGLEN) -r $(READLEN) | gzip -c > $@ + +$(READS_STOCH): $(REF) | $(DIRS) + $(DNACAT) -P $(REF) | $(GENREADS) -s $(SEED) -p $(FRAGLEN) -r $(READLEN) -d $(DEPTH) | gzip -c > $@ + +$(READS_STOCHERR): $(REF) | $(DIRS) + $(DNACAT) -P $(REF) | $(GENREADS) -s $(SEED) -p $(FRAGLEN) -r $(READLEN) -d $(DEPTH) -e $(ERRRATE) | gzip -c > $@ + +# perfect_cov/k%/stats.plain.txt perfect_cov/k%/stats.links.txt: $(READS_PERFECT) +perfect_k%: $(READS_PERFECT) + $(MAKE) -f $(MKFILE) K=$* REF=$(REF) NAME=perfect_cov INPUT=$(READS_PERFECT) + +# stoch_cov/k%/stats.plain.txt stoch_cov/k%/stats.links.txt: $(READS_STOCH) +stoch_k%: $(READS_STOCH) + $(MAKE) -f $(MKFILE) K=$* REF=$(REF) NAME=stoch_cov INPUT=$(READS_STOCH) + +# stocherr_cov/k%/stats.plain.txt stocherr_cov/k%/stats.links.txt: $(READS_STOCHERR) +stocherr_k%: $(READS_STOCHERR) + $(MAKE) -f $(MKFILE) K=$* REF=$(REF) NAME=stocherr_cov INPUT=$(READS_STOCHERR) CLEAN=1 + +# Find the number of sequencing errors that would add a new edges between two +# existing kmers +bad.edges.csv: $(REF) + $(DNACAT) -P $(REF) | $(COUNT_BAD_EDGES) $(DEPTH) $(ERRRATE) > $@ + +$(DIRS): + mkdir -p $@ + +clean: + rm -rf $(DIRS) bad.edges.csv + +.PHONY: all clean diff --git a/results/kmer_size_experiment/results/20160912mon/notes.txt b/results/kmer_size_experiment/results/20160912mon/notes.txt new file mode 100644 index 00000000..47f7e116 --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/notes.txt @@ -0,0 +1,10 @@ +At k=21, ~0.2%[1] of mutations add a new edge between existing kmers. +With an error rate of 0.5% and 100X coverage, there are 500,000 seqn errors[2]. +Therefore we add approximately ~1325 new edges[3] to the graph with errors, even +if we clean off all kmers due to sequencing errors. This equates to an edge every +1000bp. + + +[1] 7951 / (3*10**6), see seqn.errors.csv +[2] 100*10**6*0.005, 100X coverage, ref is 1Mbp, 0.5% sequencing error rate +[3] 500000*0.002, 500,000 errors, 0.2% are likely to be edges between existing kmers diff --git a/results/kmer_size_experiment/results/20160912mon/perfect.cov.pdf b/results/kmer_size_experiment/results/20160912mon/perfect.cov.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2d26a632c56eb5d5a14ec81992b32e811fba45ee GIT binary patch literal 5615 zcmZ`-c{tST-?oH|%DzmBNrb|fu};~ICHuY)rZF@bGnyrmu`@!-lI&~t97$!1$et`A zktL)ES)#1vo#~v;@4UbBzTfMb>$#r$`E1W;{EHmpND$k0a5xM3Cw) zcd!+jh$j<9ZoIB@!585vKLkwDmakjlsEu zjK@420*xv}N}s=)NN}b-UPKr_mHu8ub80o`>Im!uqSZcz(G4OVh{#UM>20`uxNy`V zjeo*Z5#u(50UNG77<&m2(yLpWU55axh`_ zKE`5MqM?S|{ zXkQr|wTqrB@S&|!#Z46`^%5; z$9}3$_qed3hCGT|;#^i$b)lxPw};dDwMqI0T8%oqVpckb%!FhW3)Yhag;x8R%OUp{1LWhi;J#ZIw=1+e$y5 z(3l`J_lfOc^T06C*XaQ!!uNdvQw{5l@USF3k zTS*U3>+^6Z;h!&`;kexje}&6wjcdh9R59--#5}5A@{7c@Ub<tL-j`|Eq*L$&9Hw2JnZMo<5_u=&>UjW^%6*#?zB(GN_W1-;lVq zr!R`;)$Ak)%%u8kH|X&$bH5n_)rfQLt}AWS?$B=CprQRe7plDpNBa=I$89jSiLdtK z_bg9#^{0}zzb+1Je%nd6p8hZ%-K5c3?@sPy-O^O^M#WMplv4k=+5!3X*J`!NGQ$FI zcgk|qxUH@1TE5$bUUK#$n)=l=L?TpHnC$Bx=-ZYpKZ+}P)^iN*H6g#U#8fv~i=-$P zcbGn1`83tQ@CRdU*K{*TOQvQ+Wi@CVQBh+)!a0=`xH{ju6RWa{D_c_fQF!i$gp>V` zIWrC2No273?V!=)wYn#=5KU7rSUG|-eYz z_lA3-S}9HbPi{P&)KHPUy;^HfBIGDu?V%x+XdN{OJPyRY^3~lZ@xTl+o|VW#^=@f9 z3QBZzC4U0gzQ3Plf{9WXI5+QMq;TT8B}~#<+Uxi>U*8llX`6G;Z>>7k}?AzBW^>0}K?9dNKdKW&T>b zBpJFRZpN4MPBP~me@=4uT#~$H;FX-fD}oy3i7j51Vu3i(K%9M`m3WeQ7<|2y^0gGP z{Xm~Jr#*VDj61rYCX-}fy+JM(Va;x!hP zW^$(tupc$I+psiy!(8|$)zq`9NqOtX+mCt)Z^bpmsm5Ae<&QX*ByIIvlYY)Yr??^m zK?mMnh5sN4Tux;K;~xGjsa#z-wRh}iI`_$gfsM_DrOB1$S&1&$JC5_g8Pt3CR>6K- z3ATO*H9KsbLT}&FsPr#Z`ioTQocSlG{U+5vxfE=S@gRbnL39qa0wHKzizm}K69NL8 zqCGH39lSrtS)O*Lr~sw^5dWZEI&J=@NgBJOtkEQpGnh7C&@!O?NeBQ~pGLVLH8rq4 z6a=9M`9<1bJ%5scHHki6sHxGz(NbdZ1Z!U;>K6hgS{dkolzyB4hvw-JJ)bl9qOPtE zl8E*IDbuYGzXu2U?)M$F=}e8ICjrCJ*DxqFJt)|WHVlCOqI9Sd?Ej{8dQE>*dRB^` zjrEuSwZBoN(ISy6CV9a7vK5oz(MH1rxw8;^+O zz&A7B9(^m;%bkt!$QlDPmSNhrx>`qITc0V7DR49no5s5>*jRrIw6k}1NHea5;UFiu z&QT1)V38U(|8;xJ1t{39s&1btQDfen`c%=hPu{3c%rOjVmqh$3;~e-s4b2JLfJUeR zdx(9r=(V{C$q%<+JWX*4?m)r$!`pSxrZcE;(#0sP4*XmdFwjhyK`L=V?i11411pP4 zl!AuIez?vsZP@L$HVAfj)z90*+1~fcNsdRnI2w4y#X9O%5^wBGf4}K`Z2K$uV=A{! z&q7AifQ~0-2n?S6`;L6qeSnX|E{v+eRONkSeVU|(gAw}~iCfoFTqA;4fN;_6*6;Q~ zX-63z7NMVv?NWU({0xXCMvI0Y)LwTmL(HQl@-fO)k@`SatF~DguXE)S6E4P#Io{IX zh1K1hW5K#^f&ErZldB(*{u@cEi6n@ws|wCDpa|a9)i>+AbL3MFbz79}Cs{;*dtbaM z8l0>L?gbJsF(PBSxUOGO{L>JFI>lkohe)phUTJYfM0RoUTTw|M-L4alA|37s&oQ&;a$B58J}jZD zW5~p>wUldBa-uA1G0$_sc|nSFyq@8xZcVP`Wu9IB0JhWe>_sh354o`{)$Zo1hvOpz zTe{bvs-kjXHO+T@nFG02BFtOwO>;W{G((HIL%6ul=f}!(3%ttgYZ>sL{vxeuP;vn%t!6qlp@K-Zc1FMCH;$e}% zjyPmdmBWMcBPW((jWgHS*rZq^D5+cDgOi}9GSG3u*ae0th>5%6SziPN%%H!H?c(bC-(y7p< zfMoMIYVa}Cc1#~;1Jk`;O^E%<^3i}#ragLAcg8YCYEQ~b3S|>fW{$S58zv9yFFG!& z9@SMSVi~Z)6h6jZ4u34o9}|;h^w=od=+?`t{L&}$gkLi+eBOlaB6rEV7i?v1&25_& z&%9B7bFQrN&8L#AD${z7^Vyl4cCVuOnFyoKPmDw;^(vlLw71%l9yH1sJj>O{r)lA zPE&*2YTfz!eW)lvYf?gAiN_|Sh@?dRL6)fhRv%uk5YQ1|84wxpC17ileOKm(_>Ysj zjl0&Y=UGp&vanXL)*M}rl4<2@MMV9Gs=O6<>r$Mu5E-ZrwNp4I-=~68YVy`mzN+Hr zIp|L4#P|^`3+@X5xw-hrRnX@@bH50uDo94;Ol0{>#C7;*4PdJ%EvRyPv^pQ zSjDyTpEgIvS{#=HM|T7Ti;UBaV|xm&cWi8L9G|~cw^&DTSwa-zj?};Q1 z;S*P1g6GY}>(1PjR5!Y2I+d-KHk)mg@YSYSz=k8cc(-_@u*r@2s zsaI_bxr^BMpidMwpw#MD)iKrgdou5RaZ#24@F<~m$6_Bfv@4GF4~MkDAGt!)KcrYE zZJbd#zh#(B$d<>u*NuHO?UDRS*?8_6>ePaG?VRiU@TohgaQ@}4?Lb*!d4T;=rJR#` zQwP2`^hv}MjCZiqV>uacV4a*N!aTw5b$Wi)x1c5GSpFEjb`|AXY{_`^JE>9CTFgpO znW?C;zl5N1h4DF~_xMk^rtQ1E_IKSV?-C!QAHK@N_S?O(n;BUepQt%0WiQtc79AIP z)JSAQ8Fa_ryKry$nFZ3`vL^PS@grj=pvxEnZSmyEB86=9&G44J>5U4{FGcU(He7G0 zTwWg?vMaW=ciO^Ly07*IJm2kOEr^Ptq*7peBFg$I--CO1>+3ESls^948Z;C1IW(G? z(i`8~E1NEhuJA3tPyXuuqNV4(fAG*`2euqvTysx^i7HMN3C`I5@~u^ydgJ!P_{S8l zZG;colY;xai!M{JH+KgmzQiwyXdL%{T+>-t{NYB=>Xzt`x*gHwqyHHDi<|kpIcA@7 zGBxg!7e9ZV8$vZ22drk^R3}z|pW)ZA17jQWf&E#Zx0W@pPkr03;^-5MclAIl*U$Nu z5}Si^Kd|kIo@YJDdVzX^S`WT}SOI3M1kT3n-5s)Ttjul@@clB_wVyDvWce09WmtQv zvAbbs!EU!=zJ`eFpKv4!QflV12ODGd(zk?z6n|797D4J7i|Xgs+*`sgpeCv$HPHu~ zLC+RW?I7DD`rsR<#|=8qP$a>cuG{|m7e|uPirY2M9k}ehnj7*RU)sI)9ZnbLloMvj!!j919`bu$} zc}@`XR>a2Mzea7?-(x7?Fa1gnN0W*$IGTV(A6x&kv&Z1HaYW2-004$#JU!7g(Tc80 zIfE3Hz&;opnFxX@gRheDBs3Q7NdiC=6~P{On$AWfVnARo0`eMd#zmsY^idauA)v@U zo>;U$2u#9YJ!p^*5=Fq{K;TORG~JNqiA14jIv4=#fuW@$Vu&EH91c(NKzo9~gkMFX zXp=t{i~I+EnT+#7639MSB$)&PgUceR4;E3a|5WgPy#&D4 zjR`>0?h686#ChUruQPf-z*cxX d2}D;*fAxk)LJ~;7I-{fjQ2|Iu=$Y#S{s%|Sz7PNa literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20160912mon/perfect.links.csv b/results/kmer_size_experiment/results/20160912mon/perfect.links.csv new file mode 100644 index 00000000..4c2a267f --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/perfect.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,52592,1 +31,52592,1 +41,52592,1 +51,52592,0 +61,52592,0 +71,52592,0 +81,52592,0 +91,52592,0 +99,52592,0 diff --git a/results/kmer_size_experiment/results/20160912mon/perfect.plain.csv b/results/kmer_size_experiment/results/20160912mon/perfect.plain.csv new file mode 100644 index 00000000..337f0f4a --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/perfect.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,969,0 +31,1729,0 +41,2909,0 +51,5501,0 +61,13480,0 +71,21068,0 +81,38623,0 +91,48050,0 +99,52592,0 diff --git a/results/kmer_size_experiment/results/20160912mon/seqn.errors.csv b/results/kmer_size_experiment/results/20160912mon/seqn.errors.csv new file mode 100644 index 00000000..ba2a975d --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/seqn.errors.csv @@ -0,0 +1,12 @@ +# The number of sequencing errors that would add a new edge between two +# existing kmers. Note: there are 3*reflen possible mutations +kmer,reflen,nkmers,nedges,nerror_edges,cov,err_rate,est_bad_edges +21,1000000,927610,933930,7951,100,0.005,1325 +31,1000000,971394,973934,1874,100,0.005,312 +41,1000000,988492,989443,394,100,0.005,65 +51,1000000,994492,994828,85,100,0.005,14 +61,1000000,996793,996939,35,100,0.005,5 +71,1000000,997897,997975,9,100,0.005,1 +81,1000000,998506,998551,4,100,0.005,0 +91,1000000,998891,998921,6,100,0.005,1 +99,1000000,999092,999114,0,100,0.005,0 diff --git a/results/kmer_size_experiment/results/20160912mon/stoch.cov.pdf b/results/kmer_size_experiment/results/20160912mon/stoch.cov.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a2e6d9b4d2eb9bf60d41cfce6987b7758d279396 GIT binary patch literal 5663 zcmZ`-c{r5q+ZJUR*^*^S3?dYd83rS=FC|&ClQ9NUV`iEq`@S}oELjst*~%-agph2> zz7|nPh%AMSwQr{P?ft#q@BO~#IG*D`?~JypYyy=DRX^8d8mRiNGfb9>{D1z z*nkre1cfL<@NSnu8X6!#kBCN+F?gIll8lDP=&QpNp(=_9C=3RLBVeksAixyw{{Q~7 z#*ne}EK7(H5lQgEphyri3YkD5)6>j}cy|g4O=SF4QczNW(y?E)@C-;6bmj~QjdN#o z0sFff2w?3;KtljMBpHdtdqMznq$ipLQKAP}LjIGW`mY2=93z^D#*zOD2lVhbGJQ#c zsQpR@EYT!9g@~f>r~F_0F#>*%<2)MYN%n$3Rbk2yzz~Ba(@y{xV(G)tN2BoWXhz>8 zGJR&=Am5xtdsBQ4oVR|td6j=Z4O`ysxp(?WFeE;RmwKhM_#62|)=$ z;}>|P>{ukiO;iT2^Eyg8RDL+Q|2>HBL6u1^dxgNOk!Q*UTF~I>j@Z2BlYR>q)tYU- zF|#>Zd~jvlK2*vlry>U00eAJtWv)7hWs-jtL5vpA=u2vxGa6 zl($NdKlnwqgV>*ip5Rb_$xkSGH}-NS^UD6^H~PF0yh-J~QL#69!;Z2VA*|+jv44j_KCj7)9tj=}gkA^W-6uHwUBz58<%!kiU%0@yEGu)FcPU7eG- zAlQCG3Rz5eJn?6!_Wl`3kw&HxennBJTJqMq$PgYUjN#$*xQS*FbDUh3#7Xcsih zjsdeHArjS8)#MwTtV^$sFMEd#6~%@Pu;a4WJs`{q9Y@4jtn$qWM^=+6 zhDsi$M)&m)(yUNS?XqxPqcw4L-GoMjfMm4>QLYtB8Z|PH)8*CsY#mwd;Bmo#CP(3E zl;>|em}W&f*e$CW4QUk)H%AdJbV1dM)`Y`_QS5Gs;s|(aQjR5xtS?p|IVh~TFd;wQ zQRB6m_OL)OjnYhWviIp~>cqoDte@|@YN;WRB*l^ChDWw>;wbTasc6nxw&XS$_`R^( z$A&;nF3h5%$ToHht~b%-RSp|Y?y0(*u~$C%RV5z?W|g;m4Wy5(mo{mpnpuxN$XKG7Q^hkEm+d_-Lx7ANhh+O^0duc$(`5v7JUI zhQOT!yLg@NuPP~`>Zv6`{A4nxPnKg30Op!5s$G%P zc$F3e_)Im;yqpnl!0ZLRnG|^cYR~-X4m{+lNtj#)E?X)IZzeW7^v9R)o7uJ#O_%jE zdMXq$$18MpV>U?GRTQcR+?CZs0uGs>_Q{1z4<&F9SD+hf~u?uo?yVZDK zs@p{Qz(1aOuVHt>FMD?U$J)@X8=u#+H~Z5zU;n6g6Hg6{e$ljffjjX-gXZ=wd1G%V z3lR{&?(+;mTj1r+MyNIHAIORr_}JZ5$+A2KTnbxsdBFnr-y&{&U0JC{1QXjH=u}IO z+THw~FsgJgr{bxq5#++vJ@Yks z{zPfxh;?|D(3!QzKKqd803Yp)9uVaFa~6F~Gi}>8=D8_9W*m^7uc)8am(e)mBsIvg zQrgX)S|Twvwdiyjav?{#)r;f1TOV; zXx2*+fDLyPyr)wh-z4)QSPRd-}ey!czB&5is zVX9IM95^gZ?b(g!*-Zsr-CH%tIA6c!Q@?idOk7H%r|DhVm=pns8 zOU`*eIsG@Hq!a>e8f+$8qZ`&{-7}6e`^XDXDVGe!{`fo`Z_CN~{;5;1yVUh!=SD8` zrA(<$0r5^DK8V0&@1~yv?tODtP-a?k0QH45O&p=Yoccm;ix6yiaT-2prhtl4YgcU! zF%4`=Yi9rSS5Dg8$5C(xV zHu**JfPpXB$coI6BN`fv?dT=3c%l^niTZ^ABugV*2>f^We`EmzV$^d4&g$vuB1vd> zhzcXh`M1zutbU(Shr#SPMiD?CeGP*`Gd2Y-(8UArmjr;p5i0+u1Yq>^w*J zld~H8_iz#my?9>gjvQUlyU z>YhTcOpl%Tlz=!?AD`?B7Mf+JRm19~Q4!>`(b}!}=?ZY*1r;Wllre=Fl9fAF9+e^k z3zz@o$24Kwercs2u|~GZ^B$b1P-zTDnVQ$P!pzjMntSHaLrm z=}{s2>Bv@$H%5TTd10S-{i)+?{@4Rqy3yLSylfgz)84(2>h$ZQG``++JLdc1N3?@QnPlYnARGbi zCwrHJI9rWaHoM~bvNCUP_k`~S**-tN_ge6HNFvHeANKr^Mfcb29S3G*2$O*}%e?l{ zITrqx`=7n!d(Ol&!NRN~6TAPaD|90KaMC_~h|uBCr+0*JhL^dXC}ySH;Zu%KAqbPh zP!q~(Yzko=uKUOKSt@ephRUvhI<L2-E!fO&7oQotI4E-&w9aR0 zc;#+aU-ujn`II{NcASEt?VVe_$L1>M_}0|61Xv^TQSs13;gItp(gKytmEg*w1Br+z zaBDoWu)^-4`mo3Xm8*sL$PEj91`6N}I23T;bYf71ccd4k!s67$4m_~SYLh&0xfL(1o;^ct#}JU8*xIeut8 z_Chn^Zs_x9yR#bl8fGF=^->nb9>-N=!;Xu4i!YwIEio=p2T5%=bmt*z*q{HPi<8b1 z|9GOo;*Ol1+>xSZhXD0*}~zz zbphG>OLYGe*u@b;gf&9XuaX$|{lI6V!>3wer}QQ*s509!o-!!w$Wk-3RrMfc&~V;< zUX4pnsqjFrCFcGUyjR2%Sph2b_PHnLa?d5axhf!gEdOX9%iNbW*cNh&vUU2R{6({i z_4Cq0Dnlnr%ZFx){ffU0of{k-G93I+qBIyZI51dM;#2Z*KpT^ZiM%p>`D>rrgKuPR z@5By~rJNK*%{6s^)@veal>A%$E&QYW7yUQZIJQph9^XB- zRkvlus?I9TdVsZzwTkOo^r@!9P0rD~(d7y832yN!!W6J3%tnb@v0D`fufL+Ja#hvd zqu-U-br|&qW~wyq?LTr*rBc-d`T%+q7SnaC^T14PSB!kG-~~b5IM%qM=MN2}W;J2a zi4Ohp5%Qe!wWspz!FDZ<-j(xJTVCGy+qiMBptfi?elIJp0o3Y)vMl+x&su+sI(>Kg zSp8u3T2Bzp6W3Pt9_2HTds)n_p|Sq$UPMGe#OwSnWPjgFXkW$fH`cgT5X#%*wvX;< zKSbFz^_jKdkw*Kkf$uhjgbGcvP2xHV{90FOt4C%Ns^_bTP7BWWaR;mPtLku;KF&Ug z0TJ6QJV89UMyYL)MJ1l4vpPMVF2gPo<=o|Ov7SH9{gA~szwfDS4(N@U;{R;5Ncw2o zqdHPGt7&Hz<>8v!KF6ewwrCns4_R(Ru}3(EPpel4fUzu)2*o@p{-M6)dY>k zGx=(hTN0lP{NfZ&4cq0^(`biUq;jN;4)tA!UYEbVjL6Sn?Oy8-z5mpt=s9*tVTnJN z4bO&8S$+e|njNo}PCKD_?waX`T#c-$T+8I|)-MIEIdhA)iiYpkU*I%-pZ7ynQ@T!* zB_lcWs!m-)vp%ExrH1-*%x~ z!9laW72g&1H1a9tN{Gx8g;PLawStGUS+Y%E_T7s0;04Dx0jfcZs>*Y0@%z|OnRjZ> zF-wJ|reY?(QbHzWCMVB*#LwXBX=z=yX_taVQ=Xt7b>?GxY&vZwhZo+DRUMPDRpPi*YEP@ zU6{$sIg>`2|MF#e099w=znpVjlT-%0z^`C?M^+JJ3cd2X#BVX^=9PV(e}#asLb zT>HdAGuO9-?_>UQ>O=wK3u}YOLaf z7W(H}@QXR|O=L@CxBjZ+d!sh#pc8~LyUQH$2epHADI(<43&zB@2Qfz$9k zr?s7d@>`dR8{YlwUqUaF->vQ1g3CQzGG17OkS638ynoha`tU1NFIOy|)XEH=+Z0~= zwp0{vmKV&j5xKhkFR6|ATZV%E^1BRhbZZHNqZ?if+4`TAEe5B9BVm36AV44E;en=` zTnrb>5uywSyfHWm34%}ou2S%1G#2ea20@jT0e3vzmm`rd5Wtg&yhhi!NEC%3bx{~1 zisJ2oMf*YkG6w5ThrE#}A|3|;+=ysKAUzX_LeYIQ5a5oXmm*I>}nv6jqLBQ|1Ko}GKr$OjTM0QJ7)()(?hpPK237r6E)1sj?_4D){m${f za}{Awy5afX7~)?TToL-O8gQ7(KYAe(kr*tRNOy4|fEC6cO+POLIE(YZ({E>tc>tDp eJQ>1pSAWfhL`D+Hza|4$Qh|e{qzudqLH`5eyxO$@ literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20160912mon/stoch.links.csv b/results/kmer_size_experiment/results/20160912mon/stoch.links.csv new file mode 100644 index 00000000..04e63624 --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/stoch.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,74568,6 +31,74568,4 +41,60396,3 +51,52592,0 +61,52592,0 +71,52592,0 +81,52592,0 +91,18978,0 +99,108,1 diff --git a/results/kmer_size_experiment/results/20160912mon/stoch.plain.csv b/results/kmer_size_experiment/results/20160912mon/stoch.plain.csv new file mode 100644 index 00000000..1a9be738 --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/stoch.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,969,0 +31,1729,0 +41,2909,0 +51,5501,0 +61,13480,0 +71,21068,0 +81,38623,0 +91,18068,0 +99,108,1 diff --git a/results/kmer_size_experiment/results/20160912mon/stocherr.cov.pdf b/results/kmer_size_experiment/results/20160912mon/stocherr.cov.pdf new file mode 100644 index 0000000000000000000000000000000000000000..98cb1ff6afafdc3654f5f32a17925793874ed694 GIT binary patch literal 5682 zcmZ`-XH-*Zv_+5>kX}UE1wl|sNa#pUXwo}M4Iw}@Br!#LQL3~7v4AuY1S|-GQl*2S z^df>HARr)BqzdS~gc)b%&AfNl%F6ooJ>NO!o_L!eNIvN}}#Gze^t^ZI|^*`Y}o z$}Jnfh=9QRqLDVGa!A^8FjRV76LY>38?C|>|J#88FobGEU9 zm+L87cQbG!GwY){xS1Z?nI7kjj=9+V9|AtviWABQCXgvhR_5?4^K()gtK6|R7}-~` z-bh7=WhD>y-HOws-%59Mv;%!bgmGEfcNO;n4;IE=IXm{aXU>C@6iZcmH9t0)spQap z4?9^}+H8?lGhSfzgvGUCREPy=^SpGNCY2G;U^>kn(FwA;v2~Ftm2-;D1DQL6y2I_o zJQoo+b>kQrfRDtDJlO#?m+QAJuH0nQ_lm|>n)fV7&Hd(?u_5*S+(ghVwp9BDpvl3m z!O0+&s;0_VYn;J8+C-1e?*~vTEac|L2Jg^Zfx9~Yrt#LA2c8O$I z47~6amw|s9P@(*TxmJa4Uo_{|?vh|G4U-s$HS5w^ty(g#`pleC%k?RlB%4C|Ysz%ZB2mS72Ho2;Z52r#g!CC-qSImG zt4wMO7o!$UqH`305v$U;xG_OqYd;qU*3&WBQ?(-HttBdpVna0*x+#WYok(l;F+Nvz ze9XKI)1dUxz=fdVc8gk1;AiEN>ccWkNV<~osRNCr+kjt zXT^+FPZ-U99-gpIANE;6-nqnu@ayO47i)hGJEnI$bw^R;rAY5b)_rNmAZ{;_a);ua zFTU*6o5#xe6fywIBdVsxcGCO`by195E3r{f*;GrC|14oja~UDo-Oj{fQO8AgQrIhK zd6N`h92;#yHjkz+vm!aEt-e5f9PZQ9Or4m)=)`L0>?``3H$?Mirn$3L3C=V_&0aTm zd%jS=n`T-;HkzAVnOkSB-z&p?t#n9_?ugf$q=|gG0~Vm)wDe#94PRrQTtEI=l#h;& zAD zVd2~GPSQ{5IJVIspYowcaNCG@aYA+pu^I}R68Snd)duLwRwbyeg-yDbRQ0K_Db@84 zO{s~>TTH&Jd-(KS;+?07M$jir%lBs$Zwj;RgajX~{}{s+?f-`B?H=1dsdm3-yYJ+v zk9Tkb-9X05m(uw;(~W?t;6_Do%Bi+q`97v6U5I^xPWFPRX(C=F+8i=xky8To{8wSXPnv|=Lu5BEbc2g z2~|W@(#U^EMR3{22nx#(V@&J>a_}z<5C1V%U>|E#KuZ_aE4@w6X@5DoA^Ej`*n586 zg~xA7`-gJe5v6AuTP%XZqF=J|v1R()w>_4-((1ko+u`3+_!W4TMh4?&70a6a{#t06XQiUh%4>UwThSB|9Pth6C>GHJsd)3~B<4CScb17|$!EhT$K*nQsQ4W8XkR6ZqGHw@p)-VmL zqr8o>!_nD1trmED$gX;Ra=czqWm%f%D09P;^a5=Z&Agj~kPNB)!EU(0kF)Qxr*4`? z+R|>cg6*?ywpnbBOJ>z}Zk^3*v$Kq}+zvKen3aWQnx**#a7(BTFsr(si^_j=UGvoI265yw-bTtsb+BOO+iC^;`WSxLT0)PC`^C|5kC9BT97J^3j_> z^S!jLQ8Bi8y6^63lXz5F(-7-ub@R`eXAO0X%$mD?CpEq=Z-ngkgoTHF2l*cFFHLC7 z2wqu(Jzo7;^_>xqdYMLH<3EY*PhO_d?jN-Hi z0)Wj?UTB0qE&y;=puAI3gi?R5|6pz^^ZsI5ia{4!h-$$Rd(2-i2l!Hk67BeU2 z2-Z9&o=@}baY+e29*jp=ztYfQXr}NL75aR;hO`;9A)|?y4DMb2`dT*N`~I0Boy2CE z-*a!)oEL}CSfwY3Ti z1~f`jpo`c$jruw}E;W;+&ef2Z>LI{C&$L?uZ8(LDB$>qMzQWBu69}h9z2VsG#5z97#fQ4}X|%EdM&DUCbfQ(vF?d_sG- z0-I{m$!W-FhQN8DPXrq8zJv2Tfqnwz;%7(I)YTOHhr?YDGH>i$x$PDezA6A0 z-EG}=49hr1Q&xz2F}4@$hvubmT{>b__anB)156WN-auvxy)052;%3t}E$bumczoQ& z^hoZd)8K`*ym5`GZ|m%x#(!s>%Z=R?W&BAN z;p04zXov&fG=O&%ngM3FUpRMi$I)l#$LQ|n-rZ$y{aG+pbNl-HUC#d7PP7liPUwb- z&`6)*1ek-pWct@aSzZ~@eeX#a$VkuL@4NOh)Un#^XD6RTcrwxm4z1?0?p?lnaDi3{ zpfS*;ThtX?pyO^jTG_-|O~XD#N2@0tcl5FcWa=7E$`Lrg&lBjb&j*cI(QDDo7IQX8iY`##&iJ`CX zuvBA8j^b~A{S`{#!)sMdxA1f!9IH{5t#_t4oj^KQi#Q`VI5qAkC~)$%80x16NHBAX zJx<{&phfGxPZ22)t%)~IE$VdUVK6X&See~qc_YdjJ$?Uno!@AB5cV)8o#b3GY3@Ykr%}fGXAp9EH~J0LOsh=; z^dz{p;w;+m_amxfoJ=&~nwCP64U*Qz-V!ROuS$sdiLFRwi+>ca2hv^}da)BU&znu? zV^8IZO-eOd-#g=U=BV^n9tW9WY0?=N>wv;IM@q2h4p@X&D8L;r`9L%Qg6bkwEUi^~IQ zkJm|Te#u=zt64YHW9P?5qLh1-PAWP2t_}(HXN?Swd>eW*_-Ifgf6FfO=}KPqqk7nW z)OG=zeTTWxy$aZTK`$}})S8ysTkN$7DI_VAe~=|>w`wD66@y*{SqDW2tpx3CGVjU$ zkoY0ISHEY=pur% z<+AE|??Dej4-fJU)LikSU(gtO+Q!H6D!xxXThhRz>pwl#x<0d z(TYJOUl^2&lw*;rlg&LZ;MDHySFu>N=j(^d#(wk-?TGQ@_OjGcINdU1_zoT1J6d*84_NBkbd6T z{`zMF)l05u%x#X0HJ@J&8U4=BUub&QG@&a$@YTld#)3~jL-JD?bEon4}EoQ!Tqt*+@Eb% zh?9FuizfN-djEPYdzNt znzh96#vYy9DRmpi=2_|O^}H;G-p#>?2QN&Esxhnb ztK2z^I7Zx!wKw2-ONp9Ox1_YrT``}?(af06u}S@A*TiSXl2f!-H1eRqg2ntp?)GV| zQ}tSOcT&?Y+Zl5fGVeoQC~iPwE4nMcxx2Q1ZDg<0MD(;1chZ z!uRj$0_#eb*GGpOitHUP>|jeh)_Q^(_j(!fW8y>8L)G_1R18(O!+Z8>YfSPVKL64h z_9^Vk)i}D)p5&e$xw~?x68z%=@;8r`=B~+r@S%^dFpqIXRd+;aV*~k&BENup(PcvY?X5w{mE;L??T8L(IS^ew2aGYd zT))qqYcZFbrG1OM_~py&5VGDhXf5ly7O@2U689C;KejO+(wF^ZXIUq3V(Z`;OD})2 zo0scy?JWKwu_^4{4CB7227@re*;t|2TJTxdRe>DUkm>mSTSJcZr8#wc_?5xVgVaw; z*6(l=#?@l=uj{@qIP8_oR}ry&^q`8su#X-@`F*wsZc4fj@xd)fyqc(Mp3(t%UpDKV3EthDtNtiyKFOA6!MPkw!OvI0}?{yWM$d9ZsB*Tk`w8F5RD7v1aYrnv70* z*uwYYo9nAZiI%xxbURTS`~Q*J>VL^l(4Q8V0hZ!6p|KPXj4E6IJ~^VXdRQX*7XSjo z(ca!Ditj}=wwwVaWw0L_OC|#9D&Wgx90`R%d6Pg8B_*&Ij$+jjiD&@qLqJ@iXj}x6 zOqIGwGyzHW^Twb805A!S@uEO}2qXcA1;CyJ6g7}?6M;lhtTYhV3r#6SL=yq9JQhdt zLU{vV!eNt0it@)`5dXk^$ygr*f$WDtkVya-=YzwdE>Wr>4l@&}%H8!J9^j$Y|0Tr# zFaZxY0kAFElXU0_QbEd!VS(_Yv_#ns4(IYmj4=vx1w}$55g_odxd7CO{?juIAfUWK z3IGHGQuyluRFst!l>u+yFAPd?MyU^g{R@MrQ1+nzjX@y_l-=jwm^x)$`4^_BK=C&J zjZrH83sZnX{u8GFRsRpBtf2Ux8p=?$fAm5kAkY{Tfnw|eU|Vz$it@bx*aYj1qiko? id4O$jI1)fLTn}eMBq0c-!^tQss!=RrNdrqm(Ek8F=HuM} literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20160912mon/stocherr.links.csv b/results/kmer_size_experiment/results/20160912mon/stocherr.links.csv new file mode 100644 index 00000000..4a8dfd48 --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/stocherr.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1335,89 +31,3512,55 +41,5173,70 +51,11842,28 +61,22376,8 +71,31866,0 +81,13029,0 +91,137,3 +99,108675,0 diff --git a/results/kmer_size_experiment/results/20160912mon/stocherr.plain.csv b/results/kmer_size_experiment/results/20160912mon/stocherr.plain.csv new file mode 100644 index 00000000..d4ed109b --- /dev/null +++ b/results/kmer_size_experiment/results/20160912mon/stocherr.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,957,40 +31,1729,1 +41,2909,0 +51,5501,0 +61,13480,0 +71,21068,0 +81,12431,0 +91,137,3 +99,108675,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/bad.edges.csv b/results/kmer_size_experiment/results/20160929thurs/bad.edges.csv new file mode 100644 index 00000000..ba2a975d --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/bad.edges.csv @@ -0,0 +1,12 @@ +# The number of sequencing errors that would add a new edge between two +# existing kmers. Note: there are 3*reflen possible mutations +kmer,reflen,nkmers,nedges,nerror_edges,cov,err_rate,est_bad_edges +21,1000000,927610,933930,7951,100,0.005,1325 +31,1000000,971394,973934,1874,100,0.005,312 +41,1000000,988492,989443,394,100,0.005,65 +51,1000000,994492,994828,85,100,0.005,14 +61,1000000,996793,996939,35,100,0.005,5 +71,1000000,997897,997975,9,100,0.005,1 +81,1000000,998506,998551,4,100,0.005,0 +91,1000000,998891,998921,6,100,0.005,1 +99,1000000,999092,999114,0,100,0.005,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/cleaning.table.csv b/results/kmer_size_experiment/results/20160929thurs/cleaning.table.csv new file mode 100644 index 00000000..28715de6 --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/cleaning.table.csv @@ -0,0 +1,12 @@ +# Number of kmers in the perfect, raw and cleaned graphs +# _nreal is the number of real kmers in the raw/cleaned graph +kmer,nkmers,raw_nkmers,raw_nreal,clean_nkmers,clean_nreal +k21,912362,8064108,912361,916474,912340 +k31,962865,10437776,962864,963212,962853 +k41,984331,11684532,984330,984311,984310 +k51,992751,11939641,992749,992711,992711 +k61,996024,11301966,996022,996009,996009 +k71,997500,9841194,997498,997436,997436 +k81,998290,7606657,998286,997359,997359 +k91,998785,4640344,996949,406297,406277 +k99,999043,1484663,703944,2269,2266 diff --git a/results/kmer_size_experiment/results/20160929thurs/perfect.cov.pdf b/results/kmer_size_experiment/results/20160929thurs/perfect.cov.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6cfce26bd6eb0c3c9cd54331927ab49194c4b02d GIT binary patch literal 6040 zcmZ{oc|6o>`^PQWCi|LFh6q{44932%k*sBxX$+=jGmEVmdvPctk;s}Qa%{;KB_w2D zvXvzvvSbg{Gvl1jbDnd4zj@94@%?;1_kCUWHUGTtkEFGci9A?A86X)p6ZSEzCv3nK z4*&xdf!Hfo0a{uBkRcucBOJW8^lCqK_SOx&H#Cra}@9dF8 z6m^v?&=e2D`5@s0pcRRTBN3^~tnpY+5*&f2y;V|BQc$F7hqbUYNd};!13+LrXIMk155ysWAVU}thQfLSLDn#D1Occ-HP{0GyFm3{3ur!?8;`&c|FnY)u^1xtkpNUX zTn@5D5U?aXoVuU#KlY;;evRWi0^?2e0YbovU?9i@i6T-@05U;Qhhv0*V?7bHz6nI? z%+P><^kpYYtXnnD`xlp+@|YgUy;zZ~MKMT)Mld7gvzQX)L`-E89WR^}kl$O?%uElg zkXSkg9WxdR3Np-45?Rl%z|VbJ6Q5lp?CwunS?wQ{$>+x(-z^;q|9KU;ySgZTAo-jU z)k{d1oP}IDnT?U39kb?fUbyb zfNyB32!v_9Euv;}rzQm#=!1wyj$%1MOxF1L>yXe9&a(I#OBRoDI+dOp5@Yl-opvkQ zpY9pYole1x-(zu){UQ(|B1Y%5w>??QwXftmJ}@j>=B`Q^i4tq^i{@_=X$qNPXs#@L z_Oc>NzU_-p5;_f?AvyPj4pq;DRyp$`3#Iw;$S6CtfV|K6THknh#WEe=m@13M3Hsbu zn<$wPqHGvLnA12PY`oO}gs)%q2h{%7@9LP*64i0eK74bipuv}1Dcf^jH02anwIu_^ zwa59|f#2wr80o7b>uTBIwy2W6@W_spU}PGmMvmq%{_U#uvIKt zpEW>o>R~%ahO(I6WwoELi&uCKT-o*6jgpi0gkwIg6wdOVtYl+z+ULGaFnR!VN4=F*{E&!|LYh1)6WAp>P_ z%(wFmWAr{xxC?s|yTPWQlvWM+%4-&f>}y{(p?SWMQ-X$yj659M0P3f17QQ_^u_3D-Ww)`I7WBxHyhZ$7d5gI zNQPV-u)nd?gED2sfR8Ap=dV3@rb1Qh9;4e>ulllHQL~g6&5@b^v+rU1*5>hLQWaJ@ z_VnSb)BiZ*VY|ukE(%HHlt_7%rh3)moH5l;g^wY(C_rayv{@A*zw)VXZ{ zVU#z2RD}AStB^cjss0vWalf;1iT5-NS}S$+zDNA$S1q!gMA)nM*$zTyJlS*W6?Ygu z)+mbI8>!$@`f9xOUUAj@nvQk~|LoS5v+w@%U=HDqdBt+v5BSuRW}OjC5}PMKnPWa> z+DYT+ysl_Q=G~?J+6srJE9e_FC)k4kx^0}AW)BPXXXiJiS;|mAbNpH%>sbj_^qq+H zLe|Ac7QE_eF;g~Xk^l8TyBnsDKK?q9mE)BI4F zD@ns6t&PtE6Yyb>zFrGkJZ44xA8}toJh*oXMsXw1kuhL=j)iozTyoJf(ND>JnDORQ z%AZbFJnPDTYH0p^UxpB|#di=VGbz{-k^dZvUrmy!{GK7BEBCk_(NVS!OVBTPDoK7k zkyH!YQu^ha@KLIesa!Svj9BF+1!!+Rg zlCVh5^r&u+hfuSQV)oL|q{6G}0LOzZK63ZuPr=_6+`r?#FOQD;Pam|N-f7joVTtFwYDy-M(&(OP=L4V4@e*G1Qgsl6$Vb z)A=-{gt_3Wg>m<*cZq2V+4i#&fOD;u4rTy6=EkF`d|8PtHAbHXOFHMc4T}KtZybK( zW0zjDx%iH{8gx9{h73!lnJ2prjP#chKl_s}C<|q7oNt@|(K;Xe(IW9NxgC5P^t~e` zxNZIj;j8)!B6;O9d4=jz5XvN2{PJC)`ec9k9^t>L7yRK{SX?d2HTAGZ7(DLIipLyW&y3y}Or z*`e*8_1to^6Ad|U++&Ux<3KTsrWYOcRzI2940^;Ys+yPeP8B2QbQO|)BV>vurSHjH z7O2`SlAbQkxkdHXlwUV)w-U-P{9NS2zC+u8@QEsaE7@jnCM{>-HpIC71R^F!wS&JZ z{wJGpLxIGzzI#`PaAUXXWuwEKVlU#7+nGePs-lLlgGuWgKRO1U1m&vo$$S_w{?urG zY`<)wCC@}E-$Y75UIXet^mHbo8ddA+k9q{{R4hb$_I=$I1=q#9?z|G6TYroIJ=yzI z=6!yuc5w@b$pa`gG8oIc7bJ>NPPrAk#mg&e2Y&Ye?8+Xw_o4>3y%spNdS`c_Nxa(b zET&|}&ujL3l0c-ZHzrQ&Lf{J0u&36k$nJxS>*sCBhR4Gr$G_T@ci zwkhZWmD)hR*h@tn`hT()t*5`(>tV9Lz1^r_R8NCygLNWzOj7SPA6o`9js~*?g|lEb zZu6IS=)|E$_BbP(gnU+r^i*iBVM4ih%nf>*@}(ofMuWaM{u9aPJWGTpp9ijXrA>{I-8~K=)P?-OtRVS|@QZo%Q`&^Xt2#uEIj?YT6FzlGRr2_n#?S zb}O29i#vrwF5M-p%ej!ZQxRP18<0pX;Z8#L3}S6|Lh9pfb>7BX2_C{i^Q^nIkj69c z2;#Y$datpw<-+6(P&(8wk}B*ZGL5J6>N#-Y4@3)Bh-;3hsjDfX z<37ePf@?jM^1t`9V4*$a5gk7!67?HtNhj4?Le-^g!WXKact`~fYPWmhlknOssZ0{=N=xbBPZ(Xr{srMi5_k{nTIMkf~ z@kUT0Bo1zB1gYV*>HhNIz?nfANN23axTtqxfsya!k!LS?YUsG87#Z}XqmTG{fTzOw z@6a0oh4@3OvW_K%mv~4OGLy1+lp~tAWb}8N5^4U zon_<0yC)FDa$1oszsb3b2gOwBVWq|z7b(=#z6MbfRS2(sc@M`(=3a@kYEGHraRlgw z74U>|^JwJ6D)I=vGBHR9kYM8xE55_~hykfLcIVV1(b^cZgn~CN{LIG2VC(ZqM>|9X zZp`E)XJ5*O<`&toSU)V(8N4_seHF09V`*|Nr>n1ffsQDrA^b2_!Neget@q?Y$l?;`_l_v(`)NcsCz6HxKcYLBTEV2~C(TXp-f&xDST<5;d zd!0!$juLTthKPHncG*?yLbPnl#VKc;6wU$1S0ntK_XGFpEw0v$wa%}IT@x}_Fsp7{ zrkFI}EwH8(=yVs5|M0_v@!FNfXakP zh@r)nr5=jE7jFRGZ8!1c!fQF5|6qVQlPNYS^};6WtmD}u(rf&eWCx{*XI*Uq@}nJ; z?KkaRpWH8X%nmT}HT(LQ*S+2Bw~`hc=REsdqW$Na{ZAnmM@`i2)eQqH@v-YnpG^7X zTBBzSr)^@S_ocn1;r5Y5RtUS=LDHbfqSK-phoMqFQ?D&D?4Ow2>`r{)>v zw>x|VWKL$E=wn>?yb0NZ?UD90FUnuEy4bjQW(YbYTU0tUR~T6Mb;x{hY{+Er!(*jE z%HY6Y^<)3XlLLCl`^d;^vsb_LsTF@Ea-!opMOM-i)uZMoh9gzFlus)=_^b?w^rjE@ z53dh&^cVGOi0^UNeCDsycb~d*HkH;T;f5rT6F{Dk@Z}T7ZkeCm>N> zC*Ly7)pte7_X=JRG>B!6J#n6Q;BIO&3K8erFCQU)RK8v=(@EH|)dgL-SiR?i#y-Tn z_o1}iyu#;W=Q9A`C@x8rfA{S5_L%Fs>tt>5yk8Fm~0*Nzg@dni+5de%fql&8&x-8Z2jH*3 zeTVYiHs6-Pwv6}Gw_NXyn&JCow@jFH=usW59`f(#`GCDKDHHNEiOW8t>!T4;r`(A|Bd9;ToGnI-rhGHesp z?OzJoAI&J(D;Umeyl~WVJabz{`%Hs2V@kq(Uwbp2e71c^mC^N%cbm@2QhEHWA9>bQC@M7!&0e&vv%WaY*$!SWEJvSi1bqhIS{#4!OP7m;Zom& zobs*UC6`!%7~@t|Xbq}xJbFxeM6Cw7l3!#gY7rnQWKm)vYd(pc!!+*R>vFhvnKG96 z6jAmz8`X2^?WO7ArSXaClhO_fJs{EXQxy#a7Px79T#9DOm*>_n2b=2HGK&feXJOY- zH-vRn)gpytzGZgX!SZ^E*K+>YyZXTT(l1{}1}+s`ba39mlzOao1-;noX3o7CL%B~; z-#-O4QQZ#d+N-NQm;3nX=jPz);Ll;vjFhgpt}gip@`w^#@gvf@$E&8!$$*f7_pedK z*n;YmQw&iOQKv%Ec9*xB^`oxGm&HA$c<;KQS*mg$@h!T3P#?P2FS#7IbV}!Vz|-os zyn>I{J6Cr^2edB{Tt5Yjvc38(hcENOTxPn?J<{Um&$9#Y28*E8^xw1zC7|cnHB|5D z#yq*_;pd$%x`7|I4$6;q3&pv6x_zme#XTmx49@zpP+MnFTVu_mDMAxIQ7)y6__-PUd_n9RtTnRRXyf#_Y1)c_FCjA}v020V=D2&`1o4091#9d`Va$0)_A*0>H}3 zAWtkc5G4?hK#(^c=0{~*7@R~yT{sdCC!xJihyWmnh(vi(B{U3<$6|n>D|iIWNL>ko z!>NHK0OX0JmLebtK#&3kOY}r|0YUh~CgD`_N1J{Sx3{7o`dOJ9r%O4&y1j-LV zM8aVJ(66}wX%qc>G8=$LcmWiFU@$=OuMZHaqN1b%^aB2+L8$pN?E}R8rGcT;d(i)C z5M?O!V(|}6MVT7b{zHR8saKePYU=;mPZ0v9Ce8orQ>5nH|I}2J)c#dN1*-hFUWj-Y z5{1B1<8vU$4jF`?o)-u@hw;KvZ)dc5fNZf?B9InqAI^qAgyD&YlTlGsh5#fbjjc=o F{{i@)vE2Xw literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20160929thurs/perfect.links.csv b/results/kmer_size_experiment/results/20160929thurs/perfect.links.csv new file mode 100644 index 00000000..4c2a267f --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/perfect.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,52592,1 +31,52592,1 +41,52592,1 +51,52592,0 +61,52592,0 +71,52592,0 +81,52592,0 +91,52592,0 +99,52592,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/perfect.pe.csv b/results/kmer_size_experiment/results/20160929thurs/perfect.pe.csv new file mode 100644 index 00000000..fc66e32d --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/perfect.pe.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,42887,5 +31,56913,4 +41,68833,3 +51,102361,3 +61,164617,2 +71,181787,2 +81,181787,1 +91,228304,0 +99,228304,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/perfect.plain.csv b/results/kmer_size_experiment/results/20160929thurs/perfect.plain.csv new file mode 100644 index 00000000..337f0f4a --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/perfect.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,969,0 +31,1729,0 +41,2909,0 +51,5501,0 +61,13480,0 +71,21068,0 +81,38623,0 +91,48050,0 +99,52592,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/stoch.cov.pdf b/results/kmer_size_experiment/results/20160929thurs/stoch.cov.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8dbf344a061fcbbd126a8e88109547fa566b11e8 GIT binary patch literal 6066 zcmZ{oc|6qZ*T+T3NY=6?Ls_zo8H~Mb4Uv7{O=B=MX2vYG?1XHk5D|%NC9-78mM!bp z%aSc+DNB}!-_aT9}yEBm^4^M9v~7r7y2QT96IEH z2Y`XHKw6 z`gI*-h9qD~cmyS%{D0-62K*Vv6(q);=mCU)A&Ni{4233AUI2ojDZ|l0BCxJVYTpDR zWoDiL-;5Pod#pnhd(gCVvo4bd?^K`A_f($eM;KYg^)#53{BW)!FA!seP>+G&teU%o z`Q&HGXdtU!|MSQnZc3+vqTJn}wF@$}imMmvf-kI+=>@}LvuCY4nHupDyQILSpC6|J zADQH=b+!!$U}QjT9W!Rk4d+I7uO66|l z-hGWm>MSs?t-~@L_XHm*3-uPhxqLnKSfcb}#;h34QZ!qQsPlwQG@7lsYAWNPm8QjZ zp4WxR)$KALXPMr~AXi^L{y2ErY{yOtG9}htutvEiB zp6xQAL(4>4mnrtTEZzszV9kk5@q9(tGLZl3HU`v}TI|<|ex;mv#noA;T4RfCUguhd z7)R&^i#Nkji6ef5k9K0y`q?4X z*CfZuoFOk1;#__B+O+D}h5NUFEIPUUfe3Qw1MBtEZgWyceOBy`#`7FSv0c68AtTed z6CI7r76(1WqGBJhhXYx(XdIb+UurE53>_Wr^u==dyGe(^j)%#!B@(_#`dZEAY49ss zzk*z)Ya@lvB;Imi(HfrH*UyBh-9K*b`NUFd4Cw1BY(vLRzY0)WnGAfLn!*#<5{(qx zXJ|gl*~}MxtW|x+ad|zBWAlz}^4y&H>1X=5uWhb2V$_F4nzw}Dx@>rkfx&D;41{ar zE=IQeF(&*mJkTU_T<&^>p;2?IM7%9zY8YFjVp6iV@ORG8y@MvMq&ipeYsQRJAZ3Vw5!-y@jP;= z-SCaTH7G`_t-r{NU|5>q#i@Tw#7>caV!yw4qU%9P-;<*??`}tv$p`cn;=$qZHJgG0 z)&iU1AGMM-bY8pr1bsAmU8ovde#74=Z*4UB8FLu@j>NJ|=^}V^-QBlN4J*lmrWxK zB(4-R4tFF=B5_56(MpvV_5OM{Zlwx|VVVMvD(ydoDjToczYKVD>xL&U#=WbY*ew<8 z+Px#Wd;-mjxrl}=rle^2tWH8$H0?>6>)V^LBrGOXmLL!FZ< z*12&Xx%Ttx)~E9A&Xh22#w0pKA^GKrSN$rD3_*eW8{2ggt1*aCNySmg0BibyX)bXB za5{9`DI??H+t0a$`}@;Q&egFy`Rg)7u|xHn-Q5ozoWgk~JG--F-uvt(?Cv3zB=bYQz+FGegAI@t`_%i@KRq6Gtv9uM}#b`X24Vwz0Jbq}SYZwp6i8O>K zrErjsqrie)vvX@XF9pg7iTvi7-lNC z6cOsAP@H}f@@$~S)Ra%LsVKv3nqhj}^QEUn(sP!d)#h$PFgebpv4W3v{4Q~PQ_Xu% zQNb)et*W+g|Bku!gw`d2Si$YpZw>c9IW0eYXR<~-ATPdm$N0>yd3}9{;fC%%wm&<% zGWn&p5tH{x{sQe~e#Z_8?UBhMXw8{%XMf@k7v`-Gbl&yV&fbTCGvtun*k%)%4*7BY z_RnaEk`5iVYY!WEK6>}mMA45yl0U2*)FbB7F;XeM>Gat8Nb-s2+uLXL4^->I(thT5 z?ROvi+*oETsLXB;+7I5@+S@zW{s~stZaMUBq#H?6)u7POUtsYU7g4e2HgEug}m@c)n+Rif6j2VK_I)`Am|u0SX?%JC2TQBQx~s7Zw_47CVI2kDJMAgM_~ zMiia{{h=}iW$6D&Wz?Sjrm}}AJ{IOao=8CCW&Enb`E3 z?$8KBbu4f?rippSAmTG2IogRYgkx?VF@5p*D6h`27mkZR<+5ur@ANCbwchk;!JVM% zS@q=i?rVXgwKR8rgwwyiV#5@2JllqAcA+ggKGxlZ@yPLY8ag!146eLf>(nlcHjOsm zT^u@%bCcl$2-L-B6yWoM{u|0n{`kc3r%SrvT|%M$`f^x#_{tJle&YbUKbEeOj!fz zM4;E1+5jiB);TG6$)}T(4hBcEuAc`juI*)=2-bEEAe%M3U;7C6-MpuCmk8E&QpC9V z=jpV*>6^oSKmI8zYFCi)C+W;7_8;dOqCrWQbh`7d0I#G}+P~q9rccv~)Y#2>u*=%= zGk2^e<<|Hvdw+@z?GqtBji56$;u7pYrW>x3{cC~D?YeZ|dt(REQXlS>^gcKzhV39}dOw5JM3&@=A;{q1`S=Cy$uPp2!S2zYgfpV2il< zhV7(T6cMQX=5*Ojn>+mTboAQnCa05*iD+x-(Vo=!{K%}}bW!ATw%elpqBxPSmWD;U z>XE4j$KFYQ#}dLO7h|Pk2@%}Qo$C-KL7A|sreqvl0NZMWam)P~b{oLO z(0uj~Hg=WAv9j!^Uc zCvVO@PRX{)hUOHRGMYRrxG-!vEPf5J&29+0{B^*qB^P;Wg zEw(Hio~4%9Wc%uP>3uEaaO%{%RorT7_rxNHXt5MISOKvdgDG1mYx|YsV>jqO>T*f7 zMbBx^n#PFli@S>>EFy}Gk>)kSq+!^y?XnV!wp<>4zZvRD1=b_H;{3^&n1}im`kDIi zU0x^83uN;T&@C=)LH6K#q&-zjX-i|vhGo$a=!j%d$;hVyzk;tL`orTRu;Hmfx#7U! zq2a1RpTg-O4OA*B;`;oxF9S+XzYJ{jr z|5Vo-CEb6@=#*A0LoELlj-h*LEofwd-Jo>1G_!P_RF*BTO`E-E`Eu2shbQ(S=DkN? zN90vb4|9(p#OBlDH0if5+rN!FY&cBUJYDc62V&eY9aR$upP|faf>#@x8{Ym556=mI zlidp+9B2X$ycqq;5Zewwc)C6G(NgtOF7{UWv^6@`Z2KkP-FI&8Jc9=YvE4a-?VG!s zd<*e4%Qbk1&yG(p$E$Rz>M>?Mjy?%D!uRP|16eb5?{!4v7rGZMXp-HXMxBI9PL#Yw zyMNRFb{^Y0L9T1P(H}L(`O$oZFl|j%9IG1f=^{^IZ%&^Nt_Z$!2tDLKVs~URksK)$=-P#*OGEGP%wds zmAQl|myA1?XD%1DjcWHjvVaUa3bs5d3Dixb-MeBsa_U(?0HQ4kV?5tNK|=Gc}#h3cgFn{2dD^u zLjkEh7F$-=CO<|V4r$dXbAmkhkYavsQ&dr9M=ukfDT{Tf8QU=I7TXBiY;+2>Yjzy4 ze`H_!(uMeBp=-~wzv#(Rf9uaBGIr_>?bzPX%7{wT^JdT-2H1^xc*17R{&|w@VW$0^N5VJsB%=aygeBrj$pL2SrvAKA*Y3=UY+L@g)Dl zjqbG_!69`kg2PAOF{anI9&=_HeagzXkW5-$TACk1)EoG(W!zFH6oX!2*U|lBn+pNt zhf6zOF8WPv|9HXN$DQEh>iDI09#=?c3VQT`abHk{L4ZLu>U2~sNY!zbH&ZcSE@nS@ z$hy8Hv+fjbW$?|9#M#fLZ?RK))k5{1b>A1Q_KFv(2pICDErC0*YCdzYK4$;H4u6pR z!3)P_p!$Vnb(M9O=5STS*&|$x4e$1!Q_XkAwq^iPChpit&CF#xujqeTzSCMPPcUuS6yd`c<_g|R?6K148 zdmh%M`f$qCti4#1yqFra_?>6#>uP?SaaIuBPQ>Q^zffEGZwv+eWkFuTP~1KghT=g| zvGqSEYZOKkLqPoz03aQdn;VkiJ5r5Bd!W1m$PUriz?#cgKHtu)kRU4~YLZVSjA`Aal}H;xA8@DpFPqBe*A}CCYa2Yc9WI^pI$8BoT#x z13-V~0;Ep#pPsER9_a><1%kl<*}s24sDgr=0?-Zkw+5j&)6^dj^S1_uQud(#twH3K z{}l&OP^2t3{}l&SqU;y{r740bt^IFJmg4sQTT}j*rXUagR}BTI;y-#J;^8PX5>GMK zfgp2~Ka%oZAm}p24NKY1sPh1sVX;IY)$sl`8v+rIC;pm@f+AE2AR=7Nij=DM3O?q$CtU zKu|%DP*OsFo9B4G=NrH0x^`WA-k-ak`>)sMEn*7Ol>$r40Yt(U!V+jsmjIV91CSV3@)VH2rvZX2 z{k)JskT#qEM`PWAAXB(I5)YIm1(*Z>m7wrn3FJ6(G!BU&{1pz;#$pJhS3FShS31ZX ziN_Lg2vR+{|ENa}_`MDzB*vZK0fc~MX@`fptZa=f)FAJM#qi zW`DFb#PaK~v~O0gFy58Y8sF?=HPrwJQ=u7(Iy%90Y+q=eo_hYQMBt>9w>rmBB{^Kt zHD~qkMoY_K*6O3P6}e&+PUY&*dDSm@kheTY<4*NXmE$GH4)xdEt17#z(V9wiLBvU_{ zny_ml=is)zDjbe=u?mBbxw%yjou{^3u`blgaxBJ9Ut%xTUOXku^6uV~^V7rUYBau6 zNl;VK7D`Flq#3kFg%)P4x~`yHRvD7~*J}52b<62PX`PdeT}->!G|y9Maz%z(Q_b=6 zxs)(a9(V21C$+$~+Kp&zfnAy9i~&p&Ps8^|+1-Vj%A{H4gjrlQV?DllLzPdn1Fsi} z=>;!~gx7V!?$8Q_KB81LWbxmbpd_k(O4Qh_gpI~(+@TeIy00AOuE<-&^)aY7748jb z{ZkjhjdDVVt=Acn!_GD(JE=5kZ1uw)(+X340L*&w)bP}}R7pokzLh-Jd7izKD^}*Y z+Comn)*An|SFJS2He;&N&~WM_=bDFdXIUxUQP=|p?7!Yw#?{?f7JkVkET1)u=V(-k z^UPJK5^YJc{kOY%G~tpLN`Skg*HubzMF|5l@$_GX?&Fu5>u0WmFMT_OKvPr zXiRW|5Y9grQkSz;6w?BR&&9q}@&N{pG)Gw6W*YA-(|Rp zSx>wVTumf&ojR|MrgCLNDUnU1zFRcXHweUUmsdG_ea7axxvm*j;>GA)w2^af-(cuf z>b`lg|Dpcb&9*61hN;UqM&^C-vINejZm`Sv^k*l`M$2;}1#iLPtfWHu+0%hiUohb_ z(~Jt|^Pt^BA6#-DW)YSH>arTsR_&)pADqtFKwmC(-)QZsuq zMF}iyd<^U3Pd%VyWEL>55qRA2g@vs*h-ze#q!A{CHh1@rEvmO4nkcu+yxD__iP?9sqT^&4AUM*NXU)2cZCWL zC$6qkBl;azR~FC03X_I|n@t7F`(D4UuAXG}wG8}u%IEv{&DF!jFC|hls^PYa^s9@9 zhcQb_f|H+90^H#5z1KLNw7<{E(W<1;#&T#;GW$d~zP=c_%sDoB-5Z*^0I-DV#r2d$ z%hpDhxtZbl9n0%Qm|r zl4e=QBC)4T+EVK;)tr+i9CXN*L_O}{LJZp_MU%ud@Va$ zhTc=s+OY>Wfy-LBoO71;}8%zO@HgDM~;LE}D0#iXeV7Oq zz|Xl_7CW_#j(1G3J3z|4Mw(<_m?ul}=uoGaJygiKSZHo5BR<7h=y8%_=6UinNcY_w zjQUaRFnago$s3-J-)jf#dJTGMHWGb$Q`?suozEifU*75UMHm4NlCd&$bSI12RLn+d zof!$@-kqh{RtYaqouzEn2@m++@_u~m4CK|JzH25$|8t&qH=2$&Rd}bWfk?nDMjPE+ z>F8V^{IU#_&hS^E|73)`h2J`Fa&IU*m=o2O{m8Q_+s~}I0u;@h)YV#B`vd6Zk6UTr^#rC^S6k^Q;XK{HpGlbnkQ{X;UC!u4lN$Kniz_j=#o`C-3x zZ@Ac7`9iqq+JsobpwJOUxE6VKW@aWh=K1R?QT)@F8h!ljDvGZ&m{%xfrzLlGLdyHq zr!k;Mx!Px#8umY`JNRM+%`N7C&VO?LdQ zAgKe=m*5Bh>5}jWsHzImg#f|iBEJ9&q~lA_vmlUBP*s&&j?@y3#aVd45x*1=-ds-$ zDE~YBUr0ul$o=d=SG2XY;CQ4f5K4}6{0({Jx8L8WNv0|cxd{k{yoEv_$wfg|N#F+h zjbHLg(Eo{FJNkAzj zRw?*zlJ6}M0vQ5aB}O-c7<=`ozYoQaW!}p<8VUaqWZh)+qo3Z zctVW$MLFxJ>Uit;#*>tyczYAz#WgmS%2WJnac6pPkFTS_PXGZd0c-)ZDoH^R=NAcH zPZXUURIkQLJioSJ=XKG`+Kau3_bJ;P%Vq*|&-T{fSLA_iAan33^`tx@G4=$8N0Itk z;&7>gyQ8{_hE0aPnitr1W3N88L4l9WXF@m^*nQw8dwVdUSr94fNL>8j`&&wP!R?B?ipj6R93PE1I=o&e<`@~T0s zWSL}!6)^JcH?^SN3bt;qTyR4ih!+rJ*>|4))2E?z3WnecFdrml=-f-%Ppv~SQlnf~ zxwPWx;`xl&##1u8(a0pbS5gsDj8d(VdA7VZJ@%e;tBr>qp4c4BJCC5g= z?TV^Qsn<_>_ue{ubC_?gSiUt9gmK68HO?S>#&d57UVqlv{`yBmL}5gK!4Uk_SO<8l zX6h?ld@lgu>6YW8rQ)YlbxZlv&eU|L?U%qe-?_Pq4euMq4;K3MZXax)T~2IXZN@pQ zIX=XkZiF?qVa$CTeUbtqj;NV~m~-_~`XWop+{>3WN8FvJoP?{Hs$Zks_YC$fVV}>8 zv_20Qjag*>Xt9o;w;oZLZk+UaF*1jZn!glMA9Cj>^d}#s9p!B3r_eHuzK*o8AkCtD zTFpsK=J26#olZ3&FMXH(y*sVjXO~o4r&!FBd4jKNMcl@r#$1oU+MoKR96P5y=+OV;j2-zAbfoQ>h@EZg}Ta z=)(rXk|y+q^agt_J(eDOck=~k+2lgAXxc?JgImUPxvH6qx#r2=EIYU?8FNbxOQs&S zUu86&$=kc6CfcS(osoRc+ftvknBfT0AiE8Tse4%$S64Kcow4o!6#=lxBekdFYg>Ed zrbi}1pTlaMAorKjEmF2c6_oe&b8)#cSeNGMZ^nZczXfeSa|*NTbR4tKw}1S^h466s z#o@Kl@`n}v)@#+$c53at*rBk7$OhES5V3k`Nl;+3w40+zvenrA2Q|CFYxeP+aXLK; z&?a>0Ozc~+H;PTDjpA}+K|@~=Zo?`=34?j;CrtZ6+K_dcbI{wn^~lFB3(zB0FRk8B ztNw$JMOx z4j+A|8Mo_+wf?Dvlexxe9@caYt$6)@{|ftz2H~ zulkRZ->;dy#?I+C3AGKheqXUVtXgiwV@76e@!Ua;A97!{#U0(>=L?oQsc~Eds$E`H zQ{Hmvj8H+$)?8Fa{@e+Ex+3%)-V-?t+de;|*C!fu5v1;P;Cp;!DkZa|M_uBl!_mtR z(>~99k0COgD)K)Zc8swSH%byr@`9=NBe#$K3$>O0#!$dtUZW0%Wb2_YB(sr>t^aznMqxBD zc+_tJ0D__1+>j)Lk?bJa1LfpFo+u0v4^)DJyop!>5{+~t0Kjr`AXhBO6UF0EK#)5Q zev8Doa0HQzx(E~wLG*M(BYlA&0t)R)l04xE92NrvUB@BGfuu}00zvXD0U%ctsTCfD z2ZE$ASb{6k4G6;h8WKSwe>58Y58Z=^afjoGo@h9c00d#(u^8k{QaAXo%6KxlJO0CL z{l)sfLHtj*_18@RWI?=6_+`G5MbZ`HD%_Ja66tpEYcGGs=p)g$kOUM04gme$3y{3g zf10ztIHVgu1_%ZNWd44DPqUHvfA)8IslgpPIa^(tq`khsyn976J~ALL+e`2OS8qK=~s{ t-wOm?!MI^bw=?oSK;~F10Z4YXf9(cOfa3_iHY2YfCl3%2(J|2l{2#t(oP_`Y literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20160929thurs/stocherr.links.csv b/results/kmer_size_experiment/results/20160929thurs/stocherr.links.csv new file mode 100644 index 00000000..98281166 --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/stocherr.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1552,135 +31,2655,86 +41,5732,71 +51,10941,20 +61,31864,5 +71,31864,1 +81,11749,0 +91,138,1 +99,21,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/stocherr.pe.csv b/results/kmer_size_experiment/results/20160929thurs/stocherr.pe.csv new file mode 100644 index 00000000..0be2a3f6 --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/stocherr.pe.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1343,127 +31,2755,83 +41,5821,53 +51,14476,22 +61,36204,7 +71,48050,1 +81,12589,0 +91,138,1 +99,21,0 diff --git a/results/kmer_size_experiment/results/20160929thurs/stocherr.plain.csv b/results/kmer_size_experiment/results/20160929thurs/stocherr.plain.csv new file mode 100644 index 00000000..6b0f24bd --- /dev/null +++ b/results/kmer_size_experiment/results/20160929thurs/stocherr.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,961,55 +31,1729,19 +41,2909,1 +51,5501,0 +61,13278,0 +71,21068,0 +81,11700,0 +91,138,1 +99,21,0 diff --git a/results/kmer_size_experiment/results/make-cleaning-table.py b/results/kmer_size_experiment/results/make-cleaning-table.py new file mode 100755 index 00000000..25f97343 --- /dev/null +++ b/results/kmer_size_experiment/results/make-cleaning-table.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +from __future__ import print_function +try: input = raw_input +except: pass +import sys +import csv +import re + +# +# Read in distance matrices between raw, clean and perfect graphs for k=... +# output table: kmer,nkmers,raw_nkmers,raw_nreal,clean_nkmers,clean_nreal +# + +def usage(argv,err=None): + if err is not None: print(err,file=sys.stderr) + print("usage: python",argv[0]," ...",file=sys.stderr) + exit(-1) + +def load_csv(csvpath): + m = [] + with open(csvpath, newline='') as csvpath: + csvreader = csv.reader(csvpath, delimiter='\t', quotechar='"') + next(csvreader) # skip first row (column headers) + for row in csvreader: + m.append([ 0 if x == '.' else int(x) for x in row[1:]]) + return m + +def main(argv): + if len(argv) <= 1: usage(argv) + sep = ',' + print("# Number of kmers in the perfect, raw and cleaned graphs") + print("# _nreal is the number of real kmers in the raw/cleaned graph") + print(sep.join(["kmer","nkmers", + "raw_nkmers","raw_nreal", + "clean_nkmers","clean_nreal"])) + for f in argv[1:]: + match = re.search('k([0-9]+)', f) + k = match.group(0) + m = load_csv(f) + r = [k,m[2][2],m[0][0],m[0][2],m[1][1],m[1][2]] + print(sep.join([str(x) for x in r])) + +if __name__ == '__main__': + main(sys.argv) diff --git a/results/kmer_size_experiment/results/make-csv.sh b/results/kmer_size_experiment/results/make-csv.sh new file mode 100755 index 00000000..e8dbf3b9 --- /dev/null +++ b/results/kmer_size_experiment/results/make-csv.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -eou pipefail + +echo "K,NG50,AssemblyErrors" +for f in $@ +do + K=`echo $f | grep -oE 'k[0-9]+' | grep -oE '[0-9]+$'` + NG50=`grep 'NG50:' $f | grep -oE '[0-9]+$'` + ERRORS=`grep 'assembly_errors:' $f | grep -oE '[0-9]+$'` + echo "$K,$NG50,$ERRORS" +done diff --git a/results/kmer_size_experiment/results/make-csvs-and-plots.sh b/results/kmer_size_experiment/results/make-csvs-and-plots.sh new file mode 100755 index 00000000..b071ec69 --- /dev/null +++ b/results/kmer_size_experiment/results/make-csvs-and-plots.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -eou pipefail + +./make-csv.sh ../perfect_cov/k*/stats.plain.txt > perfect.plain.csv +./make-csv.sh ../perfect_cov/k*/stats.links.txt > perfect.links.csv +./make-csv.sh ../perfect_cov/k*/stats.pe.txt > perfect.pe.csv +./plot-n50-and-errs.R perfect.plain.csv perfect.links.csv perfect.pe.csv "Perfect coverage (100X, 100bp reads)" perfect.cov.pdf + +./make-csv.sh ../stoch_cov/k*/stats.plain.txt > stoch.plain.csv +./make-csv.sh ../stoch_cov/k*/stats.links.txt > stoch.links.csv +./make-csv.sh ../stoch_cov/k*/stats.pe.txt > stoch.pe.csv +./plot-n50-and-errs.R stoch.plain.csv stoch.links.csv stoch.pe.csv "Stochastic coverage (100X, 100bp reads)" stoch.cov.pdf + +./make-csv.sh ../stocherr_cov/k*/stats.plain.txt > stocherr.plain.csv +./make-csv.sh ../stocherr_cov/k*/stats.links.txt > stocherr.links.csv +./make-csv.sh ../stocherr_cov/k*/stats.pe.txt > stocherr.pe.csv +./plot-n50-and-errs.R stocherr.plain.csv stocherr.links.csv stocherr.pe.csv "Stochastic coverage + Error (100X, 100bp reads, 0.5% err)" stocherr.cov.pdf + +./make-cleaning-table.py ../stocherr_cov/k*/graph.k*.dist.txt > cleaning.table.csv diff --git a/results/kmer_size_experiment/results/plot-n50-and-errs.R b/results/kmer_size_experiment/results/plot-n50-and-errs.R new file mode 100755 index 00000000..e93c9821 --- /dev/null +++ b/results/kmer_size_experiment/results/plot-n50-and-errs.R @@ -0,0 +1,84 @@ +#!/usr/bin/env Rscript --vanilla + +# Isaac Turner 2016-10-12 + +library('ggplot2') +library('reshape') +library('scales') +library('plyr') +library('gridExtra') +library('cowplot') + +args <- commandArgs(trailingOnly=TRUE) +if(length(args) != 5) { + stop("Usage: ./plot-n50-and-errs.R <out.pdf>\n") +} + +plain_csv <- "perfect.plain.csv" +links_csv <- "perfect.links.csv" +pe_csv <- "perfect.pe.csv" +plot_title <- "Perfect coverage (100X, 100bp reads)" + +# plain_csv <- "stoch.plain.csv" +# links_csv <- "stoch.links.csv" +# pe_csv <- "stoch.pe.csv" +# plot_title <- "Stochastic coverage (100X, 100bp reads)" + +# plain_csv <- "stocher.plain.csv" +# links_csv <- "stocher.links.csv" +# pe_csv <- "stocherr.pe.csv" +# plot_title <- "Stochastic coverage + Error (100X, 100bp reads, 1% err)" + +# output_pdf <- "plot.pdf" + +plain_csv <- args[1] +links_csv <- args[2] +pe_csv <- args[3] +plot_title <- args[4] +output_pdf <- args[5] + +a <- read.table(plain_csv,sep=',',head=T,comment.char='#',as.is=T) +a$graph = factor('plain') +b <- read.table(links_csv,sep=',',head=T,comment.char='#',as.is=T) +b$graph = factor('links') +c <- read.table(pe_csv,sep=',',head=T,comment.char='#',as.is=T) +c$graph = factor('pe') +d <- rbind(a,b,c) +d$graph <- factor(d$graph, levels=c('pe','links','plain'), labels=c('links PE','links','plain')) + +# Approach 1 +# Plot contig N50 +p1 <- ggplot(data=d, aes(x=K, y=NG50, color=graph)) + theme_minimal() + + theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + + geom_point(shape=4) + geom_line() + + scale_y_continuous(limits = c(0,250000)) + + ylab("NG50") + ggtitle(plot_title) + + theme(legend.title=element_blank()) + # hide legend title + theme(legend.justification=c(0,1), legend.position=c(0,1)) # legend in plot top left + +# Plot assembly error rate +p2 <- ggplot(data=d, aes(x=K, y=AssemblyErrors, color=graph)) + theme_minimal() + + geom_point(shape=4) + geom_line() + + scale_y_continuous(breaks=seq(0,150,50)) + coord_cartesian(ylim=c(0,150)) + + ylab("Assembly Errors") + + theme(legend.position="none") # hide legend + +# 1a. +# grid.arrange(p1, p2, ncol=1, heights=c(2, 1)) + +# 1b. +g <- plot_grid(p1, p2, align="v", nrow=2, rel_heights=c(3, 1)) + +# 1c. +# grid.newpage() +# grid.draw(rbind(ggplotGrob(q), ggplotGrob(s), size = "last")) + +# Approach 2 +# m <- melt(d,measure.vars=c('NG50','AssemblyErrors')) + +# ggplot(m, aes(x=K, y=value)) + +# geom_line(aes(color=graph)) + +# facet_grid(variable ~ ., scales="free_y", space="fixed") + +# xlab("kmer size") + ylab("") + ggtitle(plot_title) + +ggsave(g, file=output_pdf, width=6, height=6) diff --git a/results/kmer_size_experiment/runk.mk b/results/kmer_size_experiment/runk.mk new file mode 100644 index 00000000..ac39def1 --- /dev/null +++ b/results/kmer_size_experiment/runk.mk @@ -0,0 +1,97 @@ +# +# We assume fragment length is 400bp for PE threading +# + +REQFIELDS=INPUT NAME K REF + +ifndef INPUT + $(error "Error: you need to pass the input file INPUT=$(INPUT) ($(REQFIELDS))") +endif + +ifndef NAME + $(error "Error: you need to pass the project dir NAME= ($(REQFIELDS))") +endif + +ifndef K + $(error "Error: you need to pass kmer size K= ($(REQFIELDS))") +endif + +ifndef REF + $(error "Error: you need to pass the ref file REF= ($(REQFIELDS))") +endif + +CTXDIR=../.. +MCCORTEX=$(CTXDIR)/bin/mccortex $(K) +MCCORTEX31=$(CTXDIR)/bin/mccortex 31 +DNACAT=$(CTXDIR)/libs/seq_file/bin/dnacat +PYSTATS=python $(CTXDIR)/scripts/python/break-contigs-vs-truth.py + +DIR=$(NAME)/k$(K) +MEM=1G + +RAWGRAPH=$(DIR)/graph.k$(K).raw.ctx +CLEANGRAPH=$(DIR)/graph.k$(K).clean.ctx + +GRAPH=$(RAWGRAPH) +SELINKS=$(DIR)/graph.k$(K).se.raw.ctp.gz +PELINKS=$(DIR)/graph.k$(K).pe.raw.ctp.gz +LINKSTATS=$(DIR)/graph.k$(K).linkstats.txt +PERFECTGRAPH=perfect_cov/k$(K)/graph.k$(K).raw.ctx +TGTS=$(DIR)/stats.plain.txt $(DIR)/stats.links.txt $(DIR)/stats.pe.txt + +ifdef CLEAN + GRAPH=$(CLEANGRAPH) + SELINKS=$(DIR)/graph.k$(K).se.clean.ctp.gz + PELINKS=$(DIR)/graph.k$(K).pe.clean.ctp.gz + TGTS := $(TGTS) $(DIR)/graph.k$(K).dist.txt +endif + +# Keep all files +.SECONDARY: + +all: $(TGTS) + +clean: + rm -rf $(DIR) + +$(DIR)/graph.k$(K).raw.ctx: $(INPUT) | $(DIR) + $(MCCORTEX) build -m $(MEM) -k $(K) -s KmerExperiment -1 $(INPUT) $@ >& $@.log + +$(DIR)/graph.k$(K).clean.ctx: $(DIR)/graph.k$(K).raw.ctx + $(MCCORTEX) clean -m $(MEM) --fallback 3 -o $@ $< >& $@.log + +$(DIR)/graph.k$(K).se.raw.ctp.gz: $(GRAPH) $(INPUT) + $(MCCORTEX) thread -m $(MEM) -o $@ -1 $(INPUT) $(GRAPH) >& $@.log + +$(DIR)/graph.k$(K).pe.raw.ctp.gz: $(GRAPH) $(INPUT) $(SELINKS) + $(MCCORTEX) thread -m $(MEM) -p $(SELINKS) -0 -l 350 -L 450 -o $@ -i $(INPUT) $(GRAPH) >& $@.log + +$(DIR)/graph.k$(K).linkstats.txt: $(DIR)/graph.k$(K).se.raw.ctp.gz + $(MCCORTEX) links -T $@ -L 1000 $< 2> $@.log + +$(DIR)/graph.k$(K).%.clean.ctp.gz: $(DIR)/graph.k$(K).%.raw.ctp.gz $(LINKSTATS) + ( LINK_THRESH=`grep 'suggested_cutoff=' $(LINKSTATS) | grep -oE '[0-9,]+$$'`; \ + $(MCCORTEX) links --clean $$LINK_THRESH -o $@ $< >& $@.log ) + +$(DIR)/graph.k$(K).dist.txt: $(RAWGRAPH) $(CLEANGRAPH) $(PERFECTGRAPH) + $(MCCORTEX) dist -m $(MEM) -o $@ $(RAWGRAPH) $(CLEANGRAPH) $(PERFECTGRAPH) >& $@.log + +$(DIR)/contigs.plain.fa: $(GRAPH) + $(MCCORTEX) contigs -m $(MEM) -o $@ $< >& $@.log + +$(DIR)/contigs.links.fa: $(GRAPH) $(SELINKS) + $(MCCORTEX) contigs -m $(MEM) -p $(SELINKS) -o $@ $< >& $@.log + +$(DIR)/contigs.pe.fa: $(GRAPH) $(PELINKS) + $(MCCORTEX) contigs -m $(MEM) -p $(PELINKS) -o $@ $< >& $@.log + +$(DIR)/contigs.%.rmdup.fa: $(DIR)/contigs.%.fa + $(MCCORTEX31) rmsubstr -m $(MEM) -n 50M -k 21 -o $@ $< >& $@.log + +$(DIR)/stats.%.txt: $(DIR)/contigs.%.rmdup.fa + $(DNACAT) -P $(REF) $< | $(PYSTATS) 21 2> $@ 1> $(DIR)/stats.$*.out + +$(DIR): + mkdir -p $(DIR) + +.PHONY: all clean diff --git a/scripts/mccortex b/scripts/mccortex index 33293bc9..3182cdac 100755 --- a/scripts/mccortex +++ b/scripts/mccortex @@ -17,6 +17,12 @@ fi K=$1 shift +if [[ $[ $K & 1 ] -eq 0 || $K -lt 3 ]] +then + echo "kmer is not odd and greater than 2: $K" >&2 + exit -1 +fi + MAXK=$[ (($K+31)/32)*32 - 1 ] PARENTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )" CMD=$PARENTDIR/bin/mccortex$MAXK diff --git a/scripts/mccortex-server.py b/scripts/mccortex-server.py deleted file mode 100755 index 6723b72a..00000000 --- a/scripts/mccortex-server.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function - -import os -import sys -import time -import signal - -from subprocess import Popen, PIPE - -# These should work on python2 after 'pip install --user future' -from http.server import BaseHTTPRequestHandler -import socketserver - -# -# Start server on port 2306, with link and graph files: -# python mccortex-server.py 2306 --coverages --edges -p l.ctp.gz a.ctx b.ctx -# Query a kmer: -# curl localhost:2096/CAGTGGCCA -# Response: -# { "key": "CAGTGGCCA", "colours": [1], "left": "T", "right": "T", "edges": "88", "links": [] } -# - -def check_mccortex_alive(proc): - if proc.poll() is not None: - print("McCortex quit [%i]" % proc.returncode, file=sys.stdout) - sys.exit(1) - -def query_mccortex(proc,kmer): - print(kmer, file=proc.stdin) - proc.stdin.flush() - check_mccortex_alive(proc) - line = proc.stdout.readline() - # Trim off prompt text - if line[0:2] == "> ": line = line[2:len(line)] - check_mccortex_alive(proc) - return line - -# when we start mccortex we set it to ignore interrupt signal, we handle it. -def preexec_function(): - # Ignore the SIGINT signal by setting the handler to the standard - # signal handler SIG_IGN. - signal.signal(signal.SIGINT, signal.SIG_IGN) - -def start_mccortex(extra_args): - script_dir = os.path.dirname(os.path.realpath(__file__)) - - # Adding two lists together appends one to the other - try: - proc = Popen([script_dir+"/../bin/mccortex31", "server", "--single-line"] + extra_args, - stdin=PIPE, stdout=PIPE, universal_newlines=True, - preexec_fn = preexec_function) - except Exception as e: - print("Couldn't start McCortex: ",e,file=sys.stdout) - sys.exit(1) - - # Give test query to check it works - check_mccortex_alive(proc) - resp = query_mccortex(proc, "hi") - - return proc - -def stop_mccortex(mccortex): - print("q\n", file=mccortex.stdin) - mccortex.stdin.close() - # sleep until process has closed - while mccortex.poll() is None: - time.sleep(1) - print("McCortex exited with:",str(mccortex.poll())) - -def test_mccortex(args): - proc = start_mccortex(args) - print("Got: ", query_mccortex(proc, "CACTGATGA"), end='') - print("Got: ", query_mccortex(proc, "CCACTGATG"), end='') - try: - while True: - pass - except (KeyboardInterrupt,SystemExit): - print("Got exit signal") - stop_mccortex(proc) - -def start_web_server(port,args): - mccortex = start_mccortex(args) - - class mccortexHTTPServer(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - if len(self.path) < 4 or len(self.path) > 300: - jsonstr = "{\"error\": \"Webserver: bad query\"}\n" - else: - jsonstr = query_mccortex(mccortex, self.path[1:]) - self.wfile.write(jsonstr.encode("UTF-8")) - - try: - httpd = socketserver.TCPServer(("", port), mccortexHTTPServer) - except Exception as e: - print("Cannot start HTTP server: ",e,file=sys.stderr) - sys.exit() - - print("serving at port", port) - - try: - httpd.serve_forever() - except (KeyboardInterrupt,SystemExit): - print(" Got exit signal") - - httpd.server_close() - stop_mccortex(mccortex) - print("closed. bye.") - - -def main(): - if len(sys.argv) < 3 or not sys.argv[1].isdigit(): - print("usage: %s <port> [mccortex args]" % (sys.argv[0])) - print(" e.g %s 1888 -m 2G graph.ctx" % (sys.argv[0])) - sys.exit(-1) - - port = int(sys.argv[1]) - args = sys.argv[2:] - - # test_mccortex(args) - start_web_server(port,args) - -if __name__ == '__main__': - main() diff --git a/scripts/python/break-contigs-vs-truth.py b/scripts/python/break-contigs-vs-truth.py new file mode 100644 index 00000000..03ccfd1c --- /dev/null +++ b/scripts/python/break-contigs-vs-truth.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +from __future__ import print_function +try: input = raw_input +except: pass + +# usage: python break-contigs-vs-truth.py <k> [input.txt] +# input.txt: +# <master> :string +# <query1> :string +# <query2> :string +# ... +# +# Output: +# <contig-id> <ref-strand> <ref-start> <contig-substr> +# ... +# +# For each query string, report all maximal substring alignments between it and +# the master string. Alignments are of length >= k and can include single base +# mismatches as long as they are flanked by k matching bases. +# +# Next we print coverage of ref using maximal alignments to contigs. +# Finally we report NG50 and number of assembly errors. +# +# Time is on average NlogN where N is number of query bases. Worst case N^2 +# e.g. aaaaaaaaaaaa vs aaaaaaaaaaaa +# Memory usage is linear with length of the master string and number of maximal +# substring matches. +# +# Isaac Turner 2016-09-10 +# MIT License + + +import fileinput +import pyRBT +import sys +from collections import defaultdict + +class Alignment: + def __init__(self,seqid,start1,start2,length): + self.seqid = seqid + self.start1,self.start2,self.length = start1,start2,length + def __cmp__(x,y): + if x.seqid != y.seqid: return x.seqid - y.seqid + xoff,yoff = x.start1-x.start2,y.start1-y.start2 + if xoff != yoff: return xoff - yoff + if x.start1 >= y.start1+y.length: return 1 + if y.start1 >= x.start1+x.length: return -1 + return 0 # one is within the other + def __lt__(x,y): return x.__cmp__(y) > 0 + def __ge__(x,y): return x.__cmp__(y) >= 0 + def __eq__(x,y): return x.__cmp__(y) == 0 + def __ne__(x,y): return x.__cmp__(y) != 0 + def __le__(x,y): return x.__cmp__(y) <= 0 + def __lt__(x,y): return x.__cmp__(y) < 0 + def __str__(self): + return ("Alignment(id:"+str(self.seqid)+"," + + "starts:("+str(self.start1)+","+str(self.start2)+")," + + "len:"+str(self.length)+")") + +def dna_reverse_complement(s): + h = {'a':'t','c':'g','g':'c','t':'a','A':'T','C':'G','G':'C','T':'A'} + t = [ h.get(c,c) for c in s ] + return ''.join(t[::-1]) + +# upper case +def build_occ_hash(occ,k,seq,seqid): + for i in range(len(seq)-k+1): + occ[seq[i:i+k].upper()].append((seqid,i)) + +# Extend a match as far as we have an exact match or a mismatch flanked by k +# exact matches either side. Seeded from an initial matching kmer +def extend_match_kmer_match(a,b,s1,e1,s2,e2,M): + while True: + while s1 > 0 and s2 > 0 and a[s1-1].upper() == b[s2-1].upper(): s1,s2 = s1-1,s2-1 + if s1 > M and s2 > M and a[s1-M-1:s1-1].upper() == b[s2-M-1:s2-1].upper(): + s1 -= M+1; s2 -= M+1 + else: break + while True: + while e1 < len(a) and e2 < len(b) and a[e1].upper() == b[e2].upper(): e1,e2 = e1+1,e2+1 + if e1+M < len(a) and e2+M < len(b) and a[e1+1:e1+1+M].upper() == b[e2+1:e2+1+M].upper(): + e1 += M+1; e2 += M+1 + else: break + return (s1,e1,s2,e2) + +def count_str_matches(a,b): + return sum([ i.upper() == j.upper() for i,j in zip(a,b) ]) + +# Extend a match as far as we have an exact match or N matches within M bases +def extend_match_kmer_match2(a,b,s1,e1,s2,e2,M,N): + while True: + while s1 > 0 and s2 > 0 and a[s1-1].upper() == b[s2-1].upper(): s1,s2 = s1-1,s2-1 + if s1 > M and s2 > M and count_str_matches(a[s1-1-M:s1-1], b[s2-1-M:s2-1]) >= N: + s1 -= M+1; s2 -= M+1 + else: break + while True: + while e1 < len(a) and e2 < len(b) and a[e1].upper() == b[e2].upper(): e1,e2 = e1+1,e2+1 + if e1+M < len(a) and e2+M < len(b) and count_str_matches(a[e1+1:e1+1+M], b[e2+1:e2+1+M]) >= N: + e1 += M+1; e2 += M+1 + else: break + return (s1,e1,s2,e2) + +# Segment tree +# Query if there is an uncovered region in a set of intervals +class GapSegNode: + def __init__(self,start,end,l=None,r=None): + (self.start,self.end,self.l,self.r,self.regs) = (start,end,l,r,[]) + def add_interval(self,reg): + if reg[0] >= self.end or reg[1] <= self.start: return # does not overlap + if reg[0] <= self.start and reg[1] >= self.end: # contained + self.regs.append(reg) + else: + if self.l is not None: self.l.add_interval(reg) + if self.r is not None: self.r.add_interval(reg) + def gap_in_interval(self,reg): + if reg[0] >= self.end or reg[1] <= self.start: return False # does not overlap + if len(self.regs) > 0: return False # this node is covered + if self.l is None and self.r is None: return True # leaf node + return ((self.l is not None and self.l.gap_in_interval(reg)) or + (self.r is not None and self.r.gap_in_interval(reg))) + def __str__(self): + return "GapSegNode("+str(self.start)+","+str(self.end)+")" + @staticmethod + def build_tree(start,end): + # build tree bottom up + assert start < end + l = [ GapSegNode(i,i+1) for i in range(start,end) ] + while len(l) > 1: + N = 2*int(len(l)//2) + m = [ GapSegNode(l[i].start,l[i+1].end,l[i],l[i+1]) for i in range(0,N,2) ] + if len(l)%2 == 1: m.append(GapSegNode(l[-1].start, l[-1].end, l[-1])) + l = m + return l[0] + +# `l` is a list of alignments of sequence `seq` against the ref +# for each kmer in seq, keep the longest substring in l that covers it +# discard all substrings that are not kept +def reduce_alignments(seq,l): + # sort alignments by length (longest to shortest) + l.sort(key=lambda x: -x.length) + # iterate over alignments, only taking those that cover uncovered kmers + gst = GapSegNode.build_tree(0,len(seq)) + keep = [] + for aln in l: + reg = (aln.start2, aln.start2+aln.length) + if gst.gap_in_interval(reg): + keep.append(aln) + gst.add_interval(reg) + return keep + +# l is a list of (start,length) contig alignments +# removes substrings from l (BEWARE: they get deleted!) +# returns contig_index,contig_length +def ng50_from_coverage(l,reflen): + # sort by start (ascending) and length (descending) to remove substrings + # on the ref + l.sort(key=lambda x: (x[0],-x[1])) + j = end = 0 + for x in l: + if x[0]+x[1] > end: + end = x[0]+x[1] + l[j] = x + j += 1 + del(l[j:]) + # sort by length (descending), start position (ascending) + l.sort(key=lambda x: (-x[1],x[0])) + halflen = reflen//2 + lensum = n = 0 + while lensum < halflen: + if n+1 == len(l): + print("Warning: haven't assembled half of ref, NG50 is underestimate", + file=sys.stderr) + break + lensum += l[n][1] + n += 1 + return (n,l[n][1]) + +# for a given alignment, get left hand position on + strand +def convert_ref_strandpos(aln,reflen): + return aln.start1 if aln.seqid&1 == 0 else reflen-(aln.start1+aln.length) + +def main(k,path): + master = input().strip() + masterrc = dna_reverse_complement(master) + print(master) + print(masterrc) + occ = defaultdict(list) # [ (strand,pos) ... ] + build_occ_hash(occ,k,master,0) + build_occ_hash(occ,k,masterrc,1) + n_asm_errors = n_no_matches = n_output = 0 + m_cov = [] + strands = ["+","-"] + print("# Matching contigs sections") + print("# contig-id contig-substr ref-start ref-strand") + # Iterate over queries + qi = 0 + for q in fileinput.input(path): + q = q.strip() + rbt = pyRBT.pyRBT() + l = [] + for i in range(len(q)-k+1): + kmer = q[i:i+k].upper() + for (seqid,pos) in occ[kmer]: + # create alignment + aln = Alignment(seqid,pos,i,k) + if aln not in rbt: + # extend alignment + s1,s2 = pos,i + m = master if seqid==0 else masterrc + (s1,e1,s2,e2) = extend_match_kmer_match(m,q,s1,s1+k,s2,s2+k,5) + # (s1,e1,s2,e2) = extend_match_kmer_match2(m,q,s1,s1+k,s2,s2+k,10,6) + aln.start1,aln.start2,aln.length = s1,s2,e1-s1 + # store and print + rbt.insert(aln) + l.append(aln) + for x in l: + s = convert_ref_strandpos(x,len(master)) + m_cov.append((s,x.length,qi,x.seqid&1,x.start2)) + l = reduce_alignments(q,l) + for x in l: + s = convert_ref_strandpos(x,len(master)) + print(qi,q[x.start2:x.start2+x.length],s,strands[x.seqid&1]) + n_asm_errors += max(0,len(l)-1) + n_no_matches += (len(l) == 0) + n_output += len(l) + qi += 1 + (_,ng50) = ng50_from_coverage(m_cov,len(master)) + print() # empty line separates output, now print ref matches + print("# Ref positions assembled (longest to shortest)") + print("# start positions are 0-based and indicate the left hand position") + print("# ref-start length contig-id contig-start contig-strand") + m_cov.sort() # sort by start, length (both ascending) + for x in m_cov: print(x[0],x[1],x[2],x[4],strands[x[3]]) + print("contigs_read:",qi,file=sys.stderr) + print("contigs_printed:",n_output,file=sys.stderr) + print("assembly_errors:",n_asm_errors,file=sys.stderr) + print("nomatch_contigs:",n_no_matches,file=sys.stderr) + # number of unique segments of the reference that were assembled + print("num_uniq_ref_segs:",len(m_cov),file=sys.stderr) + print("reflen:",len(master),file=sys.stderr) + print("NG50:",ng50,file=sys.stderr) + +def usage(err=None): + if err is not None: print(err,file=sys.stderr) + print("python break-contigs-vs-truth.py <k> [contigs.txt]",file=sys.stderr) + exit(-1) + +if __name__ == '__main__': + if len(sys.argv) < 2 or len(sys.argv) > 3: usage() + try: k = int(sys.argv[1]) + except ValueError: usage("Error: invalid kmer value '"+sys.argv[1]+"'") + path = sys.argv[2] if len(sys.argv) > 2 else "-" + print("k:",k,"path:",path,file=sys.stderr) + main(k,path) diff --git a/scripts/python/count-bad-edges.py b/scripts/python/count-bad-edges.py new file mode 100644 index 00000000..7e361150 --- /dev/null +++ b/scripts/python/count-bad-edges.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +from __future__ import print_function +try: input = raw_input +except: pass +import sys + +# For various kmer sizes, find the number of sequencing errors that would add a +# new edges between two existing kmers +# Note: there are 3*reflen possible mutations + +cov = None +seqn_err_rate = None + +def usage(): + print("usage: python count-bad-edges.py [<cov> <seqn-err-rate>]",file=sys.stderr) + exit(-1) + +if len(sys.argv) != 1 and len(sys.argv) != 3: usage() +if len(sys.argv) == 3: + try: cov,seqn_err_rate = int(sys.argv[1]),float(sys.argv[2]) + except: usage() + +s = input().strip().upper() +kmers = [21,31,41,51,61,71,81,91,99] + +def est_num_of_added_edges(reflen,nerror_edges,cov,seqn_err_rate): + nerrors = cov * reflen * seqn_err_rate + bad_error_rate = nerror_edges / (3.0 * reflen) # errs that create new edges + return int(nerrors*bad_error_rate) + +print("# The number of sequencing errors that would add a new edge between two") +print("# existing kmers. Note: there are 3*reflen possible mutations") + +cols = ["kmer","reflen","nkmers","nedges","nerror_edges"] +if cov is not None: cols.extend(["cov","err_rate","est_bad_edges"]) +print(",".join([str(x) for x in cols])) + +for k in kmers: + kmers = set() + edges = set() + for i in range(len(s)-k): + kmers.add(s[i:i+k]) + edges.add(s[i:i+k+1]) + kmers.add(s[-k:]) + err_edges = 0 + pk = s[0:k] # first kmer + for i in range(1,len(s)-k+1): + nextb = s[i+k-1] + for c in "ACGT": + err_edges += (c != nextb and pk[1:]+c in kmers and pk+c not in edges) + pk = s[i:i+k] + cols = [k,len(s),len(kmers),len(edges),err_edges] + if cov is not None: + cols.extend([cov, seqn_err_rate, + est_num_of_added_edges(len(s),err_edges,cov,seqn_err_rate)]) + print(",".join([str(x) for x in cols])) diff --git a/scripts/python/mccortex-server.py b/scripts/python/mccortex-server.py new file mode 100755 index 00000000..c81e3429 --- /dev/null +++ b/scripts/python/mccortex-server.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import os +import sys +import time +import signal + +from subprocess import Popen, PIPE + +# These should work on python2 after 'pip install --user future' +from http.server import BaseHTTPRequestHandler +import socketserver + +# Opening browser windows +import webbrowser +import threading + +# +# Start server on port 2306, with link and graph files: +# python mccortex-server.py 2306 --coverages --edges -p l.ctp.gz a.ctx b.ctx +# Query a kmer: +# curl localhost:2096/CAGTGGCCA +# Response: +# { "key": "CAGTGGCCA", "colours": [1], "left": "T", "right": "T", "edges": "88", "links": [] } +# + +def usage(err=None): + if err is not None: print(err,file=sys.stderr) + print("usage: %s <port> [-W,--web|-k,--kmer <K>] [mccortex args]" % (sys.argv[0])) + print(" e.g %s 1888 -m 2G graph.ctx" % (sys.argv[0])) + sys.exit(-1) + +# when we start mccortex we set it to ignore interrupt signal, we handle it. +def preexec_function(): + # Ignore the SIGINT signal by setting the handler to the standard + # signal handler SIG_IGN. + signal.signal(signal.SIGINT, signal.SIG_IGN) + +class McCortexProc: + def __init__(self,kmer=31,extra_args=[]): + self.lock = threading.Lock() + script_dir = os.path.dirname(os.path.realpath(__file__)) + try: + self.proc = Popen([script_dir+"/../../bin/mccortex", str(kmer), + "server", "--single-line"] + extra_args, + stdin=PIPE, stdout=PIPE, universal_newlines=True, + preexec_fn = preexec_function) + except Exception as e: + print("Couldn't start McCortex: ",e,file=sys.stdout) + sys.exit(1) + def query_mccortex(self,query): + with self.lock: + print(query, file=self.proc.stdin) + self.proc.stdin.flush() + self.check_mccortex_alive() + line = self.proc.stdout.readline() + # Trim off prompt text + if line[0:2] == "> ": line = line[2:len(line)] + self.check_mccortex_alive() + return line + def check_mccortex_alive(self): + if self.proc.poll() is not None: + print("McCortex quit [%i]" % self.proc.returncode, file=sys.stdout) + sys.exit(1) + def stop_mccortex(self): + print("q\n", file=self.proc.stdin) + self.proc.stdin.close() + # sleep until process has closed + while self.proc.poll() is None: time.sleep(1) + print("McCortex exited with:",str(self.proc.poll())) + +def test_mccortex(args): + mcproc = McCortexProc(9,args) + print("Got: ", mcproc.query_mccortex("CACTGATGA"), end='') + print("Got: ", mcproc.query_mccortex("CCACTGATG"), end='') + try: + while True: pass + except (KeyboardInterrupt,SystemExit): + print(" Got exit signal") + mcproc.stop_mccortex() + +def launch_webpage(url,delay=1): + time.sleep(delay) + webbrowser.open(url) + +def start_web_server(port,args,kmer=31,webpage=None): + mcproc = McCortexProc(kmer,args) + + class mccortexHTTPServer(BaseHTTPRequestHandler): + def do_GET(self): + if len(self.path) < 4 or len(self.path) > 300: + jsonstr = "{\"error\": \"Webserver: bad query\"}\n" + else: + jsonstr = mcproc.query_mccortex(self.path[1:]) + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(jsonstr.encode("utf-8")) + + # Reuse old open port (http://stackoverflow.com/a/10614360/431087) + # to avoid error: [Errno 48] Address already in use + class MyTCPServer(socketserver.TCPServer, socketserver.ThreadingMixIn): + allow_reuse_address = True + + try: + httpd = MyTCPServer(("", port), mccortexHTTPServer) + except Exception as e: + print("Cannot start HTTP server: ",e,file=sys.stderr) + sys.exit() + + # Open webpage if requested, after pausing for 1 sec + if webpage is not None: + threading.Thread(target=launch_webpage,args=(webpage,)).start() + + print("serving at port", port) + try: httpd.serve_forever() + except (KeyboardInterrupt,SystemExit): + print(" Got exit signal") + httpd.server_close() + mcproc.stop_mccortex() + print("Goodbye.") + +def main(): + kmer = 31 + webpage = None + if len(sys.argv) < 3 or not sys.argv[1].isdigit(): usage() + port = int(sys.argv[1]) + while len(sys.argv) > 2: + if sys.argv[2] in ['-W','--web']: + webpage = "http://127.0.0.1:"+str(port) + del(sys.argv[2]) + elif sys.argv[2] in ['-k','--kmer'] and len(sys.argv) > 3: + try: kmer = int(sys.argv[3]) + except ValueError: usage("Invalid --kmer value: "+sys.argv[3]) + del(sys.argv[2:4]) + else: break + args = sys.argv[2:] + # test_mccortex(args) + start_web_server(port,args,kmer,webpage) + +if __name__ == '__main__': + main() diff --git a/scripts/python/mccortex.py b/scripts/python/mccortex.py index 1d6caeee..725b1ef2 100644 --- a/scripts/python/mccortex.py +++ b/scripts/python/mccortex.py @@ -49,14 +49,18 @@ def quack(): def reverse_complement(s): complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', \ 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'} - bases = list(s) # get returns the base if not in dict - bases = reversed([complement.get(base,base) for base in bases]) + # get returns the base if not in dict + bases = reversed([complement.get(b,b) for b in s]) return ''.join(bases) def dna_key(s): r = reverse_complement(s) return s if s <= r else r +# given a string s and kmer size `k`, return a list of the kmers (contains dups) +def kmers(s,k): + return [ s[i:i+k] for i in range(len(s)-k+1) ] + def load_fasta(path): """ Load all chromosomes from a FASTA file. diff --git a/scripts/python/pyRBT.py b/scripts/python/pyRBT.py new file mode 100644 index 00000000..fbf741e8 --- /dev/null +++ b/scripts/python/pyRBT.py @@ -0,0 +1,597 @@ +from __future__ import print_function + +# https://en.wikipedia.org/wiki/Red%E2%80%93black_tree +# Invariants: +# 1. A node is either red or black. +# 2. The root is black. +# 3. All leaves are black. +# 4. If a node is red, then both its children are black. +# 5. Every path from a given node to a leaf node has the same no. of black nodes. +# Results in: +# 6. Longest root->leaf path is no more than twice as long as shortest root->leaf +# (i.e. roughly balanced) +# +# Black depth of the tree is the number of back nodes from root to any leaf +# Longest path is 2*B-1 nodes where B is the black depth of the tree +# Shortest path is B nodes + +class pyRBT: + class RBLeaf: + def __init__(self): self.size = 0 + def isblack(self): return True + def isred(self): return False + def isleaf(self): return True + def __str__(self): return "RBLeaf" + def __len__(self): return 0 + def treestr(self): return "." + + class RBNode: + def __init__(self,value,black=True): + self.value = value + self.black = black + self.size = 1 + self.l = self.r = pyRBT.leaf + def isblack(self): return self.black + def isred(self): return not self.black + def isleaf(self): return False + def __len__(self): return self.size + def __str__(self): + return "RBNode("+str(self.value)+","+("black" if self.black else "red")+")" + def treestr(self): + col = "B" if self.black else "R" + return "("+self.l.treestr()+","+str(self.value)+":"+col+","+self.r.treestr()+")" + + class RBIterator: + def __init__(self,tree,reverse=False,retpaths=False): + self.tree = tree + self.path = [] + self.fwd = not reverse + self.retpaths = retpaths + def __iter__(self): return self + def next(self): return self.__next__() + def __next__(self): + p = self.path + if len(p) == 0: + if self.tree.root.isleaf(): raise StopIteration() # empty tree + p.append(self.tree.root) + if self.fwd: + while not p[-1].l.isleaf(): p.append(p[-1].l) + else: + while not p[-1].r.isleaf(): p.append(p[-1].r) + elif self.fwd and not p[-1].r.isleaf(): + # Take the secondary fork (right fork when forward) + p.append(p[-1].r) + while not p[-1].l.isleaf(): p.append(p[-1].l) + elif not self.fwd and not p[-1].l.isleaf(): + # Take the secondary fork (left fork when reverse) + p.append(p[-1].l) + while not p[-1].r.isleaf(): p.append(p[-1].r) + else: + # Back up the tree + last = p.pop() # remove last leaf + # find first parent node that used left node + if self.fwd: + while len(p) > 0 and last == p[-1].r: last = p.pop() + else: + while len(p) > 0 and last == p[-1].l: last = p.pop() + if len(p) == 0: raise StopIteration() + return p if self.retpaths else p[-1].value + + leaf = RBLeaf() + + def __init__(self): + self.root = pyRBT.leaf + + def __len__(self): + return self.root.size + + # Editing the tree voids any iterators! Do not edit the tree whilst iterating. + def __iter__(self): + return pyRBT.RBIterator(self,False,False) + + # Get a reverse iterator by overriding reversed(...) + def __reversed__(self): + return pyRBT.RBIterator(self,True,False) + + # Iterator that returns paths to each node + def paths(self,reverse=False): + return pyRBT.RBIterator(self,reverse,True) + + # Get a string representation of the tree + def __str__(self): + return self.root.treestr() + + # override the square bracket operator [] to get a value by index + def __getitem__(self,i): + return self.get(i) + + # setitem not defined since we don't map from key -> value + # def __setitem__(self,key,value): + + # override the del operation to delete a value by index + def __delitem__(self,i): + self.pop(i) + + def __contains__(self,item): + return self.find(item) is not None + + def clear(self): + self.root = pyRBT.leaf + + # compare two Red Black Trees lexicographically + # [1] < [2] < [1,1] < [1,2] < [1,2,0] + def __cmp__(x,y): + if len(x) != len(y): return len(x) - len(y) + for (a,b) in zip(x,y): + if a != b: return a-b + return 0 + + def __gt__(x,y): return x.__cmp__(y) > 0 + def __ge__(x,y): return x.__cmp__(y) >= 0 + def __eq__(x,y): return x.__cmp__(y) == 0 + def __ne__(x,y): return x.__cmp__(y) != 0 + def __le__(x,y): return x.__cmp__(y) <= 0 + def __lt__(x,y): return x.__cmp__(y) < 0 + + # p is a path to a node path[-1] + @staticmethod + def _parent(path): + return path[-2] if len(path) >= 2 else None + + @staticmethod + def _grandparent(path): + return path[-3] if len(path) >= 3 else None + + @staticmethod + def _uncle(path): + gp = pyRBT._grandparent(path) + p = pyRBT._parent(path) + if gp is None: return None + return gp.r if p == gp.l else gp.l + + @staticmethod + def _sibling(path): + if len(path) < 2: return None + (pa,nd) = (path[-2],path[-1]) + return pa.r if pa.l == nd else pa.l + + def _replace_node(self,pa,ch,newch): + if pa is None: self.root = newch + elif pa.l == ch: pa.l = newch + else: pa.r = newch + + # + # pa -> ch + # / \ / \ + # ch pa + # / \ / \ + # + # input path to parent node, upon return path points to child (new parent) + def _rotate_left(self,path): + pa = path.pop() + ch = pa.r + (pa.r, ch.l) = (ch.l, pa) + pa.size = len(pa.l) + 1 + len(pa.r) + ch.size = len(ch.l) + 1 + len(ch.r) + self._replace_node(path[-1] if len(path) > 0 else None, pa, ch) + path.append(ch) + + # + # pa -> ch + # / \ / \ + # ch pa + # / \ / \ + # + # input path to parent node, upon return path points to child (new parent) + def _rotate_right(self,path): + pa = path.pop() + ch = pa.l + (pa.l, ch.r) = (ch.r, pa) + pa.size = len(pa.l) + 1 + len(pa.r) + ch.size = len(ch.l) + 1 + len(ch.r) + self._replace_node(path[-1] if len(path) > 0 else None, pa, ch) + path.append(ch) + + def _insert_case1(self,path): + # print(" tree:",self) + # print(" _insert_case1:",','.join([str(x) for x in path])) + if len(path) == 1: + self.root = path[0] + self.root.black = True + elif pyRBT._parent(path).isred(): + self._insert_case3(path) + + def _insert_case3(self,path): + # print(" tree:",self) + # print(" _insert_case3:",','.join([str(x) for x in path])) + assert len(path) > 1 and path[-2].isred() + # Assumption: parent exists and is red + # => therefore grandparent also exists and is black + gp = pyRBT._grandparent(path) + pa = pyRBT._parent(path) + un = pyRBT._uncle(path) + if un is not None and un.isred(): + pa.black = True + un.black = True + gp.black = False + path.pop() + path.pop() + self._insert_case1(path) # gp is now red, deal with it + else: + self._insert_case4(path) + + def _insert_case4(self,path): + # print(" tree:",self) + # print(" _insert_case4:",','.join([str(x) for x in path])) + gp = pyRBT._grandparent(path) + pa = pyRBT._parent(path) + nd = path.pop() # pop to rotate from parent + if nd == pa.r and pa == gp.l: + self._rotate_left(path) + path.append(nd.l) + elif nd == pa.l and pa == gp.r: + self._rotate_right(path) + path.append(nd.r) + else: + path.append(nd) + self._insert_case5(path) + + def _insert_case5(self,path): + # print(" tree:",self) + # print(" _insert_case5:",','.join([str(x) for x in path])) + gp = pyRBT._grandparent(path) + pa = pyRBT._parent(path) + nd = path[-1] + gp.black = False + pa.black = True + path.pop() # pop to rotate grandparent + path.pop() + if nd == pa.l: self._rotate_right(path) + else: self._rotate_left(path) + + # multiset = True allows multiple insertions of the same value + def insert(self,item,multiset=False): + if len(self) == 0: + self.root = pyRBT.RBNode(item) + else: + p,v = [],self.root + while not v.isleaf(): + p.append(v) + if not multiset and item == v.value: + v.value = item + return + v = (v.l if item < v.value else v.r) + v = pyRBT.RBNode(item,black=False) + if item < p[-1].value: p[-1].l = v + else: p[-1].r = v + # Need to update size + for node in p: node.size += 1 + p.append(v) + self._insert_case1(p) + + def extend(self,l): + for x in l: self.insert(x) + + # remove element from a given index + def pop(self,i=None): + if i is None: i = len(self)-1 + p = self.getpath(i) + return self._delete_path(p) + + # remove a given item + def remove(self,item): + p = self.findpath(item) + if len(p) == 0 or p[-1].value != item: + raise KeyError("RBT key '"+str(item)+"' not found") + return self._delete_path(p) + + def _delete_path(self,p): + nd = p[-1] + val = nd.value # value that is being deleted + # go right since we use the < and >= relations for left/right leaves + if not nd.r.isleaf(): + v = nd.r + p.append(v) + while not v.l.isleaf(): + v = v.l + p.append(v) + elif not nd.l.isleaf(): + v = nd.l + p.append(v) + while not v.r.isleaf(): + v = v.r + p.append(v) + nd.value = p[-1].value + for v in p: v.size -= 1 + self._delete_one_child(p) + return val + + def _delete_one_child(self,path): + node = path.pop() + child = (node.l if node.r.isleaf() else node.r) + self._replace_node(path[-1] if len(path) > 0 else None, node, child) + path.append(child) # may be appending a leaf node, this is OK in deletion + if node.isblack(): + if child.isred(): child.black = True + else: self._delete_case2(path) + # `node` is no longer in the tree + + # assume we have a parent + def _delete_case2(self,path): + if len(path) < 2: return + (pa,nd,sb) = (path[-2],path[-1],pyRBT._sibling(path)) + if sb.isred(): + pa.black = False + sb.black = True + path.pop() # pop to rotate parent + if nd == pa.l: self._rotate_left(path) + else: self._rotate_right(path) + path.append(pa) + path.append(nd) + self._delete_case3(path) + + def _delete_case3(self,path): + (pa,nd,sb) = (path[-2],path[-1],pyRBT._sibling(path)) + if pa.isblack() and sb.isblack() and sb.l.isblack() and sb.r.isblack(): + sb.black = False + path.pop() + self._delete_case2(path) # parent + else: + self._delete_case4(path) # node + + def _delete_case4(self,path): + (pa,nd,sb) = (path[-2],path[-1],pyRBT._sibling(path)) + if pa.isred() and sb.isblack() and sb.l.isblack() and sb.r.isblack(): + sb.black = False + pa.black = True + else: + self._delete_case5(path) + + def _delete_case5(self,path): + (pa,nd,sb) = (path[-2],path[-1],pyRBT._sibling(path)) + if sb.isblack(): + path.pop() # pop to rotate sibling + path.append(sb) + if nd == pa.l and sb.r.isblack() and sb.l.isred(): + sb.black = False + sb.l.black = True + self._rotate_right(path) + elif nd == pa.r and sb.l.isblack() and sb.r.isred(): + sb.black = False + sb.r.black = True + self._rotate_left(path) + path.pop() # remove sibling, re-add node + path.append(nd) + self._delete_case6(path) + + def _delete_case6(self,path): + (pa,nd,sb) = (path[-2],path[-1],pyRBT._sibling(path)) + sb.black = pa.black + pa.black = True + path.pop() # rotate parent + if nd == pa.l: + sb.r.black = True + self._rotate_left(path) + else: + assert nd == pa.r + sb.l.black = True + self._rotate_right(path) + + def find(self,item): + v = self.root + while not v.isleaf(): + if item == v.value: return item + v = (v.l if item < v.value else v.r) + return None + + # path = tree.findpath(x) + # => + # x == path[-1].value OR + # len(path) <= 1 OR + # x between path[-1].value and path[-2].value + def findpath(self,item,p=None): + if p is None: p,v = [],self.root + else: v = p.pop() + while not v.isleaf(): + p.append(v) + if item == v.value: return p + v = (v.l if item < v.value else v.r) + return p + + # fetch via index + # index is within `start` if passed + def get(self,i,start=None): + if start is None: v = self.root + if i < 0: i += len(v) # allow negative indices + if i < 0 or i >= len(v): + raise IndexError("index out of range (%d vs 0..%d)" % (i, len(v))) + while not v.isleaf(): + if i < len(v.l): v = v.l + elif i == len(v.l): return v.value + else: + i -= len(v.l) + 1 + v = v.r + raise RuntimeError("Internal pyRBT error") + + # fetch path via index + # index is within `p[-1]` if passed + def getpath(self,i,p=None): + if p is None: p,v = [],self.root + else: v = p[-1] + if i < 0: i += len(v) # allow negative indices + if i < 0 or i >= len(v): + raise IndexError("index out of range (%d vs 0..%d)" % (i, len(v))) + while not v.isleaf(): + p.append(v) + if i < len(v.l): v = v.l + elif i == len(v.l): return p + else: + i -= len(v.l) + 1 + v = v.r + raise RuntimeError("Internal pyRBT error") + + # Get the index of an item + def index(self,item,start=None): + if start is None: v = self.root + i = 0 + idx = None + while not v.isleaf(): + if item < v.value: v = v.l + elif item == v.value: + # found one instance, look for earlier ones + idx = i+len(v.l) + v = v.l + else: + i += len(v.l) + 1 + v = v.r + if idx is None: raise KeyError('Key not found: '+str(item)) + return idx + + # Check data structure integrity by checking invariants are met + def check(self): + assert (len(self) == 0) == self.root.isleaf() # size is zero only if empty + assert self.root.isblack() # root node is black + nblack = -1 + npaths = 0 + for p in self.paths(): + # print("Check:",'->'.join([str(x) for x in p])) + assert not p[-1].isleaf() or p[-1].isblack() # all leaf nodes are black + if p[-1].isred(): + # all red nodes have only black children + assert p[-1].l.isblack() and p[-1].r.isblack() + # Every path from the the root has the same number of black nodes + if p[-1].l.isleaf() or p[-1].r.isleaf(): + ntmpb = sum([ x.isblack() for x in p ]) + 1 + assert nblack == -1 or nblack == ntmpb + nblack = ntmpb + npaths += 1 + assert(npaths == len(self)) + # print('nblack:',nblack,'npaths:',npaths) + + +import random + +def _test_rbt(nums): + tree = pyRBT() + vals = [] # sorted values in the tree + # insert values into the tree + for v in nums: + tree.insert(v,True) + vals.append(v) + vals.sort() + tree.check() + assert sum([ x==y for x,y in zip(vals,iter(tree)) ]) == len(vals) + # Test indexing + for (i,v) in enumerate(vals): assert(tree[i] == v) + for (i,v) in enumerate(vals): assert tree.index(v) == vals.index(v) + # remove the values in a random order + rvals = list(nums) + random.shuffle(rvals) + for v in rvals: + tree.remove(v) + vals.remove(v) + tree.check() + assert sum([ x==y for x,y in zip(vals,iter(tree)) ]) == len(vals) + # Test indexing + for (i,v) in enumerate(vals): assert tree[i] == v + for (i,v) in enumerate(vals): assert tree.index(v) == vals.index(v) + # Re-build sorted remove forwards + sortedvals = sorted(vals) + tree.extend(sortedvals) + for v in sortedvals: assert sortedvals.remove(v) == tree.remove(v) + assert len(tree) == 0 + tree.check() + # Re-build sorted remove backwards + sortedvals = sorted(vals) + tree.extend(sortedvals) + for v in reversed(sortedvals): assert sortedvals.remove(v) == tree.remove(v) + assert len(tree) == 0 + tree.check() + +def _test_rbt_comparison(): + print("Testing RBT comparison...") + abc = pyRBT() + xyz = pyRBT() + abc.extend([1,2,3,9]) + xyz.extend([9,3,2,1]) + assert abc == xyz and abc >= xyz and abc <= xyz + assert not (abc > xyz) and not (abc < xyz) and not (abc != xyz) + abc.remove(3) + # abc < xyz + assert abc < xyz and xyz > abc and abc != xyz + assert not (abc > xyz) and not (abc >= xyz) and not (abc == xyz) + assert not (xyz < abc) and not (xyz <= abc) + abc.clear() + xyz.clear() + # abc == xyz + assert abc == xyz and abc <= xyz and abc >= xyz + assert not (abc != xyz) and not (abc < xyz) and not (abc > xyz) + # abc > xyz + abc.insert(1) + assert abc > xyz and xyz < abc and abc != xyz + assert not (abc < xyz) and not (abc <= xyz) and not (abc == xyz) + assert not (xyz > abc) and not (xyz >= abc) + + +def _test_red_black_tree(): + print("Testing RBT") + tree = pyRBT() + + # Insert [1,2,...,N] + print("Doing mixed automated tests...") + vals = list(range(1,100)) + _test_rbt(vals) # 1..N in sorted order + random.shuffle(vals) + _test_rbt(vals) # 1..N in random order + _test_rbt([]) # test epmty set + _test_rbt([random.randrange(100) for x in range(200)]) # random numbers with repeats + _test_rbt([random.randrange(1000) for x in range(200)]) # random numbers with repeats + _test_rbt([1]*10) # multiple 1s + + # Test python features + vals = list(range(1,10)) + sortedvals = sorted(list(vals)) # sorted copy + random.shuffle(vals) + for v in vals: + tree.insert(v,True) + tree.check() + # -- Testing iterate paths -- + # print("Testing path iteration...") + # for path in tree.paths(): + # print(' ','->'.join([str(x) for x in path])) + # print("Testing path iterating reversed...") + # for path in tree.paths(reverse=True): + # print(' ','->'.join([str(x) for x in path])) + print("Testing value iteration...") + assert ','.join([str(x) for x in tree]) == ','.join([str(x) for x in sortedvals]) + print("Testing value iteration reversed...") + assert ','.join([str(x) for x in reversed(tree)]) == ','.join([str(x) for x in reversed(sortedvals)]) + print("Testing tree[i]...") + for i in range(len(tree)): assert tree[i] == sortedvals[i] + print("Testing find...") + for f in [-1,0.5,len(vals)+1,len(vals)-3.6]: assert tree.find(f) is None + print("Testing tree.index(i)...") + for (i,v) in enumerate(sortedvals): assert tree.index(v) == sortedvals.index(v) + print("Checking tree...") + tree.check() + # Test removing random nodes with pop + while len(sortedvals) > 0: + i = random.randrange(len(sortedvals)) + assert tree.pop(i) == sortedvals.pop(i) + tree.check() + # re-build tree and sorted list + tree.extend(vals) + sortedvals = sorted(list(vals)) # sorted copy + assert sum([ x == y for (x,y) in zip(iter(tree),sortedvals)]) == len(vals) + print("Testing remove...") + for v in vals: + tree.remove(v) + tree.check() + _test_rbt_comparison() + print("Looks like the tests all passed...") + +if __name__ == '__main__': + _test_red_black_tree() + +del _test_rbt +del _test_red_black_tree diff --git a/scripts/seq2pdf.sh b/scripts/seq2pdf.sh index d3eff0d7..e9649919 100755 --- a/scripts/seq2pdf.sh +++ b/scripts/seq2pdf.sh @@ -37,30 +37,22 @@ fi kmer=$1 shift -maxk=$[ ( ($kmer + 31) / 32 ) * 32 - 1 ] - -if [[ $[ $kmer & 1 ] -eq 0 || $kmer -lt 3 ]] -then - echo kmer is not odd and greater than 2 - exit -1 -fi - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -CTX="$DIR/../bin/ctx$maxk" -CTX2GRAPHVIZ="$DIR/cortex_to_graphviz.pl" -if [[ !(-e $CTX) || !(-x $CTX) ]] +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )" +MCCORTEX="$DIR/bin/mccortex" +CTX2GRAPHVIZ="$DIR/scripts/cortex_to_graphviz.pl" +if [[ !(-e $MCCORTEX) || !(-x $MCCORTEX) ]] then - echo "Did you compile for MAXK=$maxk? I cannot run $CTX" + echo "Did you compile McCortex? I cannot run `$MCCORTEX`" exit -1 fi files=$(printf " --seq %s" $@; printf "\n") if [[ $mkpdf == 1 ]]; then - $CTX build -q -k $kmer --sample seq2pdf $files - | \ + $MCCORTEX $kmer build -q -k $kmer --sample seq2pdf $files - | \ $CTX2GRAPHVIZ -k $kmer $script_args - | \ dot -Tpdf else - $CTX build -q -k $kmer --sample seq2pdf $files - | \ + $MCCORTEX $kmer build -q -k $kmer --sample seq2pdf $files - | \ $CTX2GRAPHVIZ -k $kmer $script_args - fi diff --git a/src/commands/ctx_breakpoints.c b/src/commands/ctx_breakpoints.c index 4c98e62c..7b2e3b1f 100644 --- a/src/commands/ctx_breakpoints.c +++ b/src/commands/ctx_breakpoints.c @@ -172,7 +172,8 @@ int ctx_breakpoints(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_bubbles.c b/src/commands/ctx_bubbles.c index 18ca8d70..f9b8025f 100644 --- a/src/commands/ctx_bubbles.c +++ b/src/commands/ctx_bubbles.c @@ -175,7 +175,8 @@ int ctx_bubbles(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_clean.c b/src/commands/ctx_clean.c index 6d39beb4..e4b4ebfb 100644 --- a/src/commands/ctx_clean.c +++ b/src/commands/ctx_clean.c @@ -367,10 +367,10 @@ int ctx_clean(int argc, char **argv) } char num_kmers_str[100]; - ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); + ulong_to_str(hash_table_nkmers(&db_graph.ht), num_kmers_str); status("[cleaning] Total kmers loaded: %s\n", num_kmers_str); - size_t initial_nkmers = db_graph.ht.num_kmers; + size_t initial_nkmers = hash_table_nkmers(&db_graph.ht); hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); @@ -443,7 +443,7 @@ int ctx_clean(int argc, char **argv) } // Print stats on removed kmers - size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; + size_t removed_nkmers = initial_nkmers - hash_table_nkmers(&db_graph.ht); double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); @@ -457,7 +457,7 @@ int ctx_clean(int argc, char **argv) sort_kmers, &db_graph); } - ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); + ctx_check(hash_table_nkmers(&db_graph.ht) == hash_table_count_kmers(&db_graph.ht)); // TODO: report kmer coverage for each sample diff --git a/src/commands/ctx_contigs.c b/src/commands/ctx_contigs.c index 3343bfeb..2ecd0cef 100644 --- a/src/commands/ctx_contigs.c +++ b/src/commands/ctx_contigs.c @@ -209,7 +209,8 @@ int ctx_contigs(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_correct.c b/src/commands/ctx_correct.c index d9ba3c8f..95fbd322 100644 --- a/src/commands/ctx_correct.c +++ b/src/commands/ctx_correct.c @@ -142,7 +142,8 @@ int ctx_correct(int argc, char **argv) // Paths memory size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false); + path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false, + kmers_in_hash, false); cmd_print_mem(path_mem, "paths"); diff --git a/src/commands/ctx_dist_matrix.c b/src/commands/ctx_dist_matrix.c index e3b57c65..6ffdf8fe 100644 --- a/src/commands/ctx_dist_matrix.c +++ b/src/commands/ctx_dist_matrix.c @@ -185,8 +185,10 @@ int ctx_dist_matrix(int argc, char **argv) fprintf(fout, "\n"); for(row = 0; row < ncols; row++) { fprintf(fout, "col%zu", row); - for(col = 0; col < ncols; col++) - fprintf(fout, "\t%zu", (size_t)mat[ncols*row+col]); + for(col = 0; col < ncols; col++) { + if(col < row) fprintf(fout, "\t."); + else fprintf(fout, "\t%zu", (size_t)mat[ncols*row+col]); + } fprintf(fout, "\n"); } diff --git a/src/commands/ctx_exp_abc.c b/src/commands/ctx_exp_abc.c index 68d08e4f..1776c20e 100644 --- a/src/commands/ctx_exp_abc.c +++ b/src/commands/ctx_exp_abc.c @@ -130,7 +130,7 @@ static void print_failed(dBNode node, const dBNodeBuffer *nbuf, { const size_t kmer_size = db_graph->kmer_size; char bkmerstr[MAX_KMER_SIZE+1]; - BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); + BinaryKmer bkmer = db_node_get_bkey(db_graph, node.key); binary_kmer_to_str(bkmer, kmer_size, bkmerstr); printf(">%s:%i %s %s\n", bkmerstr, node.orient, is_AB ? "A->B" : "B->C", prime_AB ? "prime_AB" : "walk_AB"); @@ -412,7 +412,8 @@ int ctx_exp_abc(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_health_check.c b/src/commands/ctx_health_check.c index 678c353c..1bbdd7f3 100644 --- a/src/commands/ctx_health_check.c +++ b/src/commands/ctx_health_check.c @@ -124,7 +124,8 @@ int ctx_health_check(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_index.c b/src/commands/ctx_index.c index 0ad36701..0eda7136 100644 --- a/src/commands/ctx_index.c +++ b/src/commands/ctx_index.c @@ -133,10 +133,10 @@ int ctx_index(int argc, char **argv) if(!graph_file_read(&gfile, &bkmer, covgs, edges)) { status("Read kmer failed"); break; } binary_kmer_to_str(bkmer, kmer_size, bkmerstr); - if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer)) + if(nblocks > 0 && binary_kmer_ge(prev_bkmer,bkmer)) die("File is not sorted: %s [%s]", bkmerstr, path); // We've already read one kmer entry, read rest of block - bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block); + bl_bytes = kmer_mem + graph_file_fread(&gfile, tmp_mem, rem_block); bl_kmers = 1 + bl_bytes / kmer_mem; fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n", bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr, diff --git a/src/commands/ctx_infer_edges.c b/src/commands/ctx_infer_edges.c index 31d668df..18644bf4 100644 --- a/src/commands/ctx_infer_edges.c +++ b/src/commands/ctx_infer_edges.c @@ -303,11 +303,11 @@ int ctx_infer_edges(int argc, char **argv) char modified_str[100], kmers_str[100]; ulong_to_str(num_kmers_edited, modified_str); - ulong_to_str(db_graph.ht.num_kmers, kmers_str); + ulong_to_str(hash_table_nkmers(&db_graph.ht), kmers_str); double modified_rate = 0; - if(db_graph.ht.num_kmers) - modified_rate = (100.0 * num_kmers_edited) / db_graph.ht.num_kmers; + if(hash_table_nkmers(&db_graph.ht)) + modified_rate = (100.0 * num_kmers_edited) / hash_table_nkmers(&db_graph.ht); status("%s of %s (%.2f%%) nodes modified\n", modified_str, kmers_str, modified_rate); diff --git a/src/commands/ctx_pjoin.c b/src/commands/ctx_pjoin.c index 34d79048..69e99efe 100644 --- a/src/commands/ctx_pjoin.c +++ b/src/commands/ctx_pjoin.c @@ -172,7 +172,8 @@ int ctx_pjoin(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(pfiles, num_pfiles, output_ncols, rem_mem, true); + path_mem = gpath_reader_mem_req(pfiles, num_pfiles, output_ncols, rem_mem, true, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_pop_bubbles.c b/src/commands/ctx_pop_bubbles.c index 370b8fee..bd935b3f 100644 --- a/src/commands/ctx_pop_bubbles.c +++ b/src/commands/ctx_pop_bubbles.c @@ -169,11 +169,11 @@ int ctx_pop_bubbles(int argc, char **argv) ulong_to_str(npopped, npopped_str); status("Popped %s bubbles", npopped_str); - size_t nkmers0 = db_graph.ht.num_kmers; + size_t nkmers0 = hash_table_nkmers(&db_graph.ht); status("Removing nodes..."); for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i]; prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph); - size_t nkmers1 = db_graph.ht.num_kmers; + size_t nkmers1 = hash_table_nkmers(&db_graph.ht); ctx_assert(nkmers1 <= nkmers0); char nkmers0str[50], nkmers1str[50], ndiffstr[50]; diff --git a/src/commands/ctx_pview.c b/src/commands/ctx_pview.c index 922a9018..3fdf498d 100644 --- a/src/commands/ctx_pview.c +++ b/src/commands/ctx_pview.c @@ -169,7 +169,8 @@ int ctx_pview(int argc, char **argv) // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, true); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, true, + kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; diff --git a/src/commands/ctx_reads.c b/src/commands/ctx_reads.c index b490e8e1..d09c716a 100644 --- a/src/commands/ctx_reads.c +++ b/src/commands/ctx_reads.c @@ -40,6 +40,7 @@ static struct option longopts[] = {"force", no_argument, NULL, 'f'}, {"memory", required_argument, NULL, 'm'}, {"nkmers", required_argument, NULL, 'n'}, + {"threads", required_argument, NULL, 't'}, // command specific {"format", required_argument, NULL, 'F'}, {"invert", no_argument, NULL, 'v'}, diff --git a/src/commands/ctx_rmsubstr.c b/src/commands/ctx_rmsubstr.c index ba2193eb..ceddb002 100644 --- a/src/commands/ctx_rmsubstr.c +++ b/src/commands/ctx_rmsubstr.c @@ -78,7 +78,7 @@ static int _is_substr(const ReadBuffer *rbuf, size_t idx, // read in the list or a complete match with a read before it in the list. // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end) // since identical strings have equal length - if(hit->chrom < idx || r->seq.end < r2->seq.end) { + if(r->seq.end < r2->seq.end || (r->seq.end == r2->seq.end && idx > hit->chrom)) { if(hit->orient == node.orient) { // potential FORWARD match if(hit->offset >= contig_start && @@ -92,7 +92,7 @@ static int _is_substr(const ReadBuffer *rbuf, size_t idx, // potential REVERSE match // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after // the first valid kmer - size_t r1_rem = r->seq.end - (contig_start + kmer_size); + size_t r1_rem = r->seq.end - (contig_start + kmer_size); size_t r2_rem = r2->seq.end - (hit->offset + kmer_size); if(r1_rem <= hit->offset && r2_rem >= contig_start && @@ -179,20 +179,35 @@ int ctx_rmsubstr(int argc, char **argv) // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; + size_t mem_to_use = memargs.mem_to_use; bits_per_kmer = sizeof(BinaryKmer)*8 + - sizeof(KONodeList) + sizeof(KOccur) + // see kmer_occur.h - 8; // 1 byte per kmer for each base to load sequence files + sizeof(KONodeList) + sizeof(KOccur); // see kmer_occur.h - kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, + if(mem_to_use < (size_t)est_num_bases) { + warn("You probably need at least %zu bytes (> %zu)", + (size_t)est_num_bases, memargs.mem_to_use); + } + else { + mem_to_use -= est_num_bases; + } + + kmers_in_hash = cmd_get_kmers_in_hash(mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, - est_num_bases, est_num_bases, - false, &graph_mem); + 0, est_num_bases, + true, &graph_mem); + + // 1 byte per kmer for each base to load sequence files + size_t total_mem = kmers_in_hash*bits_per_kmer/8 + est_num_bases; + + char memstr[50]; + bytes_to_str(total_mem, 1, memstr); + status("[memory] total mem with input: %s\n", memstr); - cmd_check_mem_limit(memargs.mem_to_use, graph_mem); + cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Open output file diff --git a/src/commands/ctx_server.c b/src/commands/ctx_server.c index 49e137ea..aa2f9ca6 100644 --- a/src/commands/ctx_server.c +++ b/src/commands/ctx_server.c @@ -6,6 +6,7 @@ #include "graphs_load.h" #include "gpath_reader.h" #include "gpath_checks.h" +#include "graph_search.h" #include "json_hdr.h" const char server_usage[] = @@ -23,8 +24,9 @@ const char server_usage[] = " -n, --nkmers <kmers> Number of hash table entries (e.g. 1G ~ 1 billion)\n" " -p, --paths <in.ctp> Load link file (can specify multiple times)\n" " -S, --single-line Reponses on a single line\n" -" -C, --coverages Load per sample coverages\n" +" -C, --coverages Load coverages for kmers+links\n" " -E, --edges Load per sample edges\n" +" -D, --disk Read from disk (one graph only, must be sorted)\n" "\n"; static struct option longopts[] = @@ -37,34 +39,61 @@ static struct option longopts[] = {"single-line", no_argument, NULL, 'S'}, {"coverages", no_argument, NULL, 'C'}, {"edges", no_argument, NULL, 'E'}, + {"disk", no_argument, NULL, 'D'}, {NULL, 0, NULL, 0} }; #define MAX_RANDOM_TRIES 100 -static inline void kmer_response(StrBuf *resp, dBNode node, const char *keystr, - bool pretty, const dBGraph *db_graph) +typedef struct { + dBNode node; + BinaryKmer bkey; + Covg *covgs; + Edges *edges; + size_t ncols, nedges; + bool binary_covgs; +} ServerQuery; + +static void query_alloc(ServerQuery *q, size_t ncols, + bool binary_covgs, bool flatten_edges) { - size_t i, col; + q->covgs = ctx_calloc(ncols, sizeof(Covg)); + q->edges = ctx_calloc(ncols, sizeof(Edges)); + q->ncols = ncols; + q->nedges = flatten_edges ? 1 : ncols; + q->binary_covgs = binary_covgs; +} + +static void query_dealloc(ServerQuery *q) { + ctx_free(q->covgs); + ctx_free(q->edges); +} + +static inline void kmer_response(StrBuf *resp, ServerQuery q, bool pretty, + const dBGraph *db_graph) +{ + size_t i; + char keystr[MAX_KMER_SIZE+1]; + binary_kmer_to_str(q.bkey, db_graph->kmer_size, keystr); strbuf_append_str(resp, "{"); strbuf_append_str(resp, pretty ? "\n " : " "); strbuf_append_str(resp, "\"key\": \""); strbuf_append_str(resp, keystr); strbuf_append_str(resp, "\", \"colours\": ["); - for(col = 0; col < db_graph->num_of_cols; col++) { - if(col) strbuf_append_char(resp, ','); - Covg covg = db_graph->col_covgs ? db_node_get_covg(db_graph, node.key, col) - : db_node_has_col(db_graph, node.key, col); - strbuf_append_ulong(resp, covg); + strbuf_append_ulong(resp, q.covgs[0]); + for(i = 1; i < q.ncols; i++) { + strbuf_append_char(resp, ','); + strbuf_append_ulong(resp, q.covgs[i]); } strbuf_append_str(resp, "],"); strbuf_append_str(resp, pretty ? "\n " : " "); // Edges - Edges edges = db_node_get_edges_union(db_graph, node.key); + Edges uedges = 0; // get union of edges + for(i = 0; i < q.nedges; i++) uedges |= q.edges[i]; char edgesstr[9], left[5] = {0}, right[5] = {0}, *l = left, *r = right; - db_node_get_edges_str(edges, edgesstr); + db_node_get_edges_str(uedges, edgesstr); for(i = 0; i < 4; i++) if(edgesstr[i] != '.') { *l = toupper(edgesstr[i]); *(++l) = '\0'; } for(i = 4; i < 8; i++) @@ -80,10 +109,8 @@ static inline void kmer_response(StrBuf *resp, dBNode node, const char *keystr, // Sample edges char sedges[3]; - for(i = 0; i < db_graph->num_edge_cols; i++) { - edges_to_char(db_node_get_edges(db_graph, node.key, i), sedges); - strbuf_append_str(resp, sedges); - } + for(i = 0; i < q.nedges; i++) + strbuf_append_str(resp, edges_to_char(q.edges[i], sedges)); strbuf_append_str(resp, "\","); strbuf_append_str(resp, pretty ? "\n " : " "); @@ -92,7 +119,7 @@ static inline void kmer_response(StrBuf *resp, dBNode node, const char *keystr, // Links // {"forward": true, "juncs": "ACAA", "colours": [0,0,1]} size_t nlinks; - const GPath *gpath = gpath_store_safe_fetch(&db_graph->gpstore, node.key); + const GPath *gpath = gpath_store_safe_fetch(&db_graph->gpstore, q.node.key); const GPathSet *gpset = &db_graph->gpstore.gpset; for(nlinks = 0; gpath != NULL; gpath = gpath->next, nlinks++) { @@ -109,10 +136,10 @@ static inline void kmer_response(StrBuf *resp, dBNode node, const char *keystr, // counts may be null if user did not specify -C,--coverages uint8_t *counts = gpath_set_get_nseen(gpset, gpath); strbuf_append_str(resp, "\", \"colours\": ["); - for(col = 0; col < db_graph->num_of_cols; col++) { - if(col) strbuf_append_char(resp, ','); - size_t count = counts ? counts[col] - : gpath_has_colour(gpath, gpset->ncols, col); + for(i = 0; i < db_graph->num_of_cols; i++) { + if(i) strbuf_append_char(resp, ','); + size_t count = counts ? counts[i] + : gpath_has_colour(gpath, gpset->ncols, i); strbuf_append_ulong(resp, count); } strbuf_append_str(resp, "]}"); @@ -121,6 +148,29 @@ static inline void kmer_response(StrBuf *resp, dBNode node, const char *keystr, strbuf_append_str(resp, pretty ? "]\n}\n" : "] }\n"); } +static inline void query_fetch_from_graph(ServerQuery *q, const dBGraph *db_graph) +{ + size_t i; + for(i = 0; i < q->ncols; i++) + q->covgs[i] = db_graph->col_covgs ? db_node_get_covg(db_graph, q->node.key, i) + : db_node_has_col(db_graph, q->node.key, i); + for(i = 0; i < q->nedges; i++) + q->edges[i] = db_node_get_edges(db_graph, q->node.key, i); +} + +static inline void query_fetch_from_disk(ServerQuery *q) +{ + size_t i; + // Convert coverage to binary if required + if(q->binary_covgs) + for(i = 0; i < q->ncols; i++) + q->covgs[i] = (q->covgs[i] > 0); + // Flatten edges if we aren't outputting per sample edges + if(q->nedges == 1) + for(i = 1; i < q->ncols; i++) + q->edges[0] |= q->edges[i]; +} + /* // Query: ACACCAA { @@ -141,12 +191,11 @@ static inline void kmer_response(StrBuf *resp, dBNode node, const char *keystr, * @param pretty pretty print JSON or one line JSON * @returns true iff query was valid kmer */ -static inline bool query_response(const char *qstr, StrBuf *resp, bool pretty, - const dBGraph *db_graph) +static inline bool query_response(const char *qstr, ServerQuery q, + StrBuf *resp, bool pretty, + GraphFileSearch *disk, const dBGraph *db_graph) { size_t qlen; - dBNode node; - char keystr[MAX_KMER_SIZE+1], *ptr; strbuf_reset(resp); // query must be a kmer @@ -167,42 +216,53 @@ static inline bool query_response(const char *qstr, StrBuf *resp, bool pretty, return false; } - node = db_graph_find_str(db_graph, qstr); - if(node.key == HASH_NOT_FOUND) { - strbuf_set(resp, "{}\n"); - return true; + BinaryKmer bkmer = binary_kmer_from_str(qstr, db_graph->kmer_size); + q.bkey = binary_kmer_get_key(bkmer, db_graph->kmer_size); + q.node.orient = (binary_kmer_eq(bkmer, q.bkey) ? FORWARD : REVERSE); + + if(disk == NULL) { + // Fetch from graph + q.node.key = hash_table_find(&db_graph->ht, q.bkey); + if(q.node.key == HASH_NOT_FOUND) { strbuf_set(resp, "{}\n"); return true; } + query_fetch_from_graph(&q, db_graph); + } + else { + if(!graph_search_find(disk, q.bkey, q.covgs, q.edges)) { + strbuf_set(resp, "{}\n"); + return true; + } + query_fetch_from_disk(&q); } - // Get upper case kmer key - memcpy(keystr, qstr, qlen+1); - for(ptr = keystr; *ptr; ptr++) *ptr = toupper(*ptr); - if(node.orient == REVERSE) dna_reverse_complement_str(keystr, qlen); - kmer_response(resp, node, keystr, pretty, db_graph); + kmer_response(resp, q, pretty, db_graph); return true; } // Reply with a random kmer -static inline void request_random(StrBuf *resp, bool pretty, - const dBGraph *db_graph) +static inline void request_random(ServerQuery q, StrBuf *resp, bool pretty, + GraphFileSearch *disk, const dBGraph *db_graph) { - dBNode node; - char keystr[MAX_KMER_SIZE+1]; strbuf_reset(resp); - - hkey_t hkey = db_graph_rand_node(db_graph, MAX_RANDOM_TRIES); - if(hkey == HASH_NOT_FOUND) { strbuf_set(resp, "{}\n"); } - node.key = hkey; - node.orient = FORWARD; - BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); - binary_kmer_to_str(bkmer, db_graph->kmer_size, keystr); - kmer_response(resp, node, keystr, pretty, db_graph); + if(disk == NULL) { + q.node.key = db_graph_rand_node(db_graph, MAX_RANDOM_TRIES); + if(q.node.key == HASH_NOT_FOUND) { strbuf_set(resp, "{}\n"); return; } + q.node.orient = FORWARD; + q.bkey = db_node_get_bkey(db_graph, q.node.key); + query_fetch_from_graph(&q, db_graph); + } + else { + graph_search_rand(disk, &q.bkey, q.covgs, q.edges); + query_fetch_from_disk(&q); + } + kmer_response(resp, q, pretty, db_graph); } static char* make_info_json_str(cJSON **hdrs, size_t nhdrs, - bool pretty, const dBGraph *db_graph) + bool pretty, size_t nkmers, + const dBGraph *db_graph) { cJSON *json = cJSON_CreateObject(); - json_hdr_make_std(json, NULL, hdrs, nhdrs, db_graph); + json_hdr_make_std(json, NULL, hdrs, nhdrs, db_graph, nkmers); cJSON *paths = cJSON_CreateObject(); cJSON_AddItemToObject(json, "paths", paths); @@ -228,8 +288,9 @@ int ctx_server(int argc, char **argv) gpfile_buf_alloc(&gpfiles, 8); bool pretty = true; - // Per sample coverage and edges - bool load_covgs = false, load_edges = false; + bool binary_covgs = true; // Binary coverage instead of full coverage + bool per_col_edges = false; // Load per sample or pooled edges + bool use_disk = false; // Arg parsing char cmd[100]; @@ -253,8 +314,9 @@ int ctx_server(int argc, char **argv) case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'S': cmd_check(pretty, cmd); pretty = false; break; - case 'C': cmd_check(!load_covgs, cmd); load_covgs = true; break; - case 'E': cmd_check(!load_edges, cmd); load_edges = true; break; + case 'C': cmd_check(binary_covgs, cmd); binary_covgs = false; break; + case 'E': cmd_check(!per_col_edges, cmd); per_col_edges = true; break; + case 'D': cmd_check(!use_disk, cmd); use_disk = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); @@ -270,92 +332,125 @@ int ctx_server(int argc, char **argv) // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; + ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); - size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; + size_t i, ncols; + size_t ctx_max_kmers = 0, ctx_sum_kmers = 0; + size_t ctp_max_kmers = 0, ctp_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); + gpath_reader_count_kmers(gpfiles.b, gpfiles.len, &ctp_max_kmers, &ctp_sum_kmers); + // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1); + if(use_disk && num_gfiles > 1) + cmd_print_usage("Can only use --disk with one sorted graph file"); + // // Decide on memory // - size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem = 0; + size_t bits_per_kmer, kmers_in_hash, graph_mem = 0, path_mem = 0; // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) + - bits_per_kmer = sizeof(BinaryKmer)*8 + // kmer - sizeof(Edges)*8 * (load_edges ? ncols : 1) + // edges - sizeof(Covg)*8 * (load_covgs ? ncols : 0) + // covgs - (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) + // links - ncols; // in colour - - kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, - memargs.mem_to_use_set, - memargs.num_kmers, - memargs.num_kmers_set, - bits_per_kmer, - ctx_max_kmers, ctx_sum_kmers, - false, &graph_mem); - - if(gpfiles.len) + if(use_disk && gpfiles.len == 0) { - // Paths memory - size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); - path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, - ncols, rem_mem, - load_covgs); // load path counts - - // Shift path store memory from graphs->paths - graph_mem -= sizeof(GPath*)*kmers_in_hash; - path_mem += sizeof(GPath*)*kmers_in_hash; - cmd_print_mem(path_mem, "paths"); + kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, + memargs.mem_to_use_set, + memargs.num_kmers, + memargs.num_kmers_set, + 0, 0, 0, false, &graph_mem); + } + else + { + bits_per_kmer = sizeof(BinaryKmer)*8 + // kmer + sizeof(Edges)*8 * (per_col_edges ? ncols : 1) + // edges + (binary_covgs ? 1 : sizeof(Covg)*8) * ncols + // covgs + (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0); // links + + kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, + memargs.mem_to_use_set, + memargs.num_kmers, + memargs.num_kmers_set, + bits_per_kmer, + use_disk ? ctp_max_kmers : ctx_max_kmers, + use_disk ? ctp_sum_kmers : ctx_sum_kmers, + false, &graph_mem); + + if(gpfiles.len) + { + // Paths memory + size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); + path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, + ncols, rem_mem, + !binary_covgs, // load path counts + kmers_in_hash, false); + + // Shift path store memory from graphs->paths + graph_mem -= sizeof(GPath*)*kmers_in_hash; + path_mem += sizeof(GPath*)*kmers_in_hash; + cmd_print_mem(path_mem, "paths"); + } } size_t total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // Allocate memory + int allocflags = DBG_ALLOC_EDGES | (binary_covgs ? DBG_ALLOC_NODE_IN_COL + : DBG_ALLOC_COVGS); + if(use_disk) allocflags = 0; + dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, - ncols, load_edges ? ncols : 1, kmers_in_hash, - DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | - (load_covgs ? DBG_ALLOC_COVGS : 0)); + ncols, per_col_edges ? ncols : 1, kmers_in_hash, + allocflags); // Paths - allocates nothing if gpfiles.len == 0 gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, - path_mem, load_covgs, + path_mem, !binary_covgs, &db_graph); // // Load graphs // - GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); - gprefs.empty_colours = true; + GraphFileSearch *disk = NULL; - for(i = 0; i < num_gfiles; i++) { - graph_load(&gfiles[i], gprefs, NULL); - graph_file_close(&gfiles[i]); - gprefs.empty_colours = false; + if(use_disk) { + // Only load graph info + graph_load_ginfo(&db_graph, &gfiles[0]); + disk = graph_search_new(&gfiles[0]); + } + else { + GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); + gprefs.empty_colours = true; + for(i = 0; i < num_gfiles; i++) { + graph_load(&gfiles[i], gprefs, NULL); + graph_file_close(&gfiles[i]); + gprefs.empty_colours = false; + } } - ctx_free(gfiles); - - hash_table_print_stats(&db_graph.ht); // Load link files + int link_flags = use_disk ? GPATH_ADD_MISSING_KMERS : GPATH_DIE_MISSING_KMERS; for(i = 0; i < gpfiles.len; i++) - gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); + gpath_reader_load(&gpfiles.b[i], link_flags, &db_graph); + + hash_table_print_stats(&db_graph.ht); // Create array of cJSON** from input files cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*)); for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json; // Construct cJSON - char *info_txt = make_info_json_str(hdrs, gpfiles.len, pretty, &db_graph); + size_t nkmers_in_graph = use_disk ? ctx_max_kmers : hash_table_nkmers(&db_graph.ht); + char *info_txt = make_info_json_str(hdrs, gpfiles.len, pretty, + nkmers_in_graph, &db_graph); ctx_free(hdrs); // Close input link files @@ -370,26 +465,31 @@ int ctx_server(int argc, char **argv) size_t nqueries = 0, nbad_queries = 0; bool success; + ServerQuery q; + query_alloc(&q, db_graph.num_of_cols, binary_covgs, !per_col_edges); + // Read from input while(1) { fprintf(stdout, "> "); fflush(stdout); - if(futil_fcheck(strbuf_reset_readline(&line, stdin), stdin, "STDIN") == 0) + if(futil_fcheck(strbuf_reset_readline(&line, stdin), stdin, "STDIN") == 0) { + fprintf(stdout, "\n"); break; + } strbuf_chomp(&line); - if(strcmp(line.b,"q") == 0) { break; } - else if(strcmp(line.b,"info") == 0) { + if(strcasecmp(line.b,"q") == 0 || strcasecmp(line.b,"quit") == 0) { break; } + else if(strcasecmp(line.b,"info") == 0) { fputs(info_txt, stdout); fputc('\n', stdout); fflush(stdout); } - else if(strcmp(line.b,"random") == 0) { - request_random(&response, pretty, &db_graph); + else if(strcasecmp(line.b,"random") == 0) { + request_random(q, &response, pretty, disk, &db_graph); fputs(response.b, stdout); fflush(stdout); } else { - success = query_response(line.b, &response, pretty, &db_graph); + success = query_response(line.b, q, &response, pretty, disk, &db_graph); if(response.end) { fputs(response.b, stdout); fflush(stdout); @@ -404,6 +504,14 @@ int ctx_server(int argc, char **argv) ulong_to_str(nbad_queries, badstr); status("Answered %s queries, %s bad queries", nstr, badstr); + query_dealloc(&q); + + if(disk) { + graph_search_destroy(disk); + graph_file_close(&gfiles[0]); + } + ctx_free(gfiles); + free(info_txt); strbuf_dealloc(&line); strbuf_dealloc(&response); diff --git a/src/commands/ctx_sort.c b/src/commands/ctx_sort.c index 74254d3c..e613ce04 100644 --- a/src/commands/ctx_sort.c +++ b/src/commands/ctx_sort.c @@ -115,14 +115,14 @@ int ctx_sort(int argc, char **argv) // Read in whole file // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); - size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem); + size_t nkread = graph_file_fread(&gfile, mem, num_kmers*kmer_mem); if(nkread != num_kmers*kmer_mem) die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem); // check we are at the end of the file char tmpc; - if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) { + if(graph_file_fread(&gfile, &tmpc, 1) != 0) { die("More kmers in file than believed (kmers: %zu ncols: %zu).", num_kmers, ncols); } diff --git a/src/commands/ctx_thread.c b/src/commands/ctx_thread.c index e103aac1..551c56e3 100644 --- a/src/commands/ctx_thread.c +++ b/src/commands/ctx_thread.c @@ -24,8 +24,8 @@ const char thread_usage[] = " -n, --nkmers <N> Number of hash table entries (e.g. 1G ~ 1 billion)\n" " -t, --threads <T> Number of threads to use [default: "QUOTE_VALUE(DEFAULT_NTHREADS)"]\n" " -p, --paths <in.ctp> Load link file (can specify multiple times)\n" -" -0, --zero-paths Zero counts on initially loaded paths. Use if existing\n" -" paths were built from sequence being re-used by this run\n" +" -0, --zero-paths Zero counts on initially loaded links. Use if existing\n" +" links were built from sequence being re-used by this run\n" "\n" " Input:\n" " -1, --seq <in.fa> Thread reads from file (supports sam,bam,fq,*.gz\n" @@ -38,7 +38,7 @@ const char thread_usage[] = " -l, --min-frag-len <bp> Min fragment size for --seq2 [default:"QUOTE_VALUE(DEFAULT_CRTALN_FRAGLEN_MIN)"]\n" " -L, --max-frag-len <bp> Max fragment size for --seq2 [default:"QUOTE_VALUE(DEFAULT_CRTALN_FRAGLEN_MAX)"]\n" "\n" -" Path Params:\n" +" Link Params:\n" " -w, --one-way Use one-way gap filling (conservative) [default]\n" " -W, --two-way Use two-way gap filling (liberal)\n" " -d, --gap-diff-const <d> Set parameters for allowable gap lengths (decimals):\n" @@ -49,12 +49,12 @@ const char thread_usage[] = " -g, --gap-hist <o.csv> Save size distribution of sequence gaps bridged\n" " -G, --frag-hist <o.csv> Save size distribution of PE fragments\n" "\n" -" -u, --use-new-paths Use paths as they are being added (higher err rate) [default: no]\n" +" -u, --use-new-paths Use links as they are being added (higher err rate) [default: no]\n" "\n" " Debugging Options: Probably best not to touch these\n" " -x,--print-contigs -y,--print-paths -z,--print-reads\n" "\n" -" When loading existing paths with -p, use offset (e.g. 2:in.ctp) to specify\n" +" When loading existing links with -p, use offset (e.g. 2:in.ctp) to specify\n" " which colour to load the data into. See `"CMD" pjoin` to combine .ctp files\n" "\n"; @@ -139,7 +139,8 @@ int ctx_thread(int argc, char **argv) // Paths memory size_t min_path_mem = 0; - gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem); + gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, + &min_path_mem, NULL, NULL); if(graph_mem + min_path_mem > args.memargs.mem_to_use) { char buf[50]; @@ -248,7 +249,7 @@ int ctx_thread(int argc, char **argv) correct_aln_dump_stats(aln_stats, load_stats, args.dump_seq_sizes, args.dump_frag_sizes, - db_graph.ht.num_kmers); + hash_table_nkmers(&db_graph.ht)); // Don't need GPathHash anymore gpath_hash_dealloc(&db_graph.gphash); diff --git a/src/commands/ctx_uniqkmers.c b/src/commands/ctx_uniqkmers.c index f165fb97..07cf592e 100644 --- a/src/commands/ctx_uniqkmers.c +++ b/src/commands/ctx_uniqkmers.c @@ -89,7 +89,7 @@ static inline bool _is_valid_flank(BinaryKmer bkmer, const read_t *r, nuc = dna_char_to_nuc(r->seq.b[i-1]); tmp_bkmer = binary_kmer_left_shift_add(tmp_bkmer, kmer_size, nuc); tmp_bkey = binary_kmer_get_key(tmp_bkmer, kmer_size); - if(binary_kmers_are_equal(bkey, tmp_bkey)) return false; + if(binary_kmer_eq(bkey, tmp_bkey)) return false; bkeys[i] = tmp_bkey; } } @@ -98,7 +98,7 @@ static inline bool _is_valid_flank(BinaryKmer bkmer, const read_t *r, nuc = dna_char_to_nuc(r->seq.b[r->seq.end-i]); tmp_bkmer = binary_kmer_right_shift_add(tmp_bkmer, kmer_size, nuc); tmp_bkey = binary_kmer_get_key(tmp_bkmer, kmer_size); - if(binary_kmers_are_equal(bkey, tmp_bkey)) return false; + if(binary_kmer_eq(bkey, tmp_bkey)) return false; bkeys[i] = tmp_bkey; } } diff --git a/src/commands/ctx_unitigs.c b/src/commands/ctx_unitigs.c index 2e94b361..1c8d7c6c 100644 --- a/src/commands/ctx_unitigs.c +++ b/src/commands/ctx_unitigs.c @@ -148,7 +148,7 @@ static inline bool print_edges(hkey_t hkey, size_t threadid, void *arg) // Check if node is an end of a unitig if(uend.assigned) { - BinaryKmer bkey = db_node_get_bkmer(p->db_graph, hkey); + BinaryKmer bkey = db_node_get_bkey(p->db_graph, hkey); Edges edges = db_node_get_edges(p->db_graph, hkey, 0); if(uend.left) { diff --git a/src/global/cortex_types.h b/src/global/cortex_types.h index 1a8200e7..f8d96885 100644 --- a/src/global/cortex_types.h +++ b/src/global/cortex_types.h @@ -7,6 +7,9 @@ typedef uint32_t Covg; #define COVG_MAX UINT_MAX +#define SAFE_ADD_COVG(a,b) ((uint64_t)(a)+(b) > COVG_MAX ? COVG_MAX : (a)+(b)) +#define SAFE_SUM_COVG(a,b) ((a) = SAFE_ADD_COVG((a), (b))) + typedef uint8_t Orientation; #define FORWARD 0 #define REVERSE 1 diff --git a/src/global/global.h b/src/global/global.h index 4a782d62..57409de7 100644 --- a/src/global/global.h +++ b/src/global/global.h @@ -40,7 +40,7 @@ #define ONE_MEGABYTE (1<<20) #define MAX_IO_THREADS 10 #define DEFAULT_IO_BUFSIZE (4*ONE_MEGABYTE) -#define CORTEX_URL "https://github.com/mcveanlab/mccortex" +#define MCCORTEX_URL "https://github.com/mcveanlab/mccortex" #include "ctx_assert.h" #include "ctx_alloc.h" // Wrappers for malloc, calloc etc. diff --git a/src/graph/binary_kmer.c b/src/graph/binary_kmer.c index 69ee75d8..ac2fbc7f 100644 --- a/src/graph/binary_kmer.c +++ b/src/graph/binary_kmer.c @@ -13,16 +13,6 @@ // This is exported const BinaryKmer zero_bkmer = BINARY_KMER_ZERO_MACRO; -// less than for 1 or 2 bitfields is defined in the header -#if NUM_BKMER_WORDS > 2 -bool binary_kmer_less_than(BinaryKmer left, BinaryKmer right) { - size_t i; - for(i = 0; i < NUM_BKMER_WORDS && left.b[i] == right.b[i]; i++); - return (i < NUM_BKMER_WORDS && left.b[i] < right.b[i]); -} -#endif - - BinaryKmer binary_kmer_from_old(BinaryKmer bkmer, size_t kmer_size) { size_t o = 0, x = 2*(kmer_size&31); @@ -47,18 +37,6 @@ BinaryKmer binary_kmer_to_old(BinaryKmer bkmer, size_t kmer_size) return nbkmer; } -#if NUM_BKMER_WORDS > 1 - int binary_kmers_cmp(BinaryKmer a, BinaryKmer b) - { - size_t i; - for(i = 0; i < NUM_BKMER_WORDS; i++) { - if(a.b[i] < b.b[i]) return -1; - if(a.b[i] > b.b[i]) return 1; - } - return 0; - } -#endif - // For a given kmer, get the BinaryKmer 'key': // the lower of the kmer vs reverse complement of itself // kmer and kmer_key must NOT point to overlapping memory @@ -75,7 +53,7 @@ BinaryKmer binary_kmer_get_key(const BinaryKmer bkmer, size_t kmer_size) // Don't know which is going to be correct -- this will happen 1 in 4 times bkey = binary_kmer_reverse_complement(bkmer, kmer_size); - return (binary_kmer_less_than(bkmer, bkey) ? bkmer : bkey); + return (binary_kmer_lt(bkmer, bkey) ? bkmer : bkey); } #if NUM_BKMER_WORDS > 1 diff --git a/src/graph/binary_kmer.h b/src/graph/binary_kmer.h index 3d0c5e27..12950da9 100644 --- a/src/graph/binary_kmer.h +++ b/src/graph/binary_kmer.h @@ -56,40 +56,60 @@ extern const BinaryKmer zero_bkmer; ((bkmer)->b[NUM_BKMER_WORDS - 1] \ = ((bkmer)->b[NUM_BKMER_WORDS - 1] & 0xfffffffffffffffcUL) | (nuc)) -#if NUM_BKMER_WORDS == 1 - #define binary_kmers_cmp(x,y) ((x).b[0] < (y).b[0] ? -1 : (x).b[0] > (y).b[0]) -#else - int binary_kmers_cmp(BinaryKmer a, BinaryKmer b); -#endif +static inline bool binary_kmer_less_than(BinaryKmer x, BinaryKmer y) { + size_t i; + for(i = 0; i < NUM_BKMER_WORDS && x.b[i] == y.b[i]; i++); + return (i < NUM_BKMER_WORDS && x.b[i] < y.b[i]); +} +static inline bool binary_kmer_less_or_eq(BinaryKmer x, BinaryKmer y) { + size_t i; + for(i = 0; i < NUM_BKMER_WORDS && x.b[i] == y.b[i]; i++); + return (i == NUM_BKMER_WORDS || x.b[i] < y.b[i]); +} + +static inline int binary_kmers_compare(BinaryKmer a, BinaryKmer b) +{ + int i, c; + for(i = 0; i < NUM_BKMER_WORDS; i++) + if((c = cmp(a.b[i], b.b[i])) != 0) + return c; + return 0; +} #if NUM_BKMER_WORDS == 1 - #define binary_kmers_are_equal(x,y) ((x).b[0] == (y).b[0]) - #define binary_kmer_is_zero(x) ((x).b[0] == 0UL) - #define binary_kmer_less_than(x,y) ((x).b[0] < (y).b[0]) + #define binary_kmer_eq(x,y) ((x).b[0] == (y).b[0]) + #define binary_kmer_le(x,y) ((x).b[0] <= (y).b[0]) + #define binary_kmer_lt(x,y) ((x).b[0] < (y).b[0]) + #define binary_kmer_cmp(x,y) cmp((x).b[0], (y).b[0]) #elif NUM_BKMER_WORDS == 2 - #define binary_kmers_are_equal(x,y) ((x).b[0]==(y).b[0] && (x).b[1]==(y).b[1]) - #define binary_kmer_is_zero(x) (((x).b[0] | (x).b[1]) == 0UL) - #define binary_kmer_less_than(x,y) \ - ((x).b[0] < (y).b[0] || ((x).b[0] == (y).b[0] && (x).b[1] < (y).b[1])) + #define binary_kmer_eq(x,y) ((x).b[0]==(y).b[0] && (x).b[1]==(y).b[1]) + #define binary_kmer_le(x,y) ((x).b[0]<(y).b[0] || ((x).b[0]==(y).b[0] && (x).b[1]<=(y).b[1])) + #define binary_kmer_lt(x,y) ((x).b[0]<(y).b[0] || ((x).b[0]==(y).b[0] && (x).b[1]< (y).b[1])) + #define binary_kmer_cmp(x,y) ((x).b[0] != (y).b[0] ? cmp((x).b[0],(y).b[0]) : cmp((x).b[1],(y).b[1])) #else /* NUM_BKMER_WORDS > 2 */ - #define binary_kmers_are_equal(x,y) (memcmp((x).b,(y).b,BKMER_BYTES) == 0) - #define binary_kmer_is_zero(x) binary_kmers_are_equal((x), zero_bkmer) - bool binary_kmer_less_than(BinaryKmer left, BinaryKmer right); + #define binary_kmer_eq(x,y) (memcmp((x).b,(y).b,BKMER_BYTES) == 0) + #define binary_kmer_le(x,y) binary_kmer_less_or_eq(x,y) + #define binary_kmer_lt(x,y) binary_kmer_less_than(x,y) + #define binary_kmer_cmp(x,y) binary_kmers_compare(x,y) #endif +#define binary_kmer_gt(a,b) binary_kmer_lt(b,a) +#define binary_kmer_ge(a,b) !binary_kmer_lt(a,b) +#define binary_kmer_ne(a,b) !binary_kmer_eq(a,b) + #define binary_kmer_oversized(bk,k) ((bk).b[0] & (UINT64_MAX << BKMER_TOP_BITS(k))) static inline int binary_kmers_qcmp(const void *aa, const void *bb) { const BinaryKmer *a = (const BinaryKmer*)aa, *b = (const BinaryKmer*)bb; - return binary_kmers_cmp(*a, *b); + return binary_kmer_cmp(*a, *b); } static inline int binary_kmers_qcmp_ptrs(const void *aa, const void *bb) { const BinaryKmer *a = *(const BinaryKmer *const*)aa; const BinaryKmer *b = *(const BinaryKmer *const*)bb; - return binary_kmers_cmp(*a, *b); + return binary_kmer_cmp(*a, *b); } static inline int binary_kmers_qcmp_unaligned_ptrs(const void *aa, const void *bb) @@ -98,7 +118,7 @@ static inline int binary_kmers_qcmp_unaligned_ptrs(const void *aa, const void *b const char *a = *(const char *const*)aa, *b = *(const char *const*)bb; memcpy(b1.b, a, sizeof(BinaryKmer)); memcpy(b2.b, b, sizeof(BinaryKmer)); - return binary_kmers_cmp(b1, b2); + return binary_kmer_cmp(b1, b2); } // diff --git a/src/graph/cmd_mem.c b/src/graph/cmd_mem.c index a88e002a..1d793b45 100644 --- a/src/graph/cmd_mem.c +++ b/src/graph/cmd_mem.c @@ -132,7 +132,7 @@ size_t cmd_get_kmers_in_hash(size_t mem_to_use, bool mem_to_use_set, // Check memory against memory limit and machine memory void cmd_check_mem_limit(size_t mem_to_use, size_t mem_requested) { - char memstr[100], ramstr[100]; + char memstr[50], ramstr[50]; bytes_to_str(mem_requested, 1, memstr); if(mem_requested > mem_to_use) diff --git a/src/graph/db_graph.c b/src/graph/db_graph.c index 7085fadb..96d644c3 100644 --- a/src/graph/db_graph.c +++ b/src/graph/db_graph.c @@ -260,7 +260,7 @@ uint8_t db_graph_next_nodes(const dBGraph *db_graph, const BinaryKmer node_bkey, uint8_t db_graph_next_nodes_union(const dBGraph *db_graph, dBNode node, dBNode nodes[4], Nucleotide fw_nucs[4]) { - BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key); + BinaryKmer bkey = db_node_get_bkey(db_graph, node.key); Edges edges = db_node_get_edges_union(db_graph, node.key); return db_graph_next_nodes(db_graph, bkey, node.orient, edges, nodes, fw_nucs); } @@ -295,7 +295,7 @@ uint8_t db_graph_next_nodes_in_col(const dBGraph *db_graph, else edges = db_node_edges(db_graph, node.key, colour); - bkey = db_node_get_bkmer(db_graph, node.key); + bkey = db_node_get_bkey(db_graph, node.key); count = db_graph_next_nodes(db_graph, bkey, node.orient, edges, nodes, fw_nucs); @@ -350,7 +350,7 @@ uint8_t db_graph_prev_nodes_with_mask(const dBGraph *db_graph, dBNode node, ctx_assert(edges & prev_edge); edges &= ~prev_edge; - BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key); + BinaryKmer bkey = db_node_get_bkey(db_graph, node.key); uint8_t i, j, num_prev; @@ -403,7 +403,7 @@ static inline void check_node(hkey_t node, const dBGraph *db_graph, bool *missing_edges_ptr) { Edges edges = db_node_get_edges_union(db_graph, node); - BinaryKmer bkmer = db_node_get_bkmer(db_graph, node); + BinaryKmer bkmer = db_node_get_bkey(db_graph, node); size_t nfw_edges, nrv_edges, i, j; dBNode fwnodes[8], rvnodes[8]; Nucleotide fwnucs[8], rvnucs[8]; @@ -582,7 +582,7 @@ static inline void add_all_edges(hkey_t node, dBGraph *db_graph) { const size_t kmer_size = db_graph->kmer_size, edgencols = db_graph->num_edge_cols; size_t col; - BinaryKmer bkmer, bkey, node_bkey = db_node_get_bkmer(db_graph, node); + BinaryKmer bkmer, bkey, node_bkey = db_node_get_bkey(db_graph, node); Orientation orient; Nucleotide nuc; hkey_t next; @@ -677,10 +677,9 @@ void db_graph_intersect_edges(dBGraph *db_graph, size_t nthreads, Edges *edges) // if ntries > 0 and we fail to find a node will return HASH_NOT_FOUND hkey_t db_graph_rand_node(const dBGraph *db_graph, size_t ntries) { - uint64_t capacity = db_graph->ht.capacity; - BinaryKmer *table = db_graph->ht.table; + const HashTable *ht = &db_graph->ht; + size_t i, capacity = hash_table_size(ht); hkey_t hkey; - size_t i; if(capacity == 0) { warn("No entries in hash table - cannot select random"); @@ -690,7 +689,7 @@ hkey_t db_graph_rand_node(const dBGraph *db_graph, size_t ntries) for(i = 0; i < ntries; i++) { hkey = (hkey_t)((rand() / (double)RAND_MAX) * capacity); - if(HASH_ENTRY_ASSIGNED(table[hkey])) return hkey; + if(hash_table_assigned(ht, hkey)) return hkey; } return HASH_NOT_FOUND; @@ -720,7 +719,7 @@ void db_graph_print_kmer2(BinaryKmer bkmer, Covg *covgs, Edges *edges, void db_graph_print_kmer(hkey_t node, dBGraph *db_graph, FILE *fout) { - BinaryKmer bkmer = db_node_get_bkmer(db_graph, node); + BinaryKmer bkmer = db_node_get_bkey(db_graph, node); Covg *covgs = &db_node_covg(db_graph, node, 0); Edges *edges = &db_node_edges(db_graph, node, 0); diff --git a/src/graph/db_graph.h b/src/graph/db_graph.h index 489de132..de61a110 100644 --- a/src/graph/db_graph.h +++ b/src/graph/db_graph.h @@ -56,7 +56,7 @@ typedef struct } dBGraph; #define db_graph_has_path_hash(graph) ((graph)->gphash.table != NULL) -#define db_graph_node_assigned(graph,hkey) HASH_ENTRY_ASSIGNED((graph)->ht.table[hkey]) +#define db_graph_node_assigned(graph,hkey) hash_table_assigned(&(graph)->ht, hkey) // alloc_flags specifies where fields to malloc. OR together DBG_ALLOC_* values void db_graph_alloc(dBGraph *db_graph, size_t kmer_size, diff --git a/src/graph/db_node.c b/src/graph/db_node.c index c868d690..077bc6db 100644 --- a/src/graph/db_node.c +++ b/src/graph/db_node.c @@ -214,7 +214,7 @@ void db_nodes_left_shift(dBNode *nlist, size_t n, size_t shift) size_t db_node_to_str(const dBGraph *db_graph, dBNode node, char *str) { const size_t kmer_size = db_graph->kmer_size; - BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); + BinaryKmer bkmer = db_node_get_bkey(db_graph, node.key); binary_kmer_to_str(bkmer, kmer_size, str); str[kmer_size] = ':'; str[kmer_size+1] = '0' + node.orient; @@ -230,7 +230,7 @@ size_t db_nodes_to_str(const dBNode *nodes, size_t num, size_t i; size_t kmer_size = db_graph->kmer_size; - BinaryKmer bkmer = db_node_get_bkmer(db_graph, nodes[0].key); + BinaryKmer bkmer = db_node_get_bkey(db_graph, nodes[0].key); Nucleotide nuc; binary_kmer_to_str(bkmer, kmer_size, str); @@ -308,14 +308,14 @@ void db_nodes_print_verbose(const dBNode *nodes, size_t num, BinaryKmer bkmer, bkey; char kmerstr[MAX_KMER_SIZE+1], keystr[MAX_KMER_SIZE+1]; - bkmer = db_node_get_bkmer(db_graph, nodes[0].key); + bkmer = db_node_get_bkey(db_graph, nodes[0].key); bkey = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, kmerstr); binary_kmer_to_str(bkey, kmer_size, keystr); fprintf(out, "%3zu: %s:%i %s\n", (size_t)0, kmerstr, (int)nodes[0].orient, keystr); for(i = 1; i < num; i++) { - bkmer = db_node_get_bkmer(db_graph, nodes[i].key); + bkmer = db_node_get_bkey(db_graph, nodes[i].key); bkey = db_node_oriented_bkmer(db_graph, nodes[i]); binary_kmer_to_str(bkmer, kmer_size, kmerstr); binary_kmer_to_str(bkey, kmer_size, keystr); @@ -363,7 +363,7 @@ bool db_node_check_nodes(const dBNode *nodes, size_t num, bkmer1 = db_node_oriented_bkmer(db_graph, nodes[i+1]); nuc = binary_kmer_last_nuc(bkmer1); tmp = binary_kmer_left_shift_add(bkmer0, kmer_size, nuc); - ctx_assert_ret(binary_kmers_are_equal(tmp, bkmer1)); + ctx_assert_ret(binary_kmer_eq(tmp, bkmer1)); bkmer0 = bkmer1; } diff --git a/src/graph/db_node.h b/src/graph/db_node.h index 4d51aad1..1f5e8336 100644 --- a/src/graph/db_node.h +++ b/src/graph/db_node.h @@ -21,13 +21,13 @@ static inline uint64_t db_node_hash(dBNode node) { // // Get Binary kmers // -static inline BinaryKmer db_node_get_bkmer(const dBGraph *db_graph, hkey_t hkey) { - return db_graph->ht.table[hkey]; +static inline BinaryKmer db_node_get_bkey(const dBGraph *db_graph, hkey_t hkey) { + return hash_table_fetch(&db_graph->ht, hkey); } // Get an oriented bkmer #define db_node_oriented_bkmer(graph,node) \ - bkmer_oriented_bkmer(db_node_get_bkmer(graph,(node).key), \ + bkmer_oriented_bkmer(db_node_get_bkey(graph,(node).key), \ (node).orient, (graph)->kmer_size) // @@ -107,7 +107,7 @@ static inline void db_node_set_col_mt(const dBGraph *graph, // #define bkmer_get_orientation(bkmer,bkey) \ - (binary_kmers_are_equal((bkmer), (bkey)) ? FORWARD : REVERSE) + (binary_kmer_eq((bkmer), (bkey)) ? FORWARD : REVERSE) #define bkmer_oriented_bkmer(bkmer,or,ksize) \ (or == FORWARD ? bkmer : binary_kmer_reverse_complement(bkmer,ksize)) @@ -135,11 +135,11 @@ static inline void db_node_set_col_mt(const dBGraph *graph, #define opposite_orientation(or) rev_orient(or) #define db_node_get_first_nuc(node,graph) \ - bkmer_get_first_nuc(db_node_get_bkmer(graph,(node).key), (node).orient,\ + bkmer_get_first_nuc(db_node_get_bkey(graph,(node).key), (node).orient,\ (graph)->kmer_size) #define db_node_get_last_nuc(node,graph) \ - bkmer_get_last_nuc(db_node_get_bkmer(graph,(node).key), (node).orient,\ + bkmer_get_last_nuc(db_node_get_bkey(graph,(node).key), (node).orient,\ (graph)->kmer_size) static inline dBNode db_node_reverse(dBNode node) { @@ -202,12 +202,13 @@ bool edges_has_precisely_one_edge(Edges edges, Orientation orientation, // 1=>A, 2=>C, 4=>G, 8=>T // "3b" => [AC] AACTA [ACT] // Null terminates string -static inline void edges_to_char(Edges e, char str[3]) +static inline char* edges_to_char(Edges e, char str[3]) { static const char digits[16] = "0123456789abcdef"; str[0] = digits[edges_as_nibble(e, REVERSE)]; str[1] = digits[edges_as_nibble(e, FORWARD)]; str[2] = '\0'; + return str; } static inline void edges_print(FILE *fout, Edges e) @@ -280,9 +281,6 @@ char* db_node_get_edges_str(Edges edges, char *kmer_col_edge_str); // Coverages // -#define SAFE_ADD_COVG(a,b) ((uint64_t)(a)+(b) > COVG_MAX ? COVG_MAX : (a)+(b)) -#define SAFE_SUM_COVG(a,b) ((a) = SAFE_ADD_COVG((a), (b))) - #define db_node_covg(graph,hkey,col) \ ((graph)->col_covgs[(hkey)*(graph)->num_of_cols+(col)]) diff --git a/src/graph/db_unitig.c b/src/graph/db_unitig.c index 07f02e19..f1d2b63c 100644 --- a/src/graph/db_unitig.c +++ b/src/graph/db_unitig.c @@ -26,9 +26,9 @@ static bool db_unitig_is_closed_cycle(dBNode n0, BinaryKmer bkey0, if(!edges_has_edge(edges1, nuc, n1.orient)) return false; shiftkmer = bkmer_shift_add_last_nuc(bkey1, n1.orient, kmer_size, nuc); - if(binary_kmers_are_equal(bkey0, shiftkmer)) return true; + if(binary_kmer_eq(bkey0, shiftkmer)) return true; shiftkmer = binary_kmer_reverse_complement(shiftkmer, kmer_size); - return binary_kmers_are_equal(bkey0, shiftkmer); + return binary_kmer_eq(bkey0, shiftkmer); } // Orient unitig @@ -44,8 +44,8 @@ void db_unitig_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph) return; } - BinaryKmer bkey0 = db_node_get_bkmer(db_graph, nlist[0].key); - BinaryKmer bkey1 = db_node_get_bkmer(db_graph, nlist[len-1].key); + BinaryKmer bkey0 = db_node_get_bkey(db_graph, nlist[0].key); + BinaryKmer bkey1 = db_node_get_bkey(db_graph, nlist[len-1].key); // Check if closed cycle if(db_unitig_is_closed_cycle(nlist[0], bkey0, nlist[len-1], bkey1, db_graph)) @@ -54,8 +54,8 @@ void db_unitig_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph) BinaryKmer lowest = bkey0, tmp; size_t i, lowidx = 0; for(i = 1; i < len; i++) { - tmp = db_node_get_bkmer(db_graph, nlist[i].key); - if(binary_kmer_less_than(tmp, lowest)) { + tmp = db_node_get_bkey(db_graph, nlist[i].key); + if(binary_kmer_lt(tmp, lowest)) { lowest = tmp; lowidx = i; } @@ -77,7 +77,7 @@ void db_unitig_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph) } } } - else if(binary_kmer_less_than(bkey1, bkey0)) { + else if(binary_kmer_lt(bkey1, bkey0)) { db_nodes_reverse_complement(nlist, len); } } @@ -87,7 +87,7 @@ void db_unitig_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph) // Returns the number of nodes added, adds no more than `limit` // return false if out of space and limit > 0 bool db_unitig_extend(dBNodeBuffer *nbuf, size_t limit, - const dBGraph *db_graph) + const dBGraph *db_graph) { ctx_assert(nbuf->len > 0); diff --git a/src/graph/graph_cache.c b/src/graph/graph_cache.c index 8d6c6e2a..09bb3952 100644 --- a/src/graph/graph_cache.c +++ b/src/graph/graph_cache.c @@ -81,13 +81,13 @@ static inline void gc_create_unitig(GraphCache *cache, dBNode node, // prev nodes union_edges = db_node_get_edges_union(db_graph, first.key); - bkmer0 = db_node_get_bkmer(db_graph, first.key); + bkmer0 = db_node_get_bkey(db_graph, first.key); num_prev = db_graph_next_nodes(db_graph, bkmer0, first.orient, union_edges, prev_nodes, prev_bases); // next nodes union_edges = db_node_get_edges_union(db_graph, last.key); - bkmer1 = db_node_get_bkmer(db_graph, last.key); + bkmer1 = db_node_get_bkey(db_graph, last.key); num_next = db_graph_next_nodes(db_graph, bkmer1, last.orient, union_edges, next_nodes, next_bases); diff --git a/src/graph/graph_file_reader.c b/src/graph/graph_file_reader.c index 8e3d0aea..85e3dcb9 100644 --- a/src/graph/graph_file_reader.c +++ b/src/graph/graph_file_reader.c @@ -4,6 +4,17 @@ #include "cmd.h" #include "file_util.h" +// Buffer size `bufsize` is in bytes +void graph_file_set_buffered(GraphFileReader *file, size_t bufsize) +{ + if(graph_file_is_buffered(file) == (bufsize>0)) return; + if(bufsize) strm_buf_alloc(&file->strm, bufsize); + else { + fseek(file->fh, (off_t)file->strm.begin - file->strm.end, SEEK_CUR); + strm_buf_dealloc(&file->strm); + } +} + int graph_file_fseek(GraphFileReader *file, off_t offset, int whence) { if(file_filter_isstdin(&file->fltr)) die("Cannot fseek on STDIN"); @@ -21,23 +32,24 @@ off_t graph_file_ftell(GraphFileReader *file) return ftell(file->fh); } -size_t gfr_fread_bytes(GraphFileReader *file, void *ptr, size_t size) +// read `n` bytes from `file` into `ptr` +size_t graph_file_fread(GraphFileReader *file, void *ptr, size_t n) { - size_t n; + size_t nread; if(graph_file_is_buffered(file)) - n = fread_buf(file->fh, ptr, size, &file->strm); + nread = fread_buf(file->fh, ptr, n, &file->strm); else - n = fread2(file->fh, ptr, size); + nread = fread2(file->fh, ptr, n); // check for error if(ferror(file->fh)) die("File error: %s [%s]", strerror(errno), file_filter_path(&file->fltr)); - return n; + return nread; } // Read an element from the graph file #define _gfread(gfile,ptr,size,desc) \ do { \ - size_t _n = gfr_fread_bytes(gfile, ptr, size); \ + size_t _n = graph_file_fread(gfile, ptr, size); \ const char *_path = file_filter_path(&(gfile)->fltr); \ if(_n != (size)) { \ die("Couldn't read '%s': expected %zu; recieved: %zu; [file: %s]\n",\ @@ -342,7 +354,7 @@ size_t graph_file_read_raw(GraphFileReader *file, int num_bytes_read; char kstr[MAX_KMER_SIZE+1]; - num_bytes_read = gfr_fread_bytes(file, bkmer->b, sizeof(BinaryKmer)); + num_bytes_read = graph_file_fread(file, bkmer->b, sizeof(BinaryKmer)); if(num_bytes_read == 0) return 0; if(num_bytes_read != (int)(sizeof(uint64_t)*h->num_of_bitfields)) diff --git a/src/graph/graph_file_reader.h b/src/graph/graph_file_reader.h index 64256bdb..dfd68b1c 100644 --- a/src/graph/graph_file_reader.h +++ b/src/graph/graph_file_reader.h @@ -30,11 +30,22 @@ madcrow_buffer(gfile_buf, GraphFileBuffer, GraphFileReader); // Returns 0 if not set instead of -1 #define graph_file_nkmers(rdr) ((uint64_t)MAX2((rdr)->num_of_kmers, 0)) +// Get file offset of a given kmer +static inline off_t graph_file_offset(const GraphFileReader *gfr, size_t i) +{ + size_t s = sizeof(BinaryKmer)+gfr->fltr.srcncols*(sizeof(Covg)+sizeof(Edges)); + return gfr->hdr_size + s*i; +} + #define graph_file_is_buffered(file) ((file)->strm.b != NULL) +// Buffer size `bufsize` is in bytes +void graph_file_set_buffered(GraphFileReader *file, size_t bufsize); + int graph_file_fseek(GraphFileReader *file, off_t offset, int whence); off_t graph_file_ftell(GraphFileReader *file); -size_t gfr_fread_bytes(GraphFileReader *file, void *ptr, size_t size); +// read `n` bytes from `file` into `ptr` +size_t graph_file_fread(GraphFileReader *file, void *ptr, size_t n); // Open file // if cannot open file returns 0 @@ -62,8 +73,7 @@ size_t graph_file_read_raw(GraphFileReader *rdr, // Read a kmer from the file // returns true on success, false otherwise // prints warnings if dirty kmers in file -// Beware: this function does not use file.intocol so you may wish to pass: -// graph_file_read(file, &bkmer, covgs+file.intocol, edges+file.intocol); +// be sure to zero covgs, edges before reading in bool graph_file_read(GraphFileReader *file, BinaryKmer *bkmer, Covg *covgs, Edges *edges); diff --git a/src/graph/graph_search.c b/src/graph/graph_search.c new file mode 100644 index 00000000..168ce976 --- /dev/null +++ b/src/graph/graph_search.c @@ -0,0 +1,165 @@ +#include "global.h" +#include "graph_search.h" + +struct GraphFileSearch { + GraphFileReader *file; + size_t nkmers, ncols, entrysize; // nkmers in file, size of kmer entry in file + BinaryKmer *index; + size_t blocksize, nblocks; + void *block; // read file into block to linear search +}; + +// #define INDEX_SIZE 4 /* debugging */ +#define INDEX_SIZE 4*1024*1024 /* 4M */ +#define MAX_LIN_SEARCH 512 + +/* with MAX_LIN_SEARCH of 512, 1 MiB allows 227 colours to be loaded */ + +GraphFileSearch *graph_search_new(GraphFileReader *file) +{ + if(file->num_of_kmers < 0) { + warn("Cannot open GraphFileSearch with file stream"); + return NULL; + } + size_t i; + GraphFileSearch *gs = ctx_calloc(sizeof(GraphFileSearch), 1); + gs->file = file; + gs->nkmers = file->num_of_kmers; + gs->ncols = file->hdr.num_of_cols; + gs->entrysize = sizeof(BinaryKmer) + gs->ncols * (sizeof(Covg)+sizeof(Edges)); + gs->nblocks = MIN2(gs->nkmers, INDEX_SIZE); + gs->blocksize = gs->nkmers / gs->nblocks; + gs->nblocks = (gs->nkmers+gs->blocksize-1) / gs->blocksize; + gs->index = ctx_calloc(sizeof(BinaryKmer), gs->nblocks+1); // sentinel + gs->block = ctx_calloc(MAX_LIN_SEARCH * gs->entrysize, 1); + memset(gs->index[gs->nblocks].b,0xff,BKMER_BYTES); // sentinel kmer + status("[graph_search] on-disk-graph %zu cols %zu blocks %zu bsize %zu kmers" + " building...", gs->ncols, gs->nblocks, gs->blocksize, gs->nkmers); + graph_file_set_buffered(file, 0); // Turn OFF buffered input + for(i = 0; i < gs->nblocks; i++) { + graph_file_fseek(file, graph_file_offset(file, i*gs->blocksize), SEEK_SET); + if(graph_file_fread(file, &gs->index[i], sizeof(BinaryKmer)) != sizeof(BinaryKmer)) + die("Cannot index graph: %s", file_filter_path(&gs->file->fltr)); + } + // check file is sorted + for(i = 0; i+1 < gs->nblocks; i++) + if(!binary_kmer_lt(gs->index[i],gs->index[i+1])) + die("File is not sorted: %s", file_filter_path(&file->fltr)); + status("[graph_search] Index built."); + return gs; +} + +// We don't close the file +void graph_search_destroy(GraphFileSearch *gs) +{ + ctx_free(gs->index); + ctx_free(gs->block); + ctx_free(gs); +} + +// bkmers[n] must be a sentinel kmer (i.e. MAX_KMER) +static inline int binary_search_index(BinaryKmer bkey, + const BinaryKmer *bkmers, size_t n) +{ + int l = 0, r = n, mid; + while(l < r) { + mid = (l+r)/2; + if(binary_kmer_le(bkmers[mid],bkey)) { + if(binary_kmer_lt(bkey,bkmers[mid+1])) return mid; + else l = mid+1; + } + else r = mid; + } + return -1; +} + +// Return pointer to block of Covgs+Edges +static inline void* search_file_sec(GraphFileSearch *gs, BinaryKmer bkey, + size_t start, size_t end) +{ + const size_t hdrsize = gs->file->hdr_size; + size_t mid; + BinaryKmer bmid; + // Binary search + while(start + MAX_LIN_SEARCH < end) { + mid = (start+end) / 2; + graph_file_fseek(gs->file, hdrsize+gs->entrysize*mid, SEEK_SET); + if(graph_file_fread(gs->file, gs->block, gs->entrysize) != gs->entrysize) + die("Cannot search graph from disk: %s", file_filter_path(&gs->file->fltr)); + memcpy(bmid.b, gs->block, sizeof(BinaryKmer)); // copy binary kmer + if(binary_kmer_eq(bkey,bmid)) return gs->block; + if(binary_kmer_lt(bkey,bmid)) end = mid; + else start = mid + 1; + } + // Linear search + size_t blockmem = gs->entrysize*(end-start); + graph_file_fseek(gs->file, hdrsize+gs->entrysize*start, SEEK_SET); + if(graph_file_fread(gs->file, gs->block, blockmem) != blockmem) + die("Cannot search graph from disk: %s", file_filter_path(&gs->file->fltr)); + char *p, *endp = (char*)gs->block + blockmem; + for(p = gs->block; p < endp; p += gs->entrysize) + { + memcpy(bmid.b, p, sizeof(BinaryKmer)); + if(binary_kmer_eq(bkey,bmid)) return p; + if(binary_kmer_lt(bkey,bmid)) return NULL; + } + return NULL; +} + +// Given an entry from a graph file, load edges and coverage +static inline void filter_covgs_edges(const FileFilter *fltr, + Covg *covgs, Edges *edges, + const void *ptr) +{ + size_t from, into, i; + const char *allcovgs = (const char*)ptr + sizeof(BinaryKmer); + const char *alledges = (const char*)allcovgs + fltr->srcncols*sizeof(Covg); + Covg c; + Edges e; + memset(covgs, 0, file_filter_into_ncols(fltr) * sizeof(Covg)); + for(i = 0; i < file_filter_num(fltr); i++) { + from = file_filter_fromcol(fltr, i); + into = file_filter_intocol(fltr, i); + memcpy(&c, allcovgs+sizeof(Covg)*from, sizeof(Covg)); + covgs[into] = SAFE_ADD_COVG(covgs[into], c); + } + memset(edges, 0, file_filter_into_ncols(fltr) * sizeof(Edges)); + for(i = 0; i < file_filter_num(fltr); i++) { + from = file_filter_fromcol(fltr,i); + into = file_filter_intocol(fltr, i); + memcpy(&e, alledges+sizeof(Edges)*from, sizeof(Edges)); + edges[into] |= e; + } +} + +bool graph_search_find(GraphFileSearch *gs, BinaryKmer bkey, + Covg *covgs, Edges *edges) +{ + char *ptr; + // Binary search on the index + long x = binary_search_index(bkey,gs->index,gs->nblocks); + if(x < 0) return false; + size_t blockstart = x*gs->blocksize; + size_t blockend = (size_t)x+1 < gs->nblocks ? blockstart+gs->blocksize : gs->nkmers; + if((ptr = search_file_sec(gs, bkey, blockstart, blockend)) == NULL) return false; + filter_covgs_edges(&gs->file->fltr, covgs, edges, ptr); + return true; +} + +void graph_search_fetch(GraphFileSearch *gs, size_t idx, BinaryKmer *bkey, + Covg *covgs, Edges *edges) +{ + graph_file_fseek(gs->file, gs->file->hdr_size+gs->entrysize*idx, SEEK_SET); + // read one entry + if(graph_file_fread(gs->file, gs->block, gs->entrysize) != gs->entrysize) + die("Cannot search graph from disk: %s", file_filter_path(&gs->file->fltr)); + memcpy(bkey, gs->block, sizeof(BinaryKmer)); // copy binary kmer + filter_covgs_edges(&gs->file->fltr, covgs, edges, gs->block); +} + +void graph_search_rand(GraphFileSearch *gs, + BinaryKmer *bkey, Covg *covgs, Edges *edges) +{ + size_t idx = (rand() / (double)RAND_MAX) * gs->nkmers; + graph_search_fetch(gs, idx, bkey, covgs, edges); +} diff --git a/src/graph/graph_search.h b/src/graph/graph_search.h new file mode 100644 index 00000000..a03295d4 --- /dev/null +++ b/src/graph/graph_search.h @@ -0,0 +1,26 @@ +#ifndef GRAPH_SEARCH_H_ +#define GRAPH_SEARCH_H_ + +#include "cortex_types.h" +#include "binary_kmer.h" +#include "graph_file_reader.h" + +// +// Search a sorted graph file on disk +// + +typedef struct GraphFileSearch GraphFileSearch; + +GraphFileSearch *graph_search_new(GraphFileReader *file); +void graph_search_destroy(GraphFileSearch *gs); + +bool graph_search_find(GraphFileSearch *gs, BinaryKmer bkey, + Covg *covgs, Edges *edges); + +void graph_search_fetch(GraphFileSearch *gs, size_t idx, + BinaryKmer *bkey, Covg *covgs, Edges *edges); + +void graph_search_rand(GraphFileSearch *gs, + BinaryKmer *bkey, Covg *covgs, Edges *edges); + +#endif /* GRAPH_SEARCH_H_ */ diff --git a/src/graph/graph_walker.c b/src/graph/graph_walker.c index 7375e9bd..29d4f795 100644 --- a/src/graph/graph_walker.c +++ b/src/graph/graph_walker.c @@ -191,7 +191,7 @@ static inline size_t pickup_paths(GraphWalker *wlk, dBNode node, #ifdef DEBUG_WALKER char bkey_str[MAX_KMER_SIZE+1], node_str[MAX_KMER_SIZE+1]; - BinaryKmer node_bkey = db_node_get_bkmer(wlk->db_graph, node.key); + BinaryKmer node_bkey = db_node_get_bkey(wlk->db_graph, node.key); binary_kmer_to_str(wlk->bkey, wlk->db_graph->kmer_size, bkey_str); binary_kmer_to_str(node_bkey, wlk->db_graph->kmer_size, node_str); status(" pickup_paths(): %s:%i node:%s:%i picked up %zu %s paths cntr_filter_nuc0:%i", @@ -243,7 +243,7 @@ void graph_walker_start(GraphWalker *wlk, dBNode node) wlk->last_step.idx = -1; // Get binary kmer - wlk->bkey = db_node_get_bkmer(wlk->db_graph, node.key); + wlk->bkey = db_node_get_bkey(wlk->db_graph, node.key); #ifdef DEBUG_WALKER char kmer_str[MAX_KMER_SIZE+1]; @@ -548,7 +548,7 @@ static void _graph_walker_force_jump(GraphWalker *wlk, (int)wlk->last_step.status, (int)is_fork); // Update GraphWalker position - wlk->bkey = db_node_get_bkmer(db_graph, node.key); + wlk->bkey = db_node_get_bkey(db_graph, node.key); wlk->node = node; if(is_fork) @@ -784,7 +784,7 @@ bool graph_walker_agrees_contig(GraphWalker *wlk, bkmer0 = binary_kmer_left_shift_one_base(bkmer0, wlk->db_graph->kmer_size); binary_kmer_set_last_nuc(&bkmer1, 0); - if(!binary_kmers_are_equal(bkmer0, bkmer1)) + if(!binary_kmer_eq(bkmer0, bkmer1)) { char bstr0[MAX_KMER_SIZE+1], bstr1[MAX_KMER_SIZE+1]; binary_kmer_to_str(bkmer0, wlk->db_graph->kmer_size, bstr0); @@ -793,7 +793,7 @@ bool graph_walker_agrees_contig(GraphWalker *wlk, printf("wlk: %s contig: %s num_nodes %zu\n", bstr0, bstr1, num_nodes); } - ctx_check(binary_kmers_are_equal(bkmer0, bkmer1)); + ctx_check(binary_kmer_eq(bkmer0, bkmer1)); #endif dBNode nodes[4]; diff --git a/src/graph/graph_writer.c b/src/graph/graph_writer.c index 586a214a..5bd72813 100644 --- a/src/graph/graph_writer.c +++ b/src/graph/graph_writer.c @@ -132,7 +132,7 @@ static inline void graph_write_kmer_direct(hkey_t hkey, FILE *fh, const dBGraph *db_graph) { graph_write_kmer(fh, hdr->num_of_cols, - db_graph->ht.table[hkey], + hash_table_fetch(&db_graph->ht, hkey), &db_node_covg(db_graph, hkey, 0), &db_node_edges(db_graph, hkey, 0)); } @@ -145,7 +145,7 @@ static void graph_write_kmer_indirect(hkey_t hkey, const GraphFileHeader *hdr, size_t *num_dumped) { size_t i, into, from; - BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); Covg covgs[hdr->num_of_cols], merge_covgs = 0; Edges edges[hdr->num_of_cols], merge_edges = 0; @@ -189,7 +189,7 @@ size_t graph_write_all_kmers_direct(FILE *fh, const dBGraph *db_graph, HASH_ITERATE(&db_graph->ht, graph_write_kmer_direct, hdr, fh, db_graph); } - return db_graph->ht.num_kmers; + return hash_table_nkmers(&db_graph->ht); } size_t graph_write_all_kmers_filtered(FILE *fh, const dBGraph *db_graph, @@ -289,7 +289,7 @@ static inline void _dump_empty_bkmer(hkey_t hkey, const dBGraph *db_graph, char *buf, size_t mem, FILE *fh) { size_t written; - const BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + const BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); written = fwrite(bkmer.b, 1, sizeof(BinaryKmer), fh) + fwrite(buf, 1, mem, fh); if(written != mem+sizeof(BinaryKmer)) die("Couldn't write to file"); @@ -311,7 +311,7 @@ static size_t graph_write_empty(const dBGraph *db_graph, FILE *fh, } else { HASH_ITERATE(&db_graph->ht, _dump_empty_bkmer, db_graph, buf, mem, fh); } - return db_graph->ht.num_kmers * (sizeof(BinaryKmer) + mem); + return hash_table_nkmers(&db_graph->ht) * (sizeof(BinaryKmer) + mem); } /*! @@ -336,7 +336,7 @@ static void graph_writer_update_file_kmers(const dBGraph *db_graph, ctx_assert(db_graph->num_of_cols == db_graph->num_edge_cols); ctx_assert(first_graphcol+ngraphcols <= db_graph->num_of_cols); - // db_graph->ht.num_kmers is also the number of kmers in the file + // hash_table_nkmers(&db_graph->ht) is also the number of kmers in the file // We are just overwriting some of the coverages and edges for colours // first_filecol..(first_filecol+ngraphcols) not including last size_t max_block_size = 16 * ONE_MEGABYTE; @@ -356,9 +356,9 @@ static void graph_writer_update_file_kmers(const dBGraph *db_graph, // Read block of kmers from the file // iterate over the hash table figuring which ones they are - while(nkmers_printed < db_graph->ht.num_kmers) + while(nkmers_printed < hash_table_nkmers(&db_graph->ht)) { - nkmers = MIN2(db_graph->ht.num_kmers - nkmers_printed, kmers_per_block); + nkmers = MIN2(hash_table_nkmers(&db_graph->ht) - nkmers_printed, kmers_per_block); nbytes = nkmers*filekmersize; if(fread(mem, 1, nbytes, fh) != nbytes) die("Cannot read: %s", path); memptr = mem; @@ -639,11 +639,11 @@ size_t graph_writer_merge(const char *out_ctx_path, file_filter_close(&origfltr); // Print output status - graph_writer_print_status(db_graph->ht.num_kmers, hdr->num_of_cols, + graph_writer_print_status(hash_table_nkmers(&db_graph->ht), hdr->num_of_cols, out_ctx_path, hdr->version); } - return db_graph->ht.num_kmers; + return hash_table_nkmers(&db_graph->ht); } // if intersect_gname != NULL: only load kmers that are already in the hash table diff --git a/src/graph/graphs_load.c b/src/graph/graphs_load.c index 679a1fb9..5dd7e790 100644 --- a/src/graph/graphs_load.c +++ b/src/graph/graphs_load.c @@ -43,8 +43,38 @@ void graph_loading_print_status(const GraphFileReader *file) } } -// if only_load_if_in_colour is >= 0, only kmers with coverage in existing -// colour only_load_if_in_colour will be loaded. +// Load ginfo from file header into the graph and check compatible +void graph_load_ginfo(dBGraph *graph, GraphFileReader *file) +{ + FileFilter *fltr = &file->fltr; + GraphInfo *ginfo = graph->ginfo; + GraphFileHeader *hdr = &file->hdr; + size_t i, ncols = file_filter_into_ncols(fltr), fromcol, intocol; + + // Check we can load this graph file into db_graph (kmer size + num colours) + if(hdr->kmer_size != graph->kmer_size) + { + die("Graph has different kmer size [kmer_size: %u vs %zu; path: %s]", + hdr->kmer_size, graph->kmer_size, fltr->path.b); + } + + if(ncols > graph->num_of_cols) + { + die("Program has not assigned enough colours! " + "[colours in graph: %zu vs file: %zu; path: %s]", + graph->num_of_cols, ncols, fltr->path.b); + } + + for(i = 0; i < file_filter_num(fltr); i++) { + fromcol = file_filter_fromcol(fltr, i); + intocol = file_filter_intocol(fltr, i); + graph_info_merge(ginfo+intocol, hdr->ginfo+fromcol); + } + + // Update number of colours loaded + graph->num_of_cols_used = MAX2(graph->num_of_cols_used, ncols); +} + // We assume only_load_if_in_colour < load_first_colour_into // if all_kmers_are_unique != 0 an error is thrown if a node already exists // If stats != NULL, updates: @@ -58,10 +88,8 @@ size_t graph_load(GraphFileReader *file, const GraphLoadingPrefs prefs, GraphLoadingStats *stats) { dBGraph *graph = prefs.db_graph; - GraphInfo *ginfo = graph->ginfo; - size_t i, ncols = file_filter_into_ncols(&file->fltr), fromcol, intocol; FileFilter *fltr = &file->fltr; - GraphFileHeader *hdr = &file->hdr; + size_t i, ncols = file_filter_into_ncols(fltr); ctx_assert(file_filter_num(fltr) > 0); @@ -75,28 +103,8 @@ size_t graph_load(GraphFileReader *file, const GraphLoadingPrefs prefs, die("fseek failed: %s", strerror(errno)); } - // Check we can load this graph file into db_graph (kmer size + num colours) - if(hdr->kmer_size != graph->kmer_size) - { - die("Graph has different kmer size [kmer_size: %u vs %zu; path: %s]", - hdr->kmer_size, graph->kmer_size, fltr->path.b); - } - - if(ncols > graph->num_of_cols) - { - die("Program has not assigned enough colours! " - "[colours in graph: %zu vs file: %zu; path: %s]", - graph->num_of_cols, ncols, fltr->path.b); - } - - for(i = 0; i < file_filter_num(fltr); i++) { - fromcol = file_filter_fromcol(fltr, i); - intocol = file_filter_intocol(fltr, i); - graph_info_merge(ginfo+intocol, hdr->ginfo+fromcol); - } - - // Update number of colours loaded - graph->num_of_cols_used = MAX2(graph->num_of_cols_used, ncols); + // Load ginfo from file header into the graph and check compatible + graph_load_ginfo(graph, file); // Read kmers, align colours to those they are updating // e.g. covgs[i] -> colour i in the graph diff --git a/src/graph/graphs_load.h b/src/graph/graphs_load.h index 3329702f..93f1334f 100644 --- a/src/graph/graphs_load.h +++ b/src/graph/graphs_load.h @@ -46,8 +46,9 @@ static inline GraphLoadingPrefs graph_loading_prefs(dBGraph *graph) // Print loading message void graph_loading_print_status(const GraphFileReader *file); -// if only_load_if_in_colour is >= 0, only kmers with coverage in existing -// colour only_load_if_in_colour will be loaded. +// Load ginfo from file header into the graph and check compatible +void graph_load_ginfo(dBGraph *graph, GraphFileReader *file); + // if clean_colours != 0 an error is thrown if a node already exists // returns the number of colours in the binary // If stats != NULL, updates: diff --git a/src/graph/hash_table.c b/src/graph/hash_table.c index bc7111b0..3f37bf64 100644 --- a/src/graph/hash_table.c +++ b/src/graph/hash_table.c @@ -9,11 +9,7 @@ // Hash table prefetching doesn't appear to be faster // #define HASH_PREFETCH 1 -static const BinaryKmer unset_bkmer = {.b = {UNSET_BKMER_WORD}}; - #define ht_bckt_ptr(ht,bckt) ((ht)->table + (size_t)bckt * (ht)->bucket_size) -#define hash_table_bsize(ht,bkt) ((ht)->buckets[bkt][HT_BSIZE]) -#define hash_table_bitems(ht,bkt) ((ht)->buckets[bkt][HT_BITEMS]) #define hash_table_bsize_mt(ht,bkt) (*(volatile uint8_t*)&ht->buckets[bkt][HT_BSIZE]) #define hash_table_bitems_mt(ht,bkt) (*(volatile uint8_t*)&ht->buckets[bkt][HT_BITEMS]) @@ -38,12 +34,9 @@ void hash_table_alloc(HashTable *ht, uint64_t req_capacity) // calloc is required for bucket_data to set the first element of each bucket // to the 0th pos - BinaryKmer *table = ctx_malloc(capacity * sizeof(BinaryKmer)); + BinaryKmer *table = ctx_calloc(capacity, sizeof(BinaryKmer)); uint8_t (*const buckets)[2] = ctx_calloc(num_of_buckets, sizeof(uint8_t[2])); - size_t i; - for(i = 0; i < capacity; i++) table[i] = unset_bkmer; - HashTable data = { .table = table, .num_of_buckets = num_of_buckets, @@ -66,9 +59,7 @@ void hash_table_dealloc(HashTable *hash_table) void hash_table_empty(HashTable *const ht) { - size_t i; - BinaryKmer *table = ht->table; - for(i = 0; i < ht->capacity; i++) table[i] = unset_bkmer; + memset(ht->table, 0, ht->capacity * sizeof(BinaryKmer)); memset(ht->buckets, 0, ht->num_of_buckets * sizeof(uint8_t[2])); HashTable data = { @@ -86,43 +77,30 @@ void hash_table_empty(HashTable *const ht) static inline const BinaryKmer* hash_table_find_in_bucket(const HashTable *const ht, uint_fast32_t bucket, - const BinaryKmer bkmer) + BinaryKmer bkmer) { const BinaryKmer *ptr = ht_bckt_ptr(ht, bucket); const BinaryKmer *end = ptr + hash_table_bsize(ht, bucket); + bkmer.b[0] |= BKMER_SET_FLAG; // mark as assigned in the hash table while(ptr < end) { - if(binary_kmers_are_equal(bkmer, *ptr)) return ptr; + if(binary_kmer_eq(bkmer, *ptr)) return ptr; ptr++; } return NULL; // Not found } -// static inline const BinaryKmer* hash_table_find_in_bucket_mt(const HashTable *const ht, -// uint_fast32_t bucket, -// const BinaryKmer bkmer) -// { -// const BinaryKmer *ptr = ht_bckt_ptr(ht, bucket); -// const BinaryKmer *end = ptr + hash_table_bsize_mt(ht, bucket); - -// while(ptr < end) { -// BinaryKmer tgt = *(volatile const BinaryKmer*)ptr; -// if(binary_kmers_are_equal(bkmer, tgt)) return ptr; -// ptr++; -// } -// return NULL; // Not found -// } - // Remember to increment ht->num_kmers static inline BinaryKmer* hash_table_insert_in_bucket(HashTable *ht, uint_fast32_t bucket, - const BinaryKmer bkmer) + BinaryKmer bkmer) { size_t bsize = hash_table_bsize(ht, bucket); size_t bitems = hash_table_bitems(ht, bucket); ctx_assert(bitems < ht->bucket_size); ctx_assert(bitems <= bsize); BinaryKmer *ptr = ht_bckt_ptr(ht, bucket); + bkmer.b[0] |= BKMER_SET_FLAG; // mark as assigned in the hash table if(bitems == bsize) { ptr += bsize; @@ -138,30 +116,6 @@ static inline BinaryKmer* hash_table_insert_in_bucket(HashTable *ht, return ptr; } -// static inline BinaryKmer* hash_table_insert_in_bucket_mt(HashTable *ht, -// uint_fast32_t bucket, -// const BinaryKmer bkmer) -// { -// size_t bsize = hash_table_bsize_mt(ht, bucket); -// size_t bitems = hash_table_bitems_mt(ht, bucket); -// ctx_assert(bitems < ht->bucket_size); -// ctx_assert(bitems <= bsize); -// BinaryKmer *ptr = ht_bckt_ptr(ht, bucket); - -// if(bitems == bsize) { -// ptr += bsize; -// __sync_add_and_fetch((volatile uint8_t*)&ht->buckets[bucket][HT_BSIZE], 1); -// } -// else { -// // Find an entry that has been deleted from this bucket previously -// while(HASH_ENTRY_ASSIGNED(*ptr)) ptr++; -// } - -// *ptr = bkmer; -// __sync_add_and_fetch((volatile uint8_t*)&ht->buckets[bucket][HT_BITEMS], 1); -// return ptr; -// } - #define rehash_error_exit(ht) do { \ ctx_msg_out = stderr; \ hash_table_print_stats(ht); \ @@ -335,7 +289,7 @@ void hash_table_delete(HashTable *const ht, hkey_t pos) ctx_assert(pos != HASH_NOT_FOUND); ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos])); - ht->table[pos] = unset_bkmer; + memset(ht->table+pos, 0, sizeof(BinaryKmer)); n = __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1); m = __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1); @@ -358,10 +312,10 @@ void hash_table_print_stats_brief(const HashTable *const ht) ulong_to_str(ht->capacity, capacity_str); ulong_to_str(ht->num_kmers, num_entries_str); - status("[hasht] buckets: %s [2^%zu]; bucket size: %zu; " - "memory: %s; occupancy: %s / %s (%.2f%%)\n", - num_buckets_str, nkeybits, (size_t)ht->bucket_size, mem_str, - num_entries_str, capacity_str, occupancy); + status("[hasht] buckets: %s [2^%zu]; bucket size: %zu; ", + num_buckets_str, nkeybits, (size_t)ht->bucket_size); + status("[hasht] memory: %s; filled: %s / %s (%.2f%%)\n", + mem_str, num_entries_str, capacity_str, occupancy); } void hash_table_print_stats(const HashTable *const ht) @@ -413,6 +367,7 @@ hkey_t* hash_table_sorted(const HashTable *htable) nxt = kmers = ctx_malloc(sizeof(BkmerPtrHkeyUnion) * htable->num_kmers); end = kmers + htable->num_kmers; HASH_ITERATE(htable, _fetch_kmer_union, htable, &nxt); + // Can sort ignoring that the top flag bit is set on all kmers qsort(kmers, htable->num_kmers, sizeof(BinaryKmer*), binary_kmers_qcmp_ptrs); for(nxt = kmers; nxt < end; nxt++) nxt->h = nxt->bptr - htable->table; return (hkey_t*)kmers; diff --git a/src/graph/hash_table.h b/src/graph/hash_table.h index 074d3125..be9db04c 100644 --- a/src/graph/hash_table.h +++ b/src/graph/hash_table.h @@ -7,18 +7,17 @@ #include "binary_kmer.h" #include "util.h" -#define UNSET_BKMER_WORD (1UL<<63) - #define HT_BSIZE 0 #define HT_BITEMS 1 #define HASH_NOT_FOUND (UINT64_MAX>>1) -#define HASH_ENTRY_ASSIGNED(bkmer) (!((bkmer).b[0] & UNSET_BKMER_WORD)) +#define BKMER_SET_FLAG (1UL<<63) +#define HASH_ENTRY_ASSIGNED(bkmer) (((bkmer).b[0] & BKMER_SET_FLAG)) // Struct is public so ITERATE macros can operate on it typedef struct { - BinaryKmer *const table; + BinaryKmer *const table; // Do not directly access use hash_table_fetch()! const uint64_t num_of_buckets; // needs to store maximum of 1<<32 const uint_fast32_t hash_mask; // this is num_of_buckets - 1 const uint8_t bucket_size; // max value 255 @@ -33,7 +32,23 @@ typedef struct // Returns NULL if not enough memory void hash_table_alloc(HashTable *htable, uint64_t capacity); -void hash_table_dealloc(HashTable *hash_table); +void hash_table_dealloc(HashTable *ht); + +#define hash_table_size(ht) (ht)->capacity +#define hash_table_nkmers(ht) (ht)->num_kmers +#define hash_table_assigned(ht,key) HASH_ENTRY_ASSIGNED((ht)->table[key]) + +static inline BinaryKmer hash_table_fetch(const HashTable *const ht, hkey_t key) +{ + BinaryKmer bk = ht->table[key]; + bk.b[0] &= 0x3fffffffffffffff; // mask off top two bits + return bk; +} + +#define hash_table_nbuckets(ht) ((ht)->num_of_buckets) +#define hash_table_bucket_size(ht) ((ht)->bucket_size) +#define hash_table_bsize(ht,bkt) ((ht)->buckets[bkt][HT_BSIZE]) +#define hash_table_bitems(ht,bkt) ((ht)->buckets[bkt][HT_BITEMS]) hkey_t hash_table_find(const HashTable *const htable, const BinaryKmer bkmer); hkey_t hash_table_insert(HashTable *const htable, const BinaryKmer bkmer); @@ -73,10 +88,10 @@ uint64_t hash_table_count_kmers(const HashTable *const htable); // Iterate over all entries #define HASH_ITERATE1(ht,func, ...) do { \ - const BinaryKmer *_table = (ht)->table, *htt_ptr, *htt_end = _table+(ht)->capacity;\ - for(htt_ptr = _table; htt_ptr < htt_end; htt_ptr++) { \ - if(HASH_ENTRY_ASSIGNED(*htt_ptr)) { \ - func((hkey_t)(htt_ptr - _table), ##__VA_ARGS__); \ + hkey_t _hi, _hsize = hash_table_size(ht); \ + for(_hi = 0; _hi < _hsize; _hi++) { \ + if(hash_table_assigned(ht, _hi)) { \ + func(_hi, ##__VA_ARGS__); \ } \ } \ } while(0) @@ -85,13 +100,12 @@ uint64_t hash_table_count_kmers(const HashTable *const htable); // Faster in low density hash tables // Don't use this iterator if your func adds or removes elements #define HASH_ITERATE2(ht,func, ...) do { \ - const BinaryKmer *_table = (ht)->table, *bkt_strt = _table, *htt_ptr; \ - size_t _b, _c; \ - for(_b = 0; _b < (ht)->num_of_buckets; _b++, bkt_strt += (ht)->bucket_size) {\ - for(htt_ptr = bkt_strt, _c = 0; _c < (ht)->buckets[_b][HT_BITEMS]; htt_ptr++){\ - if(HASH_ENTRY_ASSIGNED(*htt_ptr)) { \ - _c++; func((hkey_t)(htt_ptr - _table), ##__VA_ARGS__); \ - } \ + size_t _b, _bstart, _nitems, _nseen, _hi; \ + size_t _nbuck = hash_table_nbuckets(ht), _bsize = hash_table_bucket_size(ht);\ + for(_b = _bstart = 0; _b < _nbuck; _b++, _bstart += _bsize) { \ + _nitems = hash_table_bitems(ht, _b); \ + for(_nseen = 0, _hi = _bstart; _nseen < _nitems; _hi++) { \ + if(hash_table_assigned(ht, _hi)) { _nseen++; func(_hi, ##__VA_ARGS__); } \ } \ } \ } while(0) @@ -100,7 +114,7 @@ uint64_t hash_table_count_kmers(const HashTable *const htable); // Requires sizeof(hkey_t) * ht->num_kmers memory which it allocates and frees #define HASH_ITERATE_SORTED(ht,func, ...) do { \ hkey_t *_hkeys = hash_table_sorted(ht); \ - size_t _i, _nkmers = (ht)->num_kmers; \ + size_t _i, _nkmers = hash_table_nkmers(ht); \ for(_i = 0; _i < _nkmers; _i++) { func(_hkeys[_i], ##__VA_ARGS__); } \ ctx_free(_hkeys); \ } while(0) @@ -109,13 +123,12 @@ uint64_t hash_table_count_kmers(const HashTable *const htable); // Stops if func() returns non-zero value #define HASH_ITERATE_PART(ht,job,njobs,func, ...) do { \ ctx_assert((job) < (njobs)); \ - const BinaryKmer *_table = (ht)->table, *_start, *_end, *_bkptr; \ - const size_t _step = (ht)->capacity / (njobs); \ - _start = _table + (job) * _step; \ - _end = ((job)+1 == (njobs) ? _table + (ht)->capacity : _start+_step); \ - for(_bkptr = _start; _bkptr < _end; _bkptr++) { \ - if(HASH_ENTRY_ASSIGNED(*_bkptr)) { \ - if(func((hkey_t)(_bkptr - _table), ##__VA_ARGS__)) break; \ + const size_t _step = hash_table_size(ht) / (njobs); \ + hkey_t _hi = (job) * _step; \ + hkey_t _end = ((job)+1 == (njobs) ? hash_table_size(ht) : _hi+_step); \ + for(; _hi < _end; _hi++) { \ + if(hash_table_assigned(ht,_hi)) { \ + if(func(_hi, ##__VA_ARGS__)) break; \ } \ } \ } while(0) diff --git a/src/graph/json_hdr.c b/src/graph/json_hdr.c index b13b7859..a20e1eb6 100644 --- a/src/graph/json_hdr.c +++ b/src/graph/json_hdr.c @@ -187,7 +187,7 @@ void json_hdr_add_curr_cmd(cJSON *json, const char *path) */ void json_hdr_make_std(cJSON *json, const char *path, cJSON **hdrs, size_t nhdrs, - const dBGraph *db_graph) + const dBGraph *db_graph, size_t nkmers_in_graph) { // Add random id string #define FILE_KEY_LEN 16 @@ -200,7 +200,7 @@ void json_hdr_make_std(cJSON *json, const char *path, cJSON_AddNumberToObject(graph, "num_colours", db_graph->num_of_cols); cJSON_AddNumberToObject(graph, "kmer_size", db_graph->kmer_size); - cJSON_AddNumberToObject(graph, "num_kmers_in_graph", db_graph->ht.num_kmers); + cJSON_AddNumberToObject(graph, "num_kmers_in_graph", nkmers_in_graph); cJSON *colours = cJSON_CreateArray(); cJSON_AddItemToObject(graph, "colours", colours); diff --git a/src/graph/json_hdr.h b/src/graph/json_hdr.h index bdd3d24d..3ccc98b3 100644 --- a/src/graph/json_hdr.h +++ b/src/graph/json_hdr.h @@ -17,7 +17,7 @@ cJSON* json_hdr_load(gzFile gzin, const char *path); // @param path is the path of the file we are writing to void json_hdr_make_std(cJSON *json, const char *path, cJSON **hdrs, size_t nhdrs, - const dBGraph *db_graph); + const dBGraph *db_graph, size_t nkmers_in_graph); // Add current command to a header void json_hdr_add_curr_cmd(cJSON *json, const char *path); diff --git a/src/graph/prune_nodes.c b/src/graph/prune_nodes.c index 2d611c71..f48ca243 100644 --- a/src/graph/prune_nodes.c +++ b/src/graph/prune_nodes.c @@ -42,7 +42,7 @@ int prune_edges_to_nodes_lacking_flag(hkey_t hkey, const uint8_t *flags, if(bitset_get(flags, hkey)) { // Check edges - bkmer = db_node_get_bkmer(db_graph, hkey); + bkmer = db_node_get_bkey(db_graph, hkey); keep_edges = db_node_get_edges_union(db_graph, hkey); for(orient = 0; orient < 2; orient++) @@ -126,7 +126,7 @@ void prune_nodes_lacking_flag(size_t nthreads, const uint8_t *flags, static void prune_connecting_edges(dBGraph *db_graph, hkey_t hkey) { Edges uedges = db_node_get_edges_union(db_graph, hkey); - BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); dBNode next_node; Orientation or; Nucleotide nuc, lost_nuc; diff --git a/src/graph_paths/gpath_checks.c b/src/graph_paths/gpath_checks.c index 447e83d4..edc31dda 100644 --- a/src/graph_paths/gpath_checks.c +++ b/src/graph_paths/gpath_checks.c @@ -258,7 +258,7 @@ bool gpath_checks_path_col(dBNode node, const GPath *gpath, for(klen = 0, plen = 0; plen < gpath->num_juncs; klen++) { - bkmer = db_node_get_bkmer(db_graph, node.key); + bkmer = db_node_get_bkey(db_graph, node.key); edges = db_node_get_edges(db_graph, node.key, edgecol); // Check this node is in this colour @@ -282,7 +282,7 @@ bool gpath_checks_path_col(dBNode node, const GPath *gpath, int outdegree = edges_get_outdegree(backedges, rnode.orient); if(outdegree <= 1) { char bkstr[MAX_KMER_SIZE+1]; - binary_kmer_to_str(db_node_get_bkmer(db_graph, node.key), db_graph->kmer_size, bkstr); + binary_kmer_to_str(db_node_get_bkey(db_graph, node.key), db_graph->kmer_size, bkstr); status("outdegree: %i col: %zu kmer: %s", (int)outdegree, ctxcol, bkstr); } ctx_assert_ret(outdegree > 1); @@ -444,8 +444,8 @@ void gpath_checks_counts(const dBGraph *db_graph) HASH_ITERATE(&db_graph->ht, _gpstore_update_counts, db_graph, &nvisited, &nkmers, &npaths); - ctx_assert2(nvisited == db_graph->ht.num_kmers, "%zu vs %zu", - nvisited, (size_t)db_graph->ht.num_kmers); + ctx_assert2(nvisited == hash_table_nkmers(&db_graph->ht), "%zu vs %llu", + nvisited, hash_table_nkmers(&db_graph->ht)); ctx_assert2(nkmers == gpstore->num_kmers_with_paths, "%zu vs %zu", nkmers, (size_t)gpstore->num_kmers_with_paths); ctx_assert2(npaths == gpstore->gpset.entries.len, "%zu vs %zu", diff --git a/src/graph_paths/gpath_reader.c b/src/graph_paths/gpath_reader.c index 089cda3a..8980a1d9 100644 --- a/src/graph_paths/gpath_reader.c +++ b/src/graph_paths/gpath_reader.c @@ -217,6 +217,18 @@ void gpath_reader_close(GPathReader *file) memset(file, 0, sizeof(GPathReader)); } +void gpath_reader_count_kmers(GPathReader *rdrs, size_t nreaders, + size_t *max_kmers_ptr, size_t *sum_kmers_ptr) +{ + size_t i, n, max_kmers = 0, sum_kmers = 0; + for(i = 0; i < nreaders; i++) { + n = gpath_reader_get_num_kmers(&rdrs[i]); + max_kmers = MAX2(n, max_kmers); + sum_kmers += n; + } + *max_kmers_ptr = max_kmers; + *sum_kmers_ptr = sum_kmers; +} void gpath_reader_check(const GPathReader *file, size_t db_kmer_size, size_t db_ncols) @@ -700,10 +712,12 @@ void gpath_reader_max_mem_req(GPathReader *files, size_t nfiles, size_t gpath_reader_sum_mem(GPathReader *files, size_t nfiles, size_t ncols, bool count_nseen, bool use_gphash, - size_t *max_file_mem_ptr) + size_t *max_file_mem_ptr, + size_t *sum_npaths_ptr, size_t *max_npaths_ptr) { size_t i, npaths, path_bytes, file_mem; - size_t path_sum_mem = 0, max_file_mem = 0; + size_t sum_file_mem = 0, max_file_mem = 0; + size_t max_npaths = 0, sum_npaths = 0; for(i = 0; i < nfiles; i++) { @@ -711,34 +725,44 @@ size_t gpath_reader_sum_mem(GPathReader *files, size_t nfiles, path_bytes = gpath_reader_get_path_bytes(&files[i]); file_mem = npaths * sizeof(GPath) + // GPath path_bytes + // Sequence - npaths * (ncols+7)/8 + // Colset + npaths * roundup_bits2bytes(ncols) + // Colset npaths * (count_nseen ? ncols*sizeof(uint8_t) + sizeof(uint32_t) : 0) + (npaths/IDEAL_OCCUPANCY) * (use_gphash ? sizeof(GPEntry) : 0); - path_sum_mem += file_mem; + sum_npaths += npaths; + max_npaths = MAX2(max_npaths, npaths); + sum_file_mem += file_mem; max_file_mem = MAX2(max_file_mem, file_mem); } + if(sum_npaths_ptr) *sum_npaths_ptr = sum_npaths; + if(max_npaths_ptr) *max_npaths_ptr = max_npaths; if(max_file_mem_ptr) *max_file_mem_ptr = max_file_mem; - return path_sum_mem; + return sum_file_mem; } size_t gpath_reader_mem_req(GPathReader *files, size_t nfiles, size_t ncols, size_t max_mem, - bool count_nseen) + bool count_nseen, + size_t graph_capacity, bool split_linked_lists) { - size_t max_file_mem = 0, path_sum_mem; - path_sum_mem = gpath_reader_sum_mem(files, nfiles, ncols, count_nseen, false, - &max_file_mem); + size_t max_file_mem = 0, sum_file_mem; + size_t gpstore_mem = gpath_store_mem(graph_capacity, split_linked_lists); + sum_file_mem = gpath_reader_sum_mem(files, nfiles, ncols, count_nseen, false, + &max_file_mem, NULL, NULL); + + size_t min_mem_req = gpstore_mem + max_file_mem; + size_t max_mem_req = gpstore_mem + sum_file_mem; + size_t suggested_mem = min_mem_req + (size_t)((max_mem_req-min_mem_req)*0.2); - if(max_file_mem > max_mem) { + if(min_mem_req > max_mem) { char memstr[50]; - bytes_to_str(max_file_mem, 1, memstr); + bytes_to_str(min_mem_req, 1, memstr); die("Require at least %s memory for paths", memstr); } - return MIN2(max_mem, path_sum_mem); + return MIN2(max_mem, suggested_mem); } void gpath_reader_alloc_gpstore(GPathReader *files, size_t nfiles, @@ -747,23 +771,13 @@ void gpath_reader_alloc_gpstore(GPathReader *files, size_t nfiles, { if(nfiles == 0) return; - size_t sum_mem = gpath_reader_sum_mem(files, nfiles, db_graph->num_of_cols, - count_nseen, false, NULL); - - size_t i, npaths, max_npaths = 0, sum_npaths = 0; - size_t path_bytes, sum_path_bytes = 0; - - for(i = 0; i < nfiles; i++) { - npaths = gpath_reader_get_num_paths(&files[i]); - max_npaths = MAX2(max_npaths, npaths); - sum_npaths += npaths; - path_bytes = gpath_reader_get_path_bytes(&files[i]) + - npaths * roundup_bits2bytes(db_graph->num_of_cols); - sum_path_bytes += path_bytes; - } + size_t sum_mem, max_mem = 0, sum_npaths = 0, max_npaths = 0; + sum_mem = gpath_reader_sum_mem(files, nfiles, db_graph->num_of_cols, + count_nseen, false, &max_mem, + &sum_npaths, &max_npaths); - npaths = sum_mem <= mem ? sum_npaths : 0; - status("[GPathReader] need %zu paths %zu bytes", npaths, sum_path_bytes); + size_t npaths = sum_mem <= mem ? sum_npaths : max_npaths; + status("[GPathReader] need %zu sum mem: %zu bytes", npaths, sum_mem); gpath_store_alloc(&db_graph->gpstore, db_graph->num_of_cols, diff --git a/src/graph_paths/gpath_reader.h b/src/graph_paths/gpath_reader.h index 8b08707b..3b8e9986 100644 --- a/src/graph_paths/gpath_reader.h +++ b/src/graph_paths/gpath_reader.h @@ -37,29 +37,6 @@ typedef struct madcrow_buffer(gpfile_buf, GPathFileBuffer, GPathReader); -/** - Parse line with format: - [FR] [njuncs] [nseen0,nseen1,...] [juncs:ACAGT] ([seq=] [juncpos=])? - */ -void link_line_parse(const StrBuf *line, int version, const FileFilter *fltr, - bool *fw, size_t *njuncs, - SizeBuffer *counts, StrBuf *juncs, - StrBuf *seq, SizeBuffer *juncpos); - -// Reads line <kmer> <num_links> -// Calls die() on error -// Returns true unless end of file -bool gpath_reader_read_kmer(GPathReader *file, StrBuf *kmer, size_t *num_links); - -// Reads line [FR] <num_links> -// Calls die() on error -// Returns true unless end of link entries -bool gpath_reader_read_link(GPathReader *file, - bool *fw, size_t *njuncs, - SizeBuffer *countbuf, StrBuf *juncs, - StrBuf *seq, SizeBuffer *juncpos); - - // Open file, exits on error // if successful creates a new GPathReader void gpath_reader_open(GPathReader *file, const char *path); @@ -79,10 +56,23 @@ void gpath_reader_check(const GPathReader *file, size_t kmer_size, size_t ncols) void gpath_reader_load(GPathReader *file, int kmer_flags, dBGraph *db_graph); void gpath_reader_close(GPathReader *file); +// Given an array of GPathReaders, find the max and sum of the number of kmers +void gpath_reader_count_kmers(GPathReader *rdrs, size_t nreaders, + size_t *max_kmers_ptr, size_t *sum_kmers_ptr); + // // Reading without loading into a graph // +/** + Parse line with format: + [FR] [njuncs] [nseen0,nseen1,...] [juncs:ACAGT] ([seq=] [juncpos=])? + */ +void link_line_parse(const StrBuf *line, int version, const FileFilter *fltr, + bool *fw, size_t *njuncs, + SizeBuffer *counts, StrBuf *juncs, + StrBuf *seq, SizeBuffer *juncpos); + // Reads line <kmer> <num_links> // Calls die() on error // Returns true unless end of file @@ -127,11 +117,13 @@ void gpath_reader_max_mem_req(GPathReader *files, size_t nfiles, // sets @max_file_mem_ptr to the max for a single file memory size_t gpath_reader_sum_mem(GPathReader *files, size_t nfiles, size_t ncols, bool count_nseen, bool use_gphash, - size_t *max_file_mem_ptr); + size_t *max_file_mem_ptr, + size_t *sum_npaths_ptr, size_t *max_npaths_ptr); size_t gpath_reader_mem_req(GPathReader *files, size_t nfiles, size_t ncols, size_t max_mem, - bool count_nseen); + bool count_nseen, + size_t graph_capacity, bool split_linked_lists); void gpath_reader_alloc_gpstore(GPathReader *files, size_t nfiles, size_t mem, bool count_nseen, diff --git a/src/graph_paths/gpath_save.c b/src/graph_paths/gpath_save.c index c09dd1eb..7390458d 100644 --- a/src/graph_paths/gpath_save.c +++ b/src/graph_paths/gpath_save.c @@ -10,7 +10,7 @@ const char ctp_explanation_comment[] = "# This file was generated with McCortex\n" "# written by Isaac Turner <turner.isaac@gmail.com>\n" -"# url: "CORTEX_URL"\n" +"# url: "MCCORTEX_URL"\n" "# \n" "# Comment lines begin with a # and are ignored, but must come after the header\n" "# Format is:\n" @@ -81,7 +81,7 @@ cJSON* gpath_save_mkhdr(const char *path, cJSON_AddNumberToObject(json, "format_version", CTP_FORMAT_VERSION); // Add standard cortex header info, including the command being run - json_hdr_make_std(json, path, hdrs, nhdrs, db_graph); + json_hdr_make_std(json, path, hdrs, nhdrs, db_graph, hash_table_nkmers(&db_graph->ht)); // Get first command (this one), and command specific extra info if(cmdstr) { @@ -150,7 +150,7 @@ void gpath_save_sbuf(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, if(subset->list.len == 0) return; // Print "<kmer> <npaths>" - BinaryKmer bkmer = db_graph->ht.table[hkey]; + BinaryKmer bkmer = hash_table_fetch(&db_graph->ht, hkey); char bkstr[MAX_KMER_SIZE+1]; binary_kmer_to_str(bkmer, db_graph->kmer_size, bkstr); diff --git a/src/paths/gpath_hash.h b/src/paths/gpath_hash.h index 01dbf1f2..600a9e84 100644 --- a/src/paths/gpath_hash.h +++ b/src/paths/gpath_hash.h @@ -5,7 +5,7 @@ #include "gpath_store.h" // Packed structure is 10 bytes -// Do not use pointes to fields in this struct - they are not aligned +// Do not use pointers to fields in this struct - they are not aligned struct GPEntryStruct { // 5 bytes each diff --git a/src/paths/gpath_store.c b/src/paths/gpath_store.c index 9e0fbd41..25ad6317 100644 --- a/src/paths/gpath_store.c +++ b/src/paths/gpath_store.c @@ -3,6 +3,11 @@ #include "util.h" +size_t gpath_store_mem(size_t graph_capacity, bool split_linked_lists) +{ + return graph_capacity*sizeof(GPath*) * (split_linked_lists ? 2 : 1); +} + // If num_paths != 0, we ensure at least num_paths capacity // @split_linked_lists whether you intend to have traverse linked list and // all linked list separate @@ -12,7 +17,7 @@ void gpath_store_alloc(GPathStore *gpstore, size_t ncols, size_t graph_capacity, { memset(gpstore, 0, sizeof(*gpstore)); - size_t store_mem = graph_capacity*sizeof(GPath*) * (split_linked_lists ? 2 : 1); + size_t store_mem = gpath_store_mem(graph_capacity, split_linked_lists); if(store_mem > mem) die("Need at least %zu bytes (only got %zu bytes)", store_mem, mem); diff --git a/src/paths/gpath_store.h b/src/paths/gpath_store.h index bf50981d..657729aa 100644 --- a/src/paths/gpath_store.h +++ b/src/paths/gpath_store.h @@ -13,6 +13,8 @@ typedef struct GPath **paths_all, **paths_traverse; } GPathStore; +size_t gpath_store_mem(size_t graph_capacity, bool split_linked_lists); + // If num_paths != 0, we ensure at least num_paths capacity // @split_linked_lists whether you intend to have traverse linked list and // all linked list separate diff --git a/src/tests/bkmer_tests.c b/src/tests/bkmer_tests.c index 41960c1f..8886fab7 100644 --- a/src/tests/bkmer_tests.c +++ b/src/tests/bkmer_tests.c @@ -19,7 +19,7 @@ void test_bkmer_str() TASSERT(!binary_kmer_oversized(bkmer0, k)); TASSERT(!binary_kmer_oversized(bkmer1, k)); - TASSERT(binary_kmers_are_equal(bkmer0,bkmer1)); + TASSERT(binary_kmer_eq(bkmer0,bkmer1)); TASSERT(strcmp(str0,str1) == 0); } } @@ -40,8 +40,8 @@ void test_bkmer_revcmp() TASSERT(!binary_kmer_oversized(bkmer1, k)); TASSERT(!binary_kmer_oversized(bkmer2, k)); // kmer-size is odd, forward != reverse complement - TASSERT(!binary_kmers_are_equal(bkmer0, bkmer1)); - TASSERT(binary_kmers_are_equal(bkmer0, bkmer2)); + TASSERT(!binary_kmer_eq(bkmer0, bkmer1)); + TASSERT(binary_kmer_eq(bkmer0, bkmer2)); } } @@ -63,7 +63,7 @@ void test_bkmer_shifts() bkmer1 = binary_kmer_left_shift_one_base(bkmer1, k); bkmer1 = binary_kmer_right_shift_one_base(bkmer1); binary_kmer_set_first_nuc(&bkmer2, 0, k); - TASSERT2(binary_kmers_are_equal(bkmer1,bkmer2), "k:%zu", k); + TASSERT2(binary_kmer_eq(bkmer1,bkmer2), "k:%zu", k); TASSERT(!binary_kmer_oversized(bkmer1, k)); TASSERT(!binary_kmer_oversized(bkmer2, k)); @@ -71,7 +71,7 @@ void test_bkmer_shifts() bkmer1 = binary_kmer_right_shift_one_base(bkmer1); bkmer1 = binary_kmer_left_shift_one_base(bkmer1, k); binary_kmer_set_last_nuc(&bkmer2, 0); - TASSERT(binary_kmers_are_equal(bkmer1,bkmer2)); + TASSERT(binary_kmer_eq(bkmer1,bkmer2)); TASSERT(!binary_kmer_oversized(bkmer1, k)); TASSERT(!binary_kmer_oversized(bkmer2, k)); } @@ -87,8 +87,8 @@ void test_bkmer_shifts() bkmer1 = binary_kmer_left_shift_one_base(bkmer1, k); bkmer2 = binary_kmer_right_shift_one_base(bkmer2); } - TASSERT(binary_kmers_are_equal(bkmer1,zero_bkmer)); - TASSERT(binary_kmers_are_equal(bkmer2,zero_bkmer)); + TASSERT(binary_kmer_eq(bkmer1,zero_bkmer)); + TASSERT(binary_kmer_eq(bkmer2,zero_bkmer)); } // Copy from one bkmer to another by shifting a base at a time @@ -107,8 +107,8 @@ void test_bkmer_shifts() TASSERT(!binary_kmer_oversized(bkmer1, k)); TASSERT(!binary_kmer_oversized(bkmer2, k)); } - TASSERT(binary_kmers_are_equal(bkmer1,zero_bkmer)); - TASSERT(binary_kmers_are_equal(bkmer2,bkmer0)); + TASSERT(binary_kmer_eq(bkmer1,zero_bkmer)); + TASSERT(binary_kmer_eq(bkmer2,bkmer0)); // copy from bkmer1 -> bkmer2, shifting left bkmer1 = bkmer0; @@ -120,8 +120,8 @@ void test_bkmer_shifts() TASSERT(!binary_kmer_oversized(bkmer1, k)); TASSERT(!binary_kmer_oversized(bkmer2, k)); } - TASSERT(binary_kmers_are_equal(bkmer1,zero_bkmer)); - TASSERT(binary_kmers_are_equal(bkmer2,bkmer0)); + TASSERT(binary_kmer_eq(bkmer1,zero_bkmer)); + TASSERT(binary_kmer_eq(bkmer2,bkmer0)); } } diff --git a/src/tests/cleaning_tests.c b/src/tests/cleaning_tests.c index 7c78f8bd..533ed38c 100644 --- a/src/tests/cleaning_tests.c +++ b/src/tests/cleaning_tests.c @@ -58,29 +58,29 @@ void _test_graph_cleaning() "AAAATTCACGATAGTGGCGCTCGGGAGGAGTACGCAACTCAGCACCCCGGTGAGTAGCTCCCTT"; build_graph_from_str_mt(&graph, 0, graphseq, strlen(graphseq), false); - TASSERT2(graph.ht.num_kmers == 1000-19+1, - "%"PRIu64" kmers", graph.ht.num_kmers); + TASSERT2(hash_table_nkmers(&graph.ht) == 1000-19+1, + "%llu kmers", hash_table_nkmers(&graph.ht)); // No change (min_tip_len must be > 1) clean_graph(nthreads, 0, 2, NULL, NULL, visited, keep, &graph); - TASSERT(graph.ht.num_kmers == 1000-19+1); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == 1000-19+1); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); // No change (min_tip_len must be > 1) clean_graph(nthreads, 0, 1000-19+1, NULL, NULL, visited, keep, &graph); - TASSERT(graph.ht.num_kmers == 1000-19+1); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == 1000-19+1); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); // All removed clean_graph(nthreads, 0, 1000-19+2, NULL, NULL, visited, keep, &graph); - TASSERT2(graph.ht.num_kmers == 0, "%"PRIu64" kmers", graph.ht.num_kmers); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT2(hash_table_nkmers(&graph.ht) == 0, "%llu kmers", hash_table_nkmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); // Reload first 200 bases of graph 3 times for(i = 0; i < 3; i++) build_graph_from_str_mt(&graph, 0, graphseq, 200, false); - TASSERT2(graph.ht.num_kmers == 200-19+1, - "%"PRIu64" kmers", graph.ht.num_kmers); + TASSERT2(hash_table_nkmers(&graph.ht) == 200-19+1, + "%llu kmers", hash_table_nkmers(&graph.ht)); // First 100 bp with two SNPs char tmp[] = @@ -93,8 +93,8 @@ void _test_graph_cleaning() clean_graph(nthreads, thresh, 0, NULL, NULL, visited, keep, &graph); TASSERT2(thresh > 1, "threshold: %zu", thresh); - TASSERT2(graph.ht.num_kmers == 200-19+1, "%"PRIu64" kmers", graph.ht.num_kmers); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT2(hash_table_nkmers(&graph.ht) == 200-19+1, "%llu kmers", hash_table_nkmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); // First 78 bp with a single SNP creating a tip 23bp -> 5kmers long char tmp2[] = @@ -102,12 +102,12 @@ void _test_graph_cleaning() // Trim off new tip build_graph_from_str_mt(&graph, 0, tmp2, strlen(tmp2), false); - TASSERT2(graph.ht.num_kmers == 200-19+1 + 23-19+1, - "%"PRIu64" kmers", graph.ht.num_kmers); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT2(hash_table_nkmers(&graph.ht) == 200-19+1 + 23-19+1, + "%llu kmers", hash_table_nkmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); clean_graph(nthreads, 0, 2*19-1, NULL, NULL, visited, keep, &graph); - TASSERT2(graph.ht.num_kmers == 200-19+1, "%"PRIu64" kmers", graph.ht.num_kmers); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT2(hash_table_nkmers(&graph.ht) == 200-19+1, "%llu kmers", hash_table_nkmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); // clear hash table + graph hash_table_empty(&graph.ht); @@ -117,11 +117,11 @@ void _test_graph_cleaning() // Build a graph with a single kmer and delete it char tmp3[] = "AGATGTGGTTCACGGCTAG"; build_graph_from_str_mt(&graph, 0, tmp3, strlen(tmp3), false); - TASSERT2(graph.ht.num_kmers == 1, "%zu", (size_t)graph.ht.num_kmers); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT2(hash_table_nkmers(&graph.ht) == 1, "%llu", hash_table_nkmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); clean_graph(nthreads, 0, 2*19-1, NULL, NULL, visited, keep, &graph); - TASSERT(graph.ht.num_kmers == 0, "%"PRIu64" kmers", graph.ht.num_kmers); - TASSERT(graph.ht.num_kmers == hash_table_count_kmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == 0, "%llu kmers", hash_table_nkmers(&graph.ht)); + TASSERT(hash_table_nkmers(&graph.ht) == hash_table_count_kmers(&graph.ht)); ctx_free(visited); ctx_free(keep); diff --git a/src/tests/corrected_aln_tests.c b/src/tests/corrected_aln_tests.c index a3c73d14..292bf808 100644 --- a/src/tests/corrected_aln_tests.c +++ b/src/tests/corrected_aln_tests.c @@ -159,8 +159,8 @@ static void test_contig_ends_agree() // Check number of kmers in the graph size_t expnkmers = strlen(seqa)+1-kmer_size + 2+11+11; - TASSERT2(graph.ht.num_kmers == expnkmers, - "%zu vs %zu", (size_t)graph.ht.num_kmers, expnkmers); + TASSERT2(hash_table_nkmers(&graph.ht) == expnkmers, + "%llu vs %zu", hash_table_nkmers(&graph.ht), expnkmers); // Check number of paths TASSERT(graph.gpstore.num_paths == 8); diff --git a/src/tests/graph_crawler_tests.c b/src/tests/graph_crawler_tests.c index a0f6c0cb..5868b8b8 100644 --- a/src/tests/graph_crawler_tests.c +++ b/src/tests/graph_crawler_tests.c @@ -36,7 +36,7 @@ void test_graph_crawler() TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); - BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); + BinaryKmer bkey = db_node_get_bkey(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; diff --git a/src/tests/hash_table_tests.c b/src/tests/hash_table_tests.c index 805124f1..3d176777 100644 --- a/src/tests/hash_table_tests.c +++ b/src/tests/hash_table_tests.c @@ -8,7 +8,7 @@ static void xor_bkmers(hkey_t key, HashTable *ht, BinaryKmer *ptr, size_t *c) { size_t i; - BinaryKmer bkmer = ht->table[key]; + BinaryKmer bkmer = hash_table_fetch(ht, key); for(i = 0; i < NUM_BKMER_WORDS; i++) ptr->b[i] ^= bkmer.b[i]; (*c)++; } @@ -57,14 +57,14 @@ static void test_add_remove() for(i = 0; i < NUM_BKMER_WORDS; i++) bkxor.b[i] ^= bkey0.b[i]; } - TASSERT(kmers_added - kmers_deleted == ht.num_kmers); + TASSERT(kmers_added - kmers_deleted == hash_table_nkmers(&ht)); // Check xor of bkmers size_t kcount = 0; HASH_ITERATE(&ht, xor_bkmers, &ht, &bkresult, &kcount); - TASSERT(kcount == ht.num_kmers); - TASSERT(binary_kmers_are_equal(bkxor, bkresult)); + TASSERT(kcount == hash_table_nkmers(&ht)); + TASSERT(binary_kmer_eq(bkxor, bkresult)); hash_table_dealloc(&ht); } @@ -120,7 +120,7 @@ static void test_hash_table_mt() for(i = 0; i < bset.n; i++) TASSERT2(bset.nadded[i] == 1, "%zu", bset.nadded[i]); - TASSERT(bset.ht.num_kmers == nkmers); + TASSERT(hash_table_nkmers(&bset.ht) == nkmers); ctx_free(bset.bktlocks); ctx_free(bset.nadded); diff --git a/src/tests/infer_edges_tests.c b/src/tests/infer_edges_tests.c index b3526dea..38377670 100644 --- a/src/tests/infer_edges_tests.c +++ b/src/tests/infer_edges_tests.c @@ -105,7 +105,7 @@ static void simple_test() build_graph_from_str_mt(&graph, 4, seq0+2, 11, false); build_graph_from_str_mt(&graph, 4, seq1, 12, false); - TASSERT(graph.ht.num_kmers == 5); + TASSERT(hash_table_nkmers(&graph.ht) == 5); // First edge TASSERT(get_edges("TAACAATGACTC", &graph, 0) == ALLEDGES); diff --git a/src/tests/node_tests.c b/src/tests/node_tests.c index 25fa4c50..b0c659a5 100644 --- a/src/tests/node_tests.c +++ b/src/tests/node_tests.c @@ -9,7 +9,7 @@ static void edge_check(hkey_t hkey, const dBGraph *db_graph, size_t col) { - const BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + const BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); const Edges edges = db_node_get_edges(db_graph, hkey, col); dBNode nodes[4]; diff --git a/src/tests/repeat_walker_tests.c b/src/tests/repeat_walker_tests.c index c4590dd2..6786745c 100644 --- a/src/tests/repeat_walker_tests.c +++ b/src/tests/repeat_walker_tests.c @@ -77,7 +77,8 @@ static void test_repeat_loop() // Construct graph but no paths build_graph_from_str_mt(&graph, 0, seq, strlen(seq), false); - TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers); + TASSERT2(hash_table_nkmers(&graph.ht) == 15+12+15, + "%llu kmers", hash_table_nkmers(&graph.ht)); // Find first node in sequence dBNode node0 = db_graph_find_str(&graph, seq); diff --git a/src/tests/subgraph_tests.c b/src/tests/subgraph_tests.c index 7d21cde5..4e3f8290 100644 --- a/src/tests/subgraph_tests.c +++ b/src/tests/subgraph_tests.c @@ -13,12 +13,12 @@ static void run_subgraph(dBGraph *graph, uint8_t *mask, size_t nthreads = 2; subgraph_from_seq(graph, nthreads, dist, invert, grab_unitigs, - 8*graph->ht.num_kmers, mask, + 8*hash_table_nkmers(&graph->ht), mask, &seq, &len, 1); - TASSERT2(graph->ht.num_kmers == expt_nkmers, - "expected %zu kmers, got %zu; dist %zu invert: %s", - expt_nkmers, (size_t)graph->ht.num_kmers, + TASSERT2(hash_table_nkmers(&graph->ht) == expt_nkmers, + "expected %zu kmers, got %llu; dist %zu invert: %s", + expt_nkmers, hash_table_nkmers(&graph->ht), dist, invert ? "yes" : "no"); } @@ -50,7 +50,8 @@ static void simple_subgraph_test() "AAAATTCACGATAGTGGCGCTCGGGAGGAGTACGCAACTCAGCACCCCGGTGAGTAGCTCCCTT"; _tests_add_to_graph(&graph, graphseq, 0); - TASSERT2(graph.ht.num_kmers == 1000-19+1, "%"PRIu64" kmers", graph.ht.num_kmers); + TASSERT2(hash_table_nkmers(&graph.ht) == 1000-19+1, + "%llu kmers", hash_table_nkmers(&graph.ht)); // Pull out 10, 9, ... 0 bases around 2 kmers: GAGGTGGGTCCGCCTTGCGGt size_t dist; diff --git a/src/tools/assemble_contigs.c b/src/tools/assemble_contigs.c index 68be859f..3e08d4ce 100644 --- a/src/tools/assemble_contigs.c +++ b/src/tools/assemble_contigs.c @@ -166,7 +166,7 @@ static int _dump_contig(Assembler *assem, hkey_t hkey, // Print contig char kmer_str[MAX_KMER_SIZE+1]; const char *left_stat, *rght_stat; - BinaryKmer seed_bkmer = db_node_get_bkmer(db_graph, hkey); + BinaryKmer seed_bkmer = db_node_get_bkey(db_graph, hkey); binary_kmer_to_str(seed_bkmer, db_graph->kmer_size, kmer_str); dna_revcomp_str(kmer_str, kmer_str, db_graph->kmer_size); diff --git a/src/tools/breakpoint_caller.c b/src/tools/breakpoint_caller.c index 1b947376..258f515f 100644 --- a/src/tools/breakpoint_caller.c +++ b/src/tools/breakpoint_caller.c @@ -152,15 +152,13 @@ static void process_contig(BreakpointCaller *caller, // status(" got a call"); - // Find first place we meet the ref - size_t callid = __sync_fetch_and_add((volatile size_t*)caller->callid, 1); - // Swallow up some of the path into the 3p flank size_t i, flank3pidx = flank3p_runs[0].qoffset; size_t extra3pbases = MIN2(kmer_size-1, flank3pidx); size_t num_path_kmers = flank3pidx - extra3pbases; size_t kmer3poffset = kmer_size-1-extra3pbases; + size_t callid = __sync_fetch_and_add((volatile size_t*)caller->callid, 1); pthread_mutex_lock(caller->out_lock); // This can be set to anything without a '.' in it @@ -348,7 +346,7 @@ static void traverse_5pflank(BreakpointCaller *caller, GraphCrawler *crawler, dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, num_next; - BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, node0.key); + BinaryKmer bkmer0 = db_node_get_bkey(db_graph, node0.key); num_next = db_graph_next_nodes(db_graph, bkmer0, node0.orient, db_node_edges(db_graph, node0.key, 0), @@ -383,7 +381,7 @@ static void follow_break(BreakpointCaller *caller, dBNode node) size_t nonref_idx[4], num_nonref_next = 0; const dBGraph *db_graph = caller->db_graph; - BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key); + BinaryKmer bkey = db_node_get_bkey(db_graph, node.key); Edges edges = db_node_get_edges(db_graph, node.key, 0); num_next = db_graph_next_nodes(db_graph, bkey, node.orient, edges, @@ -505,7 +503,7 @@ static inline int breakpoint_caller_node(hkey_t hkey, BreakpointCaller *caller) // DEBUG // const dBGraph *db_graph = caller->db_graph; // char kstr[MAX_KMER_SIZE+1]; - // BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + // BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); // binary_kmer_to_str(bkmer, db_graph->kmer_size, kstr); // if(strcmp(kstr,"GTTGCTCATGA")) return 0; // skip all but given kmer // status("brk %s\n", kstr); @@ -553,7 +551,8 @@ static void breakpoints_print_header(gzFile gzout, const char *out_path, cJSON_AddNumberToObject(json, "format_version", BREAKPOINT_FORMAT_VERSION); // Add standard cortex headers - json_hdr_make_std(json, out_path, hdrs, nhdrs, db_graph); + json_hdr_make_std(json, out_path, hdrs, nhdrs, db_graph, + hash_table_nkmers(&db_graph->ht)); // Update reference colour // json.graph.colours[ref_col] @@ -599,7 +598,7 @@ static void breakpoints_print_header(gzFile gzout, const char *out_path, gzputs(gzout, "\n"); gzputs(gzout, "# This file was generated with McCortex\n"); gzputs(gzout, "# written by Isaac Turner <turner.isaac@gmail.com>\n"); - gzputs(gzout, "# url: "CORTEX_URL"\n"); + gzputs(gzout, "# url: "MCCORTEX_URL"\n"); gzputs(gzout, "# \n"); gzputs(gzout, "# Comment lines begin with a # and are ignored, but must come after the header\n"); gzputs(gzout, "# Format is:\n"); diff --git a/src/tools/bubble_caller.c b/src/tools/bubble_caller.c index cb043af4..6382fdf8 100644 --- a/src/tools/bubble_caller.c +++ b/src/tools/bubble_caller.c @@ -107,7 +107,8 @@ static void bubble_caller_print_header(gzFile gzout, const char* out_path, cJSON_AddNumberToObject(json, "format_version", BUBBLE_FORMAT_VERSION); // Add standard cortex headers - json_hdr_make_std(json, out_path, hdrs, nhdrs, db_graph); + json_hdr_make_std(json, out_path, hdrs, nhdrs, db_graph, + hash_table_nkmers(&db_graph->ht)); // Add parameters used in bubble calling to the header json_hdr_augment_cmd(json, "bubbles", "max_flank_kmers", cJSON_CreateInt(prefs->max_flank_len)); @@ -124,7 +125,7 @@ static void bubble_caller_print_header(gzFile gzout, const char* out_path, gzputs(gzout, "\n"); gzputs(gzout, "# This file was generated with McCortex\n"); gzputs(gzout, "# written by Isaac Turner <turner.isaac@gmail.com>\n"); - gzputs(gzout, "# url: "CORTEX_URL"\n"); + gzputs(gzout, "# url: "MCCORTEX_URL"\n"); gzputs(gzout, "# \n"); gzputs(gzout, "# Comment lines begin with a # and are ignored, but must come after the header\n"); gzputs(gzout, "\n"); @@ -265,7 +266,7 @@ void find_bubbles(BubbleCaller *caller, dBNode fork_node) dBNode nodes[4]; Nucleotide bases[4]; size_t i, num_next, num_edges_in_col; - BinaryKmer fork_bkmer = db_node_get_bkmer(db_graph, fork_node.key); + BinaryKmer fork_bkmer = db_node_get_bkey(db_graph, fork_node.key); num_next = db_graph_next_nodes(db_graph, fork_bkmer, fork_node.orient, db_node_edges(db_graph, fork_node.key, 0), diff --git a/src/tools/clean_graph.c b/src/tools/clean_graph.c index 342b56c0..22eecfc7 100644 --- a/src/tools/clean_graph.c +++ b/src/tools/clean_graph.c @@ -575,9 +575,9 @@ void clean_graph(size_t num_threads, { ctx_assert(db_graph->num_edge_cols > 0); - size_t i, init_nkmers = db_graph->ht.num_kmers; + size_t i, init_nkmers = hash_table_nkmers(&db_graph->ht); - if(db_graph->ht.num_kmers == 0) return; + if(init_nkmers == 0) return; if(covg_threshold == 0 && min_keep_tip == 0) { warn("[cleaning] No cleaning specified"); return; @@ -633,7 +633,7 @@ void clean_graph(size_t num_threads, // Print status update char remain_nkmers_str[100], removed_nkmers_str[100]; - size_t remain_nkmers = db_graph->ht.num_kmers; + size_t remain_nkmers = hash_table_nkmers(&db_graph->ht); size_t removed_nkmers = init_nkmers - remain_nkmers; ulong_to_str(remain_nkmers, remain_nkmers_str); ulong_to_str(removed_nkmers, removed_nkmers_str); diff --git a/src/tools/correct_reads.c b/src/tools/correct_reads.c index a18e31ce..88fa4834 100644 --- a/src/tools/correct_reads.c +++ b/src/tools/correct_reads.c @@ -139,7 +139,7 @@ static void handle_read2(CorrectReadsWorker *wrkr, // Copy first base from each kmer for(j = 0; j < num_neg; j++) { // printf("%zu: %zu\n", j, node_arr[j].key); - ctx_assert(HASH_ENTRY_ASSIGNED(db_graph->ht.table[node_arr[j].key])); + ctx_assert(hash_table_assigned(&db_graph->ht, node_arr[j].key)); nuc = db_node_get_first_nuc(node_arr[j], db_graph); rbuf->b[rbuf->end++] = dna_nuc_to_char(nuc); qbuf->b[qbuf->end++] = fq_zero; @@ -370,7 +370,7 @@ void correct_reads(CorrectAlnInput *inputs, size_t num_inputs, correct_aln_dump_stats(aln_stats, load_stats, dump_seqgap_hist_path, dump_fraglen_hist_path, - db_graph->ht.num_kmers); + hash_table_nkmers(&db_graph->ht)); for(i = 0; i < num_threads; i++) correct_reads_worker_dealloc(&wrkrs[i]); diff --git a/src/tools/generate_paths.c b/src/tools/generate_paths.c index 406df9e0..503024f8 100644 --- a/src/tools/generate_paths.c +++ b/src/tools/generate_paths.c @@ -251,7 +251,7 @@ static inline size_t _juncs_to_paths(const size_t *restrict pos_pl, // #ifdef CTXVERBOSE // char kmerstr[MAX_KMER_SIZE+1]; - // BinaryKmer tmpkmer = db_node_get_bkmer(db_graph, node.key); + // BinaryKmer tmpkmer = db_node_get_bkey(db_graph, node.key); // binary_kmer_to_str(tmpkmer, db_graph->kmer_size, kmerstr); // fputs(kmerstr, stdout); // binary_seq_print(newgpath.seq, newgpath.num_juncs, stdout); diff --git a/src/tools/genotyping.h b/src/tools/genotyping.h index 2828925d..5a8223cd 100644 --- a/src/tools/genotyping.h +++ b/src/tools/genotyping.h @@ -10,7 +10,7 @@ // #define DEBUG_VCFCOV static inline int bk2bits_hash(BinaryKmer bkey) { return binary_kmer_hash(bkey, 0); } -static inline int bk2bits_eq(BinaryKmer k1, BinaryKmer k2) { return binary_kmers_are_equal(k1,k2); } +static inline int bk2bits_eq(BinaryKmer k1, BinaryKmer k2) { return binary_kmer_eq(k1,k2); } KHASH_INIT(BkToBits, BinaryKmer, uint64_t, 1, bk2bits_hash, bk2bits_eq); typedef struct { diff --git a/src/tools/infer_edges.c b/src/tools/infer_edges.c index 7ef03e53..c9b9d731 100644 --- a/src/tools/infer_edges.c +++ b/src/tools/infer_edges.c @@ -89,7 +89,7 @@ static inline int infer_edges_node(hkey_t hkey, const dBGraph *db_graph, size_t *num_nodes_modified) { - BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); Edges *edges = &db_node_edges(db_graph, hkey, 0); size_t col; diff --git a/src/tools/subgraph.c b/src/tools/subgraph.c index 47a43ac4..a31a0b6d 100644 --- a/src/tools/subgraph.c +++ b/src/tools/subgraph.c @@ -120,7 +120,7 @@ static void store_node_neighbours(const hkey_t hkey, dBNodeBuffer *nbuf, uint8_t *kmer_mask, const dBGraph *db_graph) { // Get neighbours - BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); + BinaryKmer bkmer = db_node_get_bkey(db_graph, hkey); Edges edges = db_node_get_edges_union(db_graph, hkey); size_t num_next, i; dBNode next_nodes[8]; diff --git a/tests/breakpoint/Makefile b/tests/breakpoint/Makefile index e7f3a666..538ea797 100644 --- a/tests/breakpoint/Makefile +++ b/tests/breakpoint/Makefile @@ -1,119 +1,20 @@ -SHELL:=/bin/bash -euo pipefail +SHELL=/bin/bash -euo pipefail -# -# chr1: CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA -# chr2: gCCGTAGGTAAGGGCGTTAGTGC -# chr3: GAACACCCTTTGGTTTAAGCCGGGTTGGAGTTGGCCAAAGAAGTTCAACG -# chr4: ATTCTACAGCAGGTCATGAGCAACCGGCACTCGAGCAGACGTACGGGAAA -# -# >mix -# gCCGTAGGTAAGGGCGTTAGaCGGGTTGGAGTTGGCCAAAGAAGTTCAAgg -# .1111111111111111111.3333333333333333333333333333.. -# -# >repeat of 5bp -# CCCGTAGGTAAGGGCGTTAGTGCAGTGCAAGGCCACATTGGGACACGAGTTGAgg -# 111111111111111111111111xxxxx111111111111111111111111.. -# -# >deletion of 1bp -# cAACACCCTTTGGTTTAAGCCG-GTTGGAGTTGGCCAAAGAAGTTCAAgg -# .333333333333333333333 3333333333333333333333333.. -# -# >SNP -# ATTCTACAGCAGGTCATGAGCAACtGGCACTCGAGCAGACGTACGGGAAA -# 444444444444444444444444 4444444444444444444444444 -# +# Each test case is in a separate sub-directory -K=11 -CTXDIR=../.. -MCCORTEX=$(CTXDIR)/bin/mccortex31 -CTX2DOT=$(CTXDIR)/scripts/cortex_to_graphviz.pl -BRKCHCK=$(CTXDIR)/scripts/cortex_brkpnts_check_ref.pl -VCFSORT=$(CTXDIR)/libs/biogrok/vcf-sort -VCFRENAME=$(CTXDIR)/libs/biogrok/vcf-rename +# breakpoint0: empty breakpoint calls (K=31) +# breakpoint1: test we get the correct VCF (K=11) +# breakpoint2: check we don't call any ref bubbles -BGZIP=$(CTXDIR)/libs/htslib/bgzip -BCFTOOLS=$(CTXDIR)/libs/bcftools/bcftools - -SEQS=sample.fa ref.fa -GRAPHS=$(SEQS:.fa=.k$(K).ctx) -TGTS=breakpoints.txt.gz breakpoints.norm.vcf.gz $(GRAPHS) -# join.k$(K).ctx - -all: $(TGTS) cmp_breakpoint cmp_vcf - -ref.fa: - ( echo '>chr1'; \ - echo CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA; \ - echo '>chr2'; \ - echo gCCGTAGGTAAGGGCGTTAGTGC; \ - echo '>chr3'; \ - echo GAACACCCTTTGGTTTAAGCCGGGTTGGAGTTGGCCAAAGAAGTTCAACG; \ - echo '>chr4'; \ - echo ATTCTACAGCAGGTCATGAGCAACCGGCACTCGAGCAGACGTACGGGAAA; ) > $@ - -sample.fa: - ( echo '>contig0'; \ - echo gCCGTAGGTAAGGGCGTTAGaCGGGTTGGAGTTGGCCAAAGAAGTTCAAgg; \ - echo '>contig1'; \ - echo CCCGTAGGTAAGGGCGTTAGTGCAGTGCAAGGCCACATTGGGACACGAGTTGAgg; \ - echo '>contig2'; \ - echo cAACACCCTTTGGTTTAAGCCGGTTGGAGTTGGCCAAAGAAGTTCAAgg; \ - echo '>contig3'; \ - echo ATTCTACAGCAGGTCATGAGCAACtGGCACTCGAGCAGACGTACGGGAAA; ) > $@ - -truth.vcf.gz: - ( printf "##fileformat=VCF4.1\n##fileDate="`date '+%Y%m%d'`"\n"; \ - printf "##reference=ref.fa\n"; \ - printf "##contig=<id=chr1,length=50>\n"; \ - printf "##contig=<id=chr2,length=23>\n"; \ - printf "##contig=<id=chr3,length=50>\n"; \ - printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"; \ - echo "chr1 18 . T TAGTGC . . . GT 1"; \ - echo "chr3 21 . CG C . . . GT 1"; \ - echo "chr4 25 . C T . . . GT 1"; ) > truth.vcf - $(BGZIP) truth.vcf - $(BCFTOOLS) index truth.vcf.gz - -%.k$(K).ctx: %.fa - $(MCCORTEX) build -m 10M -k $(K) --sample $* --seq $< $@ >& $@.log - -breakpoints.txt.gz: sample.k$(K).ctx ref.fa - $(MCCORTEX) breakpoints -t 1 -m 10M --minref 5 \ - --seq ref.fa --out $@ sample.k$(K).ctx >& $@.log - -breakpoints.raw.vcf: breakpoints.txt.gz $(SEQS) - $(MCCORTEX) calls2vcf -o $@ breakpoints.txt.gz ref.fa >& $@.log - -breakpoints.sort.vcf: breakpoints.raw.vcf - $(VCFSORT) $< > $@ - -breakpoints.norm.vcf.gz: breakpoints.sort.vcf - $(BCFTOOLS) norm --site-win 5000 --fasta-ref ref.fa $< | \ - $(BCFTOOLS) norm --rm-dup any --do-not-normalize > breakpoints.norm.vcf - $(BGZIP) breakpoints.norm.vcf - $(BCFTOOLS) index breakpoints.norm.vcf.gz - -cmp_breakpoint: breakpoints.txt.gz ref.fa - $(BRKCHCK) <(gzip -fcd breakpoints.txt.gz) ref.fa - -# compare truth.vcf and breakpoints.norm.vcf.gz -# Check no entries private to either truth.vcf.gz or breakpoints.norm.vcf.gz -cmp_vcf: breakpoints.norm.vcf.gz truth.vcf.gz - @$(BCFTOOLS) stats breakpoints.norm.vcf.gz truth.vcf.gz | \ - grep '^SN\s*[01]\s' | grep -v 'number of samples' | \ - awk 'BEGIN{FS="\t"}{ if($$4 != 0){ print "Missing VCF entries!"; exit -1; } }' - @echo 'VCF files match!' - -join.k$(K).ctx: $(GRAPHS) - $(MCCORTEX) join -o $@ $(GRAPHS) - -join.k$(K).pdf: join.k$(K).ctx - $(CTX2DOT) --simplify $< | dot -Tpdf > $@ - -plots: join.k$(K).pdf +all: + cd breakpoint0 && $(MAKE) + cd breakpoint1 && $(MAKE) + cd breakpoint2 && $(MAKE) + @echo "All looks good." clean: - rm -rf $(TGTS) $(SEQS) - rm -rf ref.* breakpoints.* truth.* join.* *.log + cd breakpoint0 && $(MAKE) clean + cd breakpoint1 && $(MAKE) clean + cd breakpoint2 && $(MAKE) clean -.PHONY: all clean plots cmp_breakpoint cmp_vcf +.PHONY: all clean diff --git a/tests/breakpoint_empty/Makefile b/tests/breakpoint/breakpoint0/Makefile similarity index 84% rename from tests/breakpoint_empty/Makefile rename to tests/breakpoint/breakpoint0/Makefile index 11d11f06..4749046e 100644 --- a/tests/breakpoint_empty/Makefile +++ b/tests/breakpoint/breakpoint0/Makefile @@ -5,7 +5,7 @@ SHELL:=/bin/bash -euo pipefail -CTXDIR=../.. +CTXDIR=../../.. CTXPIPELINE=$(CTXDIR)/scripts/make-pipeline.pl DNACAT=$(CTXDIR)/libs/seq_file/bin/dnacat READSIM=$(CTXDIR)/libs/readsim/readsim @@ -24,7 +24,7 @@ ref.fa: reads/reads.fa.gz: ref.fa mkdir -p reads - $(READSIM) -r ref.fa -s -d $(SEQDEPTH) reads/reads + $(READSIM) -r ref.fa -l $(READLEN) -s -d $(SEQDEPTH) reads/reads task.k$(K).mk: echo "RefReads reads/reads.fa.gz" | $(CTXPIPELINE) -r ref.fa $(K) proj - > $@ @@ -32,7 +32,7 @@ task.k$(K).mk: run: task.k$(K).mk reads/reads.fa.gz ref.fa $(MAKE) -f $< CTXDIR=$(CTXDIR) breakpoints-vcf @# Check no VCF entries - (( `$(VCFCOUNT) proj/vcfs/breakpoints.joint.k$(K).vcf.gz` == 0 )) || false + (( `$(VCFCOUNT) proj/vcfs/breakpoints.joint.links.k$(K).vcf.gz` == 0 )) || false @# Check no breakpoint call entries (( `grep -c '>brkpnt' proj/k$(K)/breakpoints_links/joint.brk.gz` == 0 )) || false @echo 'Success: no breakpoint calls or VCF entries!' diff --git a/tests/breakpoint/breakpoint1/Makefile b/tests/breakpoint/breakpoint1/Makefile new file mode 100644 index 00000000..5feae061 --- /dev/null +++ b/tests/breakpoint/breakpoint1/Makefile @@ -0,0 +1,119 @@ +SHELL:=/bin/bash -euo pipefail + +# +# chr1: CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA +# chr2: gCCGTAGGTAAGGGCGTTAGTGC +# chr3: GAACACCCTTTGGTTTAAGCCGGGTTGGAGTTGGCCAAAGAAGTTCAACG +# chr4: ATTCTACAGCAGGTCATGAGCAACCGGCACTCGAGCAGACGTACGGGAAA +# +# >mix +# gCCGTAGGTAAGGGCGTTAGaCGGGTTGGAGTTGGCCAAAGAAGTTCAAgg +# .1111111111111111111.3333333333333333333333333333.. +# +# >repeat of 5bp +# CCCGTAGGTAAGGGCGTTAGTGCAGTGCAAGGCCACATTGGGACACGAGTTGAgg +# 111111111111111111111111xxxxx111111111111111111111111.. +# +# >deletion of 1bp +# cAACACCCTTTGGTTTAAGCCG-GTTGGAGTTGGCCAAAGAAGTTCAAgg +# .333333333333333333333 3333333333333333333333333.. +# +# >SNP +# ATTCTACAGCAGGTCATGAGCAACtGGCACTCGAGCAGACGTACGGGAAA +# 444444444444444444444444 4444444444444444444444444 +# + +K=11 +CTXDIR=../../.. +MCCORTEX=$(CTXDIR)/bin/mccortex31 +CTX2DOT=$(CTXDIR)/scripts/cortex_to_graphviz.pl +BRKCHCK=$(CTXDIR)/scripts/cortex_brkpnts_check_ref.pl +VCFSORT=$(CTXDIR)/libs/biogrok/vcf-sort +VCFRENAME=$(CTXDIR)/libs/biogrok/vcf-rename + +BGZIP=$(CTXDIR)/libs/htslib/bgzip +BCFTOOLS=$(CTXDIR)/libs/bcftools/bcftools + +SEQS=sample.fa ref.fa +GRAPHS=$(SEQS:.fa=.k$(K).ctx) +TGTS=breakpoints.txt.gz breakpoints.norm.vcf.gz $(GRAPHS) +# join.k$(K).ctx + +all: $(TGTS) cmp_breakpoint cmp_vcf + +ref.fa: + ( echo '>chr1'; \ + echo CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA; \ + echo '>chr2'; \ + echo gCCGTAGGTAAGGGCGTTAGTGC; \ + echo '>chr3'; \ + echo GAACACCCTTTGGTTTAAGCCGGGTTGGAGTTGGCCAAAGAAGTTCAACG; \ + echo '>chr4'; \ + echo ATTCTACAGCAGGTCATGAGCAACCGGCACTCGAGCAGACGTACGGGAAA; ) > $@ + +sample.fa: + ( echo '>contig0'; \ + echo gCCGTAGGTAAGGGCGTTAGaCGGGTTGGAGTTGGCCAAAGAAGTTCAAgg; \ + echo '>contig1'; \ + echo CCCGTAGGTAAGGGCGTTAGTGCAGTGCAAGGCCACATTGGGACACGAGTTGAgg; \ + echo '>contig2'; \ + echo cAACACCCTTTGGTTTAAGCCGGTTGGAGTTGGCCAAAGAAGTTCAAgg; \ + echo '>contig3'; \ + echo ATTCTACAGCAGGTCATGAGCAACtGGCACTCGAGCAGACGTACGGGAAA; ) > $@ + +truth.vcf.gz: + ( printf "##fileformat=VCF4.1\n##fileDate="`date '+%Y%m%d'`"\n"; \ + printf "##reference=ref.fa\n"; \ + printf "##contig=<id=chr1,length=50>\n"; \ + printf "##contig=<id=chr2,length=23>\n"; \ + printf "##contig=<id=chr3,length=50>\n"; \ + printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"; \ + echo "chr1 18 . T TAGTGC . . . GT 1"; \ + echo "chr3 21 . CG C . . . GT 1"; \ + echo "chr4 25 . C T . . . GT 1"; ) > truth.vcf + $(BGZIP) truth.vcf + $(BCFTOOLS) index truth.vcf.gz + +%.k$(K).ctx: %.fa + $(MCCORTEX) build -m 10M -k $(K) --sample $* --seq $< $@ >& $@.log + +breakpoints.txt.gz: sample.k$(K).ctx ref.fa + $(MCCORTEX) breakpoints -t 1 -m 10M --minref 5 \ + --seq ref.fa --out $@ sample.k$(K).ctx >& $@.log + +breakpoints.raw.vcf: breakpoints.txt.gz $(SEQS) + $(MCCORTEX) calls2vcf -o $@ breakpoints.txt.gz ref.fa >& $@.log + +breakpoints.sort.vcf: breakpoints.raw.vcf + $(VCFSORT) $< > $@ + +breakpoints.norm.vcf.gz: breakpoints.sort.vcf + $(BCFTOOLS) norm --site-win 5000 --fasta-ref ref.fa $< | \ + $(BCFTOOLS) norm --rm-dup any --do-not-normalize > breakpoints.norm.vcf + $(BGZIP) breakpoints.norm.vcf + $(BCFTOOLS) index breakpoints.norm.vcf.gz + +cmp_breakpoint: breakpoints.txt.gz ref.fa + $(BRKCHCK) <(gzip -fcd breakpoints.txt.gz) ref.fa + +# compare truth.vcf and breakpoints.norm.vcf.gz +# Check no entries private to either truth.vcf.gz or breakpoints.norm.vcf.gz +cmp_vcf: breakpoints.norm.vcf.gz truth.vcf.gz + @$(BCFTOOLS) stats breakpoints.norm.vcf.gz truth.vcf.gz | \ + grep '^SN\s*[01]\s' | grep -v 'number of samples' | \ + awk 'BEGIN{FS="\t"}{ if($$4 != 0){ print "Missing VCF entries!"; exit -1; } }' + @echo 'VCF files match!' + +join.k$(K).ctx: $(GRAPHS) + $(MCCORTEX) join -o $@ $(GRAPHS) + +join.k$(K).pdf: join.k$(K).ctx + $(CTX2DOT) --simplify $< | dot -Tpdf > $@ + +plots: join.k$(K).pdf + +clean: + rm -rf $(TGTS) $(SEQS) + rm -rf ref.* breakpoints.* truth.* join.* *.log + +.PHONY: all clean plots cmp_breakpoint cmp_vcf diff --git a/tests/breakpoint/breakpoint2/Makefile b/tests/breakpoint/breakpoint2/Makefile new file mode 100644 index 00000000..414f3ca0 --- /dev/null +++ b/tests/breakpoint/breakpoint2/Makefile @@ -0,0 +1,44 @@ +# +# Check that we don't call a ref bubble +# + +SHELL:=/bin/bash -euo pipefail + +CTXDIR=../../.. +CTXPIPELINE=$(CTXDIR)/scripts/make-pipeline.pl +DNACAT=$(CTXDIR)/libs/seq_file/bin/dnacat +READSIM=$(CTXDIR)/libs/readsim/readsim +VCFCOUNT=$(CTXDIR)/libs/biogrok/vcf-count + +REFLEN=1000 +K=31 +SEQDEPTH=30 +READLEN=50 +OUTDIR=proj + +all: run + +ref.fa: + echo '>ref' > $@ + echo -n TCTCATATGGGCATTGTCGTCTGCCCGTCACCTTCGGTCGACGCTGTTCAACATTCGGTGTTGTAGTTTATTATACTAGCGCAATCCCCGAGTTTGGGCA >> $@ + echo TCTCATATGGGCATTGTCGTCTGCCCGTCACCTTCGGTCGAgGCTGTTCAACATTCGGTGTTGTAGTTTATTATACTAGCGCAATCCCCGAGTTTGGGCA >> $@ + +reads/reads.fa.gz: ref.fa + mkdir -p reads + $(READSIM) -r ref.fa -l $(READLEN) -s -d $(SEQDEPTH) reads/reads + +task.k$(K).mk: + echo "RefReads reads/reads.fa.gz" | $(CTXPIPELINE) -r ref.fa $(K) proj - > $@ + +run: task.k$(K).mk reads/reads.fa.gz ref.fa + $(MAKE) -f $< CTXDIR=$(CTXDIR) breakpoints-vcf + @# Check no VCF entries + (( `$(VCFCOUNT) proj/vcfs/breakpoints.joint.links.k$(K).vcf.gz` == 0 )) || false + @# Check no breakpoint call entries + (( `grep -c '>brkpnt' proj/k$(K)/breakpoints_links/joint.brk.gz` == 0 )) || false + @echo 'Success: no breakpoint calls or VCF entries!' + +clean: + rm -rf ref.fa* reads proj task.k$(K).mk + +.PHONY: all run clean diff --git a/tests/dist_matrix/Makefile b/tests/dist_matrix/Makefile index 97134b40..14b78f7a 100644 --- a/tests/dist_matrix/Makefile +++ b/tests/dist_matrix/Makefile @@ -31,7 +31,7 @@ truth.tsv: beauty.fa beast.fa 'my ($$N,$$H,$$T) = map {my ($$x) = ($$_ =~ /(\d+)/); $$x} ($$a,$$b,$$c); '\ 'print ".\tcol0\tcol1\n"; '\ 'print "col0\t$$H\t".max(min($$H,$$N)-max($$N-$$T,0),0)."\n"; '\ -'print "col1\t0\t$$T\n";' > $@ +'print "col1\t.\t$$T\n";' > $@ dist.tsv: beauty.ctx beast.ctx $(MCCORTEX) dist -q --out $@ beauty.ctx beast.ctx From 47b32e2b12b5918b3206f5cd8da7e197cd699581 Mon Sep 17 00:00:00 2001 From: Isaac Turner <turner.isaac@gmail.com> Date: Mon, 31 Oct 2016 11:08:13 +0000 Subject: [PATCH 2/3] Squash develop into master - Add bfc to kmer sensitivity experiment - Documentation updates - Update libraries - Update vcfcov usage to indicate VCF should have duplicates removed - pipeline: add NKMERS= to vcfcov Fixes #45. Thanks to @peterdfields. - ctx_vcfcov.c: ignore -n,--nkmers if --low-mem is passed - add some comments to tests --- .gitmodules | 3 + README.md | 8 +- libs/Makefile | 6 +- libs/bcftools | 2 +- libs/bfc | 1 + libs/htslib | 2 +- libs/samtools | 2 +- libs/seq-align | 2 +- libs/string_buffer | 2 +- results/assembly/Makefile | 2 +- results/kmer_size_experiment/Makefile | 25 +++-- results/kmer_size_experiment/notes.txt | 12 +++ .../results/20160929thurs/perfect_no_pe.pdf | Bin 0 -> 5524 bytes .../results/20161012wed/bad.edges.csv | 12 +++ .../20161012wed/cleaning.corr.table.csv | 12 +++ .../results/20161012wed/cleaning.table.csv | 12 +++ .../results/20161012wed/perfect.links.csv | 10 ++ .../results/20161012wed/perfect.pdf | Bin 0 -> 6029 bytes .../results/20161012wed/perfect.pe.csv | 10 ++ .../results/20161012wed/perfect.plain.csv | 10 ++ .../results/20161012wed/perfect_nope.pdf | Bin 0 -> 5606 bytes .../results/20161012wed/stoch.links.csv | 10 ++ .../results/20161012wed/stoch.pdf | Bin 0 -> 6053 bytes .../results/20161012wed/stoch.pe.csv | 10 ++ .../results/20161012wed/stoch.plain.csv | 10 ++ .../results/20161012wed/stocherr.links.csv | 10 ++ .../results/20161012wed/stocherr.pdf | Bin 0 -> 6027 bytes .../results/20161012wed/stocherr.pe.csv | 10 ++ .../results/20161012wed/stocherr.plain.csv | 10 ++ .../20161012wed/stocherrcorr.links.csv | 10 ++ .../results/20161012wed/stocherrcorr.pdf | Bin 0 -> 6230 bytes .../results/20161012wed/stocherrcorr.pe.csv | 10 ++ .../20161012wed/stocherrcorr.plain.csv | 10 ++ .../results/generate-results.sh | 34 +++++++ .../results/make-csvs-and-plots.sh | 19 ---- .../kmer_size_experiment/results/plot-bfc.R | 88 ++++++++++++++++++ .../results/plot-n50-and-errs.R | 73 ++++++++------- scripts/make-pipeline.pl | 10 +- scripts/mccortex | 4 +- scripts/python/break-contigs-vs-truth.py | 2 + scripts/python/count-bad-edges.py | 0 scripts/python/mccortex.py | 0 scripts/python/pyRBT.py | 0 src/commands/ctx_contigs.c | 4 +- src/commands/ctx_vcfcov.c | 12 ++- tests/pipeline/Makefile | 5 + 46 files changed, 395 insertions(+), 79 deletions(-) create mode 160000 libs/bfc create mode 100644 results/kmer_size_experiment/notes.txt create mode 100644 results/kmer_size_experiment/results/20160929thurs/perfect_no_pe.pdf create mode 100644 results/kmer_size_experiment/results/20161012wed/bad.edges.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/cleaning.corr.table.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/cleaning.table.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/perfect.links.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/perfect.pdf create mode 100644 results/kmer_size_experiment/results/20161012wed/perfect.pe.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/perfect.plain.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/perfect_nope.pdf create mode 100644 results/kmer_size_experiment/results/20161012wed/stoch.links.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stoch.pdf create mode 100644 results/kmer_size_experiment/results/20161012wed/stoch.pe.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stoch.plain.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherr.links.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherr.pdf create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherr.pe.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherr.plain.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherrcorr.links.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherrcorr.pdf create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherrcorr.pe.csv create mode 100644 results/kmer_size_experiment/results/20161012wed/stocherrcorr.plain.csv create mode 100755 results/kmer_size_experiment/results/generate-results.sh delete mode 100755 results/kmer_size_experiment/results/make-csvs-and-plots.sh create mode 100755 results/kmer_size_experiment/results/plot-bfc.R mode change 100644 => 100755 scripts/python/break-contigs-vs-truth.py mode change 100644 => 100755 scripts/python/count-bad-edges.py mode change 100644 => 100755 scripts/python/mccortex.py mode change 100644 => 100755 scripts/python/pyRBT.py diff --git a/.gitmodules b/.gitmodules index bf766840..984d0cad 100644 --- a/.gitmodules +++ b/.gitmodules @@ -69,3 +69,6 @@ [submodule "libs/carrays"] path = libs/carrays url = https://github.com/noporpoise/carrays.git +[submodule "libs/bfc"] + path = libs/bfc + url = https://github.com/lh3/bfc.git diff --git a/README.md b/README.md index 1caab186..24b4ffd1 100644 --- a/README.md +++ b/README.md @@ -103,9 +103,9 @@ Commands inferedges infer graph edges between kmers before calling `thread` join combine graphs, filter graph intersections links clean and plot link files (.ctp) - pjoin merge path files (.ctp) + pjoin merge link files (.ctp) popbubbles pop bubbles in the population graph - pview text view of a cortex path file (.ctp) + pview text view of a cortex link file (.ctp) reads filter reads against a graph rmsubstr reduce set of strings to remove substrings server interactively query the graph @@ -117,7 +117,8 @@ Commands vcfcov coverage of a VCF against cortex graphs vcfgeno genotype a VCF after running vcfcov view text view of a cortex graph file (.ctx) - + + Type a command with no arguments to see help. Common Options: @@ -196,6 +197,7 @@ Used in testing: * [bwa](https://github.com/lh3/bwa) (MIT) * [readsim](https://github.com/noporpoise/readsim) (Public Domain) * [samtools](https://github.com/samtools/samtools) (MIT) +* [bfc](https://github.com/lh3/bfc) (MIT) Citing ------ diff --git a/libs/Makefile b/libs/Makefile index 2dd23a81..7154a757 100644 --- a/libs/Makefile +++ b/libs/Makefile @@ -5,7 +5,7 @@ SHELL=/bin/bash # make clean <- clean all libraries CORE=xxHash htslib string_buffer bit_array seq_file seq-align msg-pool sort_r madcrowlib carrays misc -OTHER=bcftools samtools bwa readsim bioinf-perl maximal_substrs vcf-slim +OTHER=bcftools samtools bwa bfc readsim bioinf-perl maximal_substrs vcf-slim ALLTGTS=$(CORE) $(OTHER) @@ -35,6 +35,9 @@ samtools: htslib samtools/Makefile bwa: bwa/Makefile cd bwa && $(MAKE) +bfc: bfc/Makefile + cd bfc && $(MAKE) + string_buffer: string_buffer/Makefile cd string_buffer && $(MAKE) all @@ -96,6 +99,7 @@ clean: cd htslib && $(MAKE) clean cd bcftools && $(MAKE) clean cd samtools && $(MAKE) clean + cd bfc && $(MAKE) clean cd string_buffer && $(MAKE) clean cd bit_array && $(MAKE) clean cd seq_file && $(MAKE) clean diff --git a/libs/bcftools b/libs/bcftools index fd5c871c..54a215f1 160000 --- a/libs/bcftools +++ b/libs/bcftools @@ -1 +1 @@ -Subproject commit fd5c871c191d520422f7392a998e2e973e9cf76f +Subproject commit 54a215f1f8467347b1ec2ac444cb26fa1ed5f8eb diff --git a/libs/bfc b/libs/bfc new file mode 160000 index 00000000..69ab176e --- /dev/null +++ b/libs/bfc @@ -0,0 +1 @@ +Subproject commit 69ab176e7aac4af482d7d8587e45bfe239d02c96 diff --git a/libs/htslib b/libs/htslib index 4295de42..1bc5c562 160000 --- a/libs/htslib +++ b/libs/htslib @@ -1 +1 @@ -Subproject commit 4295de423b015b216587a9043f1c0504d5d48f3f +Subproject commit 1bc5c562ce4146b98e4c01a4d5697bf363734789 diff --git a/libs/samtools b/libs/samtools index 9df63211..ce4a601a 160000 --- a/libs/samtools +++ b/libs/samtools @@ -1 +1 @@ -Subproject commit 9df632114a537f57928e4b2c17fbb1928679f877 +Subproject commit ce4a601a0859bc9ccfcf000dddf0ac77e7d576b3 diff --git a/libs/seq-align b/libs/seq-align index f4f2c711..a5bd29f6 160000 --- a/libs/seq-align +++ b/libs/seq-align @@ -1 +1 @@ -Subproject commit f4f2c71108d048f52971e702b17805f747f7c4f8 +Subproject commit a5bd29f6cdc0f00b6b95eb8cd0f38e9c103c56e6 diff --git a/libs/string_buffer b/libs/string_buffer index 35dfcd3c..5a4f5ef9 160000 --- a/libs/string_buffer +++ b/libs/string_buffer @@ -1 +1 @@ -Subproject commit 35dfcd3c69de5e23b59ad5e853300e9e321b6363 +Subproject commit 5a4f5ef9855a55b760a24332d238587f5d337a9e diff --git a/results/assembly/Makefile b/results/assembly/Makefile index 63c9ce21..9a623715 100644 --- a/results/assembly/Makefile +++ b/results/assembly/Makefile @@ -65,7 +65,7 @@ samples.txt: "reads/chrom1.$(HAPDEPTH)X.1.fa.gz:reads/chrom1.$(HAPDEPTH)X.2.fa.gz" > $@ task.k$(K).mk: samples.txt - $(CTXPIPELINE) -r $(REF) $(K) $(OUTDIR) $< > $@ + $(CTXPIPELINE) $(K) $(OUTDIR) $< > $@ supernodes.k$(K).fa: task.k$(K).mk $(READS) $(MAKE) -f $< CTXDIR=$(CTXDIR) MEM=1G graphs diff --git a/results/kmer_size_experiment/Makefile b/results/kmer_size_experiment/Makefile index 607c679c..73bf0a14 100644 --- a/results/kmer_size_experiment/Makefile +++ b/results/kmer_size_experiment/Makefile @@ -11,16 +11,21 @@ SHELL=/bin/bash -euo pipefail # # Isaac Turner 2016-10-28 -KMERS=21 31 41 51 61 71 81 91 99 CTXDIR=../../ DNACAT=$(CTXDIR)/libs/seq_file/bin/dnacat +BFC=$(CTXDIR)/libs/bfc/bfc GENREADS=$(CTXDIR)/scripts/python/generate-reads.py COUNT_BAD_EDGES=python $(CTXDIR)/scripts/python/count-bad-edges.py +TIME=/usr/bin/time -l + REF=$(CTXDIR)/results/data/chr22/chr22_17M_18M.fa READS_PERFECT=data/perfect_cov.fa.gz READS_STOCH=data/stoch_cov.fa.gz READS_STOCHERR=data/stocherr_cov.fa.gz +READS_CORR=data/stocherr_corr.fa.gz + MKFILE=runk.mk +KMERS=21 31 41 51 61 71 81 91 99 FRAGLEN=400 READLEN=100 DEPTH=100 @@ -28,7 +33,7 @@ ERRRATE=0.005 SEED=2380999655 # {perfect_cov,stoch_cov,stocherr_cov}/k{21,31,41,51,61,71,81,91,99}/stats.links.txt -NAMES=perfect_cov stoch_cov stocherr_cov +NAMES=perfect_cov stoch_cov stocherr_cov stocherr_corr PLAIN_STATS=$(shell for d in $(NAMES); do for k in $(KMERS); do echo $$d/k$$k/stats.plain.txt; done; done) LINKS_STATS=$(PLAIN_STATS:plain.txt=links.txt) DIRS=data $(NAMES) @@ -39,9 +44,10 @@ DIRS=data $(NAMES) PERFECT=$(foreach K,$(KMERS),perfect_k$(K)) STOCH=$(foreach K,$(KMERS),stoch_k$(K)) STOCHERR=$(foreach K,$(KMERS),stocherr_k$(K)) +ERRCORR=$(foreach K,$(KMERS),stocherr_corr_k$(K)) -# all: $(PLAIN_STATS) $(LINKS_STATS) bad.edges.csv -all: $(PERFECT) $(STOCH) $(STOCHERR) bad.edges.csv +# all: $(PLAIN_STATS) $(LINKS_STATS) results/bad.edges.csv +all: $(PERFECT) $(STOCH) $(STOCHERR) $(ERRCORR) results/latest/bad.edges.csv $(REF): cd ../data && ./download.sh @@ -55,6 +61,9 @@ $(READS_STOCH): $(REF) | $(DIRS) $(READS_STOCHERR): $(REF) | $(DIRS) $(DNACAT) -P $(REF) | $(GENREADS) -s $(SEED) -p $(FRAGLEN) -r $(READLEN) -d $(DEPTH) -e $(ERRRATE) | gzip -c > $@ +$(READS_CORR): $(READS_STOCHERR) | $(DIRS) + $(TIME) $(BFC) -s 1m <($(DNACAT) -F $(READS_STOCHERR)) <($(DNACAT) -F $(READS_STOCHERR)) 2> $@.log | gzip -c 1> $@ + # perfect_cov/k%/stats.plain.txt perfect_cov/k%/stats.links.txt: $(READS_PERFECT) perfect_k%: $(READS_PERFECT) $(MAKE) -f $(MKFILE) K=$* REF=$(REF) NAME=perfect_cov INPUT=$(READS_PERFECT) @@ -67,15 +76,19 @@ stoch_k%: $(READS_STOCH) stocherr_k%: $(READS_STOCHERR) $(MAKE) -f $(MKFILE) K=$* REF=$(REF) NAME=stocherr_cov INPUT=$(READS_STOCHERR) CLEAN=1 +stocherr_corr_k%: $(READS_CORR) + $(MAKE) -f $(MKFILE) K=$* REF=$(REF) NAME=stocherr_corr INPUT=$(READS_CORR) CLEAN=1 + # Find the number of sequencing errors that would add a new edges between two # existing kmers -bad.edges.csv: $(REF) +results/latest/bad.edges.csv: $(REF) + mkdir -p results/latest $(DNACAT) -P $(REF) | $(COUNT_BAD_EDGES) $(DEPTH) $(ERRRATE) > $@ $(DIRS): mkdir -p $@ clean: - rm -rf $(DIRS) bad.edges.csv + rm -rf $(DIRS) results/latest/bad.edges.csv .PHONY: all clean diff --git a/results/kmer_size_experiment/notes.txt b/results/kmer_size_experiment/notes.txt new file mode 100644 index 00000000..33c58b7d --- /dev/null +++ b/results/kmer_size_experiment/notes.txt @@ -0,0 +1,12 @@ + +Requires python and uses the bundled version of bfc + +Sample reads and assemble with: + + make + +Generate plots and tables with: + + cd results + ./generate-results.sh + diff --git a/results/kmer_size_experiment/results/20160929thurs/perfect_no_pe.pdf b/results/kmer_size_experiment/results/20160929thurs/perfect_no_pe.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2e6069c1118fcf1238ad89024142894bd10602d5 GIT binary patch literal 5524 zcmZ`-2{@GN+qZ;}eM`32PL|h<y^x(O$-XaR3?^fSS&;1ema?Rjoh&($BwMy5B>NJf zEFDD15<>OO)am@s|2yCJUNhIs^SjsQzTWHl-S;heSyx{MBC9|r8o3a;7&#a@>V&0( z0CE7v{R*9$8Xc&EMZoY#3|be4M*tGK=b&;BC5W7y0z`%Qt0GAUnqoZuUzrsWk0M%G z00vl?pEnYY1I!3`KLVa;b{UKDAixn=l2l$+URI8X{fvbnL6USD8gvM>2Pq5a-`N15 z)m1+P0P4W-FciiM058M55I8`ds9*v7$3XdC1|&U_HWq=#|D_J<V9<Et5eKOJGzTpZ zI1B*`C;C(PU;ao6zsq5QKzrf60f?fCA^_?mQF!7FKz$UkIJyWp#sfjh8;2*>%$F`G zXWjli##;GQSX;Or|F!!vebpHs)EHmfyc74djx{5KMgkF4*B)GEdbH`)9q>te`peUT zmQRJtV_W;GsW&nveepM(@R-@nd-)rCi&0D+T|-;e*ypyFs9cxDHxjSy(-zv~S0+~+ z%{aT*xK<jgAEd`@^=~Z4@6UYeH2v(uG@!w>dEuR|z}+LCo!v(Ijd8IC10K6xF-C*1 z$wyzaj+>flO)rkei)?>fx=Ai+`#zn*&PJ1QZls7Ti=lv1pr2h@7!Ygsv|r5bp(;Az zqp3^RRaSa3OQtALpjhiBin&p*O*YAG3W(bD3EWHE;%R+-;Q!2McQ_^*!3O$0vSkC) z843d0?)W`IEl~BQ5&iMB(=943hzJ)%%COP1_G`nb9L3nBr~R^$Tn#!eK4$QWI|UWE zj)<0k9HBMN^l?$}Q<PqjlG0dvMUFA2<QIeq&hj)krkgQazM)4)CCDlUt}>Q|ohlD& zHy5zn=1`L?ATw*fzj~Vy+BGAzQYqmsM@_NKVU-|3#d6T$CIt%PIkkk70BRwN^2JW+ zM^`BT3*IA@i8-HV5d|!I%y3|c_eD!tTBPhep6Wx=d|Sk6VQR=}${K1si+Eq>jXJk- zzsKw#;H}H@Li5CuFskE3MTQ)|>Wvj~!+{1T-4vGS(<zbOe$;q5M&SczsK*sME&kpB za)|yZcR6Nr?amB|XfKA0;ukAE0o-cU>l9J?>{_RE+hJ2s54Va%+DmYL3menUOzW4j zYhBW7<7b!06xvl0(kHb|PClP3$g$wbw49(8E@jX!XBenJM&n*hGRF`<HCn3ol_bjk zV9+<=Ns%M|8>9X+v#uin8O?Gf8h7RGTn<#}grL$180zw+3WisbKG7_r)WTygmwTFw zWj&iRV8u$uE_|**e4Or#U)ft#LaK_eQLaXc+pPLka)?Q3)$$y9T}AFjt(cS+R-0Fj zS=HytdQH`YGW^-hU_qwJ!ojr<?)iUgm0k6W31w?)N|aN3+SPyJYi?ANDz?aJb;+9W zrbYN+T52Fp@^FSvhxQ9!PR+(jUGZZ>0g+Bt%0Cp8+dKmCON`Ga_*!PwRL+!EFPE<H z@CFOI+XyLllpO7>MNw?``5th~&g-|naon0;Z12_yt3Phe?7SAjx$R}u=Q^6Kf7YsT zvBZA#^ZHgN0`ms-HKbSOGg8sLZ@JY?CgXf&c+mJB;gi#827G7YuXiR~NBm*+{JhZW zqws4luj7~8qUTzH;Y-+F59P`>S$qFj2lOz<xLDcU6`uEZ{}A{rWfPLkrtIL)hwJCa z@D^{}8E~D4#?n_*@jvT4bv|@wqkgtqa<8$p<CR1zQX{7|XlLVr$vdy6?O}Xuz(_-- zu@f-u*IGt%0nKN{KqJ0fUGG=&9PAEnSloA1c{tmV?>GrBr)04L!_@r;8Qo8|g-~IG zzLK9A=Bf@Dd^)|nu`(=UMztth-!&uRI?Z{TYDB-uy*#Z@>^Q{KKRRdqt!tY1q4nL6 zr<D+qnCt*+vXPcEWPxQAb4%9bBRNpbU|Xu0XST6Tr;<bLsT1GoKa}GCWNBKYl(s-# zpSc>bo5FemE!lsj5s|5gH9$`E@D=}&#$!Zf*x5t&QQVh#@1sCRq?2bM<G6u?37^?x zZ(5%#ctKp^L(~L)?w2^TFJC`cUPT|2<cEw%<@?~~ama@u6ZH8+34tiF|0Utk82^xA zCfYck5w?A?I^<z@?BJgb+xB3{1W~e(VRpaax1=+65b;YQ^`q$>^Q9a;#|tY{#>(7! z+pp^*>$p2T22LYtT~^ZlUoEGwXdqHZhN_EVKYpYfV&qp{4>eY96!hKs#-3Q4%bM0K zsoLL|eOKs<g%DOsQE#<G_k+F>7`OITz0q{?ykJcr?mM_L%eApRH?`1e%5VOLYE!&K zwtoG_#<P>ho!_3D@(}JRyLKH&EnewC<ZINNFgYpuM!n|%yg72Jag*)%(OlkT?6al| zv*N8Ei+fpR<YC;DQ--+E@OYQGw-1$Q&<FYaA<fgdIV~|lyh)p*ZL`rkJNwJc&Fs_l zKc<c;triHCM85lrI{sol60!Xvp5M&(i|{~Wqz4Xg1V}_@0XP#05knvn8w3DN5gtgG zHYNyglq1R%<e{Xm^FO$e#BIO1j7Y(7O9UQp1c^-mF#;k8?@R~k6L}F(Qv>y(0EFb^ zC$oZjL3jg8JgEmzQzN+}hD2eomVPk!PXxqS7-$2Ezt#W4#UzLn&k?++qoWPOAv^#j zl9uytb|;;F-%*Q1r)W|TP!|z^gd<2!!AnHO2LIw+sG_RU|H-?goc`wB+^j$=%PG$I z!4~C~%jwKXnL|F_78HgIErw~bXCbuAM(sDrgp_ox{B+IJN>4&1<|2x9(rSc~V#&>G z)=qHij`{ns^JHE0sN#PA?CNI!z4rq9VLtgSgR@?np%YDHH;<#Ko}1XwMw}|NW1nB{ zO1Pfv<wiq(>N6Q7ifqoWwpN?-K%XL;B4jcJmCbg*)zZWWd_6ildLgBQ>_>iPqrDJ> z?DCzI<?VwhCvL7@6?NMj(R#DqJ5Lo%2jq+fgzTfBHW|2WX~&SgYy^|)4m3uMyAL<8 zfcX4jMtt$QDr;*>nj1IQ^2vinXsZ}J8h<fP^Cjj(4R^>TB{GTh8QCSAr3Xp|o-P56 zl3Bb;HfPxD`gsJjdp*e3$J90O+Ci38q%473%*itDdInqa{NSMJa&p&eIY#B{!V8ee zY;Jqrc`TXd;ITdXRbTGMLgyz{R8{1BWqeyD#=@PC@1^etWVytIZ*c1h9CYs4hGjF5 zJt{@iO&!MjB00#M*T^q7e~a&T1Idyew-Ojb{rQ_hTr6HJNP9_D&dfL&ljr+Lf-9Sc z`HbN@t|5aKt+Shd!h&`(RMPPf9T#P^XK<<Ri=Kf6zptm3^5YK#XucEpIa!WHS`)xb zJ>9-i6Tl>^&asCrfht=&PV*rD?g3Nh_mZi`ti-nmEJIm#6!!(sXom5VNt|T?Xs>xl z4Q+<fzciry+Mhg}eJA&5FzS1#ZG*}89!`<)RJegIw1L%p;M3h>2MPs%Oiz<?Rg-6h zlCAy3({`2yGNw673N49*6aH?HxhVFV<hlSCdqiEq>CC7qH}P_6LII0Hw36Rxd?b8M zL4`&(vd`_r47r6IV_t;hXS&y#%rUV&%p4Z+ctEFz_i?P<O`Z>wR5~n|c{5Ln>S!BM zaA>X-Sd{ZVh+8f6Tyb2Hz@KR%W6-HDF!yFX<Orq_mZL50aCpRmqN;T>Q#qL$!`0FI z8LA>68&%(a%a1aIc_YTG^Y$Ez9o>b<GL{HtmUBhPax9$B^|jN2L}*zAD{r!vP#`tm z-sCS4XiPFpE9-G&r`FShTsFz1e<i>XyHJ!>Xj7<E{J@;%a&EcCnDv;%6}m4hrusfb z{lfz*WO(Uw+_}lJ`nCo4hWJ)$S6Fsc4mqf!3*jk{)YIW6ykZ=+6tJfo?qvD`0H zV5K#7f1I1(T}xu<!aj0GL7viGW4^|Ejp}@AXteMG-tVc3tCQNL1gRI+a}IvueztyW z4cyOI-!X5dFm>&G?%P5f=!Y<coxG5mFDSv5Li;erQ2Q)QrZ9a-U&Xk=I7mx`^-IE~ z7k)(%4RLlC)pXU&ctu-9%?&+8lq4fX1bqe9#dC#bg<61&UVRTHteU;aJ8iUBzTkWD zHuHkBc4tpWd}g<i8k4}EbutesO|Vt4+O=}}<4(0*VUVuB;Z_B!ORr&K)eCcn`&Pwx ztB-LbPoUOQ`l?o{I#+A4$=g(a8n8=uB`oO7n<q&eNq9-XtzsUSAuJom2xI!I_Nyuk zI`XAdLl(&UPcYumPb4{#l5&ln808sVf9201$ydlTOu6!L7kUUgBpjZ%ma#UoZe0}{ zR~nakP(8j>ezkmS+-U6Wxc=C?3i+|nvC*;miolBZqngM&$QYjwS3V7^RBquJeN+2* zH*(Td<CkY9VifxogcWSPH%57fawbM5wntx$JQz7w{Ke|t!}Y@42Q4~BF?%J9*4?HC z1+_ZMB?Is{y3UNWfpU*sNGV>C@Qona^rb1fNj~^xuz7H7@Otq6F72W8H<53Ahb@Pe z)aR%Lsi~-|sOuTF;-ov-JDuac#Z_NVx$d5#beh1e4z-bIlp9b+E4KP*EBPzidycqa z``O{Ipr-P(zQI$clxmfYA(apwXna54YpSK@{&<-o&P$xy$<)a_Caj|w*_|jvs>6s( zv<$sWvvj^aw_TT`Z|!RRp|>w47d`78+8yW4=56Uc3g4-$%9eTa^yS`Lr){VAjg`v* zgP~|Iba(wUJa9Dcihz4tN9&vK(b2`xJ%#<Sk>Pg8aLvRPb@EF(xUXk!p!WHzs#O8! zmUbtmI_y7%On&9!DmA`qoZMG@_2tgN&Y9)wjjN4Vr#0vM=u`E&^(|<NK<B{JYtcuP zOrcD91{vKkWffizmbC`GTqazEsu`=_puF~s_9QVcrU#o}TpNmCVEfZ@9rxaLP<g6; zJn+@vJ520*$?zxPH-9Mo;30P)pNUwCsL<?gzZDs(RZ>8uHLk@J)gPtTp)TlW=+?7$ zvw4SMnSMDiHfkthXuWX%c}iziY^XpgEfak;ZTM*zn>q7vTSSvvZ&VglwNQ&tpU|dm zw(iBE;9`zqwxWB*uH8Z1ncar)DtFtLE=i*aY%d7i@WtZAdkbY;hte~Rerg>Wg(~xh zLW_D=lVznR>~dNU4p_THb3_eThZ7-*GKrh2g*ns%yCV_z>x|19P#dxvY<V;o8ch1; zD{$FNq*3gaxVlk*>AO6&?1emwv~8<)PAmGnvcs~8`>mJgO{eqsB-O=Q)G2SL-SM|F zWGSURg4W6JK;vs)*Cy4L^yS=McTy6iV^u`xOeH^V?oyZ<9E*6N``86~cQMN{V@FK+ z+`eHRHct-Y);P6o+9$ppy3^(o>CoXk>{#IV=&2k2{_?9s>!AntD}!y<s%0J2TVG=O zBkN-7kUrrOPh_RRkVaWgXR|b$;k!jOU&7WLlR1*~x|Ed~Q03DJZzU#G8ju^M4@?D& zgG9NEtBj?L-eZ=~tp~UIZEv}TzD<9Ec=WmuHE8qNW`1IAdZwOF!d7+=6qx3J+=8Ql z8}z2$K7aeuv&%4B^ZMjR#*d91xSgh)5tr-gRznF!UktC?nqI5&TrYk5ruk}f^{1`L zQJXSrTZesgwcBQYaNFSkb#Yu$=$%m2BYq`)<-PF!!=}cI#T8FJc81M|eT+<?4DC<t z@0YnNgQ)VWEFo;WJ@4py9~3@1`w~@&DXYKDPZ2K?&mVs8VEs#{R{XUak5Zq6dL20X z($p1~u&p}1QysrGBD$Ws#;<WE=t+I|{j$YteVh9Nqv|#|r$2+HXrCt*vE^S{%Fofb zMOgj#@xv&*#W;8~Cs7?&1wO-kMh#8vEQbu{e%$|b;p)3D$2If=T&XS|&Yzk-_*LNA z!wME@js(t8^HHCV=Z$Xy&pU5$=P8FQBpuxvwQZ@+Yv%M@AL%(xn_n}3gL!AzAlTB| z{B_0Vuxhy;haQ}<$8m+$f5;nYNjkc_&l9Hbt;Ts3P}f*hKlj<KBl<jirbhe%;>T{- zvlYRwu&$T^-5ud+gKn`<aqxo6LD2EViHz*Bt_xB>oQ__9m<ntQI)=(|oLBtrw0k^S zeb2SLZSu#+24WLGx%<tQyV}#`_A~Qv+?>ps?~mp?fo$@Pn>CwK7w&|ud_BFpwNaL0 zmLEpBAG34xuU1?2?>3a~uO*Wnnz+wFqKR83Qrr5UlPwahg~lO&19YG+($f<`+$@n+ zM~;AkBIt`m6L5g466jCB;1MW<C!P+Xpa6Pch|4e>4heu>SXcnD$A!TOq*fP>#KH-_ zo+v~R0OFA-4<h6XgJUsh0CdM9NQy*D7#vPqR?&eTNMa})5(j{?Xbj#1;R%4)pGm@r zoj(c%`v>k#KzqTk1YZ=4fCoT~7Y2>+Ax4A!^o%2Q?#}<%vi<D!e>dX4)@(l~0nn1* zj{muJBZ0&b;}Xo5m=bY1_*s`<I)(^T00NJM!|1@@bpc2f{pVIL2#fHflLH_SI=R0e zKuJ+iUJ>vF{>Grh-6QD%(0^kPCE^_P-xx$yf$09fF{m7I+WFrYvF`uIl!<fGf7{AI zA^)<KgR1@uQ&3U<7p4e-{v#JW7KTJ2u*B6V09qo05yblf;6=12hB%#(>H%6{FnECI a<X3HQco-J{voeZMs1lv1sGgZV-Twe`0hdnz literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20161012wed/bad.edges.csv b/results/kmer_size_experiment/results/20161012wed/bad.edges.csv new file mode 100644 index 00000000..ba2a975d --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/bad.edges.csv @@ -0,0 +1,12 @@ +# The number of sequencing errors that would add a new edge between two +# existing kmers. Note: there are 3*reflen possible mutations +kmer,reflen,nkmers,nedges,nerror_edges,cov,err_rate,est_bad_edges +21,1000000,927610,933930,7951,100,0.005,1325 +31,1000000,971394,973934,1874,100,0.005,312 +41,1000000,988492,989443,394,100,0.005,65 +51,1000000,994492,994828,85,100,0.005,14 +61,1000000,996793,996939,35,100,0.005,5 +71,1000000,997897,997975,9,100,0.005,1 +81,1000000,998506,998551,4,100,0.005,0 +91,1000000,998891,998921,6,100,0.005,1 +99,1000000,999092,999114,0,100,0.005,0 diff --git a/results/kmer_size_experiment/results/20161012wed/cleaning.corr.table.csv b/results/kmer_size_experiment/results/20161012wed/cleaning.corr.table.csv new file mode 100644 index 00000000..8391d83f --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/cleaning.corr.table.csv @@ -0,0 +1,12 @@ +# Number of kmers in the perfect, raw and cleaned graphs +# _nreal is the number of real kmers in the raw/cleaned graph +kmer,nkmers,raw_nkmers,raw_nreal,clean_nkmers,clean_nreal +k21,912362,1115087,912361,912624,912361 +k31,962865,1438774,962864,962835,962835 +k41,984331,1635697,984330,984322,984322 +k51,992751,1697232,992749,992735,992735 +k61,996024,1674397,996022,996008,996008 +k71,997500,1587464,997498,997498,997498 +k81,998290,1441765,998288,998287,998287 +k91,998785,1242238,998704,995357,995352 +k99,999043,907324,854979,576,576 diff --git a/results/kmer_size_experiment/results/20161012wed/cleaning.table.csv b/results/kmer_size_experiment/results/20161012wed/cleaning.table.csv new file mode 100644 index 00000000..28715de6 --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/cleaning.table.csv @@ -0,0 +1,12 @@ +# Number of kmers in the perfect, raw and cleaned graphs +# _nreal is the number of real kmers in the raw/cleaned graph +kmer,nkmers,raw_nkmers,raw_nreal,clean_nkmers,clean_nreal +k21,912362,8064108,912361,916474,912340 +k31,962865,10437776,962864,963212,962853 +k41,984331,11684532,984330,984311,984310 +k51,992751,11939641,992749,992711,992711 +k61,996024,11301966,996022,996009,996009 +k71,997500,9841194,997498,997436,997436 +k81,998290,7606657,998286,997359,997359 +k91,998785,4640344,996949,406297,406277 +k99,999043,1484663,703944,2269,2266 diff --git a/results/kmer_size_experiment/results/20161012wed/perfect.links.csv b/results/kmer_size_experiment/results/20161012wed/perfect.links.csv new file mode 100644 index 00000000..4c2a267f --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/perfect.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,52592,1 +31,52592,1 +41,52592,1 +51,52592,0 +61,52592,0 +71,52592,0 +81,52592,0 +91,52592,0 +99,52592,0 diff --git a/results/kmer_size_experiment/results/20161012wed/perfect.pdf b/results/kmer_size_experiment/results/20161012wed/perfect.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a77667fdb9b7185f154637b9d841fab9966fef00 GIT binary patch literal 6029 zcmZ{oXIxX+*2d{bk&eh%2q*|Bge3GPkbp>$-fIj2A|b&PK_K)dO+`S8f^>mFM5Kv; zbd+ADH<2n`ktV*0j^4R5_dUOJKAh*Qz1Q07eEF|O*bt^A0g{wv77m;Z{1Dh3*l&wt z1_7i1*y}f#m6e%+P#h9YKw&X3H~|R|g(-riK(Zhy5Evv4l7~RVn1T9Or~mhxDT;t5 zuQCQ)!NEOTQ3yQ1fJpEl63EL8aaboJ0*Rx1g-AjqrO4WkTv&=E#;mH!jKnxmssR74 z1^_g@?STXUp>P5mjdcM44dE_GJODxtFb4c*f!x0qP~s@jI3$MfS2z%g#SqA^c!2zm z<v?R39!ta_$oonEV?Rp3uXgAmF)jpG09Z;v3INnXp$X&@0JYHM=D?5$tP_$_H=aQ5 znLD%Bqg5+?EK-JRqRwFkDHOQ5P*y`X0Qlm{0Xf5T-jQ1W9Q9)cF1i%|%Vy8SEhY%| ziom54mn`Q#CXlX8yjb77G8na%AGLPj>)6cYy9fCC@agM|yTx-M-*Mu*>&u}B4K>~| zJ(&V;n+Hg1lJ92`9{G$8aZ;+4N9%b4Rd>_R1PQ3fAMAbl(9PBxK7XX_`z3~D?Evaw zz%CUBeH3_j@Mf)|6Z_Wgtj?&f<xYDegduzQ@C47Ba%%L%dpMQTML_F}n+eqzLr+j~ zD)qgXfTV)Esd^1L!}Pf&CLChYQ$a$*<q@2|;tcBL>T`*{E0K<4fS~m(rGq=%tm{H6 zblV|`i&>8iUl3p3JTH(%ea~IwrIWy?kT`cLTBrWw0cc1@&TQd(CgCwEhL;w!fK${e zjT;2<Hfcy8O`y@rDZ#NW&m});$duFAURUpX*ZEX*uxN>+8cY4oTFji-&T$dGkh3&( z_fLF@f!*^BzaJQ&qc&mtO4S*i)4_DJ+x$Da=ZLBGC@H1;c6l^wz_F6dlz^W0vmhJR z>tGR@M7lqdEvNMj=oVR89bAVbf+nMcL})q1Sf|8r_vVHFY*dzEc~Bjra=rXylan4S zW>Ioz?g|W6gil<V_ZdCs9Hp+;0t>KZftA+}AhMaVAKaW0#dlgap+OOE)nHvCSPrY; z$26q5agX+hGZABK>JpKF7TC*K#@CgTq$a+H8HHH9OlU3CFiI#QFoMR1zSdnWAW_op zPOqnzINkW{&pJ6cb9dA{QEyyyvZ3Y)^3G=Y4ofZ-eRZ!RbDIKPbyP0VRa+kWQlDb2 zYH89E4rmGoB(nF$?{etp-wkyXV96lM?Tl{kb~!@G_r9aPgutI-AD|qwP>%L+R~_Go z+<{h&=XXO51q#Hr1Pe?Af;a>UbaysynaWu+)Fin3<vN=&Mme7xRCk-okNxANe}iYr z4ORZZ9F7I1oMRYJK`LvOSiI(>sflRRz^!99jOakoxf|1|2efYD2_grL8q%WzCxqIM z-lV*fN)Rl}1>5>-jR-!~<ZO~kX>hZaD9C5cGtT$<(KM-_hxn_W?EL8Pijfsup=i{Z zAHU$_&)N}!!>t;Z1S)Jrmn4~NZUmIQXHn(Bq8cv@tR&?v3igFc1~x{^ojL4N*YuFc z_e#l2*~F#J5Cpt>>sj=}w5M^#JzZ7%MJKD>y-t5jTwr3k&wlvgQ&u%NlxfnY41Ucg zv<l5;;?7LI@gt{mM|tL7D0CnNs+&BLLf7wbi42pHq&DAy^^!b1T}MKg_@P*l4vBia zUeZ0C0oO)Wf)Poj1A*$i!uKdhxy~N)Ed71AAasw8VP4YDBc8a(ID&Mh;iONWXwFe$ zSy-FRYkUy3w2zToH<PlG(Z_^_r#0NIt!Im4yuq;tQf{NxI~9Rb^fIL5=L+e|3;z7% z*jCcR>?KFOdIWfg+g^fRntSx?l0aENf#a2y)N?iI-Ks?w!kr=GHf*oo7azzrp5bJV zABU&5ayyzTV{zejY2{{1>BmPe&kXx_1J<sU&ODcRUZ#Fl2ABZ=r@t895vx#MEm!$u z51j<K6)I*QH=gZeOQ!tyyR3WVfHj%9qOF<-;d(o#!hGwp*AfLA*TOX!_>GTOKJB=d z5C6h_*me)gGHVZ$W7AGoJ)ElNPbf?72&(DKI*FW2txjgpS$=10BV?gj;)(<BWj z`LLaK4UP?TaPT|$P>Aqp7VEw!mu9huZ&b-|isU=xHEH3Jo~|TM-(_$6&4cYd$L`jl z=hwrBczM{wh`=N%CcN?C`i#%FH4D!#vt#SUb*h)Yx7aC@rd8dp&sqa47}o!!Zy@FK ziqUICvQ-3CSTN1SJLqWGa$5?YtF$4S;Dt=88SOH2QuR#gvwAG3fngz1CU3X{2DVuP zR){pyxHOt<+g7Uld9mrEK_$!QBRooP2aFm{B<hbu>#eRq_$?=#QWXAxSXtqv^^2;m z9@+->Z{#$In6&ZAL!h?s2mG@%kCFu?pTA|j`AwhN*Z~m`KlO<x%@`9*pV<CR3cjL0 zc>al#E)cGd6Lad4Na*~gvEbL3*M!>?E0*?McJN)cP`!}Qu<*!uZye~mCg-ao%9p-% zwK4CgIgcFYM2>s-J1(waLHT!_5MMe@`)AxoJdK!dmq0@Iw}u73k5gtkv;#v$najf$ znf-$dHQm33Wr|qwHlxi3vI{dq>UcB{b3|u=nw<V(k$c}omy68@Ho^SpICW3Kq_z?( zO?@A+f?^Goiv_C%&jKd<t{Ii8^A2L-Z8Pf4Td8V+u|~n-1rwq%Ob$F{yP2Z%QJL}G zkvA$!Y&16wcnhAw3!LdHBDs!?2ioL#G=0qnW(zatBaJmT$xkP5PUPtRl!g&jG;r{C zj<rlG_|Tcz(Q!VTHx+4I&tS0n?8_gZD%;??WuBs;w$6|!8E>Pi!YxfzX~kP&y(Q#Z zHzu}&cS2f55j=HrVl(LBc#ZU{t9^@$d+SoKJZ%>m1ZHRRk&=IIX;x_J4d=fj`eV{~ zk+w8f#clKA1nt)J3vAdWE~$7QIUd_BY`o-Yl=124!<(ryn|VQ+l6j`XIxVYuZ^y(S z$Ev!XhK(ECUeCz1jz4{uZ|7R_6yXE>PMDR)`<2RD8&==FVX^)$3R2zckdH?j`}@rw zJr~$+JuQ9s9dGdLnP1f_KfP<;HtHI`Q*+TOsFFeFFJ|})c_>8llQ@1M&ri|;>Y|+R z02=^>aEt+VWD>#>$wUJJ0QHehD7XgJ3t%Hf{w56pQy#m&K@kONenAYGy$~iy0>B1H zray89q!+=C8K^}DB7m|oPzwwIQ8xL3R6tEH!W9z&1^$$kDcg}#qOmv=4>;n70>m3% z(E!N&3jYs;Q6x$}8{kzaR0EDjIss%UQFgzuj`H^FjOrAQ!cdX`VaQu31d_5T@ERG_ zfWLrCMnU#}0vDyG-@x@a(bLpq<V1LPja-dk97kk)kDIG8jrQ>x?fa7AASMo-H}|N7 zWMQTrFr)jq$H1b~0a?)dB|?#*)J7#Mbo{V^n;tx;60bTH@lU<Jz22ERCGgeHExo3D z!eyN_^or`<VK9Azo+VQNYlbDy%tCWSOq7cwBQ@&=6)l=-+M~2o<HW8OO)`z|a5OrZ zdzY`~6&qmhK)nBQbUoGg^!O?(ArO^eQuM;s?uacv-&=VVi$}s025*zfrS-d{bh?DB zg23hv@LQK`e0P$O><Zi95M};$eAg^;V{ZJyhZqH}+UWa^{Co??cB{a(B8XtZ)iAXd z>|6=I?=@K}(YSHRdAx}eS^^O#3J#L^aGPpc`>n%9AJDR+o4cL8xvRrklIwh41iy%_ zNm$GS?x>mWZvBO*<_;+~xtMdapy6bGtJ5<$D(CJ)E1ui#{4a!*hUFFHrQ9XlYefhA z?G97pzTQf-5Ak2)hY9R9?pXLGAE$bri!2-24|hlLQrWFg8&-b{?{oxGMZTydvXO3{ zedTL!+%$X1<znIZxUDXAx|<kqaeY6X%^&LE+ihGsvHlV6wf#UojsSw%%VC_ob74(w zU9%p0tn=yNy8?_y#Iq+j56{;|0OK`b?YVjYy~Hw`Htq=eWQ{Pj-Sm`Q_Qs>^k*dTy zqr03viIy}uf`6#_ouv{L=L9hMI9=>nC$Y3#q223@>P=31e9#?qM6#&VJ8C;|-ai&` z1qQC<GV1!2a%fE>4WQCgqg_@zwMfhThOYb#XC)Q;G%by~XawC&N6>T-&pm1wfR87j z?8(Xapd!Z$PZ@|$IHiMSJx&q=5!2H0jFN%vj&$SH#!_r)0b(1>9cmmQp=}(z#^D42 zwC(hZP|JI#=4k1moQ9|4j|oFHv}t(NR-PC?JzWsCoZ-A^vnWdV;}z9$XvGsFSFU|t zZ^m;{Ou6;e&pFZbrH%&j$6`bH>fdgF<pm^zD&8b`(E4($g%~tGoaVG-z8sjx8Nk7* zm>DI-d7?o}<G$B<CQiY^dtBKxD7DdhXR`&WBDL@5wb}46XljBC_2OCH3Gjx_W+rBs zXUJw17%>_?eyTcPHXwR~`3tAMmRn|LZ`UFf;gTZ%<0wfjizlf)XBJBrIX}zq^D+cy zAfiFBC;jzKi}04xl=7FJ>W@_j<!_0G=ayLhsW^0cCGvPP?)h~zsGQk{!-va<UMZFo zd~TNDQ7-Rbt9&iuVw2glwZ{by3lHu}{?}ZS9P82S&D$I8o5)=)Uv|G^mt)ffMY*Gy zo`q;@h{Gi^;(D~?bt`qf)X#H$iMZC}kr_}KW_eW^rfhIpxK`Lm+xfh#Sm1d<cfr*Q zkA)_LY5))3YB{mvl&$n8H83LSf>RgjjGl;Fiqna1@R(m55G9D)8hPbLSV)_GHnsgT zsn{~Z3wBd`GoQ=;t@fRwCL`+{(=3ANQdr+hu-S-~f~f-Zb}24ui~i#ko=eRUv(Oo% zNYMjP7g2<1NPz*;q-uaTptWqZEPosd$))cxM&-Q3x(2@#<Bg1btn*SQO(*8vO<u7x z8K-(_7neSR_u>1*eI+vqGXt~QWsyPI!HWgOgY!>sKiwSE85kYZ8ko$743Gx;2P*PC z^QZdNP)Vo|x49didgTi@32g4M?Wfls#VLd@j1Ps#bV{F-ws2kRKi%_asBdVi|6N}} zpJLV*)6{3H8IKEUpa&s4*=%O5`d6NmLKm{T5Mj)X5AJt8b@~j-CCCuJ5rtoUc@_K$ z;@#qH<Q?k0>izXI)BdGz=f9oVuh}<YP-GBfpl2vzs5rhEcBzr4(Jt&;SaD2r%=Kv5 zlSF<MusMWHs!I+dQ|qQ7dsEKJxz7>T$%A+Y)`v{EdylZnmdfdZ3PGp9;hkqX=;y0D z!zFr7Tsxr=#SnE$kE{Pdaw8fUYuzUiEWsjCeJS0F-?G`py>z)^-_;%a7&GBYY7M*2 z?P}uMkJv6ON|tzE-m)`lyJb67Rk(1gn}l(}v{sBEJp0pb2wbnLuYG?M9Gn&0meC3C z>wN?2Eg9Nmh-zU*xH~`g)KI#uP;^Ui{`1gCz11h*;XOXST-_AisP?SeE!(@>e=NjQ zEmz@eSL|{ytQD||8jP{0ooB31@BuA5i9PMggVvC|e3ya+^==pYA$y@>w&M3_mmQrQ zF>KRVcXg9bPxvhNN0U|jlts7PNX4M%yY5MB=#-fMOaFV{Wxt=Iwx%8rm=DNTYkiXt zNK(&!La#ok&K}emq*<>b=%MY{wsWs~`}hLOf@f$@Pe9LV#@B}E#>7yPKr9nGOEHuG zNjSF=M{ixgE62A%iS)%nHA3w|>#$_l)lBa!-Yo9S)GUWquh#fhZA8&^ix&IH{s`_S zVk_c9)`!&DJih%)<5eEY^{N^2Gy6gxG;c>qUK+A|RJ*&&)hzr-_zG9=9nc+#JL?J= zj~Kc>_XXsX>E>0U*Cf}t(-^Ug*tqp~zy*WzRU!!&RCI3XPo^m+&!!pQ-!gr3!jvT~ zZ$ED+r}i3){#g2sn2Jb^3hl%DNjFWkIdho~z-5qaaCm7)X=G`3`=f`ewz9&^Trx=L zNYsn!X6cdcfq*913wv<NheVSH+ahv`U$xV4X;N6ns*x@I_6u92?K=BF>w3Fhn<qBU z%N+?h3-9*LdJ1w1y)9OXC9PFzTd<vhWg%rKH-FKWl9zzKRg%tj2KUW-Q!-1w_^sGP z@kVMk%gI)vpN>V0iVn+HqSkT?^aXUig!y!fbT8^mVdpWmy9u2Z2@a&uxR=Q19U16u z^A7Wwp_Q@miZh}XlHEXov9m8~@QjEnZ(|=SJ^b|A5N=^q5%pa6g|0Qf?T8)Hu&ivE zMAZ4B9b=*IQ{=pwJNmx*c6ITm&EbCYJTnXHub5)T^-k}){Vs;Aut-u8N#WqEtd`u4 zf9L+Is;gP~FP9qqX8e`{BWOvTv7Ma~DH6yckHT!?mSaPG`;?b||3nMA5Sv%=@GMRE z`S7#;sk^IR8r8#n?mmxwNpjh>b7w5e%I03Som3c1=o4OzT{)}zhu6!B)||W#KJDvY z1^QLY@wOkmMwlAzWOAopn@@kFnm}A$TAJ%e)aZJzKf0rWF9N>CZlHTcwikT6A1{6V zbout=m%|d4F1}cMC%aFt<~;K8Z~UHoU_1~|WH`g16n;AV6;R1;jXzD!cQ*1Mq2HpW zIIa4G$7)~O;r*EvqxaZJ?MlI#x7B-#=KDnp6?jbdxD}p{R56#<R}*=V^7WLT^tTeb zWq^w6vWnt{V|}m^V!Y(SW#sqIey<k=_u$PTU9jzQV^>;5NEd*Y?RUKnuMRy(&TGDW z@w@Fo$J~f#o!22)idRYI$oBJLe{rhA)4JjBeQU^d!tm#B7W~D|_77hh`QxW0R@}c= zCwX#1s@6-^FJ4aaTiiSOd2=l<+92JJ_G`%Y!M{*j;Wvgd|K%=fV#r1w3PZLRDcJg- zHwzR-9fL>x5}1K7l(RFEY$;MaLmPm!4A32gA>siFvcQ`}ECGo|Iun>d($YXDEZGIc z<52*h3l4sZ%(!p_k%GDi6b?ajcSa+<06+o??L?N`;RqZS0{~vfAt{06m2d=t>{2oV zolxXdcoZH0l*C{OPDp0}5ci`<1eyHNX!zfBS0cs*jw8CG;Y0!eh;_kYkZ$B`@E<$l zDdcYVH+%I5>;D4rpZ@BPNdRa<yiWLGw^Bs%h;a?>PA-W&9sKCa&lqha`WBLaLcp1U zzxo28boB3btQQXH%q#@}ftaO!UjSJd8HfzP8Sq;JlTByJ1;G5)K(gdH=)X0vv<!KW z`G+P0A<skq&}5~_W&Bf<CtJz?tx17F|3g#wmqt$UuN*S6kiXSJz`;>yB#!Kv1Arzd wZzTD=0N_=OGnPD^QThQi#$pKoikJPPH+TXZNBGeh894}uSy))pK#TeR03n~5&;S4c literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20161012wed/perfect.pe.csv b/results/kmer_size_experiment/results/20161012wed/perfect.pe.csv new file mode 100644 index 00000000..fc66e32d --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/perfect.pe.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,42887,5 +31,56913,4 +41,68833,3 +51,102361,3 +61,164617,2 +71,181787,2 +81,181787,1 +91,228304,0 +99,228304,0 diff --git a/results/kmer_size_experiment/results/20161012wed/perfect.plain.csv b/results/kmer_size_experiment/results/20161012wed/perfect.plain.csv new file mode 100644 index 00000000..337f0f4a --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/perfect.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,969,0 +31,1729,0 +41,2909,0 +51,5501,0 +61,13480,0 +71,21068,0 +81,38623,0 +91,48050,0 +99,52592,0 diff --git a/results/kmer_size_experiment/results/20161012wed/perfect_nope.pdf b/results/kmer_size_experiment/results/20161012wed/perfect_nope.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f6737ec568712810744b70e1a65a891c3cb21d89 GIT binary patch literal 5606 zcmZ`-2{@E%*cKs!>{}@xktK{7dzS1=k~K@#F$Pm(1~X&dm#j&YC9)PFNs=vFwqzMw zN@Oi0+1Ek-na=5)^Plto-*wIPeeeA~&-*;@b6s=Y&vOeIYw1WpBxOKCVKZT~VT7<D zTQ3j<kODAHH$ckDAh4zv0)|6k&{{AY0ua%<2$h1!L8Ks1hzvwQT1FHEHpDppzkB9L z9E#*+3g~*lJlv3QEMSDkdEjv*H)Ai1GaiocBJWB|N=r(SuwSt-WJnaGstQ7&oyl21 z|H=jcoBMen0I((u2SZ_80kAR56@dk$Nd~6C-wyKsaUk1~t-TOv+@I!PO$-`GT4Di( zU+!R21Qvt$f|JgZ`PX@5gWu)2f<U|C+yIE2oIC*5L85S^8-R6Cq~d5H;238FId3eE zR5N#wZ`KcMJB+zJbI_#2bm!CIx}UFB;OgwLB~RQfZU{{s(@4st>qosWIck*7u5gtT zM4~Ad4fsCxjw`w<pCwcUabmK$F%ub+@|QlZ<amxK&f2w>vr`4PC#`;;89s2D2+<H} z?{ErqlapiHa58}8DQ3QL46HobTcg@o^_^WB%o@n)WXeIm#=k$P^4or+u;D-3lfu~J zJR2Zz38fIo=xyDzcRtF=j(dA_F8vrkr%eRKM@}_dk|`&}7|1wz>&A&!EK^jDv+1*Y z&)LlB=0anpk{Pctqi$;590`a5Z1|nFyt~33ZTPd?4Wx-|-#=h8wr2f>Fb!g~V<q-U zju%SooiC^%<x0Jtmn~BFvb<+TU1+8x;M==&)S|3PIGsykiIf%Qq8~@TUZJ+seW(9Q z(lzp=X?({k4M9G4v-pl}^_GXN7G=m5!V8a<(zNgZH!HUt(6>TIs^Q_6k>Ie-dnJm} zokNp+F-@%lE1J)#q54yySAfS(SgOn$&sbiHg)bx|h6Yn}37Bdem*jr^t~TxJxrJCD zCM;&^vO_up_zsoU>Y>VZ`{)c4^QIPa+rkM3T0ADtCznpama^V)kBRv(Rs*M_D}K_f zt;LA|*S8Ym72l;guEKI0!W?#ZqTDfbgwE;?cfb~vOc5bE=BF0_#e#EUOUD?)TiI;W zPCQ)Xd>&AF1|YIUEih;aT6zk{C0V!6=}nBW8b<aQsHpX`sWVftJhvV>aVGupQRJfv zrs7f;$3-!Yw8L}a`$A$n!wk(Tj|CXoPC%H&XtwUP<z^2;xFK*U4JQwyUhZ)-u@1Ea zZmK*dV<UZrMbR@6EYdXR%t>Fd>MD=nQcfL?k!@4i4<I!~<_Tfk5E_m4-0WoTVY7=g z=Uf<c^b!aIcuU?M-1*k{>z)m#f~P&xV@7{STX|ZQ33`WKGBxq`K71zS?rWXTrlj-a zhOu`PsMxnlVz?URP;F%PphlbK{`uuMtFl9z>AeC-;y9`+xiEUCLsp*+R^xt0hHBHt zBiUso7KeE7;dA}jrQJl_iyX7NghM*pyv7KXk);XlI?gWIQ7LU&>D%w;+`Q-K-n#cL zf1Vq77TuU}`gPg!Z;E&9V#g^Wp3SEv(5w&H?XsMlzFeE8v<CBZTuYjiIa2*F&C#u~ zw>h@{VN=@(H14&3wK6h$Y^c)@e}hO2Y!_Yq=C0+CW7AQ81?6$rTJTEZus5nG@oLTV z3RBzU^g<JYReo}M8o06g^HmIZur`eA_#{1a3$SGHuHXN_AuZ{V)(tH%?R+M8aNo#q z+h<o+x=}3FrnhFZVlJU;IAVOi*I`O3imF(W^F`}PrGWjP)rVg>_ibCJdf)EARI?ib z_kY%1wU@T7ld;VwMA{(NdFmKD1H3n9N1Dr~7dB^lIbWIY&VFISuEXfZsuYPs59W0z zY8{%wai2@?0Ll)H6PG5@4(Xp$l$I&Qgi~9@<1qJHL^U%RI2e4KYN&!$S+WWY?Kjyo z@~d;B;*)Kh<LWh{<vmkG(F$K*gRU0V<V6&aUV|MnYofOs=uemp`cm97wV<$gL)ENd zO`&;0+Ht}8*tk7c>ltS1_H>#IsDT1;n&Ljl$Lw0omI~aWCbQ%X{j*ea@;jbUOd*4` zJnLvWjgGTATj}~uJcaL*uQ-K)>oteej)KOgS=WNBdF~KjcyKl|cQA0*S%aHTJ!GJ3 zlcA;?f1zRFAGI_s^mI@lqQ_M0;}Rck`-1AXq%L8e{M|1(f$Q`+vTvs<kF%)m)8-t; z84=;@`oSWqd%`&faYmm=3;#bBZ(Kh^cEyz<2|rt0n`Tnx(gTxu<BWcG2{)};V;EGs z8K)~lR!6}HnKDvKFRca9$9&ArXm2l7C%!+^>b%Sis|@Ma?WD_5a#<~gv*#S75YEW& z9r+B0jF}weV>)5Yb)LL5;-{m#N<6a!#ZMLKKk!T!=PA;UBfYr48rSKMB&H-(o)+vw zCmvB(?!URgsaooA!6o)Vik)c@vC6zAKOnW`X0hyS5JR!21c!`%UgXgA#pE?{og0m% za}7J;(;*$soHLvBZ+^HAgecmxE0zuX?1F93*FFViLvB7UdFyUiP@AyQlytb&r1-QL zQzK)Qi==t|q9s!o9u&e9<TDHA7+AZbKuB!-ykbOfa5tnV!*5sn;XK!-okIB3@jD@z zAr#0*{hTBQ{gaIT<We$K{y}BGx%3YT1sfopv49OgrcP79jzqB-Jc%kH0N4=WjD%@m zd;uFN(w>Yol>D>%2iua_@;5t@NF8p5zyUU3Qd>cafbhlHfxtQ>rUjIh!8%X?LO$dd zSA(^Eak^$Wa#x|OOg@ej5{2<H^MJvBAz-Yjt_C3c+x%~KCqv|THeh{CO$`_p;S9); zt?YjH3*^=BJF1fj8chxY)<SqA;Ry1f;H#t_0Q@JDLuD1^{!b<+=kynoKTP#8Hyh=Q zCN#)57$-Bw-Rr;UW=g5Y*r0b;QVc@Je7QM^LO@Q-+(XOcZs7^2$W&;7=G_W`xTs?$ z72l8ZXbpRMu%AxVcP`_ZeC@Z=n>oq16MQqLf$-UNC1|9MBIzK4`pp$9x=@BZEB5L6 z_L$r8u8y?F7*;8$P!v-hm6aNtyE>E^lz|@;P#J8yTn%+6fxUgPp-TxZ6h}GtYOMtz z6vpWZ^Xt2#wme*23M!UaLe)lH>91rA`=l=S30Q|iEmE-S;x>WX83-oD4QQk?PdB!2 z2C@2OTzK}jB5Py9T}K|S`4hXf(8lxd2%P>cwGPae3ZB5Laug!T<C1e&GiQ_pJXr)9 zE-~vzF{Rh#usR5~>LIXoGqv~iT$f}OEQ;YdZ)<kzb_!ekG=X3^AK%_1byEKJxf#gE z3?6IVX)g*F!htorpF7VB0i}-$iV9Nh67G#6!y$GDnaMlesrHc}%RE|qyRF-n!5NGc z&kGS%qr_-;BnO4v_hZKOhta)`V2ZdGjrfy6o@eU<?M>Te#9c3xj*r_K9Lu>W3SL+t z=9~=CbO<DvlKvK7VZIwF3duN#roBAc#lKLi?QP$T#~#C6PV_Dx?N9t!PL>0~#u)HD zZLRLYE5Mc1Dx0@#G1M6vx72oX9_%u;{(LlAn;QFRm!&_|it;i4DYf9U6e40Q09}Cd zh5nTw`VL*Hz25kNjP!^5gz%q1mNi#?zU34QNrdZaL2Fn|`j#FXT&I))D74k67S&EK zP_Z>1f7Q%VL%}pfMX4?lbKKJrG8N9AbW982Vh^p#<-QkQ<|zDx2A|6!6CvlpjSGWM z$tciDhIKn0A3tU)buv3tbQRR2#vB>-mYKse8V6{;<$V!lm2~<G6}2XdG4H(-LYf+S zlpJc`b4{P{mfTv*b6K!i5W$_QqhQpm&NXpkC35)Fo|B?0Y`OlN1w~!yXryo=F_NpL zYZa=%CmCMdeBXmAka;=Ms5Nbh#R_yOtcWF)ndM@Byc7%P8y$_izJhct{G~~(k0_C9 zpOVf#;;W6*yIb_uhMh)R8)AIr9{oE$j;NXZ)I5tkxq=cCTH}XLREMt(i`)Qhu^8&y z%<moOTcE&+U*vfhFR5dho7sP6p>ly`Q-R1q6Oji`fFyE<T;V;>QAt_JQ+awQQ89|A zBLP-eVfFOl2=Dhe#&)mgPAJGLPyllPYXG%UVo=1n8Jx!}1qWN@t1%bau1#I{5caV2 zV5{MI%{sxnlEBoyvD&?c*wqPS3O;cuF^6A-ErIS?q@IQtOd>D2Uq`{9#=uuykaa8O zYMV!XXw5AvePu0WBVM6KArn0pK{?SdL4J4sAHoj>J_|GeDP20wOkT>?S0*&j=X3Zc zg_}%r#jM1Ri>$I+To@L?iP@U?7RFf0m~WcfK20yT%JbFo)LScNweQl4Eo(Em{@A<# zXa4Qh;7jPWQ5{8dMNPj-ulRN9ue$8w?J+Z&(<X5u`y#F)aP!C#BZOJ)Fn(BP(RxvV zQB%5*y5AJ}_$9_I;-x4@T-?LUFE3|bzWvUVL-b7E=>e*RZ<|mejEE;HU6Z(Gbggmm z{Cm0g7fQ<C&pq*bviAP+@Td1W!xP2Q!$HGC!_~z;#gjv7$aG}n%`Z2W1{6xya3|dp zyLp$hk`<%p$44S%du7haSh_6_@%Cqp434Z1y&EhUyjZYhp84!Y-ougx&Hc#jM<=g! z8tUd&YR*6EgWm$Rrrhm&;=Bnd#L40h@j`W5brE&a{vG}%{!#uv{C76#h~kHWhi8Zl zL^GO;H2gHwG-Wi^jBB^VTiILfZXMn#znyT~DM5}K&!YmhkUlBZCy$nGys07QDR1pE z=;+nU4u1zVl>Y4QKgu9iDQ^HNg`9>)_nzsYo~!STmgwia%Bc}g6My;&>rhHYD+-Z# zeNZAof?lFtJja^Hs@=xDa<Q7|=8k!Y{_GaidCQ5-&CG2GzEN71A@Si=$Mz@Nb=%3> z(s^$}5ZV>pSv>~#8OpxF=hW2F_~B<nL_x&cyk6MgKr>{ZVq}dbz5@hzcX{Zeq2#An z=6!K)b7Zu|dMWVZ9v4@k!2^T%?gGD#jopn?^S5gkYrSm0+dW1zRBKf?piO=3d=dj9 z_NkbHn6h<KIwOmUT}$TG39j}d_5$T6%Riu8w=ZvtV%o+C^=$$D(KBpc&3<4fEeZ0Y z)$e`Y5hgHElcFInLz0f<j!qxDer!B+F0@#!v-y5lkou!sYW4T(OyRxZ+AS*l9(s;% zx0C8O80YEdeWJqqL;HW^?Yv27O^piTOQd6>FQ*IPhOwD24>X0=Id+AoQkM%f2y_dq zXk}>W=ld6M6tLxI7C3bJcHZmMgO@p3cG$-a#jv&EJK?hhvzaqRTtxBlS`XzG)jWl1 zqQI=SU%aIFh*ehO?k;P)P?nG`>p(0dRw8yqF)xdzZ*ws8ag{+)4Qg3(nJt?ZLyJjX zc?X_160ALcUs&a`x8X##a>h)y>D_hnW=?bZ>>^^($m7PV^oC<O+oCGx8&s&$?xuU1 z>#-El?L(`iH=xm#J(Y2lkGiwce%Q(hfmme`nxpYA>f2>T3B#dnS}*LO4`x%%QZ~-Z zU)<5l_R5yRIM$A?8+Hq?2W>Rjhh1;68?ec>dH%`~_jvvt@mhb$<5GXi@8y!$RT?`m zy<t_6RmhtmA}=My!GX1sE_OzDEe0OsS8N4;w~6P7({7iSt3f>(i}@t-QK1I8Tv%er zXW%QuWl&~t;qoMA4&AtWzt{4<L(r$>mx$**c_@NKkHz%J_p$NnGa{Cf1Tf#&*%u91 zTDWdkVwzIg(raUwrAc-CbAuNK*LiG5?GVOQRf|FR%UgQ4Ee!+8Tz(XO`cUszU%s^V zamb?Rn&tH!bh+b7uYVJ<kEY;OTu^$D;{I7V9r^8$USeIXenIieZ>_=8!QaASsDgSE zdwV4wNFd5QN+03Z9pAKcPx^)oeeOV&Vv4HM&QeATMxPDI-2JiDsvaG1=Xv7GAlF?x zciO6gM{J9>6N>Ne4+{NA{C-yTl<&*x&c{Ww0o^M*d_yW0Slh3@qjYa#^VxE)&gEpO z-p4O~`}SoB-eBOrk`=3hEd#&CtfKlyH|7Hg55Mg!UGkgQI;f!U<4Ux5wp*(E;!%uk z4$hsW-RHYVbB0DKnm4)*tYo*$lPw=O6SseV$g-h4yPnhI$Kcz8yVKuIK42#FYWN$v z>h~5bh-LHDSTteW8p{<_{Uv*_A#VS{&go#8!wS1aKt**?<>IPiON0`9yh8XA;%GDY z^#cDMtUa<%YvbIQZs++RVelpUUEc%!k(7+0_DdIzZ1;P<jQTYB9zdlylw^O}ZXOJk zXF5D-`gk<Bj99^a+&r}8DR;3?du<Yeos#(OepH|C!zNw3Qn7O3QhM;h9{1+ja#4a& zPB7I@<i`GgT5ZL@+ECD+vXwTPq!1y|B;AVKw*I}cM55KvSmbX21lB^jxFASc6<Lh3 z0c2#s?npEq3n<EgJ@FVE0)=qFfgmz6U}p?TUc+LM0NB+F=1uBxVQ@UT)rBLy;COcz z6v7t(<B%w4666kpdtuN3*vSh)HY9n%;Bb=s1p+%GNujVvEC7~7V{pz07Xa+_D@i!1 z^GBgz|G?ewXjhmQ-W>(Q;{Y(m6@y0HBt?V$IvGpu-0l9M+x_bGe>dWP2zS3G0k9d~ z3HM9KBZH(7<0{OZloDw=_*IubZ1fN)Zv+krhk?Mq>jID~`cIwD*9+kSk^&$Qkknrf zASWv;Eep5+e_>FPnn`{D^j{c6jx-1TZwx9W{U19h1VS2a{$&S+lIDwlW3r^w{x>EC zg^+a1e`AXO!DJL={}V%&6!>pnoEHp<LU@tHSO9E>^hc2H3xM^}E*R2uMy?0g6obJ5 cWRdh&ZLm0)7w%VOWTmAPKte*=MmnJX0q8r;RR910 literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20161012wed/stoch.links.csv b/results/kmer_size_experiment/results/20161012wed/stoch.links.csv new file mode 100644 index 00000000..99fc13a9 --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stoch.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,74568,5 +31,74568,4 +41,74568,3 +51,52592,0 +61,52592,0 +71,52592,0 +81,52592,0 +91,17998,0 +99,108,5 diff --git a/results/kmer_size_experiment/results/20161012wed/stoch.pdf b/results/kmer_size_experiment/results/20161012wed/stoch.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6bf8d86aed63ea6dca2efeee795cf43ce2fc8dcd GIT binary patch literal 6053 zcmZ{oc|6qX+s7^0rtEvRFHsc6jD5=*BD?I$7=t;D8D`0necwZp>>@;xrR*djyO3;I zL)o)rt)7|A>HN;~JJ0idz2=Ymb6?ASU-QrV{)m`p>qtN(Wk4dq^T7+j1Hr@gSP%q| z0x(X0fK*gKU@a^HhDT!1+Aur<5Y<+KN<rixQV=La2BIJ>F9re|Vx0fqJqsirMa(h- zbg?iGHzXVf7!&Xw1Uxa#1dDMdz!6x|uC%1Iq!dy6)eA$C#6XuXgAizE(kY;Sp9TP1 z_;?@yuoestLt$J2unEi+fdiz80cOB|66F7tK#C(pV-aZlU*TXa3>r^d;sAwT>0mPi z4nx4giS=aus~#!f_c*R0(5`ql01AP~0AL*?3QxQNSO-NMjy3|0aYm5Njl&aX<__}C zT(_~q@YOH}sI`RL<w%6~^hkDfp7)@rbsMy#U;(;9^J|OEdyEJn9N+h|F1cE!I`ef= zZx7OB<qNiOGepi{U6%^BgbKC{4nEA48y}xGuun})O}P2O`Q&2V{D-~LEqCwZY)u2t zPl?#Ok0UEb(^3<US#DXz58MqrK8oF1JcLE(gIfc$0}uNI+aHJj<Zb>55uaObW4XcW zcp*|d;p7lls*yk|+VlM}Jr7-!kHnFG8kv8|rS42K=4&5G`5YCLtYe=+u6;Y`<r@Ny zER{KN`|NO`My*+$CYQ2u(gTi3C)y-cCvk{Xq~o}teoqBKD{76&U@f2KXEb?4l_-ZJ zotq)#FsKsMV0f-Gg*OPnIcPgSPuX2bc_!D;i!9YaeKkfGLyj0Hr;aOUcm<{K4PLhq zwZ<o@hrD{Wu5JD8>oe7arTM<fyIxm44JD_=m9-7o`O;~Vye^nUt?``VjA|Cg3Q3Gd z3+MVpW|0T4gf!NH{1Y)R%+y~WW)_iWTu|sQnsO48fr7|b4q{Lg;Wzea&wA1h0jE@0 zwt5Rd7Yu1<IX7CcBdQ?-?Mwy1QH^n9<G9=<c9qC*h10DcJNPxuF=(6(vt~^<xRr2D z(8)opdzItP^f|XTf_kA$>K{Pa?o7VBGL!_+G|&BPo4dKpkpR7RirGAMwrs7~ppF=Q zdEAJG=cJ(b8>XikQI-p1Z->K{%^XWG)!k-ZwEU1$w}3~l9fFG-bb7gl)6LenDp-cc zenkgIh1b$F_E28Nx$r|vv9@rdd)K)on^X8!onI}{Z79gn(-vihM>Daq!sOD-T|yNX z-9<IsMV%&0j9Qak54&WBoO73Fl9^{Jm{XK>C&toWDW<=YMn5-FEtrXwW6P6cgSd+p zjYMF(QP;x8(rBylo`y!R**w@{nW{+8;a$~RSg1f(Q}~tg_%AJ4M>6(VuStf!%RX`F zq?9xE7!Xq!s`NFwpH`!~>PEcWoZ|J1i|qHzsj{akbqAxg5+fGanh!n)Fc~dQ>;|Mo zb?mokvo#5XnNw-ju`O5x@4Algh3KAXJ)2;jR#^64;reWu#Rt_89o`Tb-Xc)5cZO$4 zHTzwBBx$1Yd_^FHZHE%shEEsi{}6~J)Ci|!8r**S$K3cTJ7>Fiyakj?G;Lhi=reYF zKE?-Rgm+{v5c#Z@sonbN<*P}5$48-bhDfF>dJ-m*h<Kh%i%YvY;UCclO8S%frqly8 zMvf)-W1VSgrn^6#cX^InKHs%e*gfw;ftmONo!*N!vYC>xnX=b?`kg@@W8fs5etlcp zlXdZOuc5w81dbIOX_=W}0@+=fnu0!Em<ls<GMw(~|2FNl^Q>|y_OzIW<X!3cZ$6D- zpv6mL*0l$BW=q;GTR*d)6BhQP<FuL^$8typD5|so!~3iHqg1USci0J5*1K%q*?6Xa zSLj(_rD)NVIyapxIXPI@#{}qCXOu}4Sk&N75l#xuqA`AAZLDB>I|?+Rw#9jutaNtn zb#KzZaz^=wdk#D78xr8g{bB!`a}2csFHL;s&v>d^QodU$Js+jif(<C%xjNiNqvRR* zkRCh<vf>#rwXSS!l{m17UbCu0yqMg0PUFt~?0fS>>TKDdTnmE#xlhPU#o}b9_h9PW z?LrlUn%-@imnY93ZKO)Htw$PCe;leG%(`W_6s$O~rTo$_gOzatyPnChl4|p<ROPVy znnC*qO{7+NmVEilj~%Wb+b2s^d+$<1IB8NSaZik2GBgfSw@?T8?`-c)Hf+S0kCw`h zo>SD|51|J{allOQUUT5>ACnC|fsHCZz8yjBo0UVoVBX(0xdL8!z4pqP^Zc=Ut2SV7 zdwGG;)m1Gbmh*1ojjT&U6e$t2O%m#Q_iW!Y02bwmXZ#web1jZ0=#nj8t+8ozzjr8& zzLLS8sk|x^`^>ZR*+8>{E%dehj1l=n2aXImET&%XO+6@?MOGc{8WbtNKC3A4th4LU zIt%*Qd_6*Z0$+62(WvUPlT5MBLx|8jTlbX~ndx5t^`}VM$6uF(gyUu76L&w_hZU3; zGAF;xAg?YvPqtov$do;V2sL;yVq^U}KEuk&tMX}896|Q+MT4uz326BWbaS^ZJM+li zNXb7t=ST%}WM)v(oRVA!-YEg^80D<KeKLS37+yYnGC*A2BCbUKS`~u#@>dKF-HxE; zN2DURiA9wDE+S8HTucx{oJ0Ad0<vkARNB-=G|cl~7bDHNM9zns4nKmHc=G6S->+`T zjnf}`#H1s;PAQgu)`g*?IP77lD;qsmKJAY{o$vfy1TU{M`u$5>R6OI5m``1zuYGl* zbvueHK01zgxI;SiU=}kk_OpICk9fpM`y*1M%(!K_pTx1~T+EM4dR-jB+2ERTq6v#T z`^=`pV?gwMPTbi=>U`^)T(W!drgk$B{d)5bQM8Alj{A47)?!9{vwC@*WuHFx=-NoJ zX#82n>s_Notu?o6<l<w79=8hMN@S5T(n<XBw#_Yb?ZsdCp0?E8{(#qQyvN?F%u(be zobDwaFxb#=Y@dGTiI<aB)9{{Jo>xw7Q+P|uWAg^Pu_*(XfprlMw>noJp8oED*4Z7E zRdVJ!7Bey8i&Fd2^)*yt@Gkw>;{1=Xw*6qW<2mpt^SbOI>r$$6Y4d5LXkbJMPU!xg zg1c$y94l7TaO_j`Mt;Uxd`Irmv@c%{pZ%(_vFQ5dIt9m3&c$^#{=4PR0+ddBXsO=W znb|ryI{LWxRZMno;U~jSN;V_;a3TQx#S(v^5Q%jDB$3}J^d|v<4Uo<_zz!ghj~U=V zBqR)hNIDPzY>041!Zb17fSnX^PevL_`Z@dqmPoMk8*Ye91~*6G0Xr~}3W*&M-gpNP zSceEofQkxO2MRz)MSh_c_=-1P*BnoRLKPKKIbur`25asCga1;%I5S;MK=ya|f8dNH zk^0$zuWD&&!f*&@K#mmU@Ei9?tKWCjAn_HN)C8=JxQ&D(NJYU$M1%wX1~FMhx&ITy zNN4&R#IjPoEX>C_A_tn}n@p0KVp0ZgxtUSu(KqQONnV7|G3mcbBomU;w(!t4O)8{? zip~b*Yb8|(#e|caR;-@l)*kWnVB<->>RiS>)9AC=moX!-A8;$DY2c&lrvGR?S>j13 z)$424bU|nGtl8$4+oR%RT^(u2&uo!VqR3`Fs;V?O4s<9UQuvL<p&qgxa5mL50EfpH zht=X*$$sXf)Y%9@$V}4WmbVYa?YTL-6jZG;MQV(@(q76K_Dkva3)zG~E$`vB#qIpQ zJwz}n?m)v-xO;H@^N6iaQx_KE6<M0&k{r1?muU~`pv~vuq4=v2>K&L*72JMCa%7^( zQ<6(Kb7zzUJXsVPBC+5@Hmlclb884}-8;bA!`R;6Yb(hjTolE9-rhVS{vK=W+`xe0 za%_9A6oY*Hxp~OgLv9=1IV_pWz=;i;k2`mzkn)&<qJosWgnP5-NT9<>M)Lmc)Ei-e z8{FCg2W{W10v^(nJugJO7(a@1N3xSStdg5Fe2?sN1e3*7HWL{9J^AbXZkWB97k9l> zIyGf)K%R3;4E%ZXD2E|X>!#m;S@XxuMVR-_J%waEMC*n;+Qqj}`_0?_d5^<0OF5AT z0yIAe{2a_D!p%|Olq=djh1Y;<sW0r_vPMxo)QnI+$Voq7Z2M6#UY8m@alky7YEAK2 z@T__OKbh!7W`ORF^QFN}|I;11l!tw>?;fUQ9S?;3@VBbH_Tw#wa9{#lR~uT(V%oo! zequ`@1CU)&r(99z`Ao_B>eS0u%(Y~Uvy>DXqEV+j9U-$JY>DLB04H0}i(IahkTS;$ z#ngmcW|>eq4=#K#d{#z*Ml!g^@zfN#nG{2Ikk}TeSDh&={4Eo^StK6NddpiGZk@>U ziIPf-*@QQRRzyovkAhu&HP@_|w<KaE&*ih-XHopwdNO*gnp{&imLqmwnsZWgg{`*F znNd_#j>ZbK31OVAU0YBE0m+b>SNA<A{g^hwjN2Z}GFyYxf{U1gn3$Cw#Y!=Ayw=f7 z@)o9J7A#F<DWE{APbBge2-L;sB^AB3W23%u1!8h7<#e|Id-(jL)I7^Px%?7S8k4Nz z%OlrEME?NoF&paKderx>|1%k0T!}j?R#L|*H)D|RbJb_&U4<id>d-uR93+7&@EY%V z_9}`h?kb+)1jTUfjyPChh4oXVQQp-U`gZJdClur*=nm5zmOE6+3I3ty=J6gc6>i$A z7)4!rbA8s<<AR5k2Wu^NBg-_?W*lSt&Q{M?#DR_<V*ssMLXMy)YaHFPFg?wSFp0e6 zK^+ByS_5wlVV1opqc<Lpf@&kIuc~OP81srWi<s)U2+N5D3k$jnu3yL!`Y6-{-0RYD zX2h!4T$|QJpU)ATxzJ*od(ryhDbX!9%S$7o_>1<Y-i1+CG8VfQ_D|Ewt@FIKJ@vjm zVY$(z7hU$o)b_DOKHlO>#85T#`nZmwg`$>E6*hL8YEhR>ygh1OYtA%A^jOqY6mAh# zVvI1a8zGG7tk|q5&}&H-QVp6RA6H}CLaW8tV`8%OtM#+><GVfC#rX1g-cf%3vI{+e z9TAR{uS;AvzTUiY{=MA$OC{y+mx_IgzrNQWnRu@=GW|q)#D8RXq~?j&lbK<4WEwK; z)~7$#-YJxR#WT1k^zd$ECM!lRPmPAj_Q{-+v2xoO<{iu&9U9#p?j9-`Qp(@6$auD% zmsQfFbsYAsfZ=+lp>A%K)^b5VJOb2qFR8!Sc^6WMmnD2Bh}7@Zht^B`cKDk5hWoDj z?(fnaiGLUV&Ue&wWKOL_El5p8T}E9)|20Cqjjhcg;(J7Sd|bR!oE#T{TNP?4%^=k; zkCtt|r77nrZ{srLi0xy8cS8-OKf3#lpOLGQH-MBvc%YGee7#gl4SkUkgB(U2nz7We zJl9x;?>%foArfqdBtj)lOEie**l=67+qqY*)Ev3FW3td6-TXTvoLJq=-G<>irDYE# zKD_MsHetVQKT}t_e0#tj?TYTKnS^@{Xa6DK)Y97g;YVm_e(2l0KG@K^SCDrVqhG0G zJ3w%Emn<($Wgo?|+e%BjqvNeMYkp&goScOQ=?1Yq`92*x2Rmn%<Lg%Hu=cACkI`pp zv}>BsW?l|n33oz|DH;74vvu!vh7~<=Em_tWaJ?~lL#Ujg`~%AMoBlU3%$v!9hBtQx zBj;Hc&DU`=Rs-_mHSfK;2c|LMGh%_&fr&rme)5ppl1~LK1wB#ke04wAU!x$GO5?o- zV@O}fl~z?j4?V}X-x3>k=$B6~dxeJ#1`V#~?Z1v|OAYrINT6dpT}~Is1!Fa3de;(E z@7NWRN>wh@B-A6csr^v<>LcHL_I%by8TmImy*pDn_26YrRvkBDhND>D5IW%t`3o8I zMVv?CQ*|CHt(Wr@=8l9GuK2`CijP`nHXj_Yw2Nel=(4<vhD1w5Zz|?xQupr;1wDRY zP*jWBklbL+roqr)k~h1-%f`ZW=kH%o)xT{xovre4KHDs5+u{|6#p&##qoUEr%|@pU zCv(1ush)3ArF@W-=4qkFTu65eeIdOAjjZaeim58-$$YSGFDC+Gkws{Y$5uA9%Zv|< z1ijI&ya7#LNHxE=b6#F)UoRV*EroHc8{anUxv=fO({dx&w$<UCU9R2pmyY<y%iTxU z2TLB8`dY1)OWLY7cVPO0Uxd9t-U<}0mJ|p3)k(TI7$;f2OMg_c7qDs<%N}#3U0$vh zRXiCrAv&f|i`*zIF%&TH7U47~Gq|KbgIPj1AKdS=x_{GuBDoszyf+UuVA*RqH@Z4G zRl_H0B{={VnB=c)!qLEWyAmEKKUixtfmxZ>#6CBuG_d8iA9p~Qym+zVPtf1fi?=eo zQ|7W>IPsyur=fi9>)5bm(RC}^eRR3wW}k1%Q9pHlM2vr$zv3~!oR0jrz`mpUx~ut5 zs=u@a%msW2j-vGMOX%y9NS8pAd6X6qwjE!$_RM$(4u9-Gm12r&9`I8{3P<t>W*n^V zwP{4&x%)h!+TZoS!JXzsegW%>{j}oy`$Hn@39I~<&w5wabUrRxxYM(_FEFfXiL+ny z9;bU9{fITkXelT2@_oX}moJ}&;Y|j<o0-w7xH51fW(zeqzO(E%ko9GMP0eR|@1){% zKWD-XXNR@=PaaQjuL5!xXpRMxsQIXsBY7k1!O9LB+}ZMe^D)Qwhpn2*vl}=()`#An zB+adwe!xuY)e1IsH5`7nJStnR!J!AHY;c_ZHJ`GFnqrRA_jv+jzE?P`0IHW)RF$?I zTSJxMQxzA~5I=VV8b1pj!rH_7wRg@<>UN&@zW`Rdao~M&b@blDqIR`QKkbiuKaG2} zc%ML}*p+2}*zcYUmuK88ZW;SIw1L>fkL`Z9;x2c&@u1N(5H~Ba>i)AK&5KpKZnI+Z zl3H58=R>aDuNy^i#yJ6$`(ZoB|3YoWzcCc_m+N>1O*HtBXrk>%!q)$+tdM98G!FS& z0D-lUE-nb7)kyLZ?Eo2Busae>zyXSKU{3-Dk3b<@@F0kc4A>b%bVqSGBmj2B!fq28 z7X~MgP#2EG!U^s!D1<iv#v@VAM9Ccn$70X`*a?du1rjr1a5&Mu1c9BA#8x;Y4ggD{ zF?eT$3joIcIwYJ({wNgeAG#X>?Fz#Z+)*$B9spxpF=)gsVmH{Y$~Y3aJN(0T{l)sf zLHtkO_18-PY)){(|FU68BJqh~1al`IiTFDBHJ3kQ^bn}q2s{!F1A%|f1t3lIpEj*G z7U2St0w55O)ZZT<Co3y03%CG(Yfz#YP5J=z-x@@Y_zwEt8dOG}=+gd|CM)-^T&SG< zzjNiKh;NSnoht=}5N+rG))fDx$;wFmtB0(d>_5(e$HI^(1eWNf17LHcFM@bq0DKkg nf+4=nNb>-jVK8`r<Z=I+4Gs^(;(tv>R$f*XBqDOfSO@e!3Jk0| literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20161012wed/stoch.pe.csv b/results/kmer_size_experiment/results/20161012wed/stoch.pe.csv new file mode 100644 index 00000000..3b82becf --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stoch.pe.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,42887,10 +31,68825,5 +41,93607,5 +51,102363,3 +61,164617,2 +71,181787,2 +81,228304,1 +91,23945,0 +99,108,5 diff --git a/results/kmer_size_experiment/results/20161012wed/stoch.plain.csv b/results/kmer_size_experiment/results/20161012wed/stoch.plain.csv new file mode 100644 index 00000000..13800481 --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stoch.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,969,0 +31,1729,0 +41,2909,0 +51,5501,0 +61,13480,0 +71,21068,0 +81,38623,0 +91,16634,0 +99,108,5 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherr.links.csv b/results/kmer_size_experiment/results/20161012wed/stocherr.links.csv new file mode 100644 index 00000000..98281166 --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stocherr.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1552,135 +31,2655,86 +41,5732,71 +51,10941,20 +61,31864,5 +71,31864,1 +81,11749,0 +91,138,1 +99,21,0 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherr.pdf b/results/kmer_size_experiment/results/20161012wed/stocherr.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2a9e0db622de76240eb1e0c76f2218c2dbe2f807 GIT binary patch literal 6027 zcmZ{o2{hDg`^POIqwH(8p=?=(!PraoLiT-WrZJcrGt82)?@P8KYece#B4mq_kafrw z35f_<k|j&MGd)kw`@GNd|NqXJbAI>tzLxvG&hO0mT$h-+o`I}_oDx7RY%c6$7%6Pn z84FMV$^$W1uK_eQ0U%v00)|Io(0VXD0w}4c4whGdD99^-6_gZImB3N}kSWIP|J}1j z;!$KTOQ0bZ=Ix1u<A7!ayf*<)b~DFf+z4<4ma?lTrzj^+)_&!}P$Vh9xpM#n+KqAw z@L#6^f~@_#5kQbG3=czLJb)l`m<Iv}R3sZ%0{`ov@-GL99mN`pK;!>32kBzac=8el zRQ=@+vP9r81T35!Pw5}=C<edFVS+$=;5~t0c~w;)$N-7LlWzbrK#_~1hk#?;5R`M{ z@Z_3#0f;#Z4o<hM^;v`St_*?1Q6o>+GYmzH!jnszZ;HR5mEmo3c_H{<bH*v7bwo|0 zxo=e`d^619;Y#`N=;%z@+C%uLE@Jfh_Ts0KcSlwR`~?+KdlJN$kY`MLi*rT$`Ynxa zw<{lAcCf1j^>p2@w0OiT=v=?&ey|nMyga?xx#XCT^1N$T$g2Y4aMa>=XwA2M6+Y5_ zvce7Coh%IbvMKd>e&oA&Lt+1I5~S<oENZOVMEedMkV~+fs!Ef3+E#Gun34n4lOY90 zs@~&Oa@Vr`SqGX0fC*9eyv0{g;t#snEfQ~B0NQgO&$~B(5WOqIYLMZWde}kL?z+b} z%RLCKOQS~kJsPtuspO(c*Vm31H=s{NP?u_Fi&DwNhepRbou}=2LS@Z+i{=HGCN5^7 z-@HmvH0)TI4M`B@(5b=cQWLAv3!a`{_SApoz#R*vRaas)bzO~>cp`CJj`e-!)6)|J zg4NpJX{2ds=n7<I?9(o_Muir5EEAWI&}GIX|CJ{@xdsmzLg}s~n?lWBvR^n&bAdZD z)Q)DFj~`mXNPYNnn<1%5Z===Zm;>--)<dQM<`K=9-9Ziyk%lrkHYHJ3w+peJn?4Zr z6CA*+g_4HBi(=t5FZFKIiG)6))--1I|29HR(EOCBy<VX=7^{7oPW0KXdYFeQUm^EG zP-iO42i*LJE`lB8f(+PwVN4D?)sXC}QLnx6TCa*ulx7|<<;7dgTMez0i;|g?5o|ln z(Z(Gs|4M82Uirp~z?SzDIgmrfScS3i*aFvzr+QmiDb7XIQxEw1YFin1ds|t!6g2EU zTNv-bpc>c7%MdlXl4Pf^DTc?wWzLiUw+F9klwb?t+7u7eExW}LqQS>hqJiNz5Bw`) zw3~}_wVcaU7sqP3VE2mh3ECN*2UjxeD&k>nO{{hmOg?Q*ykiyR(qn1H71GjUqsA4d zXBbMZEski9aDm`ge$J*Y=4hy<1q=wr_Nn;-14)e$RyUc4+sbstg>TNXwW&Y3opV;- zsauK{f(V;Uz6p(|E15cbr{f1VU~I@ql@;yQPKqRG$fX8mL@t~<SE*Cv<T#p?m)Klg z9hhGuGMxR?XV1+{+Q7B#@?a`=4ul%fFi6}uCU*@__<C-clju7%-Og=eT!E8hfa)Tq zC*PtOG-|$jbKE-r+1;TbOP=|qX^i}b;zen!Z%yw@(-WUv(Q8ewOjLY?i?fpol&4Mv z%C4fr-%T*7oX&%^_s>Ifi?Z>H2UXd2Y0AE4j*QqQLAJc-@2pDATO*>%?#yu732wzK zXs3nhzYHK?AFXAcCVchB{fra}YVqpTW35<v4S87@^VaEJ4fe&F=j-Xa{905-fWi`n zsn@hDKFm=<v*`J-E`iiNY9<!6Rkcu6-6|_r?UgNIp^L;bszkrD>!bpPP3X8=RP)Cd z&4KSTxX^Wz5(`Wlyk^O#lrz7xOLc3julS$J^baoxHj{V+s`=1&()_copgzN~+a?+_ z2({j%i5m-5E$KPHf{w<*V`4rj#-GB<B=)$Brx#wi8ee9+Uf1ZBe!gn{D6UD+IC|A^ zxAp0nhTv$?j6Paj(uQ>Uv)-F6=>n2`{bDsT9*u;aT5)Lei^a*|fr)aJ5Y*_c-SPD? z!)yREv&W)!9!f}c@T@TH`ao`rKhEZns#|N-UC>6LH`K9VMo)7Q*4pKpH1k~R=xAc+ zsN9OMzjJ)RfUpRvY&}-K7v=Bglb<`gxv^r`P(zw2i_ajrqVfa;wQM9<Fqz0GyGs7I z{t@FhXsaigTKJCZt4}eVa|N;H+UYIEbxbIQU%$-eBEem9YM|{@@m>9)cKEm<6X{fp z{Lssl(CHzL(6Q>3(6Pbn(CMn{q0^-7(4zz2jUNA@H8-C}>?xT%iHh7MP3CIyBQkNk zHkxG~2}U_C^=lz3ZKk^ccPeA5wrBMs9dA_{mex^T;>j<BS@HY6Z%^G<nRPqPf@~<V z{o#|CWw1G3Tlww!q{Uv{SN63N9sSGTuN}6({(j^x85^yY_4Tk@hE!OR$kp|>uzeZI zV<Yy$CDmVu*FQQw1^X7KFSY7$e$@fA+L$A-<7^5D7-I<-Gdr-obmPGG;^l+l^&M04 zUc&0;b({HwE5o~c%0~%Tq(u%6E5mcPRD$xJe_lKsS009bZe<>r?MZ}k_Uanjw`B_O zpUzb9CUzFJLJX9GR;8~cAg@TO!+|G6K;M+)x80R=`Qe(Mo^_X+91-(N$X`f<oArlQ zZ;#X>9eepWdhvow7#TBzq9;)63TcC(eTjm}{htR3O-&Z`&TQ|;mk$!F6neIzQtA@Y z5*Vj`t|dqXbv3stbt(ld^{;nka&G8OLr<L8@93ftB7dtOKAX943$;q#wBc|Iu}8f6 zYPE_zFf(m%9Q~OBN9G^T6_;sE<t0^?0<Nu0&c8Cv7^@A7c$YCzbJLl9JN;x|R?q6& z*_a%8e#p_;H~Y#i?<SSE9$;P8nS=$Rd>tf)zUw?}jPe}`&YtGmVtdxyw2)SMUiDf0 z;fIF-<H8fqR&JrDPx=tM)&xGimsrDpC{YdYEMPhaWaW=vSKMvfso$n^I@$v?On+1# z;RSq32?$v!t}Jru0$HigNAuM8>|9|8_KOrV`RHwQOO|NjF{~=cexa0bjk9y%c906& z1UT`_Xl0A;+=}fTHCOlhaON4ooRtG7>+j=`V_HXfach|RcNvSMTbypC8XiAO4|sSm zpr`d^WtbnBEl{nbdMFul{v?J!frr8~fAGd{;Q51dK*mTn9MB0!;Tub!3z>s31TxPk z070e*HzZ64Lj*d>llPPq!IYoN-;jucG`|ssOkQv+1Rm%FBC{Vk1A>Tm0e}q1I0V$x z1Q~#V3X~wffC{8f#2Z@SDd?xENeM?ziNav5ykYQP3J7Our~_2~ZT?>bqezr|PN0jr zx;ii%!VL(aSh@U$I?C$z9WPKQ3Qb7@(nH)p!V#39pvz=X1O3J=Wi`nEiCdI2{e@fi z(tWM1Cb(lrEh;VMDXej+gV#MRX^c*`7$wV{RbXVj^x`&^C`8ZNThAi7_&8W{Hnc!D zxmq+XirS)j<rtsdh>tfXfBHqYO1_zAe(U|&Gs3&U*YjFP?>*Lo#+s;ZA4br<G_hw4 zJ&|wE`C+jmHZk4<%0PW$gNhbKHS1kdqr<&tK$Ari_%;ER#j(fJ(!>n>esFg9d_o)5 z&%D$|2T=to^UQ?BuX_{Dd_3K%T6Q^N^=93fPnApu<Sz|~I)sC5@8G`5I0f!xAz0M5 zz>%7Ky|{rn#K!!T#K%N6_SS@CC?C(_@x4ZHt2jIYe=%CS6Ek1U7kC*$C7CiM_X%g^ zhLVM+NP@#<Kl)M48g*aUcmuNUBXRVybPV)4%CVm*iRBY_wu(-?!x8_1L^55B@92|f zR!KZPr|>q5&q3e=mdc%U=)mdc#rH&1<E^Tis=Sx1SF7Ymh|6Ji%I=MH*T|4HK0V>R z_8q(6tdmq##fZ9z{TMGK7nREjwR!VVOg|Jv75Ai-z#QZw)D-Ay`D#waL%Muw%GsDY z@46IdX?;JBIYjqLAjz`z{rUooxOGQ01+Sp%s)Ba+FV=hYdSK4``-x9^F?+%cKL|qH zY=>uBV?n9<dcDOaK$G-3r`H^@bXhvl+Ix9-_gLD06izgz-<;fI8%(#SDH1uQ9V|p8 zd6o^x7~m#7xE{pRX-NCMKYl1H^WFg|{6~;ogUOHA+-E|P;D&nO26l^q)w_p|G)h1! zeQnxhZT=-%ju*$CzF=#hVwt6-xgZ&P%m=D48_s!~S`Wy>8CrLrH#NKxDp5*LxX-2( z0rBR=hrwr+R2k&LdZEXrs4eB0b3>&z0Dao5kx{Q%xh!MwK;72@PonH^^Uu@L>9Uy% zq#hU3)iI*s(q6f5St{@_dO6>H$!SRvf2xV<q;CCv3s3faE`Ns8@{Gl8j#X?Zx*DjN z>hYvVp0@4{u&S_Jc>RksZ`wfCwMet}j9E5&!1=Hewoq0!^#}3tY}_vmbdrf@7}-S1 zZ?hNDAhjoN3l$1C#u+7-ymsQG*Vk7tH%Vpc5$1}Tdyt-Qn-3{?Xu)89uk_rA&4}bR zz&4wy!Sx6ILjy}xco}uRd+~AxcK5Rf1(#};*uJUmbJ0iS!xI#ectcDC#JOr{YWQmS zhm+Kz_&O6{#ntu|>SF>caVI;lRaa38PXPg}0qg;E8c9JBr|0nAPgSotYhI3(eq}T3 z=q=%G=grZ;_l$j-bv=QlV{4;#6R~Fy$P#?~d{Ul>Bu4_{qevs2voP8GltBYk;|62m zg){8iv6o+YKL~Azw!f&Ur)ef2)+%OU<bDPs6?R6%OXQ2hJ<<1~Ex<e725u}^O$U={ z9kh6!$c)5ui~DEo&mNQ9;Ix$<k;I>MwjdVA+9_Fovv#h?d~Baj)blagEMs@=Ho95) z%EGb8x&UweIr>d4*k-~&&00;@uLc|cm2Sb1Q>G(!PWOXFoaBL|ha}uO@}U{Rs&Rxc zVzBJ6ta?&cv6ybq5?NG>@r<aI;);vAcd7PL?xn;YA1*1ueEuQYrO)5M`>=h&zJ`sg zjhRjBviK-uRQloL(NCp*rJJLdMkYrMMyAUYM}kI%N9xOb%VvhPk(tQI>+{!Ehg8cq z@yuRHy#i}FDQYo`Q)7|J{Ys~m>^#?o1qO4*-i&=6?s@a@je5bhb@rn#`S%{S=pIDw z6f)a%nHt`&(OoPYfJXz`?<5bDx_wh9#w!z!2x3j!O%Y9s{+<37{!#v4{CB@G?#mpV zITGA&*|(xsrx&58qpzf|Ke-t#)6UuM5`7f?I58pdY666pz^4VaRb-YQP(drVUe|&6 zs5rR4fnxhP;XPne#rIzR6DJ@wD#i-s3jE-he!)JvPtE-?vV+{0xpm^{<M~b4hwo&y zqYz1sZ)786nPi(~@*Mc=JDj{~mh1ODy)gIC?>&RMqOWpzT6qq`x5_KCWXGR&?o2v= zb)IP~U%WvILVKXQ>fgbAhjXt9Uwz)zI{qUfq9EdRen0HZ&<lm3>ak7w_)Y-a%l)3O zj)tFF<qh>u-^M1|999G0e&^vSHoj{d-&^3<xwW@-YB8~Kxe@EU;!=b@QLk6ug0}Q^ z@l6VdIG|+-V#zhU(-m1#=J9ax0?EU5%vJO;^W$-p$Ihi4Da@;Pq~=!vgE4a)3szrn zGj=4EiTY9B9?~=>YDOxgHstnC$WMN1N9w82PoZVnT`$tYf-V%^r@JtEfhD{@T)$0A z#M=n^dgpfY*2zVtMc=6K!O+1k`MWO@+S8+ggp(Lqm>x5R@WMDOScjg6HbJ|?)9D_I zwuttMuIpv#U3}nQz*WHUAiLm77qKg~%Lrb1)vnVuZa9|X6`>3MvEXC&TnW#<%v7Vd zX4|=Z)erlkANBp><z&X}b6WTI*gM2>#0=SoZYtcAy}7QIpF=<J?M-M=opDJ6YE5p9 zBbNcgfJs^J0WF%HX%tVB(7JTPbUIfvYcAI^`K$E{ZfmC8lKqmgqSnhyrtk80q_o6a zv}iMuGkvU$*oqksz;%jS;Fy}enz)+6-kgjt&JZyGyD~y|BK}EphtdRTB=nWu6Ibxv zkLgx-w!~G`ca3tfx$+oj<HT3fUWu<kThCp?9NS!mobEeSJ%!?n7JK$>1|Jrc``fKN zmUGl<?Zotl)kW4JuZKw1%E^EN8|B<x%#v+~?mnpA4qkDJ=Ze$sP=Pd{O5ep!O1@QX zK&}-(G!-@`it!j%8cScA!F)ou?xpqHrCkY{OsPdw_2r{TwtcoA##Y`<)eB16$&o<9 z?}VPT;27YB-ANf58LQ9CVRjbv@m0o8j2-!$CtMKbb#==@giG5-iFT#|mF{1PC&!!p znjf!jz8$tLv9WX9ML&kF_xnHJAD}OYjtj~RQaccW7^v)o^zSz{UMwi9{oEe>A^3Ax zENxJKQh&efU0Fn>cX=V<EA(Ys?+h_y_<bj;98*%CAw&~%CPpYEd+*D3`-PZ*TUAN5 zK^}W9UJP{wg&fPy(`ut>Z^XVNtq7evMXaswDk}LH(7V1XJgjAlb6y}$FuuI`fFtkn zr@WkVX@up^pXZ0+Eyn)qIXAU%m7r&s4b<So)?y&(-sj!b^M2FYht*62JV~x@E~`!R z-etHK!S_Eh90;q^3({-E2*fmjG+fsBa#aH7;ttY=?OGn^HgkJ_dGq=(`NN9EIA+?Y zL8PU-`TLUXe&u344o#YJ!0`mt&*#2ri95Kv%O9+CRPC}1)H=7UrM>}ei_m~iRZE;l z{QMUDY)RxhtRr$jZ|n3s!!Ge43D9}hJ>ucTu{&8M9p|NgIv@1SPxwA39)jh$G?afh ze>)t0oPDMA`P-jw))4FXx8IKJ_#V5vW<0Y9!OhC9c>Qe7^yN@&T(4f2KA#!9^qu$H z=2}UDSza*hZsgX%zffE4FAN3zIa}06lgD^SG<mp4!Pft-?2zaSXdLpl008MB-Q5x7 zfg)vM=mb<!26-XT1RPKe0`ehX@CX#b9S=}YQUbYQ$TLtJ4haN#U|~1Nj0=MkD5wiZ zV&McYcNBsM1mTe=H?rgfgJUshAm}O<K`|tI!r*Z7j1mBHLy}YBkT@Vn4voRPA>4r= z?5{(@$>fhh!TzRu63`wnEWrx}BjAA`j0XmdxK7Rn`xO~SA$OO*4Of4${%;WfW4ijQ z698EeuHt_UTPY&B#kdUfA|Hv|9sH`xA2vn^)C~k435Nkdzv}{|RP^t|SRxkT4v+^b zC;;UDdVmmRWkqG6JMb?JOddH?9w7QJO#wpggZ@_oD=CwQ!T-=Gk^ZScl*m2OKkOi? z<hJI2HF>bYzv9V*)&8X^E2{h}hcblRW&ITokA)#o2rPMG4g^^t{SoB*0znti?ig}+ jMyUtL5`)15$qs+i28V}X@xLmgtO8a8h>7W&836tRH0z)6 literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherr.pe.csv b/results/kmer_size_experiment/results/20161012wed/stocherr.pe.csv new file mode 100644 index 00000000..0be2a3f6 --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stocherr.pe.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1343,127 +31,2755,83 +41,5821,53 +51,14476,22 +61,36204,7 +71,48050,1 +81,12589,0 +91,138,1 +99,21,0 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherr.plain.csv b/results/kmer_size_experiment/results/20161012wed/stocherr.plain.csv new file mode 100644 index 00000000..6b0f24bd --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stocherr.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,961,55 +31,1729,19 +41,2909,1 +51,5501,0 +61,13278,0 +71,21068,0 +81,11700,0 +91,138,1 +99,21,0 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherrcorr.links.csv b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.links.csv new file mode 100644 index 00000000..6291487e --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.links.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1826,97 +31,3025,85 +41,5580,76 +51,12459,26 +61,31866,13 +71,28039,1 +81,42610,0 +91,11077,5 +99,21,0 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherrcorr.pdf b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bdd65a497b16a26638384c03a4b5828339ac0269 GIT binary patch literal 6230 zcmZ{oc{tST-^XQ(k?hGy46+r*jBTvhl9GL224f7S#>|){`@V)0vPG82l2FMWQkG=j zBO#G3TehL(ndzL)?>xWrJl|{Py1w`4{%rT>kMH%m-=ZeEdeUGSd4Oo>Z0P6EzR)2D zJOB)o1!7&U1Ju+2ARRmsPDEibx^N;AD50wYkp(M)Wx)`zJQymkAPE2&VqO2=Jqr{O zO^GrCUdF?59w-C>XiOsFNJL7Q2_EZ8LLl+fT{#&!8Ci<$S1l~nBni;a03b20)Gi=@ zcLM}j_~MX2kPe&(M`PWAAQQMdk^q#WIG6$d6QK030IDC=8;`^g|8fWEU@=6>k^qGM z3I~}X30M*yLCGioU-_sGzsGS6iE$@-03i@%Fc74NLK7(`0O_GA!_h?|u&zjI-vlCM zW}X0_ta&>FEKGrYYTD@&tC`3v@}Z5H#obs<P2o^_sjGInc_^V8+RgQhXhDL}y5ahj zFpJiQSHAAI3*5PQN1bqe1c4l>SibXd%=>#$yrQ11`G)nz$zWgJjj<iMoie}uDDCQc z6(nq+NkTBZTCdK9h=YC3KUB(5x5&v_k1ubN*x&k*4Dz<heDZT5@F*X$ywqFVp=#6F zn<V7)bz(zuX>NE#rnaERqxt;<rI1Qnak*n<(ljNDHVh#jxMn;@z{Z+4STq$?#E#7< z(msisC|rvuojN(w{#p;3Oj6_;nBX+MeNziqE6uK<`k+hqMYZN>*mcuy;wq<iUmG7+ zie@Y=BTCdPT=yhR*d3dx#(8W^=ttx2Y0mc-GSSpT>gcqV<(!<arm<MOL;K2;o|~;J zvwWs-gPxP#O5Ho+!{Bt@;{C|xI_bmJvIHw~<_B61v}QQt!zx}?v&!g3rdvVv0U_T~ zGaP5SThJ{_Cbx7fRo{E!ZtTdWBJ0QRR%wkY-yntZw8t1k-`w`uMn}PEt6M$_j2@SN zczZ`<nf?c@&85Jc<nmO-DY$BSEEfldOXbJYYQ^K0<RyN=F_c@ifn3+??uaRFH<bux zId#Dqj2o6ku1tC0n(^kj7Kn7lOZlNcP=yFAs~lZ&btbc$JE%LtCpAe{PC4BjB?kV+ zE!W~i=mU3Gv0HPl6|*wtSwFUz6}4j&DcKS#N{|$XzANjR@Lvv<thmRJ?Q-r|Bxfqq zG6&NjkUl~uCT$L+t%=rJJFely+2W2D)~Dk*IqL8p&tzcD9<JVY0cuks!!azvUZ54@ z;RP*V88lGNRmy()cGFRqte5^P$zz&S(HVNXWgyz;r5tVBja#__p+Ht$AGQGcE*xa$ z&Nq$Eq|l<C_o88b^kMQ$iF5)AE2X;;A^EnPXL%M*F|B1VjcV$K_|tly@sskC0vhkv zR@thA`HO`quzSt%vyIM+t_<EMp*rhdx|}r>hw_lh4d{26=uuM8kpL*vQuYUkozfvf zP^IW$OQ%@T%v`-FsO~aQUk48;KWm3xut?aZze_#xhr<zNg?5iDP6Sa`FkhtobP~y{ zt$v$vWiXbsc^2_8FBAJ*CW>K2+u@l^E5nM5O=WqxjrEi>%=TG%Q;W*;K-}Y-My$H9 zO~~#Jn~`rqUATE&$7JYRe!}tBd+Dmsltm#0sO6&LU8XgTCnBt!lC^73=D3NO&oyy< zx=-7~%rQKjE*B%iNJPN#*S9^$2~W$LpKu=@Gv`aRdNOT0KDiyzNp|LDOYgojv>9@x z?4u)N+qaeK<`b``MdQ-6f+f+x?jvmk_ONtVQ)O(5TQ!p3-~AC_i%HI4M#?>XNtV!X z$NLn1VQR`2NBa<VAH|mGD^(ZhGr&2}b+|$PQP&)ywqtISUQf2&e1rZub2B%fUwEgn zzSfMbJENXV+FJ~ZYO<|qJBPC(7My_(N*`D)JmKF^`(v=?ixr(Fgr<{$6?cI}jjYW5 z$E%O;XzvV2Y|^l>teMrke_DlRPf^gAbDB)vz~d5a;@SiEZ70$9pAcRuEZL7K&&;yu z`Rcn$UVZcP`hx!%kq@^j`Ss;mrADsjI9ouaeQt~tYxSh*ONv|@oF4|y(@#6M-ZmvG z#%ai|=cW55#hJIx1@yMBij=C#cU_S&Vvqhzguhb}UvgPX$A@8MUGn3mn-=M|1Y-BZ z>!2oCnU!AO-!7l*!6(IiI+^E9BnYW3=p@Q~Nf2@;Gm0qIlUL)Kws>3(UkPXRRYh;| z_Bs8mwsumt>v-&+36OE|!N!;Ttn!@+HQBd2JZwu|0NrVulHQ6O2?$-NBs&8XXAL;N zeE-?9zha%%tTx^p6jUPP^q}Q~wmR9obWiI-Q&Bd|sZ?;I{&0<May%!<daN&7?c)*d z=5BtyPe50ufz8PEuVu4Ah}&nk7W)mpf<LLQ{5=0v){$e!8PjVfr6v+rJdY>DeuA04 z;U<_{w2yC_o~~&Zy2jnEnmg9XkUP?%nw!!oo}1QkCfB-Cd}jMGl;0MlA&euGbA}c2 z&U;{tyz+E!#MDU!SZvPA1?u+LcCGE!yxWy;PAhL<u}9kb6kC=zMBBkBFFe#2m_M!G z$NFP_w)XWKFC==F{IB0qa9>9ij2KQNEsLF%BpFP8Vzrv={vaX_&^9`2ZiGvGp_6Wm z{PGpY+<Ja~5<GW;msyqQlUPAFc}a4CGU!_U(P)i@W$m1Kot$~?9Il<bKHvI&_kGH5 z7<<KU46A%FjQv(VJxAHwUTaRxF+Ni8-E&GdKB}5~**A^fQVveu+s_Q>4K~l<OkG=< zIZ-~zr03<Y&wSx@G6F#_$0*ZRdvB~gM(j@DrQ6`u1}FCuwL)`|St^B-@el$cz9U_U zuPu`P*GI6{)#6jEzx%yMY|t>%VKASiUuO{Hd<sLp3k)1PdXaBr^k~|Bc$4!S?$U5o z={ojJyF{O{K`O)wBDRqh*_%G-{j$pdaV<Ier)?fsj6v%8iFVB*r+}zWln*R9$y8M4 zbjnbysj~EG%h^~{ZXMNp{!yEA`NcYr$89r5s`)2h7Py*naIzgZ0m=ZWT)fZhI8Ml5 zoZMKY@PJVRXIc02m9$YgZ(mD=>=fyj29r=1f_tLh8CL1?Ljfz#jULY2vLLXm6u(j| zllvm@MAcFI+BVGJhIc>Nxu%?L{K<-r;)))@rsE5`3S%H0UM5q0bcb!Xq^%WwFx+nm z4gf4bnR32eBHbSeq*GiaPt-qo@IuDKGsq@wDLmw9!i5(_U&*YaXQww!z|a#b^@EdM zQ$r2wWYd+O^LF5%(Q12xHXF(4A!l$H;Sre#bK0onV^kk2ZM9Fi(aKU_8H2UYYOFkh zz(O1#UtG49R$f(*pGK7MS6h^Xcd($(7fB05e67~YbEy8>aO-Vq3HlCiM-6$bOpNfd zTa)dKnksZ`!Mk{6(#C9WTPej>V`@DIc1SD)YODKf*7y+v510Ld)6TzJJzD*F>?lpE zl|pQP!OvgpMn$tf(djq4{fTTK1C%QPXb+^Kni&vAK{_mnf@Tz^GDNzf;M!OppuH?* zPhJi}{lor2cT_t2O=A>@M3^IqKzk4cV<{DoK13J*q(>n;pqd&;4*~>Jll)>s&?O(@ zWpg5xV%5~B=_n=9SiCt7j`(E(5zH=Y0~LO||A!{2CTcx<&=nmWZ8!nx3RI+e!G7~8 zb@ls<T2vs!P>X<ckvCBYBsD3>h{Bwp-{h#EtoVPDBekc$$?-v|w}tumsp!5&rACt^ z_Sk#<H$2Q}^;sMB6J^eWnc1(tx=SOhsB3}KHBBsJf=JAS<m)6p6ON5MX8LS}o=<n! z3&(vX^@?j5-}Fo0^`6XW!M&gxIgNcE-PZ$0>uK)(2xoYG&4xMTM4k=zr-ind+i~vB z$B&)ZprJ$4%-||3wNH`tXftR7-p8XexX8SX^(TS*-_H+SjBlYi%DGo(D-5PFNsnLn zMjm(I<L!j1TW5*Z7<Z;u%NzE}UhNgO4TD%E6TV5=2kd4b*_5{+5o&y0gx*=?#@wX% z=iADhP4S7&e7p-x<T^-`7$Tf_B}%g$JNJw)z(|osB56|Q3&GqKEsaQ$fP_hZ_NAH8 z?{wN21le@=adok^^>*9IaGooQ;S+N(kGh@A75AyH&u}5Gty}h_(ruAh@cRrtTmDaY z8n?b5w%op+e9whd-$Rw5vYyhOO%lVwupgO8dpA=ZBZAlXbOp(8cCCXlSZSUXBCEy^ zqCHVOG_aLpCJl$tJ<cGS*yl~8lYw3W^#P7%t+P_@7fL559Sn};+>it<t{>!_4AyZ9 z=re2jxIPc}*-C~c5y3i+N*Fi4LfzK4y|cLe6JK(o$%4m!k_1k1{5aPX1G;xfx2y0P z@LFn>{adaWh79c}O>)kCGTWP<1><$8w<gFO{i!yzkA(iv3=*J`IL`rO_IJI|zaGfa zewl8+CvG4k{lWLXu%Cg}wby>WJ#{WP0dZLuQp;)D`}O`0J6d@l%_U8`Wz92-bX>3K zt6y={(y-0Y(P~M=(0e(9XTrGe9@7Q#a)(sqp1v1W<}6;!NXq4q4_CyUCWazr<e|r9 zLc5&lCy$xQp3DxB+yHcIvPVR|W#=)ACIWTd@;{HXxqD`gjzNdRg#R9ssE)QiEsy3( zu30hv<EZ64w?+F!3F061G^{!`xuzbR2RweqMP!)^TkM{4pcyKijiF2l5xgy(8xW|V zOjymU6dYXu`&xwYo3t4Y8^FcTB90Jt4wZ*-vK*&g>uD$YoMYw?D!t2DK#S6xxGPW~ zSQo3GSoGGOoAJ^mu*tQ1Ebjz)B4;0_=2_(_=07$)Zt|d5W7u+7;yPf5!%*+W!=8cO zMH-@%3g3e`89nRV%>J{Bm5UtP&;uUE@H|93IN@~gHGVOkO4>@k$}>X=%8`8S@$kZD zHcwPW`B!3D+wf0a(BNu-Kf6DtKZ9yQV7SOE5mya$a!@mhxzK7kV}}#RS>w2B`Cf8P zv9HIowQX&5Z6e8f0c=4`7ZY-XB)H<4%Omu)&%>qjlKS<a2DJu0TIV=-VvJgG4?}9B zY_6#3su}Z(Hi??*yPZ>%3_U00DYPp7K=`9@BQUvB&y@|YW_xW)8zYt@G%en2ntR^n zJiWvQx7CGV3F3JNQ=h^ZYk7-p3x_A^6*hT3x?cL5C7h0(`nSqjP3<08<P$BHq6S|; zEXVbfEtGY9EAesP80If?OSQ$!>U=VdmG~~<E`hL!cx;R`uNx)}>n+<ZLs@m?3K{y% zP>)_<J;Gl|^2EkIxccI1_SM_(ym%zf=A9X!TU^?P9Ka7q2db9Rmd2J%%VHynBNrZ5 zjC?8fE#4ftIy^C=H#}7$Hyk)TG+a~ST{1nSiAqOB+?c!mbpTqrNj&M9(8a%&m82ZK zFgY5b&?7G*Z|$)*#NVGaIym}m=-uGsL6!U+i_G%Xya$gPb-qXJ7M!%~Fua^wsk2bf zi--ceNlxr7cHIUS5*0{?B+>ev`tW)=zji-UzevATzrAhd1F6Gvhi4BO56l@=7=;)a z7|R%ISU01j-f+KxMIA;}+>XEP60dlg#HS9ik~=BetAtT#x}mM;rDW?i=#1~-M!bU< z%6;_o8$Y2~sbl~y1)qUL_nhr!_|ni5E!}_0=#+LGW89f*oI}YOZ_vmDyFux2X%^`Q zsT^BAn>KsT%H^5^4^Qj^%tw#Fjwlx{4|9(p#8zophV)o<`|gCpH;3uE(uJFSff#p8 zN6iO>_fYnAL6_#1rm>&l;rZcj^LpTe1FygX&qg;H<JtiTPqzo&+N!?FWj9s6Y>$q& z*nSOozt78CXmH;kt}EZSeT%&H$HMKp<vP5>3hWW)M2&7uBgV`d=AGal{+*63kS+Ui zaz{i_iTmRPtv+|hQAgp5lNDoV_uZ?zlGxS{eGRSt{n4{r^X99BY3n|v@tP6ucYRaX z$Z5&o7r}Rr6pzjvvpY5!@+G81v*T4tXrNX>E`!#H7F$?P*rgVAA)LPR+ugejTdWH# z3*M1o{UQCUd3&$p-=szc3MMeKu~aYzpN4apvJW(e)H`>Er7~0qHwt$Nuj^*$UU}%3 z&y&yfFf-q&!>8k3hd!dr#k$=wb|{9cmDGXwoc}p<wutvYYO)Te)}oOI{d6Gw`I2v( zjMS)2Ruh@b*(RDLdYN<J7WkI*t##$REXLmL!H`E)21T{#HJLT8?Bm$u*rfG$patV| zbz&*v>Q`?XPGzfQ%x0S<ezSOW%7P`k=%8ryQIipi;fI`ENp-PCb-J{~bT12ijzZ?| zkSe(?NOWa)Wo%_ZS614pgQ6&aQvs<n9{0SVO@6#@IHXngxg+HM=T!6LEiomPJ^gHa zwk+1UZv2~Jm-x59t!BqiyB64heXjk}YG>l3g?9&*{f{4&`dP13$k?ejwPSlit0JmU zH-aTz$Vh<#>SWwt#)(z~_a8pn30kp_<B7f0rleSlF8&ZRA@Lqsi&`suY$#~pBg$(~ zW^m!^H1-RoiJa17o#GTYk@N!jv^x*oXVq=>X>{eoWX)L#YneWf;0J-{jfCTf%bf{n zs%c+en!v41YvP_7JU6i8a~OvqO{%Jv14&nR^lw`m`j@$_7EX*c_%>90-F!b}Rb**x zw}+{4Uhna1KImo4kBSXU4^;jxps1&`8{BhHUw0+H<i*mPpie<dp)quUJqbNM()Xp2 zWw_D;(l_VVEnU+-!9yS0(WTg;nlu60=yTBm!I|XMoi|$1{&$`xya;qB!#s~y<ri=* zJ4`8$qzsC#Caef({NeMWrsGl3XaBDCJ;5P$D}uwk&p7kzTMxN%jK1V#X{3;rmzL&+ z5RC?Y>shzd31y&{*bQ|5_|`%|--D&SuNQr%c78l#>E%subcKDbpTm_9UIpcTKK@-$ zh4Cz-YBYazJxCR{#+R)WFdO?lWyrd*BD>)fZgueOkHk+areoMC{aT^M&W8O(tAnzI z8Um(o(w4v*STmPB*cki${@$4&`NL<hWuUsovbxHKb4$1?V)B{zMdZ<T(91=keRx|$ zukMz}hszyef#RTxj%1%7S4NXFirOw-ICA*jJvZ*%?DGR6%cH9B(_#C^P(`Lwar675 z!8PPM@%{FpHD86BW7<p8V8V>_isw;7x;K|x-TJfj3m4Oa7WYqYZ>|-^8|MVk?L};T z{}*a2|Ba!5zhu%&7>Xi^!ccTlDz^S-WsSmUVF;+-1^`GG<>rQ@Xrol|(;g_V0P;j( zNCcp=BFKw`B_h#CHzEKmFAs9XQsi6$0R;rP<KZ_cj0;DQsHlrT;SnTHH#E`*2qL1; zt`w6e9D&DTfFKt<lIlo_gd-3XxfK9%MNvu-Py`@I27@KKBHe%>{I4bv6!J%-;s3CE zkTC9WJjoLcClP@ltUDHiyg{i3|CO0QC3o0AbmCvE{~N^r6pDY{1VH8_7veA7m};b4 zF^u4zl$I#BgI{y`(?=hPzKJBF5O4tK_gsL~iT+bp_Q50F0J1<Z7$E!i15{K{kW&D< z0spo^DC#-&0b>5PffXtDp#N=yP~!e&Q<9@d&Hv>GRiIp9{@VuqS3X$?n4**aZ$DXx z^1o~f3QGT~p#Y_v;O|%>9*#mI@f7hN2r@_cAt~ntg04`sPs;6#IuDQ;7E1(DMfqQ| XArRqs;;+dlD9b?rqN10K^#K0^v1kR% literal 0 HcmV?d00001 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherrcorr.pe.csv b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.pe.csv new file mode 100644 index 00000000..8d9cbaa6 --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.pe.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,1548,97 +31,2653,68 +41,6189,65 +51,15642,18 +61,34269,12 +71,39181,0 +81,50743,0 +91,11697,5 +99,21,0 diff --git a/results/kmer_size_experiment/results/20161012wed/stocherrcorr.plain.csv b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.plain.csv new file mode 100644 index 00000000..68bc459a --- /dev/null +++ b/results/kmer_size_experiment/results/20161012wed/stocherrcorr.plain.csv @@ -0,0 +1,10 @@ +K,NG50,AssemblyErrors +21,959,3 +31,1763,0 +41,2909,0 +51,5501,0 +61,13278,0 +71,21068,0 +81,38623,0 +91,10786,5 +99,21,0 diff --git a/results/kmer_size_experiment/results/generate-results.sh b/results/kmer_size_experiment/results/generate-results.sh new file mode 100755 index 00000000..4623f6aa --- /dev/null +++ b/results/kmer_size_experiment/results/generate-results.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -eou pipefail + +mkdir -p latest +./make-csv.sh ../perfect_cov/k*/stats.plain.txt > latest/perfect.plain.csv +./make-csv.sh ../perfect_cov/k*/stats.links.txt > latest/perfect.links.csv +./make-csv.sh ../perfect_cov/k*/stats.pe.txt > latest/perfect.pe.csv +./plot-n50-and-errs.R "Perfect cov. (100X, 100bp reads)" latest/perfect.pdf \ + latest/perfect.plain.csv latest/perfect.links.csv latest/perfect.pe.csv + +./plot-n50-and-errs.R "Perfect cov. (100X, 100bp reads)" latest/perfect_no_pe.pdf \ + latest/perfect.plain.csv latest/perfect.links.csv + +./make-csv.sh ../stoch_cov/k*/stats.plain.txt > latest/stoch.plain.csv +./make-csv.sh ../stoch_cov/k*/stats.links.txt > latest/stoch.links.csv +./make-csv.sh ../stoch_cov/k*/stats.pe.txt > latest/stoch.pe.csv +./plot-n50-and-errs.R "Stochastic cov. (100X, 100bp reads)" latest/stoch.pdf \ + latest/stoch.plain.csv latest/stoch.links.csv latest/stoch.pe.csv + +./make-csv.sh ../stocherr_cov/k*/stats.plain.txt > latest/stocherr.plain.csv +./make-csv.sh ../stocherr_cov/k*/stats.links.txt > latest/stocherr.links.csv +./make-csv.sh ../stocherr_cov/k*/stats.pe.txt > latest/stocherr.pe.csv +./plot-n50-and-errs.R "Stochastic cov. + 0.5% err (100X, 100bp reads)" latest/stocherr.pdf \ + latest/stocherr.plain.csv latest/stocherr.links.csv latest/stocherr.pe.csv + +./make-csv.sh ../stocherr_corr/k*/stats.plain.txt > latest/stocherrcorr.plain.csv +./make-csv.sh ../stocherr_corr/k*/stats.links.txt > latest/stocherrcorr.links.csv +./make-csv.sh ../stocherr_corr/k*/stats.pe.txt > latest/stocherrcorr.pe.csv +./plot-bfc.R "Stochastic cov. + 0.5% err (100X, 100bp reads)" latest/stocherrcorr.pdf \ + latest/stocherr.plain.csv latest/stocherr.links.csv latest/stocherr.pe.csv \ + latest/stocherrcorr.plain.csv latest/stocherrcorr.links.csv latest/stocherrcorr.pe.csv + +./make-cleaning-table.py ../stocherr_cov/k*/graph.k*.dist.txt > latest/cleaning.table.csv +./make-cleaning-table.py ../stocherr_corr/k*/graph.k*.dist.txt > latest/cleaning.corr.table.csv diff --git a/results/kmer_size_experiment/results/make-csvs-and-plots.sh b/results/kmer_size_experiment/results/make-csvs-and-plots.sh deleted file mode 100755 index b071ec69..00000000 --- a/results/kmer_size_experiment/results/make-csvs-and-plots.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -eou pipefail - -./make-csv.sh ../perfect_cov/k*/stats.plain.txt > perfect.plain.csv -./make-csv.sh ../perfect_cov/k*/stats.links.txt > perfect.links.csv -./make-csv.sh ../perfect_cov/k*/stats.pe.txt > perfect.pe.csv -./plot-n50-and-errs.R perfect.plain.csv perfect.links.csv perfect.pe.csv "Perfect coverage (100X, 100bp reads)" perfect.cov.pdf - -./make-csv.sh ../stoch_cov/k*/stats.plain.txt > stoch.plain.csv -./make-csv.sh ../stoch_cov/k*/stats.links.txt > stoch.links.csv -./make-csv.sh ../stoch_cov/k*/stats.pe.txt > stoch.pe.csv -./plot-n50-and-errs.R stoch.plain.csv stoch.links.csv stoch.pe.csv "Stochastic coverage (100X, 100bp reads)" stoch.cov.pdf - -./make-csv.sh ../stocherr_cov/k*/stats.plain.txt > stocherr.plain.csv -./make-csv.sh ../stocherr_cov/k*/stats.links.txt > stocherr.links.csv -./make-csv.sh ../stocherr_cov/k*/stats.pe.txt > stocherr.pe.csv -./plot-n50-and-errs.R stocherr.plain.csv stocherr.links.csv stocherr.pe.csv "Stochastic coverage + Error (100X, 100bp reads, 0.5% err)" stocherr.cov.pdf - -./make-cleaning-table.py ../stocherr_cov/k*/graph.k*.dist.txt > cleaning.table.csv diff --git a/results/kmer_size_experiment/results/plot-bfc.R b/results/kmer_size_experiment/results/plot-bfc.R new file mode 100755 index 00000000..28acd51b --- /dev/null +++ b/results/kmer_size_experiment/results/plot-bfc.R @@ -0,0 +1,88 @@ +#!/usr/bin/env Rscript --vanilla + +# Isaac Turner 2016-10-12 + +library('ggplot2') +library('reshape') +library('scales') +library('plyr') +library('gridExtra') +library('cowplot') + +args <- commandArgs(trailingOnly=TRUE) +if(length(args) != 8) { + stop("Usage: ./plot-n50-and-errs.R <title> <out.pdf> <plain.csv> <links.csv> <pe.csv> <p.c.csv> <l.c.csv> <pe.c.csv>\n") +} + +# +plain_csv <- "latest/stocherr.plain.csv" +links_csv <- "latest/stocherr.links.csv" +pe_csv <- "latest/stocherr.pe.csv" +plot_title <- "Stochastic coverage + 0.5% error (100X, 100bp reads)" +corr_plain_csv <- "latest/stocherrcorr.plain.csv" +corr_links_csv <- "latest/stocherrcorr.links.csv" +corr_pe_csv <- "latest/stocherrcorr.pe.csv" +output_pdf <- "plot.pdf" +# + +use_pe <- (length(args) >= 5) +use_corr <- (length(args) >= 8) + +plot_title <- args[1] +output_pdf <- args[2] +plain_csv <- args[3] +links_csv <- args[4] +if(use_pe) { + pe_csv <- args[5] +} +if(use_corr) { + corr_plain_csv <- args[6] + corr_links_csv <- args[7] + corr_pe_csv <- args[8] +} + +a <- read.table(plain_csv,sep=',',head=T,comment.char='#',as.is=T) +b <- read.table(links_csv,sep=',',head=T,comment.char='#',as.is=T) +c <- read.table(pe_csv,sep=',',head=T,comment.char='#',as.is=T) +a$graph = factor('plain') +b$graph = factor('links') +c$graph <- factor('pe') + +aa <- read.table(corr_plain_csv,sep=',',head=T,comment.char='#',as.is=T) +bb <- read.table(corr_links_csv,sep=',',head=T,comment.char='#',as.is=T) +cc <- read.table(corr_pe_csv,sep=',',head=T,comment.char='#',as.is=T) + +a$corrNG50 <- aa$NG50 +b$corrNG50 <- bb$NG50 +c$corrNG50 <- cc$NG50 +a$corrAsmErrors <- aa$AssemblyErrors +b$corrAsmErrors <- bb$AssemblyErrors +c$corrAsmErrors <- cc$AssemblyErrors + +d = rbind(a,b,c) +dlevels <- c('pe','links','plain') +dlabels <- c('links PE','links','plain') +d$graph <- factor(d$graph, levels=dlevels, labels=dlabels) + +# Approach 1 +# Plot contig N50 +p1 <- ggplot(data=d, aes(x=K, y=NG50, color=graph, shape=graph)) + theme_minimal() + + theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + + geom_line() + geom_point(shape=4) + + geom_line(aes(y=corrNG50),linetype="dotted") + + scale_y_continuous(limits = c(0,250000)) + + ylab("NG50") + ggtitle(plot_title) + + theme(legend.title=element_blank()) + # hide legend title + theme(legend.justification=c(0,1), legend.position=c(0,1)) # legend in plot top left + +# Plot assembly error rate +p2 <- ggplot(data=d, aes(x=K, y=AssemblyErrors, color=graph)) + theme_minimal() + + geom_point(shape=4) + geom_line() + + geom_line(aes(y=corrAsmErrors),linetype="dotted") + + scale_y_continuous(breaks=seq(0,150,50)) + coord_cartesian(ylim=c(0,150)) + + ylab("Assembly Errors") + + theme(legend.position="none") # hide legend + +g <- plot_grid(p1, p2, align="v", nrow=2, rel_heights=c(3, 1)) + +ggsave(g, file=output_pdf, width=6, height=6) diff --git a/results/kmer_size_experiment/results/plot-n50-and-errs.R b/results/kmer_size_experiment/results/plot-n50-and-errs.R index e93c9821..e3b09b78 100755 --- a/results/kmer_size_experiment/results/plot-n50-and-errs.R +++ b/results/kmer_size_experiment/results/plot-n50-and-errs.R @@ -10,48 +10,71 @@ library('gridExtra') library('cowplot') args <- commandArgs(trailingOnly=TRUE) -if(length(args) != 5) { - stop("Usage: ./plot-n50-and-errs.R <plain.csv> <links.csv> <pe.csv> <title> <out.pdf>\n") +if(length(args) != 5 && length(args) != 4) { + stop("Usage: ./plot-n50-and-errs.R <title> <out.pdf> <plain.csv> <links.csv> [pe.csv]\n") } plain_csv <- "perfect.plain.csv" links_csv <- "perfect.links.csv" pe_csv <- "perfect.pe.csv" plot_title <- "Perfect coverage (100X, 100bp reads)" +use_pe <- TRUE # plain_csv <- "stoch.plain.csv" # links_csv <- "stoch.links.csv" # pe_csv <- "stoch.pe.csv" # plot_title <- "Stochastic coverage (100X, 100bp reads)" +# use_pe <- TRUE -# plain_csv <- "stocher.plain.csv" -# links_csv <- "stocher.links.csv" +# plain_csv <- "stocherr.plain.csv" +# links_csv <- "stocherr.links.csv" # pe_csv <- "stocherr.pe.csv" -# plot_title <- "Stochastic coverage + Error (100X, 100bp reads, 1% err)" +# plot_title <- "Stochastic coverage + 0.5% err (100X, 100bp reads)" +# use_pe <- TRUE # output_pdf <- "plot.pdf" -plain_csv <- args[1] -links_csv <- args[2] -pe_csv <- args[3] -plot_title <- args[4] -output_pdf <- args[5] +use_pe <- (length(args) == 5) + +plot_title <- args[1] +output_pdf <- args[2] +plain_csv <- args[3] +links_csv <- args[4] +if(use_pe) { + pe_csv <- args[5] +} a <- read.table(plain_csv,sep=',',head=T,comment.char='#',as.is=T) a$graph = factor('plain') b <- read.table(links_csv,sep=',',head=T,comment.char='#',as.is=T) b$graph = factor('links') -c <- read.table(pe_csv,sep=',',head=T,comment.char='#',as.is=T) -c$graph = factor('pe') -d <- rbind(a,b,c) -d$graph <- factor(d$graph, levels=c('pe','links','plain'), labels=c('links PE','links','plain')) +d <- rbind(a,b) + +if(use_pe) { + c <- read.table(pe_csv,sep=',',head=T,comment.char='#',as.is=T) + c$graph <- factor('pe') + d <- rbind(d,c) + dlevels <- c('pe','links','plain') + dlabels <- c('links PE','links','plain') +} else { + dlevels <- c('links','plain') + dlabels <- c('links','plain') +} + +d$graph <- factor(d$graph, levels=dlevels, labels=dlabels) + +N50_ylim <- 80000 +asm_ylim <- 5 +if(use_pe) { + N50_ylim <- 250000 + asm_ylim <- 150 +} -# Approach 1 # Plot contig N50 p1 <- ggplot(data=d, aes(x=K, y=NG50, color=graph)) + theme_minimal() + theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + geom_point(shape=4) + geom_line() + - scale_y_continuous(limits = c(0,250000)) + + scale_y_continuous(limits = c(0,N50_ylim)) + ylab("NG50") + ggtitle(plot_title) + theme(legend.title=element_blank()) + # hide legend title theme(legend.justification=c(0,1), legend.position=c(0,1)) # legend in plot top left @@ -59,26 +82,10 @@ p1 <- ggplot(data=d, aes(x=K, y=NG50, color=graph)) + theme_minimal() + # Plot assembly error rate p2 <- ggplot(data=d, aes(x=K, y=AssemblyErrors, color=graph)) + theme_minimal() + geom_point(shape=4) + geom_line() + - scale_y_continuous(breaks=seq(0,150,50)) + coord_cartesian(ylim=c(0,150)) + + scale_y_continuous(limits=c(0,asm_ylim)) + ylab("Assembly Errors") + theme(legend.position="none") # hide legend -# 1a. -# grid.arrange(p1, p2, ncol=1, heights=c(2, 1)) - -# 1b. g <- plot_grid(p1, p2, align="v", nrow=2, rel_heights=c(3, 1)) -# 1c. -# grid.newpage() -# grid.draw(rbind(ggplotGrob(q), ggplotGrob(s), size = "last")) - -# Approach 2 -# m <- melt(d,measure.vars=c('NG50','AssemblyErrors')) - -# ggplot(m, aes(x=K, y=value)) + -# geom_line(aes(color=graph)) + -# facet_grid(variable ~ ., scales="free_y", space="fixed") + -# xlab("kmer size") + ylab("") + ggtitle(plot_title) - ggsave(g, file=output_pdf, width=6, height=6) diff --git a/scripts/make-pipeline.pl b/scripts/make-pipeline.pl index 8b7a40aa..acbf0d10 100755 --- a/scripts/make-pipeline.pl +++ b/scripts/make-pipeline.pl @@ -375,16 +375,16 @@ sub print_usage STAMPY_BASE='.(defined($stampy_base) ? $stampy_base : '').' # Set up memory, threads and number of kmers in the graph -CTX_ARGS= ifdef MEM - CTX_ARGS:=$(CTX_ARGS) -m $(MEM) + CTX_ARGS_MEM=-m $(MEM) endif ifdef NKMERS - CTX_ARGS:=$(CTX_ARGS) -n $(NKMERS) + CTX_ARGS_NKMERS=-n $(NKMERS) endif ifdef NTHREADS - CTX_ARGS:=$(CTX_ARGS) -t $(NTHREADS) + CTX_ARGS_THREADS=-t $(NTHREADS) endif +CTX_ARGS=$(CTX_ARGS_MEM) $(CTX_ARGS_NKMERS) $(CTX_ARGS_THREADS) # # Parse USE_LINKS and JOINT_CALLING Makefile options @@ -939,7 +939,7 @@ sub merge_vcf_list for my $assem (qw(links plain)) { my $callroot = "$call.$pop.$assem.$kmerstr"; print "$proj/k$k/vcfcov/$callroot.%.vcf.gz: $proj/vcfs/$callroot.vcf.gz $proj/k$k/graphs/%.raw.ctx | dirs\n"; - print "\t( $mccortex vcfcov -m \$(MEM) \$(VCFCOV_ARGS) --ref $ref_path --out-fmt vcfgz \$^ | \$(BCFTOOLS) view --samples \"\$*\" -o \$@ -O z ) >& \$@.log\n\n"; + print "\t( $mccortex vcfcov \$(CTX_ARGS_MEM) \$(CTX_ARGS_NKMERS) \$(VCFCOV_ARGS) --ref $ref_path --out-fmt vcfgz \$^ | \$(BCFTOOLS) view --samples \"\$*\" -o \$@ -O z ) >& \$@.log\n\n"; } } } diff --git a/scripts/mccortex b/scripts/mccortex index 3182cdac..8e586630 100755 --- a/scripts/mccortex +++ b/scripts/mccortex @@ -4,8 +4,8 @@ set -euo pipefail function usage { - echo "usage mccortex <K> ..." >&2 - echo " wrapper to find the correct mccortex binary given kmer size (K)" >&2 + echo "usage: mccortex <K> [<cmd> ...]" >&2 + echo " Wrapper to find the correct mccortex binary given kmer size (K)" >&2 exit -1 } diff --git a/scripts/python/break-contigs-vs-truth.py b/scripts/python/break-contigs-vs-truth.py old mode 100644 new mode 100755 index 03ccfd1c..037aee77 --- a/scripts/python/break-contigs-vs-truth.py +++ b/scripts/python/break-contigs-vs-truth.py @@ -242,6 +242,8 @@ def main(k,path): def usage(err=None): if err is not None: print(err,file=sys.stderr) print("python break-contigs-vs-truth.py <k> [contigs.txt]",file=sys.stderr) + print(" Reads ref genome from STDIN as a single line",file=sys.stderr) + print(" If contigs.txt not passed, reads from STDIN after reading ref",file=sys.stderr) exit(-1) if __name__ == '__main__': diff --git a/scripts/python/count-bad-edges.py b/scripts/python/count-bad-edges.py old mode 100644 new mode 100755 diff --git a/scripts/python/mccortex.py b/scripts/python/mccortex.py old mode 100644 new mode 100755 diff --git a/scripts/python/pyRBT.py b/scripts/python/pyRBT.py old mode 100644 new mode 100755 diff --git a/src/commands/ctx_contigs.c b/src/commands/ctx_contigs.c index 2ecd0cef..b9608dff 100644 --- a/src/commands/ctx_contigs.c +++ b/src/commands/ctx_contigs.c @@ -181,10 +181,10 @@ int ctx_contigs(int argc, char **argv) // pop_colour is colour 1 graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1); - if(!genome_size) + if(!genome_size && (conf_table_path || num_gfiles)) { char nk_str[50]; - if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming"); + if(ctx_max_kmers == 0) die("Please pass --genome <G> if streaming"); genome_size = ctx_max_kmers; ulong_to_str(genome_size, nk_str); status("Taking number of kmers as genome size: %s", nk_str); diff --git a/src/commands/ctx_vcfcov.c b/src/commands/ctx_vcfcov.c index a6d81d45..0a8352cc 100644 --- a/src/commands/ctx_vcfcov.c +++ b/src/commands/ctx_vcfcov.c @@ -17,8 +17,9 @@ const char vcfcov_usage[] = "usage: "CMD" "SUBCMD" [options] <in.vcf> <in.ctx> [in2.ctx ...]\n" "\n" -" Get coverage of a VCF in the cortex graphs. VCF must be sorted by position. \n" -" It is recommended to use uncleaned graphs.\n" +" Add coverage to a VCF using cortex graphs. It is recommended to use\n" +" uncleaned graphs. The VCF must be sorted by position, with duplicates removed\n" +" Indels ought to be left aligned to remove duplicates.\n" "\n" " -h, --help This help message\n" " -q, --quiet Silence status output normally printed to STDERR\n" @@ -103,6 +104,13 @@ int ctx_vcfcov(int argc, char **argv) if(use_lowmem && use_himem) cmd_print_usage("Cannot use --low-mem and --high-mem together!"); + // Override number of kmers to use if --low-mem passed, since we calculate + // number of kmers required anyway + if(use_lowmem && memargs.num_kmers_set) { + memargs.num_kmers_set = 0; + memargs.num_kmers_set = false; + } + if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; diff --git a/tests/pipeline/Makefile b/tests/pipeline/Makefile index cdb321fc..e3643c05 100644 --- a/tests/pipeline/Makefile +++ b/tests/pipeline/Makefile @@ -1,5 +1,10 @@ SHELL=/bin/bash -euo pipefail +# +# Simulate reads from two given samples, with sequencing error of 1%, 50X, 100bp +# Call variants and check that we get exactly the expected variants +# + CTXDIR=../.. MCCORTEX=$(CTXDIR)/bin/mccortex31 DNACAT=$(CTXDIR)/libs/seq_file/bin/dnacat From b01f22f746d56e60d14d772d9259887472a7381b Mon Sep 17 00:00:00 2001 From: Isaac Turner <turner.isaac@gmail.com> Date: Mon, 31 Oct 2016 11:19:51 +0000 Subject: [PATCH 3/3] Add missing file for kmer size experiment --- .../results/make-csvs-and-plots.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 results/kmer_size_experiment/results/make-csvs-and-plots.sh diff --git a/results/kmer_size_experiment/results/make-csvs-and-plots.sh b/results/kmer_size_experiment/results/make-csvs-and-plots.sh new file mode 100755 index 00000000..b071ec69 --- /dev/null +++ b/results/kmer_size_experiment/results/make-csvs-and-plots.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -eou pipefail + +./make-csv.sh ../perfect_cov/k*/stats.plain.txt > perfect.plain.csv +./make-csv.sh ../perfect_cov/k*/stats.links.txt > perfect.links.csv +./make-csv.sh ../perfect_cov/k*/stats.pe.txt > perfect.pe.csv +./plot-n50-and-errs.R perfect.plain.csv perfect.links.csv perfect.pe.csv "Perfect coverage (100X, 100bp reads)" perfect.cov.pdf + +./make-csv.sh ../stoch_cov/k*/stats.plain.txt > stoch.plain.csv +./make-csv.sh ../stoch_cov/k*/stats.links.txt > stoch.links.csv +./make-csv.sh ../stoch_cov/k*/stats.pe.txt > stoch.pe.csv +./plot-n50-and-errs.R stoch.plain.csv stoch.links.csv stoch.pe.csv "Stochastic coverage (100X, 100bp reads)" stoch.cov.pdf + +./make-csv.sh ../stocherr_cov/k*/stats.plain.txt > stocherr.plain.csv +./make-csv.sh ../stocherr_cov/k*/stats.links.txt > stocherr.links.csv +./make-csv.sh ../stocherr_cov/k*/stats.pe.txt > stocherr.pe.csv +./plot-n50-and-errs.R stocherr.plain.csv stocherr.links.csv stocherr.pe.csv "Stochastic coverage + Error (100X, 100bp reads, 0.5% err)" stocherr.cov.pdf + +./make-cleaning-table.py ../stocherr_cov/k*/graph.k*.dist.txt > cleaning.table.csv