From 88bb16a3eb77f73597b207425b8a9372ec052a42 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 14 Aug 2023 12:08:43 +0100 Subject: [PATCH 01/26] Add view -N/-R ^FILE for FILE negation. (#1896) These options output only the reads with read names (-N) or read groups (-R) listed in FILE. Copying the syntax used in various bcftools commands, now ^FILE negates this so it only outputs reads with their name/group not listed in the file. The third exclusion-by-hash in the code is tag types, but it's less likely we'll be using files of tag names as these tend to be more prescribed than the generated names and read-groups. Hence negation isn't supported there. Fixes #1895 --- doc/samtools-view.1 | 8 ++++++++ sam_view.c | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/doc/samtools-view.1 b/doc/samtools-view.1 index f38d29d23..2e34f912f 100644 --- a/doc/samtools-view.1 +++ b/doc/samtools-view.1 @@ -292,6 +292,10 @@ or .BI "-N " FILE ", --qname-file " FILE Output only alignments with read names listed in .IR FILE . +If \fIFILE\fR starts with \fB^\fR then the operation is negated and +only outputs alignment with read groups not listed in \fIFILE\fR. +It is not permissible to mix both the filter-in and filter-out style +syntax in the same command. .TP .BI "-r " STR ", --read-group " STR Output alignments in read group @@ -306,6 +310,10 @@ This behaviour may change in a future release. Output alignments in read groups listed in .I FILE [null]. +If \fIFILE\fR starts with \fB^\fR then the operation is negated and +only outputs alignment with read names not listed in \fIFILE\fR. +It is not permissible to mix both the filter-in and filter-out style +syntax in the same command. Note that records with no .B RG tag will also be output when using this option. diff --git a/sam_view.c b/sam_view.c index aa5b92310..2a1439acb 100644 --- a/sam_view.c +++ b/sam_view.c @@ -54,6 +54,8 @@ typedef struct samview_settings { strhash_t rnhash; strhash_t tvhash; int min_mapQ; + int rghash_discard; // 0 keep, 1 discard + int rnhash_discard; // 0 keep, 1 discard // Described here in the same terms as the usage statement. // The code however always negates to "reject if" keep if: @@ -176,7 +178,8 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin uint8_t *s = bam_aux_get(b, "RG"); if (s) { khint_t k = kh_get(str, settings->rghash, (char*)(s + 1)); - if (k == kh_end(settings->rghash)) return 1; + if ((k == kh_end(settings->rghash)) != settings->rghash_discard) + return 1; } } if (settings->tag) { @@ -204,9 +207,11 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin } if (settings->rnhash) { const char* rn = bam_get_qname(b); - if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) { + strhash_t h = settings->rnhash; + if (!rn && !settings->rnhash_discard) + return 1; + if ((kh_get(str, h, rn) == kh_end(h)) != settings->rnhash_discard) return 1; - } } if (settings->library) { const char *p = bam_get_library((sam_hdr_t*)h, b); @@ -305,7 +310,12 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting if (settings->rghash == NULL) { settings->rghash = kh_init(str); if (settings->rghash == NULL) goto err; + } else if (settings->rghash_discard == 1) { + print_error("view", "cannot mix include and exclude read-group files in the same command line"); + free(d); + return -1; } + settings->rghash_discard = 0; kh_put(str, settings->rghash, d, &ret); if (ret == -1) goto err; @@ -326,8 +336,14 @@ static int add_read_names_file(const char *subcmd, samview_settings_t *settings, perror(NULL); return -1; } + } else if ((settings->rnhash_discard == 0 && *fn == '^') || + (settings->rnhash_discard == 1 && *fn != '^')) { + print_error("view", "cannot mix include and exclude read-name files in the same command line"); + return -1; } - return populate_lookup_from_file(subcmd, settings->rnhash, fn); + settings->rnhash_discard = (*fn == '^'); + return populate_lookup_from_file(subcmd, settings->rnhash, + fn + (*fn == '^')); } static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) @@ -338,8 +354,14 @@ static int add_read_groups_file(const char *subcmd, samview_settings_t *settings perror(NULL); return -1; } + } else if ((settings->rghash_discard == 0 && *fn == '^') || + (settings->rghash_discard == 1 && *fn != '^')) { + print_error("view", "cannot mix include and exclude read-group files in the same command line"); + return -1; } - return populate_lookup_from_file(subcmd, settings->rghash, fn); + settings->rghash_discard = (*fn == '^'); + return populate_lookup_from_file(subcmd, settings->rghash, + fn + (*fn == '^')); } static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) @@ -1458,9 +1480,10 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "Filtering options (Only include in output reads that...):\n" " -L, --target[s]-file FILE ...overlap (BED) regions in FILE\n" +" -N, --qname-file [^]FILE ...whose read name is listed in FILE (\"^\" negates)\n" " -r, --read-group STR ...are in read group STR\n" -" -R, --read-group-file FILE ...are in a read group listed in FILE\n" -" -N, --qname-file FILE ...whose read name is listed in FILE\n" +" -R, --read-group-file [^]FILE\n" +" ...are in a read group listed in FILE\n" " -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2)\n" " -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE\n" " -q, --min-MQ INT ...have mapping quality >= INT\n" From 3aa477829a34ea664b5e4e0b544cc5cfe7aa09d0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 7 Aug 2023 11:50:02 +0100 Subject: [PATCH 02/26] Add samtools sort -N / samtools merge -N option. These handle an alternative form of read-name sort order that is purely ASCII instead of the "natural" alphanumeric sort. Natural sort order will place "a12" after "a3", as it computes 12 > 3 rather than "1" < "3" (ASCII). However it also means hexadecimal values such as "a12b" > "a3bc", and "a09b" > "a7bc". Typically any read name generation system using leading fixed-width columns with leading zeros to pad is going to be safer using traditional ASCII sort orders. Note while writing this my initial implementation was to add QueryNameASCII to the SamOrder typedef, but it turns out the sheer level of code duplication going on here means doing that yielding 81 changed lines, vs 15 for this. I don't like using a global variable, but it's already using one due to the lacklustre ksort API which can't pass in a client state argument. (Being poorly modelled on qsort I assume.) Hence a sibling global is a trivial change and less error prone. Fixes #1500 --- bam_sort.c | 19 ++++++++--- doc/samtools-merge.1 | 22 ++++++++++--- doc/samtools-sort.1 | 52 ++++++++++++++++++++++--------- test/sort/name.sort.expected.sam | 2 +- test/sort/name2.sort.expected.sam | 24 ++++++++++++++ test/test.pl | 1 + 6 files changed, 97 insertions(+), 23 deletions(-) create mode 100644 test/sort/name2.sort.expected.sam diff --git a/bam_sort.c b/bam_sort.c index b44bd665d..d0163394c 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -164,11 +164,15 @@ static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_co typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder; static SamOrder g_sam_order = Coordinate; +static int natural_sort = 1; // not ASCII, but alphanumeric: a12b > a7b static char g_sort_tag[2] = {0,0}; #define is_digit(c) ((c)<='9' && (c)>='0') static int strnum_cmp(const char *_a, const char *_b) { + if (!natural_sort) + return strcmp(_a,_b); + const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; const unsigned char *pa = a, *pb = b; while (*pa && *pb) { @@ -1535,7 +1539,8 @@ static void merge_usage(FILE *to) " or: samtools merge [options] ... \n" "\n" "Options:\n" -" -n Input files are sorted by read name\n" +" -n Input files are sorted by read name (natural)\n" +" -N Input files are sorted by read name (ASCII)\n" " -t TAG Input files are sorted by TAG value\n" " -r Attach RG tag (inferred from file names)\n" " -u Uncompressed BAM output\n" @@ -1581,11 +1586,12 @@ int bam_merge(int argc, char *argv[]) return 0; } - while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:nNru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = optarg; break; + case 'N': natural_sort = 0; // fall through case 'n': sam_order = QueryName; break; case 'o': fnout = optarg; break; case 't': sort_tag = optarg; break; @@ -3287,6 +3293,9 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, break; case QueryName: new_so = "queryname"; + new_ss = natural_sort + ? "queryname:natural" + : "queryname:lexicographical"; break; case MinHash: new_so = "coordinate"; @@ -3605,7 +3614,8 @@ static void sort_usage(FILE *fp) " -I FILE Order minimisers by their position in FILE FASTA\n" " -w INT Window size for minimiser indexing via -I ref.fa [100]\n" " -H Squash homopolymers when computing minimiser\n" -" -n Sort by read name (not compatible with samtools index command)\n" +" -n Sort by read name (natural): cannot be used with samtools index\n" +" -N Sort by read name (ASCII): cannot be used with samtools index\n" " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" " -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" @@ -3658,9 +3668,10 @@ int bam_sort(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "l:m:nNo:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; + case 'N': natural_sort = 0; // fall through case 'n': sam_order = QueryName; break; case 't': by_tag = true; sort_tag = optarg; break; case 'm': { diff --git a/doc/samtools-merge.1 b/doc/samtools-merge.1 index 342313010..2978afc63 100644 --- a/doc/samtools-merge.1 +++ b/doc/samtools-merge.1 @@ -83,8 +83,10 @@ of existing IDs in the output header will have a suffix appended to them to diff records from other files and the read records will be updated to reflect this. The ordering of the records in the input files must match the usage of the -\fB-n\fP and \fB-t\fP command-line options. If they do not, the output -order will be undefined. See +\fB-n\fP, \fB-N\fP and \fB-t\fP command-line options. If they do not, +the output order will be undefined. Note this also extends to +disallowing mixing of "queryname" files with a combination of natural +and lexicographical sort orders. See .B sort for information about record ordering. @@ -122,8 +124,20 @@ is actually in SAM format, though any alignment records it may contain are ignored.) .TP .B -n -The input alignments are sorted by read names rather than by chromosomal -coordinates +The input alignments are sorted by read names using an alpha-numeric +ordering, rather than by chromosomal coordinates. +The alpha-numeric or \*(lqnatural\*(rq sort order detects runs of digits in the +strings and sorts these numerically. Hence "a7b" appears before "a12b". +Note this is not suitable where hexadecimal values are in use. +.TP +.B -N +The input alignments are sorted by read names using a lexicographical +ordering, rather than by chromosomal coordinates. Unlike \fB-n\fR no +detection of numeric components is used, instead relying purely on the +ASCII value of each character. Hence "x12" comes before "x7" as "1" +is before "7" in ASCII. This is a more appropriate name sort order +where all digits in names are already zero-padded and/or hexadecimal +values are being used. .TP .BI -o \ FILE Write merged output to diff --git a/doc/samtools-sort.1 b/doc/samtools-sort.1 index 4e9be1976..7b25b1387 100644 --- a/doc/samtools-sort.1 +++ b/doc/samtools-sort.1 @@ -48,11 +48,12 @@ samtools sort .SH DESCRIPTION .PP -Sort alignments by leftmost coordinates, by read name when \fB-n\fR -is used, by tag contents with \fB-t\fR, or a minimiser-based collation -order with \fB-M\fR. An appropriate -.B @HD-SO -sort order header tag will be added or an existing one updated if necessary. +Sort alignments by leftmost coordinates, by read name when \fB-n\fR or +\fB-N\fR are used, by tag contents with \fB-t\fR, or a minimiser-based +collation order with \fB-M\fR. An appropriate \fB@HD SO\fR +sort order header tag will be added or an existing one updated if +necessary, along with the \fB@HD SS\fR sub-sort header tag where +appropriate. The sorted output is written to standard output by default, or to the specified file @@ -74,7 +75,8 @@ instead if you need name collated data without a full lexicographical sort. Note that if the sorted output file is to be indexed with .BR "samtools index" , the default coordinate sort must be used. -Thus the \fB-n\fR, \fB-t\fR and \fB-M\fR options are incompatible with +Thus the \fB-n\fR, \fB-N\fR, \fB-t\fR and \fB-M\fR options are +incompatible with .BR "samtools index" . When sorting by minimisier (\fB-M\fR), the sort order is defined by @@ -115,11 +117,26 @@ minimum value of 1M for this setting. .B -n Sort by read names (i.e., the .B QNAME -field) rather than by chromosomal coordinates. +field) using an alpha-numeric ordering, rather than by chromosomal coordinates. +The alpha-numeric or \*(lqnatural\*(rq sort order detects runs of digits in the +strings and sorts these numerically. Hence "a7b" appears before "a12b". +Note this is not suitable where hexadecimal values are in use. +Sets the header sub-sort (\fB@HD SS\fR) tag to \fBqueryname:natural\fR. +.TP +.B -N +Sort by read names (i.e., the +.B QNAME +field) using the lexicographical ordering, rather than by chromosomal +coordinates. Unlike \fB-n\fR no detection of numeric components is +used, instead relying purely on the ASCII value of each character. +Hence "x12" comes before "x7" as "1" is before "7" in ASCII. This is +a more appropriate name sort order where all digits in names are +already zero-padded and/or hexadecimal values are being used. +Sets the header sub-sort (\fB@HD SS\fR) tag to \fBqueryname:lexicographical\fR. .TP .BI "-t " TAG Sort first by the value in the alignment tag TAG, then by position or name (if -also using \fB-n\fP). +also using \fB-n\fP or \fB-N\fR). .TP .B "-M " Sort unmapped reads (those in chromosome "*") by their sequence @@ -212,7 +229,8 @@ and the sub-sort (SS) is The following rules are used for ordering records. If option \fB-t\fP is in use, records are first sorted by the value of -the given alignment tag, and then by position or name (if using \fB-n\fP). +the given alignment tag, and then by position or name (if using \fB-n\fP +or \fB-N\fP). For example, \*(lq-t RG\*(rq will make read group the primary sort key. The rules for ordering by tag are: @@ -236,11 +254,17 @@ Character tags (type A) are compared by binary character value. No attempt is made to compare tags of other types \(em notably type B array values will not be compared. .PP -When the \fB-n\fP option is present, records are sorted by name. Names are -compared so as to give a \*(lqnatural\*(rq ordering \(em i.e. sections -consisting of digits are compared numerically while all other sections are -compared based on their binary representation. This means \*(lqa1\*(rq will -come before \*(lqb1\*(rq and \*(lqa9\*(rq will come before \*(lqa10\*(rq. +When the \fB-n\fP or \fB-N\fP option is present, records are sorted by +name. Historically samtools has used a \*(lqnatural\*(rq ordering +\(em i.e. sections consisting of digits are compared numerically while +all other sections are compared based on their binary representation. +This means \*(lqa1\*(rq will come before \*(lqb1\*(rq and \*(lqa9\*(rq +will come before \*(lqa10\*(rq. However this alpha-numeric sort can +be confused by runs of hexadecimal digits. The newer \fB-N\fP +option adds a simpler lexicographical based name collation which does not +attempt any numeric comparisons and may be more appropriate for some +data sets. Note care must be taken when using \fBsamtools merge\fP to +ensure all files are using the same collation order. Records with the same name will be ordered according to the values of the READ1 and READ2 flags (see .BR flags ). diff --git a/test/sort/name.sort.expected.sam b/test/sort/name.sort.expected.sam index 6e10914af..22fca6238 100644 --- a/test/sort/name.sort.expected.sam +++ b/test/sort/name.sort.expected.sam @@ -1,4 +1,4 @@ -@HD VN:1.4 SO:queryname +@HD VN:1.4 SO:queryname SS:queryname:natural @SQ SN:insert LN:599 @SQ SN:ref1 LN:45 @SQ SN:ref2 LN:40 diff --git a/test/sort/name2.sort.expected.sam b/test/sort/name2.sort.expected.sam new file mode 100644 index 000000000..340d10821 --- /dev/null +++ b/test/sort/name2.sort.expected.sam @@ -0,0 +1,24 @@ +@HD VN:1.4 SO:queryname SS:queryname:lexicographical +@SQ SN:insert LN:599 +@SQ SN:ref1 LN:45 +@SQ SN:ref2 LN:40 +@SQ SN:ref3 LN:4 +@PG ID:llama +@RG ID:fish PG:llama +@RG ID:cow PU:13_&^&&*(:332 PG:donkey +@RG PU:*9u8jkjjkjd: ID:colt +@PG ID:bull PP:donkey +@PG ID:donkey +@CO Do you know? +r005 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt PG:Z:donkey +r005 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt PG:Z:donkey +r006 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt PG:Z:donkey +r006 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt PG:Z:donkey +r007 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt PG:Z:donkey +r007 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:donkey +x10 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow PG:Z:bull +x11 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow PG:Z:bull +x12 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow PG:Z:bull +x7 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow PG:Z:bull +x8 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow PG:Z:bull +x9 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow PG:Z:bull diff --git a/test/test.pl b/test/test.pl index 3702994d8..179390a3f 100755 --- a/test/test.pl +++ b/test/test.pl @@ -3213,6 +3213,7 @@ sub test_sort # Name sort test_cmd($opts, out=>"sort/name.sort.expected.sam", ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools sort${threads} -n -m 10M $$opts{path}/dat/test_input_1_a.bam -O SAM -o -"); + test_cmd($opts, out=>"sort/name2.sort.expected.sam", ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools sort${threads} -N -m 10M $$opts{path}/dat/test_input_1_b.bam -O SAM -o -"); # Tag sort (RG) test_cmd($opts, out=>"sort/tag.rg.sort.expected.sam", ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools sort${threads} -t RG -m 10M $$opts{path}/dat/test_input_1_a.bam -O SAM -o -"); From ef784095cd3a38453c68bc0bc67305d81570eb87 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 7 Aug 2023 12:22:15 +0100 Subject: [PATCH 03/26] Bug fix samtools sort --write-index sort mode checks. This previously had a verbatim list of all the sorts to exclude (accidentally omitting MinHash), instead of checking only the one sort order that works with indices. --- bam_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bam_sort.c b/bam_sort.c index d0163394c..d5533f5cf 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -3743,7 +3743,7 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } - if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) { + if (ga.write_index && sam_order != Coordinate) { fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); ga.write_index = 0; } From ed3b05fbefe6fc79a49e7e2207e6c7610f547fdb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Aug 2023 12:05:41 +0100 Subject: [PATCH 04/26] Add hclen SAM filter documentation. Fixes the last comment in #813 (previously closed already), along with the associated htslib PR. --- doc/samtools.1 | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/samtools.1 b/doc/samtools.1 index be5b4aa7c..307a7c396 100644 --- a/doc/samtools.1 +++ b/doc/samtools.1 @@ -1165,6 +1165,7 @@ flag.secondary int Single bit, 0 or 256 flag.qcfail int Single bit, 0 or 512 flag.dup int Single bit, 0 or 1024 flag.supplementary int Single bit, 0 or 2048 +hclen int Number of hard-clipped bases library string Library (LB header via RG) mapq int Mapping quality mpos int Synonym for pnext @@ -1193,10 +1194,12 @@ equivalent to \fBflag & 1024\fR. "qlen" and "rlen" are measured using the CIGAR string to count the number of query (sequence) and reference bases consumed. Note "qlen" may not exactly match the length of the "seq" field if the sequence is -"*". "sclen" is the number of soft-clipped bases. When combined in -"qlen-sclen" it can give the number of sequence bases used in the -alignment, distinguishing between global alignment and local alignment -length. +"*". + +"sclen" and "hclen" are the number of soft and hard-clipped +bases respectively. The formula "qlen-sclen" gives the +number of sequence bases used in the alignment, distinguishing between +global alignment and local alignment length. "endpos" is the (1-based inclusive) position of the rightmost mapped base of the read, as measured using the CIGAR string, and for mapped reads From 117549fc746bded7fecefcd746aadebeb45ec637 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 22 Aug 2023 18:05:21 +0100 Subject: [PATCH 05/26] Add the order of filtering to the man page. (#1907) --- doc/samtools-fasta.1 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/samtools-fasta.1 b/doc/samtools-fasta.1 index 7eb47e84a..fbdef34f5 100644 --- a/doc/samtools-fasta.1 +++ b/doc/samtools-fasta.1 @@ -128,6 +128,8 @@ soft-clipped CIGAR records then the soft-clipped data will be in the output FASTA/FASTQ. Hard-clipped data is, by definition, absent from the SAM record and hence will be absent in any FASTA/FASTQ produced. +The filter options order of precedence is -d, -f, -F, --rf and -G. + .SH OPTIONS .TP 8 .B -n From ae19296215245937dd7ff8fee36c58d2d259f349 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 21 Aug 2023 22:30:33 +1200 Subject: [PATCH 06/26] Check fclose(stdout) at the end of main() Since PR samtools/htslib#1665, hts_open("-", "w") / hts_close() no longer actually closes stdout. Close it at the end of main() so there is an opportunity to detect I/O errors in previously-uncommitted writes. Ignore EBADF as other code may have already closed stdout, e.g., either particular subcommands or when (dynamically) linked against an older version of HTSlib. --- bamtk.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bamtk.c b/bamtk.c index e05ea1816..221787561 100644 --- a/bamtk.c +++ b/bamtk.c @@ -303,5 +303,14 @@ int main(int argc, char *argv[]) fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } + + // For subcommands that may have produced substantial output on stdout, + // make a final check for delayed I/O errors. Ignore EBADF as other code + // may have already closed stdout. + if (fclose(stdout) != 0 && errno != EBADF) { + print_error_errno(argv[1], "closing standard output failed"); + return 1; + } + return ret; } From 5d67a96b99fbf4cd270e04975e2edd627a04c261 Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Wed, 30 Aug 2023 16:18:59 +0200 Subject: [PATCH 07/26] new option --plot-depth for samtools coverage --- coverage.c | 38 ++++++++++++++++++++++++++++---------- doc/samtools-coverage.1 | 21 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/coverage.c b/coverage.c index dedaa8e99..f9546b799 100644 --- a/coverage.c +++ b/coverage.c @@ -120,6 +120,7 @@ static int usage() { " effectively removing any depth limit.\n" "Output options:\n" " -m, --histogram show histogram instead of tabular output\n" + " -D, --plot-depth plot depth instead of tabular output\n" " -A, --ascii show only ASCII characters in histogram\n" " -o, --output FILE write output to FILE [stdout]\n" " -H, --no-header don't print a header in tabular mode\n" @@ -209,7 +210,7 @@ void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *s } void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist, - const int hist_size, const bool full_utf) { + const int hist_size, const bool full_utf, bool plot_coverage) { int i, col; bool show_percentiles = false; const int n_rows = 10; @@ -221,7 +222,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in double hist_data[hist_size]; double max_val = 0.0; for (i = 0; i < hist_size; ++i) { - hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width; + hist_data[i] = (plot_coverage?1:100) * hist[i] / (double) stats[tid].bin_width; if (hist_data[i] > max_val) max_val = hist_data[i]; } @@ -231,7 +232,10 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in double row_bin_size = max_val / (double) n_rows; for (i = n_rows-1; i >= 0; --i) { double current_bin = row_bin_size * i; - if (show_percentiles) { + if (plot_coverage) { + fprintf(file_out, ">%8.1f ",i*row_bin_size); + } + else if (show_percentiles) { fprintf(file_out, ">%3i%% ", i*10); } else { fprintf(file_out, ">%7.2f%% ", current_bin); @@ -265,7 +269,13 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break; case 1: fprintf(file_out, "Histo bin width: %sbp", readable_bps(stats[tid].bin_width, buf)); break; - case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; + case 0: if (plot_coverage) { + fprintf(file_out, "Histo max cov: %.5g", max_val); + } else { + fprintf(file_out, "Histo max bin: %.5g%%", max_val); + } + break; + }; fputc('\n', file_out); } @@ -313,6 +323,7 @@ int main_coverage(int argc, char *argv[]) { bool opt_print_header = true; bool opt_print_tabular = true; bool opt_print_histogram = false; + bool opt_plot_coverage = false; bool opt_full_utf = true; FILE *file_out = stdout; @@ -332,6 +343,7 @@ int main_coverage(int argc, char *argv[]) { {"min-bq", required_argument, NULL, 'Q'}, {"histogram", no_argument, NULL, 'm'}, {"ascii", no_argument, NULL, 'A'}, + {"plot-depth", no_argument, NULL, 'D'}, {"output", required_argument, NULL, 'o'}, {"no-header", no_argument, NULL, 'H'}, {"n-bins", required_argument, NULL, 'w'}, @@ -344,7 +356,7 @@ int main_coverage(int argc, char *argv[]) { // parse the command line int c; opterr = 0; - while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:D", lopts, NULL)) != -1) { switch (c) { case 1: if ((required_flags = bam_str2flag(optarg)) < 0) { @@ -368,6 +380,10 @@ int main_coverage(int argc, char *argv[]) { case 'A': opt_full_utf = false; opt_print_histogram = true; opt_print_tabular = false; break; + case 'D': opt_print_histogram = true; + opt_print_tabular = false; + opt_plot_coverage = true; + break; case 'H': opt_print_header = false; break; case 'h': return usage(); default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; @@ -560,7 +576,7 @@ int main_coverage(int argc, char *argv[]) { if (tid != old_tid) { // Next target sequence if (old_tid >= 0) { if (opt_print_histogram) { - print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf); + print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf, opt_plot_coverage); fputc('\n', file_out); } else if (opt_print_tabular) { print_tabular_line(file_out, h, stats, old_tid); @@ -604,12 +620,14 @@ int main_coverage(int argc, char *argv[]) { count_base = true; stats[tid].summed_coverage += depth_at_pos; } - // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage - //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output + + if(current_bin < n_bins && opt_plot_coverage) { + hist[current_bin] += depth_at_pos; + } } if (count_base) { stats[tid].n_covered_bases++; - if (opt_print_histogram && current_bin < n_bins) + if (opt_print_histogram && current_bin < n_bins && !opt_plot_coverage) ++(hist[current_bin]); // Histogram based on breadth of coverage } } @@ -620,7 +638,7 @@ int main_coverage(int argc, char *argv[]) { if (tid < n_targets && tid >=0) { if (opt_print_histogram) { - print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf); + print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf, opt_plot_coverage); } else if (opt_print_tabular) { print_tabular_line(file_out, h, stats, tid); } diff --git a/doc/samtools-coverage.1 b/doc/samtools-coverage.1 index cf71d50be..304ee30dc 100644 --- a/doc/samtools-coverage.1 +++ b/doc/samtools-coverage.1 @@ -102,6 +102,9 @@ Output options: .TP 8 .BI -m,\ --histogram Show histogram instead of tabular output. +.BI -D,\ --plot-depth +Show histogram instead of tabular output but displays the depth of coverage instead of the percent of coverage. +This option can be used to visualize copy number varaiations in the terminal.. .TP .BI -A,\ --ascii Show only ASCII characters in histogram using colon and fullstop for @@ -154,6 +157,24 @@ chr1 (249.25Mbp) 1.00M 4.44M 7.87M 12.00M .EE +.EX 2 +samtools coverage -m -r 'chr1:24500000-25600000' --plot-depth -w 32 -A input.bam + +chr1 (249.25Mbp) +> 38.8 | .::::::: | Number of reads: 283218 +> 34.5 | :::::::: | (3327 filtered) +> 30.2 | :::::::::. | Covered bases: 1.10Mbp +> 25.9 |.:::::.:.::::::::::::::::::::::.| Percent covered: 99.83% +> 21.6 |::::::::::::::::::::::::::::::::| Mean coverage: 33.2x +> 17.2 |::::::::::::::::::::::::::::::::| Mean baseQ: 37.2 +> 12.9 |::::::::::::::::::::::::::::::::| Mean mapQ: 59.3 +> 8.6 |::::::::::::::::::::::::::::::::| +> 4.3 |::::::::::::::::::::::::::::::::| Histo bin width: 34.5Kbp +> 0.0 |::::::::::::::::::::::::::::::::| Histo max cov: 43.117 + 24.50M 24.84M 25.19M 25.60M + +.EE + .SH AUTHOR .PP From 38e45176de8d8d65c3ba144b131381b55121bf16 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Fri, 8 Sep 2023 14:15:09 +0100 Subject: [PATCH 08/26] Minor layout changes and added const modifier. --- coverage.c | 5 ++--- doc/samtools-coverage.1 | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/coverage.c b/coverage.c index f9546b799..68dd6be5e 100644 --- a/coverage.c +++ b/coverage.c @@ -210,7 +210,7 @@ void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *s } void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist, - const int hist_size, const bool full_utf, bool plot_coverage) { + const int hist_size, const bool full_utf, const bool plot_coverage) { int i, col; bool show_percentiles = false; const int n_rows = 10; @@ -234,8 +234,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in double current_bin = row_bin_size * i; if (plot_coverage) { fprintf(file_out, ">%8.1f ",i*row_bin_size); - } - else if (show_percentiles) { + } else if (show_percentiles) { fprintf(file_out, ">%3i%% ", i*10); } else { fprintf(file_out, ">%7.2f%% ", current_bin); diff --git a/doc/samtools-coverage.1 b/doc/samtools-coverage.1 index 304ee30dc..3da80abdd 100644 --- a/doc/samtools-coverage.1 +++ b/doc/samtools-coverage.1 @@ -102,9 +102,10 @@ Output options: .TP 8 .BI -m,\ --histogram Show histogram instead of tabular output. +.TP .BI -D,\ --plot-depth -Show histogram instead of tabular output but displays the depth of coverage instead of the percent of coverage. -This option can be used to visualize copy number varaiations in the terminal.. +As above but displays the depth of coverage instead of the percent of coverage. +This option can be used to visualize copy number variations in the terminal. .TP .BI -A,\ --ascii Show only ASCII characters in histogram using colon and fullstop for From 007d83e8b63737d8a08060723a41eba4a1df3f7a Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Tue, 12 Sep 2023 09:57:22 +0100 Subject: [PATCH 09/26] mpileup manual updated with samples --- doc/samtools-mpileup.1 | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/samtools-mpileup.1 b/doc/samtools-mpileup.1 index 35b14d4d4..2eb29ee7a 100644 --- a/doc/samtools-mpileup.1 +++ b/doc/samtools-mpileup.1 @@ -76,6 +76,7 @@ could be specified using \fB-r 20 -l chr20.bed\fR, meaning that the index is used to find chromosome 20 and then it is filtered for the regions listed in the bed file. +Unmapped reads are not considered and are always discarded. By default secondary alignments, QC failures and duplicate reads will be omitted, along with low quality bases and some reads in high depth regions. See the \fB--ff\fR, \fB-Q\fR and \fB-d\fR options for @@ -283,8 +284,8 @@ none of the mask bits set. Hence this does not override the .TP .BI --ff,\ --excl-flags \ STR|INT Filter flags: skip reads with any of the mask bits set. This defaults -to UNMAP,SECONDARY,QCFAIL,DUP. The option is not accumulative, so -specifying e.g. \fB--ff UNMAP,QCFAIL\fR will reenable output of +to SECONDARY,QCFAIL,DUP. The option is not accumulative, so +specifying e.g. \fB--ff QCFAIL\fR will reenable output of secondary and duplicate alignments. Note this does not override the \fB--incl-flags\fR option. .TP @@ -482,7 +483,42 @@ The .B -E option can be used to make it ignore the contents of the BQ:Z tag and force it to recalculate the BAQ scores by making a new alignment. - +.PP +.SH EXAMPLES +Using range: +With implicit index files in1.bam. and in2.sam.gz., +.EX 2 +samtools mpileup in1.bam in2.sam.gz -r chr10:100000-200000 +.EE +With explicit index files, +.EX 2 +samtools mpileup in1.bam in2.sam.gz idx/in1.csi idx/in2.csi -X -r chr10:100000-200000 +.EE +With fofn being a file of input file names, and implicit index files present with inputs, +.EX 2 +samtools mpileup -b fofn -r chr10:100000-200000 +.EE +Using flags: +To get reads with flags READ2 or REVERSE and not having any of SECONDARY,QCFAIL,DUP, +.EX 2 +samtools mpileup --rf READ2,REVERSE in.sam +.EE +or +.EX 2 +samtools mpileup --rf 144 in.sam +.EE +To get reads with flag SECONDARY, +.EX 2 +samtools mpileup --rf SECONDARY --ff QCFAIL,DUP in.sam +.EE +Using all possible alignmentes: +To show all possible alignments, either of below two equivalent commands may be used, +.EX 2 +samtools mpileup --count-orphans --no-BAQ --max-depth 0 --fasta-ref ref_file.fa \\ +--min-BQ 0 --excl-flags 0 --disable-overlap-removal in.sam + +samtools mpileup -A -B -d 0 -f ref_file.fa -Q 0 --ff 0 -x in.sam +.EE .SH AUTHOR .PP Written by Heng Li from the Sanger Institute. From 379f1a626167acc0c68c215e5e252daea19a2826 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 20 Sep 2023 18:01:29 +1200 Subject: [PATCH 10/26] Use the spec-defined PL values in man page example Spell these values as ILLUMINA and LS454, as per the SAM specification. The Perl command shown interpolates `@RG` as an undefined array; instead use printf(1) to avoid any such issues. --- doc/samtools-merge.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/samtools-merge.1 b/doc/samtools-merge.1 index 2978afc63..c92811a91 100644 --- a/doc/samtools-merge.1 +++ b/doc/samtools-merge.1 @@ -201,7 +201,7 @@ Attach the .B RG tag while merging sorted alignments: .EX 2 -perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt +printf '@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:ILLUMINA\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:LS454\\n' > rg.txt samtools merge -rh rg.txt merged.bam ga.bam 454.bam .EE The value in a From 147ac2f83857d6209e344a77a2ae74324975fc67 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 3 Oct 2023 16:14:05 +0100 Subject: [PATCH 11/26] Remove -5 option from samtools consensus. This option referred to the original name of the "gap5" mode, but it's now just called "-m bayesian" and is the default anyway. The -5 option produces an error now. --- doc/samtools-consensus.1 | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/samtools-consensus.1 b/doc/samtools-consensus.1 index 683b9b80e..2de5774ee 100644 --- a/doc/samtools-consensus.1 +++ b/doc/samtools-consensus.1 @@ -264,13 +264,9 @@ output "N". .TP 0 The following options apply only to Bayesian consensus mode enabled -with the \fB-5\fR option. +(default on). .TP 10 -.B -5 -Enable Bayesian consensus algorithm. - -.TP .BI "-C " C ", --cutoff " C Only used for the Gap5 consensus mode, which produces a Phred style score for the final consensus quality. If this is below From 04d53eba2533eba1fd8090ef448c6628f3d83c81 Mon Sep 17 00:00:00 2001 From: Vasudeva Easwara Sarma Date: Thu, 19 Oct 2023 12:14:56 +0100 Subject: [PATCH 12/26] treats empty barcode as no barcode - avoids issues with realloc --- stats.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stats.c b/stats.c index 44783a974..1574b3203 100644 --- a/stats.c +++ b/stats.c @@ -766,6 +766,9 @@ static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) { continue; uint32_t barcode_len = strlen(barcode); + if (!barcode_len) { + continue; //consider 0 size barcode same as no barcode - avoids issues with realloc below + } if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time uint32_t offset = 0; for (i = 0; i < stats->ntags; i++) From 7cce9186eb2a1d453ce2f55a6fba99b10906bfb6 Mon Sep 17 00:00:00 2001 From: erboone Date: Wed, 30 Aug 2023 13:39:15 -0700 Subject: [PATCH 13/26] Add plot of read lengths ("RL") from samtools stats output. Fixes #1824 --- misc/plot-bamstats | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/misc/plot-bamstats b/misc/plot-bamstats index d07c0b1e7..3231ff125 100755 --- a/misc/plot-bamstats +++ b/misc/plot-bamstats @@ -25,6 +25,8 @@ use strict; use warnings; use Carp; +use POSIX qw/floor/; +use List::Util qw(max); my $opts = parse_params(); parse_bamcheck($opts); @@ -39,6 +41,7 @@ plot_coverage($opts); plot_mismatches_per_cycle($opts); plot_indel_dist($opts); plot_indel_cycles($opts); +plot_read_len_hist($opts); create_html($opts); exit; @@ -1204,6 +1207,136 @@ sub plot_indel_cycles plot($$args{gp}); } +sub plot_read_len_hist +{ + my ($opts) = @_; + + my @rl = get_values($opts, 'RL'); + if ( !@rl ) { return; } + + # TODO: Add modified version for paired end reads. + # my $is_paired = get_value($opts,'SN','reads paired:'); + + my $args = get_defaults($opts,"$$opts{prefix}read_length.png"); + my $fh = $$args{fh}; + + # anonymous subroutines for code readibility + my $log_bin = sub { + my $x = $_[0]; + return floor((log($x) / log(10)) * 10); + }; + my $inv_log_bin = sub { + my $x = $_[0]; + return floor(10**($x/10)); + }; + + # Open files to store data: this is more elegant than copying four sets of + # data into a single GNUplot file. + open(my $rd_fh, '>', "$$opts{prefix}read_len_rawdata.txt"); + open(my $hd_fh, '>', "$$opts{prefix}read_len_histdata.txt"); + + # Create gnuplot command file options + my $min_x = $rl[0][0]; + my $max_x = $rl[-1][0]; + + # Gnuplot commands for normalized binned data + # The normalized plot bins reads by read length with bin widths on a + # logarithmic scale (i.e. bins 10-11 bp, 12-14 bp, 15-18 bp, etc.), then + # bin height is normalized by the width of the bins. + # The end goal is to smooth the distribution of reads compared to plotting + # each length without binning. Visualization of unbinned data is included + # in order to make single value spikes more visible, for example in the + # case that spike ins are used as a control. + print $fh qq[ + set ytics nomirror + set y2tic nomirror + set terminal png size 800,1000 truecolor + set output "$$args{img}" + $$args{grid} + set grid noy2tics + set multiplot layout 2,1 columnsfirst scale 1,0.9 + set autoscale y + set ylabel "Read Count" + set autoscale x + set logscale x + set xrange[$min_x: $max_x] + set xtics rotate by -45 out scale 1 nomirror + set xlabel "Read Length" + set boxwidth .8 relative + set title "$$args{title}" noenhanced + set title offset -15,-.5 + set key at graph 1, 1.15 + plot "$$opts{prefix}read_len_rawdata.txt" using 1:2 lt rgb "orange" with boxes fs solid .7 title 'unbinned', "$$opts{prefix}read_len_histdata.txt" using 4:3 lt rgb "blue" with boxes title 'normalized bins(count/bin width)' axis x1y2 + ]; + print $fh qq[]; + + # Gnuplot commands for non-normalized binned data + # The un-normalized plot has reads assigned to bins identically to the + # normalized plot, however, the bin height is not divided by bin width. The + # goal of this visualization is to more accurately reflect the distribution + # of reads by volume, and will generally collect around the mean read + # length. + # Inclusion of unbinned data is to give point of visual comparison to + # normalized histogram. + print $fh qq[ + set boxwidth 1 relative + plot "$$opts{prefix}read_len_rawdata.txt" using 1:2 lt rgb "orange" with boxes fs solid .7 title 'unbinned', "$$opts{prefix}read_len_histdata.txt" using 4:2 lt rgb "blue" with boxes title 'unnormalized bins' axis x1y2 + ]; + + my @logscale_data = (0)x&$log_bin($max_x); + my $log_x = 0; + my $start_i = 1; + for my $line (@rl) + { + my @deref_line = @{$line}; + my $curr_x = $deref_line[0]; + my $curr_y = $deref_line[1]; + + print $rd_fh "$curr_x $curr_y\n"; + $log_x = max(&$log_bin($curr_x), 5); + @logscale_data[floor($log_x)] += $curr_y; + $start_i = $curr_x + 1; + } + + # Copy binned data into its own file + + # This is a special case to deal with the problem that reads of len < 5 do + # not cooperate with the log10(x)*10 transformation and cause small + # fragments to graph badly. + my $start = 1; + my $end = 3; + my $div_binw_val = $logscale_data[5]/($end - $start); + my $start_idx = -1; + for my $i (0..@logscale_data-1) { + if($logscale_data[$i] != 0) + { + $start_idx = $i; + if ($i == 5) + { + print $hd_fh "5 $logscale_data[5] $div_binw_val $start $end\n"; + $start_idx += 1; + } + last; + } + } + + + for my $i ($start_idx..@logscale_data-1) { + $start = &$inv_log_bin($i); + $end = &$inv_log_bin($i+1); + $div_binw_val = $logscale_data[$i]/max(($end - $start), 1); + print $hd_fh "$i $logscale_data[$i] $div_binw_val $start $end\n"; + } + + # Close file buffers and clean up workspace + close($fh); + close($hd_fh); + close($rd_fh); + plot($$args{gp}); + unlink("$$opts{prefix}read_len_rawdata.txt"); + unlink("$$opts{prefix}read_len_histdata.txt"); +} + sub merge_bamcheck { my ($opts) = @_; From 9a594672ff98c903a642da760cd9bbeb0b15285a Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Mon, 16 Oct 2023 12:46:47 +0100 Subject: [PATCH 14/26] updated cat to support multiple non-seekable streams --- bam_cat.c | 330 +++++++++++++++++++++++++++++------------------------- 1 file changed, 179 insertions(+), 151 deletions(-) diff --git a/bam_cat.c b/bam_cat.c index ed8cf58c5..4bc845dca 100644 --- a/bam_cat.c +++ b/bam_cat.c @@ -1,6 +1,6 @@ /* bam_cat.c -- efficiently concatenates bam files. - Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd. + Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021, 2023 Genome Research Ltd. Modified SAMtools work copyright (C) 2010 Illumina, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -49,52 +49,76 @@ Illumina. #include "samtools.h" #include "sam_opts.h" -/* - * Check the files are consistent and capable of being concatenated. - * Also fills out the version numbers and produces a new sam_hdr_t - * structure with merged RG lines. - * Note it is only a simple merge. - * - * Returns updated header on success; - * NULL on failure. - */ -static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h, - int *vers_maj_p, int *vers_min_p) { +/// cat_check_merge_hdr - check compatibility and merge RG hearders merges RGon both CRAM and BAM. +/** @param firstfile - pointer to the 1sr file opened in caller + * @param nfn - number of files to be processed, including the firstfile + * @param fn - array of file paths to be processed + * @param h - sam header pointer which contains explicitly given header + * @param vers_maj_p - cram major version set and send out for output creation + * @param vers_min_p - cram min version set and send out for output creation + * @param out_h - pointer to sam header pointer, outputs the merged header + * returns array of opened samFile pointers on success and NULL on failure + * This method has the merged header processing for cram and bam. + * RG lines are merged for both cram and bam. For cram, version match for each + * file and order match of RG lines are compared as well. + * Note: it is a simple merge of RG lines alone. +*/ +static samFile** cat_check_merge_hdr(samFile * const firstfile, int nfn, char * const *fn, const sam_hdr_t *h, + int *vers_maj_p, int *vers_min_p, sam_hdr_t **out_h) { int i, vers_maj = -1, vers_min = -1; sam_hdr_t *new_h = NULL, *old_h = NULL; samFile *in = NULL; kstring_t ks = KS_INITIALIZE; - - if (h) { - new_h = sam_hdr_dup(h); - if (!new_h) { - fprintf(stderr, "[%s] ERROR: header duplication failed.\n", - __func__); - goto fail; + samFile **files = calloc(nfn, sizeof(samFile *)); + if(!files) { + fprintf(stderr, "[%s] ERROR: failed to allocate space for file handles.\n", __func__); + return NULL; + } + if (!out_h || !firstfile) { + fprintf(stderr, "[%s] ERROR: header check failed.\n", __func__); + goto fail; + } + if (*out_h) { //use header if one is already present + new_h = *out_h; + } + else { + if (h) { //use the explicit header given + new_h = sam_hdr_dup(h); + if (!new_h) { + fprintf(stderr, "[%s] ERROR: header duplication failed.\n", + __func__); + goto fail; + } } } for (i = 0; i < nfn; ++i) { - cram_fd *in_c; int ki; - - in = sam_open(fn[i], "rc"); + //1st file is already open and passed, rest open locally + files[i] = in = i ? sam_open(fn[i], "r") : firstfile; if (in == 0) { print_error_errno("cat", "fail to open file '%s'", fn[i]); goto fail; } - in_c = in->fp.cram; - - int vmaj = cram_major_vers(in_c); - int vmin = cram_minor_vers(in_c); - if ((vers_maj != -1 && vers_maj != vmaj) || - (vers_min != -1 && vers_min != vmin)) { - fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n", - __func__); + if (firstfile->format.format != in->format.format) { + print_error("cat", "File %s is of different format!", fn[i]); goto fail; } - vers_maj = vmaj; - vers_min = vmin; + if (firstfile->format.format == cram) { //version check for cram + cram_fd *in_c; + in_c = in->fp.cram; + + int vmaj = cram_major_vers(in_c); + int vmin = cram_minor_vers(in_c); + if ((vers_maj != -1 && vers_maj != vmaj) || + (vers_min != -1 && vers_min != vmin)) { + fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n", + __func__); + goto fail; + } + vers_maj = vmaj; + vers_min = vmin; + } old_h = sam_hdr_read(in); if (!old_h) { @@ -111,10 +135,10 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t goto fail; } sam_hdr_destroy(old_h); - sam_close(in); + old_h = NULL; continue; } - + //merge RG lines int old_count = sam_hdr_count_lines(old_h, "RG"); for (ki = 0; ki < old_count; ki++) { const char *old_name = sam_hdr_line_name(old_h, "RG", ki); @@ -136,7 +160,8 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t } } - if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) { + if (firstfile->format.format == cram && old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) { + //RG order check for cram for (ki = 0; ki < old_count; ki++) { const char *old_name = sam_hdr_line_name(old_h, "RG", ki); const char *new_name = sam_hdr_line_name(new_h, "RG", ki); @@ -148,72 +173,61 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t } } - sam_hdr_destroy(old_h); - sam_close(in); + sam_hdr_destroy(old_h); old_h = NULL; } ks_free(&ks); - *vers_maj_p = vers_maj; - *vers_min_p = vers_min; - - return new_h; + if (vers_maj_p) { + *vers_maj_p = vers_maj; + } + if (vers_min_p) { + *vers_min_p = vers_min; + } + *out_h = new_h; + return files; fail: ks_free(&ks); if (old_h) sam_hdr_destroy(old_h); if (new_h) sam_hdr_destroy(new_h); - if (in) sam_close(in); + *out_h = NULL; + for (i = 1; i < nfn; ++i) { //close files other than the firstfile + if (files[i]) { + sam_close(files[i]); + } + } + free(files); return NULL; } - -/* - * CRAM files don't store the RG:Z:ID per read in the aux field. - * Instead they have a numerical data series (RG) to point each read - * back to the Nth @RG line in the file. This means that we may need - * to edit the RG data series (if the files were produced from - * "samtools split" for example). - * - * The encoding method is stored in the compression header. Typical - * examples: - * - * RG => EXTERNAL {18} # Block content-id 18 holds RG values - * # as a series of ITF8 encoded values - * - * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0} - * # One RG value #-1. (No RG) - * - * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG) - * - * RG => HUFFMAN {2, 0, 1, 2, 1, 1} - * # Two RG values, #0 and #1, written - * # to the CORE block and possibly - * # mixed with other data series. - * - * A single value can (but may not be) implemented as a zero bit - * huffman code. In this situation we can change the meta-data in the - * compression header to renumber an RG value.. - */ -int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg) +int cram_cat(samFile * const firstfile, int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg) { - samFile *out; + samFile *out = NULL; cram_fd *out_c; - int i, vers_maj, vers_min; + int i, vers_maj, vers_min, ret = -1; sam_hdr_t *new_h = NULL; + samFile **files = NULL; - /* Check consistent versioning and compatible headers */ - if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min))) + /* Check consistent versioning and compatible headers; + merges RG lines, opens all files and returns them that multiple non-seekable + stream inputs can be handled */ + if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, &vers_maj, &vers_min, &new_h))) return -1; + if (!new_h) { + print_error_errno("cat", "failed to make output header"); + goto closefiles; + } + /* Open the file with cram_vers */ char vers[100]; - sprintf(vers, "%d.%d", vers_maj, vers_min); + snprintf(vers, sizeof(vers), "%d.%d", vers_maj, vers_min); out = sam_open_format(outcram, "wc", &ga->out); if (out == 0) { print_error_errno("cat", "fail to open output file '%s'", outcram); - return -1; + goto closefiles; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); @@ -224,12 +238,13 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) - return -1; + goto closefiles; if (sam_hdr_write(out, new_h) < 0) { print_error_errno("cat", "Couldn't write header"); - return -1; + goto closefiles; } + out_c = out->fp.cram; for (i = 0; i < nfn; ++i) { samFile *in; @@ -238,17 +253,17 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_hdr_t *old_h; int new_rg = -1; - in = sam_open(fn[i], "rc"); + in = files[i]; if (in == 0) { print_error_errno("cat", "fail to open file '%s'", fn[i]); - return -1; + goto closefiles; } in_c = in->fp.cram; old_h = sam_hdr_read(in); if (!old_h) { print_error("cat", "fail to read the header of file '%s'", fn[i]); - return -1; + goto closefiles; } // Compute RG mapping if suitable for changing. @@ -258,11 +273,11 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, new_rg = sam_hdr_line_index(new_h, "RG", old_name); if (new_rg < 0) { print_error("cat", "fail to find @RG line '%s' in the new header", old_name); - return -1; + goto closefiles; } } else { print_error("cat", "fail to find @RG line in file '%s'", fn[i]); - return -1; + goto closefiles; } } else { new_rg = 0; @@ -274,7 +289,7 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, cram_block *blk; // Container compression header if (!(blk = cram_read_block(in_c))) - return -1; + goto closefiles; cram_free_block(blk); cram_free_container(c); continue; @@ -292,34 +307,42 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) - return -1; + goto closefiles; // Container compression header if (!(blk = cram_read_block(in_c))) - return -1; + goto closefiles; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); - return -1; + goto closefiles; } cram_free_block(blk); - // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. (void)cram_container_get_landmarks(c, &num_slices); cram_copy_slice(in_c, out_c, num_slices); } - cram_free_container(c); } - sam_hdr_destroy(old_h); - sam_close(in); } - sam_close(out); - sam_hdr_destroy(new_h); + ret = 0; - return 0; +closefiles: + if (out) { + sam_close(out); + } + if (new_h) { + sam_hdr_destroy(new_h); + } + for (i = 1; i < nfn; ++i) { //skip firstfile and close rest + if (files[i]) { + sam_close(files[i]); + } + } + free(files); + return ret; } @@ -330,31 +353,40 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, #define BGZF_EMPTY_BLOCK_SIZE 28 -int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg) +int bam_cat(samFile * const firstfile, int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg) { - BGZF *fp, *in = NULL; + BGZF *fp = NULL, *in = NULL; uint8_t *buf = NULL; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; + samFile **files = NULL; + sam_hdr_t *new_h = NULL; + /* merges RG lines, opens all files and returns them that multiple non-seekable + stream inputs can be handled */ + if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, NULL, NULL, &new_h))) + return -1; + if (!new_h) { + print_error_errno("cat", "failed to make output header"); + goto fail; + } fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); if (fp == 0) { print_error_errno("cat", "fail to open output file '%s'", outbam); - return -1; + goto fail; } - if (h) { - if (!no_pg && sam_hdr_add_pg(h, "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) - goto fail; - if (bam_hdr_write(fp, h) < 0) { - print_error_errno("cat", "Couldn't write header"); - goto fail; - } + if (!no_pg && sam_hdr_add_pg(new_h, "samtools", + "VN", samtools_version(), + arg_list ? "CL": NULL, + arg_list ? arg_list : NULL, + NULL)) + goto fail; + + if (bam_hdr_write(fp, new_h) < 0) { + print_error_errno("cat", "Couldn't write header"); + goto fail; } buf = (uint8_t*) malloc(BUF_SIZE); @@ -363,35 +395,13 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a goto fail; } for(i = 0; i < nfn; ++i){ - sam_hdr_t *old; int len,j; - - in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); + in = files[i]->fp.bgzf; if (in == 0) { print_error_errno("cat", "fail to open file '%s'", fn[i]); goto fail; } - if (in->is_write) return -1; - - old = bam_hdr_read(in); - if (old == NULL) { - fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n", - __func__, fn[i]); - goto fail; - } - if (h == 0 && i == 0) { - if (!no_pg && sam_hdr_add_pg(old, "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) - goto fail; - - if (bam_hdr_write(fp, old) < 0) { - print_error_errno("cat", "Couldn't write header"); - goto fail; - } - } + if (in->is_write) goto fail; if (in->block_offset < in->block_length) { if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; @@ -432,27 +442,42 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; } } - sam_hdr_destroy(old); - bgzf_close(in); in = NULL; } free(buf); if (bgzf_close(fp) < 0) { fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam); - return -1; + goto fail; + } + for (i = 1; i < nfn; ++i) { //skip firstfile and close rest + if (files[i]) { + sam_close(files[i]); + } } + free(files); + sam_hdr_destroy(new_h); return 0; write_fail: fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam); fail: - if (in) bgzf_close(in); + if (new_h) { + sam_hdr_destroy(new_h); + } if (fp) bgzf_close(fp); free(buf); + + if (files) { + for(i = 1; i < nfn; ++i) { //except the firstfile + if(files[i]) { + sam_close(files[i]); + } + } + free(files); + } return -1; } - int main_cat(int argc, char *argv[]) { sam_hdr_t *h = 0; @@ -472,21 +497,22 @@ int main_cat(int argc, char *argv[]) char *arg_list = NULL; sam_global_args_init(&ga); - while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); if (fph == 0) { fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg); - return 1; + ret = 1; + goto end; } h = sam_hdr_read(fph); if (h == NULL) { fprintf(stderr, "[%s] ERROR: failed to read the header from '%s'.\n", __func__, optarg); - return 1; + ret = 1; + goto end; } sam_close(fph); break; @@ -521,7 +547,8 @@ int main_cat(int argc, char *argv[]) if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { print_error("cat", "failed to create arg_list"); - return 1; + ret = 1; + goto end; } // Append files specified in argv to the list. @@ -541,34 +568,34 @@ int main_cat(int argc, char *argv[]) fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); fprintf(stderr, " -o FILE output BAM/CRAM\n"); fprintf(stderr, " --no-PG do not add a PG line\n"); - sam_global_opt_help(stderr, "--..-@-."); - return 1; + sam_global_opt_help(stderr, "---.-@-."); + ret = 1; + goto end; } in = sam_open(infns[0], "r"); if (!in) { print_error_errno("cat", "failed to open file '%s'", infns[0]); - return 1; + ret = 1; + goto end; } switch (hts_get_format(in)->format) { case bam: - sam_close(in); - if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0) + if (bam_cat(in, infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0) ret = 1; break; case cram: - sam_close(in); - if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0) + if (cram_cat(in, infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0) ret = 1; break; default: - sam_close(in); fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); - return 1; + ret = 1; } + sam_close(in); end: if (infns_size > 0) { @@ -582,6 +609,7 @@ int main_cat(int argc, char *argv[]) free(arg_list); if (h) sam_hdr_destroy(h); + sam_global_args_free(&ga); return ret; } From 57de9c2fb201887f72aba4097233e173fd046d5b Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 17 Mar 2020 14:53:06 +0000 Subject: [PATCH 15/26] Add -d TAG option, which allows read split by custom aux tags. Reworked by rmd to use the existing rg arrays for both split by read group, and split by alternate tags. The original used a separate hash table for the `-d` option. Co-authored-by: Rob Davies --- bam_split.c | 260 +++++++++++++++++++++++++++++++++++-------- doc/samtools-split.1 | 8 +- 2 files changed, 218 insertions(+), 50 deletions(-) diff --git a/bam_split.c b/bam_split.c index e9f0fb591..67ef4735c 100644 --- a/bam_split.c +++ b/bam_split.c @@ -48,6 +48,7 @@ struct parsed_opts { const char *unaccounted_header_name; const char *unaccounted_name; const char *output_format_string; + const char *tag; bool verbose; int no_pg; sam_global_args ga; @@ -61,11 +62,11 @@ struct state { samFile* unaccounted_file; sam_hdr_t* unaccounted_header; size_t output_count; - char** rg_id; + char **rg_id; char **rg_index_file_name; char **rg_output_file_name; - samFile** rg_output_file; - sam_hdr_t** rg_output_header; + samFile **rg_output_file; + sam_hdr_t **rg_output_header; kh_c2i_t* rg_hash; htsThreadPool p; int write_index; @@ -85,6 +86,7 @@ static void usage(FILE *write_to) " -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" " -h FILE2 ... and override the header with FILE2 (-u file only)\n" +" -d TAG split by TAG value. TAG value must be a string.\n" " -v verbose output\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(write_to, "-....@.."); @@ -104,7 +106,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) { if (argc == 1) { usage(stdout); return NULL; } - const char *optstring = "vf:h:u:@:"; + const char *optstring = "vf:h:u:d:@:"; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), @@ -132,6 +134,10 @@ static parsed_opts_t* parse_args(int argc, char** argv) case 'u': retval->unaccounted_name = optarg; break; + case 'd': + retval->tag = optarg; + retval->output_format_string = "%*_%!.%."; + break; case 1: retval->no_pg = 1; break; @@ -275,6 +281,133 @@ static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2) return 0; } +static int grow_output_lists(state_t *state, size_t count) { + char **new_list = realloc(state->rg_id, count * sizeof(char *)); + if (!new_list) + return -1; + state->rg_id = new_list; + new_list = realloc(state->rg_index_file_name, count * sizeof(char *)); + if (!new_list) + return -1; + state->rg_index_file_name = new_list; + new_list = realloc(state->rg_output_file_name, count * sizeof(char *)); + if (!new_list) + return -1; + state->rg_output_file_name = new_list; + samFile **new_file = realloc(state->rg_output_file, + count * sizeof(samFile *)); + if (!new_file) + return -1; + state->rg_output_file = new_file; + sam_hdr_t **new_hdr = realloc(state->rg_output_header, + count * sizeof(sam_hdr_t *)); + if (!new_hdr) + return -1; + state->rg_output_header = new_hdr; + return 0; +} + +static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, + const char *tag, const char *arg_list) { + char *input_base_name = NULL, *new_file_name = NULL, *tag_key = NULL; + char *new_idx_fn = NULL; + sam_hdr_t *new_hdr = NULL; + samFile *new_sam_file = NULL; + + khiter_t i = kh_get_c2i(state->rg_hash, tag); + if (i != kh_end(state->rg_hash)) { + return i; + } + // create new file + if (grow_output_lists(state, state->output_count + 1) != 0) { + print_error_errno("split", "Couldn't grow output lists"); + return kh_end(state->rg_hash); + } + tag_key = strdup(tag); + if (!tag_key) { + print_error_errno("split", "Couldn't copy tag value"); + return kh_end(state->rg_hash); + } + char *dirsep = strrchr(opts->merged_input_name, '/'); + input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); + if (!input_base_name) { + print_error_errno("split", "Filename parsing failed"); + goto fail; + } + + char* extension = strrchr(input_base_name, '.'); + if (extension) *extension = '\0'; + + new_file_name = expand_format_string(opts->output_format_string, input_base_name, tag, 0, &opts->ga.out); + if (!new_file_name) { + print_error_errno("split", "Filename creation failed"); + goto fail; + } + + new_hdr = sam_hdr_dup(state->merged_input_header); + if (!new_hdr) { + print_error_errno("split", "Duplicating header for file \"%s\" failed", new_file_name); + goto fail; + } + if (!opts->no_pg && sam_hdr_add_pg(new_hdr, "samtools", + "VN", samtools_version(), + arg_list ? "CL": NULL, + arg_list ? arg_list : NULL, + NULL)) { + print_error_errno("split", "Adding PG line to file \"%s\" failed", new_file_name); + goto fail; + } + + char outmode[4] = "w"; + sam_open_mode(outmode + 1, new_file_name, NULL); + new_sam_file = sam_open_format(new_file_name, outmode, &opts->ga.out); + if (!new_sam_file) { + print_error_errno("split", "Opening filename for writing \"%s\" failed", new_file_name); + goto fail; + } + if (state->p.pool) + hts_set_opt(new_sam_file, HTS_OPT_THREAD_POOL, &state->p); + + if (state->write_index) { + new_idx_fn = auto_index(new_sam_file, new_file_name, new_hdr); + if (!new_idx_fn) { + print_error_errno("split", "Creating index file for file \"%s\" failed", new_file_name); + goto fail; + } + } + + if (sam_hdr_write(new_sam_file, new_hdr) != 0) { + print_error_errno("split", "Couldn't write header to \"%s\"", + new_file_name); + goto fail; + } + int ret = -1; + i = kh_put_c2i(state->rg_hash, tag_key, &ret); + if (ret < 0) { + print_error_errno("split", "Adding file \"%s\" failed", new_file_name); + goto fail; + } + + kh_val(state->rg_hash, i) = state->output_count; + state->rg_id[state->output_count] = tag_key; + state->rg_index_file_name[state->output_count] = new_idx_fn; + state->rg_output_file_name[state->output_count] = new_file_name; + state->rg_output_file[state->output_count] = new_sam_file; + state->rg_output_header[state->output_count] = new_hdr; + state->output_count++; + free(input_base_name); + return i; + + fail: + free(input_base_name); + free(new_file_name); + free(tag_key); + free(new_idx_fn); + sam_hdr_destroy(new_hdr); + sam_close(new_sam_file); + return kh_end(state->rg_hash); +} + // Set the initial state static state_t* init(parsed_opts_t* opts, const char *arg_list) { @@ -306,6 +439,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) cleanup_state(retval, false); return NULL; } + retval->write_index = opts->ga.write_index; if (opts->unaccounted_name) { if (opts->unaccounted_header_name) { @@ -331,10 +465,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } else { retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header); if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) { + "VN", samtools_version(), + arg_list ? "CL": NULL, + arg_list ? arg_list : NULL, + NULL)) { print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name); cleanup_state(retval, false); return NULL; @@ -354,9 +488,19 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p); } - // Open output files for RGs - if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; - if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count); + int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0; + if (is_rg) { + if (!count_RG(retval->merged_input_header, + &retval->output_count, &retval->rg_id)) { + cleanup_state(retval, false); + return NULL; + } + if (opts->verbose) + fprintf(stderr, "@RG's found %zu\n",retval->output_count); + } else { + retval->output_count = 0; + } + // Prevent calloc(0, size); size_t num = retval->output_count ? retval->output_count : 1; retval->rg_index_file_name = (char **)calloc(num, sizeof(char *)); @@ -370,7 +514,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) cleanup_state(retval, false); return NULL; } + if (!is_rg) + return retval; // Done for this case - outputs will be opened later + // Open output files for RGs char* dirsep = strrchr(opts->merged_input_name, '/'); char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); if (!input_base_name) { @@ -398,7 +545,6 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } retval->rg_output_file_name[i] = output_filename; - sam_open_mode(outmode + 1, output_filename, NULL); retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out); @@ -425,12 +571,12 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) // Set and edit header retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header); if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) || - (!opts->no_pg && - sam_hdr_add_pg(retval->rg_output_header[i], "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL))) { + (!opts->no_pg && + sam_hdr_add_pg(retval->rg_output_header[i], "samtools", + "VN", samtools_version(), + arg_list ? "CL": NULL, + arg_list ? arg_list : NULL, + NULL))) { print_error("split", "Could not rewrite header for \"%s\"", output_filename); cleanup_state(retval, false); free(input_base_name); @@ -439,34 +585,17 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } free(input_base_name); - retval->write_index = opts->ga.write_index; return retval; } -static bool split(state_t* state) +static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) { + int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0; if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { print_error_errno("split", "Could not write output file header"); return false; } - size_t i; - for (i = 0; i < state->output_count; i++) { - if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { - print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); - return false; - } - if (state->write_index) { - state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], - state->rg_output_file_name[i], - state->rg_output_header[i]); - if (!state->rg_index_file_name[i]) { - print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); - return false; - } - } - } - bam1_t* file_read = bam_init1(); // Read the first record int r; @@ -480,25 +609,54 @@ static bool split(state_t* state) } } + if (is_rg) { + size_t i; + for (i = 0; i < state->output_count; i++) { + if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { + print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); + goto error; + } + if (state->write_index) { + state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], + state->rg_output_file_name[i], + state->rg_output_header[i]); + if (!state->rg_index_file_name[i]) { + print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); + goto error; + } + } + } + } while (file_read != NULL) { // Get RG tag from read and look it up in hash to find file to output it to - uint8_t* tag = bam_aux_get(file_read, "RG"); + uint8_t* tag = bam_aux_get(file_read, is_rg ? "RG" : opts->tag); + char *val = tag ? bam_aux2Z(tag) : NULL; khiter_t iter; - if ( tag != NULL ) { - char* rg = bam_aux2Z(tag); - iter = kh_get_c2i(state->rg_hash, rg); + if ( val != NULL ) { + iter = kh_get_c2i(state->rg_hash, val); } else { iter = kh_end(state->rg_hash); } + if (!is_rg && val && iter == kh_end(state->rg_hash)) { + // Need to open a new output file + iter = prep_sam_file(opts, state, val, arg_list); + if (iter == kh_end(state->rg_hash)) { // Open failed + print_error("split", + "Could not create output file for tag \"%s:%s\"", + opts->tag, bam_aux2Z(tag)); + goto error; + + } + } + // Write the read out to correct file if (iter != kh_end(state->rg_hash)) { // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) { print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]); - bam_destroy1(file_read); - return false; + goto error; } } else { // otherwise write to the unaccounted bam if there is one or fail @@ -506,15 +664,14 @@ static bool split(state_t* state) if (tag) { fprintf(stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); } else { - fprintf(stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); + fprintf(stderr, "Read \"%s\" has no %s tag.\n", + bam_get_qname(file_read), is_rg ? "RG" : opts->tag); } - bam_destroy1(file_read); - return false; + goto error; } else { if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) { print_error_errno("split", "Could not write to unaccounted output file"); - bam_destroy1(file_read); - return false; + goto error; } } } @@ -532,6 +689,7 @@ static bool split(state_t* state) } if (state->write_index) { + size_t i; for (i = 0; i < state->output_count; i++) { if (sam_idx_save(state->rg_output_file[i]) < 0) { print_error_errno("split", "writing index failed"); @@ -542,6 +700,9 @@ static bool split(state_t* state) } return true; +error: + bam_destroy1(file_read); + return false; } static int cleanup_state(state_t* status, bool check_close) @@ -577,6 +738,7 @@ static int cleanup_state(state_t* status, bool check_close) free(status->rg_output_file_name); free(status->rg_index_file_name); kh_destroy_c2i(status->rg_hash); + free(status->rg_id); if (status->p.pool) hts_tpool_destroy(status->p.pool); @@ -603,7 +765,7 @@ int main_split(int argc, char** argv) state_t* status = init(opts, arg_list); if (!status) goto cleanup_opts; - if (!split(status)) { + if (!split(status, opts, arg_list)) { cleanup_state(status, false); goto cleanup_opts; } diff --git a/doc/samtools-split.1 b/doc/samtools-split.1 index 9ae71ce30..0adea99f0 100644 --- a/doc/samtools-split.1 +++ b/doc/samtools-split.1 @@ -70,7 +70,7 @@ Output format defaults to BAM. For SAM or CRAM then either set the format with .SH OPTIONS .TP 14 .BI "-u " FILE1 -.RI "Put reads with no RG tag or an unrecognised RG tag into " FILE1 +.RI "Put reads with no tag or an unrecognised tag into " FILE1 .TP .BI "-h " FILE2 .RI "Use the header from " FILE2 " when writing the file given in the " -u @@ -84,6 +84,12 @@ references must be in the same order and have the same lengths. Output filename format string (see below) ["%*_%#.%."] .TP +.BI "-d " TAG +Split reads by TAG value into distinct files. Only the TAG key must be +supplied with the option. The value of the TAG has to be a string (i.e. +.B key:Z:value +) +.TP .B -v Verbose output .TP From 41e1543cea551c474622c467c7784e4d32e8dfc9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 3 Nov 2023 10:35:08 +0000 Subject: [PATCH 16/26] Refactor some names as we now split on tags other than RG --- bam_split.c | 170 ++++++++++++++++++++++++++-------------------------- 1 file changed, 85 insertions(+), 85 deletions(-) diff --git a/bam_split.c b/bam_split.c index 67ef4735c..3d36f590c 100644 --- a/bam_split.c +++ b/bam_split.c @@ -62,12 +62,12 @@ struct state { samFile* unaccounted_file; sam_hdr_t* unaccounted_header; size_t output_count; - char **rg_id; - char **rg_index_file_name; - char **rg_output_file_name; - samFile **rg_output_file; - sam_hdr_t **rg_output_header; - kh_c2i_t* rg_hash; + char **tag_vals; + char **index_file_name; + char **output_file_name; + samFile **output_file; + sam_hdr_t **output_header; + kh_c2i_t* tag_val_hash; htsThreadPool p; int write_index; }; @@ -95,8 +95,8 @@ static void usage(FILE *write_to) "Format string expansions:\n" " %%%% %%\n" " %%* basename\n" -" %%# @RG index\n" -" %%! @RG ID\n" +" %%# index (of @RG in the header, or count of TAG values seen so far)\n" +" %%! @RG ID or TAG value\n" " %%. filename extension for output format\n" ); } @@ -169,7 +169,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) } // Expands a output filename format string -static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format) +static char* expand_format_string(const char* format_string, const char* basename, const char* tag_val, const int file_idx, const htsFormat *format) { kstring_t str = { 0, 0, NULL }; const char* pointer = format_string; @@ -185,10 +185,10 @@ static char* expand_format_string(const char* format_string, const char* basenam if (kputs(basename, &str) < 0) goto memfail; break; case '#': - if (kputl(rg_idx, &str) < 0) goto memfail; + if (kputl(file_idx, &str) < 0) goto memfail; break; case '!': - if (kputs(rg_id, &str) < 0) goto memfail; + if (kputs(tag_val, &str) < 0) goto memfail; break; case '.': // Only really need to cope with sam, bam, cram @@ -282,51 +282,51 @@ static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2) } static int grow_output_lists(state_t *state, size_t count) { - char **new_list = realloc(state->rg_id, count * sizeof(char *)); + char **new_list = realloc(state->tag_vals, count * sizeof(char *)); if (!new_list) return -1; - state->rg_id = new_list; - new_list = realloc(state->rg_index_file_name, count * sizeof(char *)); + state->tag_vals = new_list; + new_list = realloc(state->index_file_name, count * sizeof(char *)); if (!new_list) return -1; - state->rg_index_file_name = new_list; - new_list = realloc(state->rg_output_file_name, count * sizeof(char *)); + state->index_file_name = new_list; + new_list = realloc(state->output_file_name, count * sizeof(char *)); if (!new_list) return -1; - state->rg_output_file_name = new_list; - samFile **new_file = realloc(state->rg_output_file, + state->output_file_name = new_list; + samFile **new_file = realloc(state->output_file, count * sizeof(samFile *)); if (!new_file) return -1; - state->rg_output_file = new_file; - sam_hdr_t **new_hdr = realloc(state->rg_output_header, + state->output_file = new_file; + sam_hdr_t **new_hdr = realloc(state->output_header, count * sizeof(sam_hdr_t *)); if (!new_hdr) return -1; - state->rg_output_header = new_hdr; + state->output_header = new_hdr; return 0; } static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, const char *tag, const char *arg_list) { - char *input_base_name = NULL, *new_file_name = NULL, *tag_key = NULL; + char *input_base_name = NULL, *new_file_name = NULL, *tag_val = NULL; char *new_idx_fn = NULL; sam_hdr_t *new_hdr = NULL; samFile *new_sam_file = NULL; - khiter_t i = kh_get_c2i(state->rg_hash, tag); - if (i != kh_end(state->rg_hash)) { + khiter_t i = kh_get_c2i(state->tag_val_hash, tag); + if (i != kh_end(state->tag_val_hash)) { return i; } // create new file if (grow_output_lists(state, state->output_count + 1) != 0) { print_error_errno("split", "Couldn't grow output lists"); - return kh_end(state->rg_hash); + return kh_end(state->tag_val_hash); } - tag_key = strdup(tag); - if (!tag_key) { + tag_val = strdup(tag); + if (!tag_val) { print_error_errno("split", "Couldn't copy tag value"); - return kh_end(state->rg_hash); + return kh_end(state->tag_val_hash); } char *dirsep = strrchr(opts->merged_input_name, '/'); input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); @@ -382,18 +382,18 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, goto fail; } int ret = -1; - i = kh_put_c2i(state->rg_hash, tag_key, &ret); + i = kh_put_c2i(state->tag_val_hash, tag_val, &ret); if (ret < 0) { print_error_errno("split", "Adding file \"%s\" failed", new_file_name); goto fail; } - kh_val(state->rg_hash, i) = state->output_count; - state->rg_id[state->output_count] = tag_key; - state->rg_index_file_name[state->output_count] = new_idx_fn; - state->rg_output_file_name[state->output_count] = new_file_name; - state->rg_output_file[state->output_count] = new_sam_file; - state->rg_output_header[state->output_count] = new_hdr; + kh_val(state->tag_val_hash, i) = state->output_count; + state->tag_vals[state->output_count] = tag_val; + state->index_file_name[state->output_count] = new_idx_fn; + state->output_file_name[state->output_count] = new_file_name; + state->output_file[state->output_count] = new_sam_file; + state->output_header[state->output_count] = new_hdr; state->output_count++; free(input_base_name); return i; @@ -401,11 +401,11 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, fail: free(input_base_name); free(new_file_name); - free(tag_key); + free(tag_val); free(new_idx_fn); sam_hdr_destroy(new_hdr); sam_close(new_sam_file); - return kh_end(state->rg_hash); + return kh_end(state->tag_val_hash); } // Set the initial state @@ -491,7 +491,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0; if (is_rg) { if (!count_RG(retval->merged_input_header, - &retval->output_count, &retval->rg_id)) { + &retval->output_count, &retval->tag_vals)) { cleanup_state(retval, false); return NULL; } @@ -503,13 +503,13 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) // Prevent calloc(0, size); size_t num = retval->output_count ? retval->output_count : 1; - retval->rg_index_file_name = (char **)calloc(num, sizeof(char *)); - retval->rg_output_file_name = (char **)calloc(num, sizeof(char *)); - retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*)); - retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*)); - retval->rg_hash = kh_init_c2i(); - if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || - !retval->rg_hash || !retval->rg_index_file_name) { + retval->index_file_name = (char **)calloc(num, sizeof(char *)); + retval->output_file_name = (char **)calloc(num, sizeof(char *)); + retval->output_file = (samFile**)calloc(num, sizeof(samFile*)); + retval->output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*)); + retval->tag_val_hash = kh_init_c2i(); + if (!retval->output_file_name || !retval->output_file || !retval->output_header || + !retval->tag_val_hash || !retval->index_file_name) { print_error_errno("split", "Could not initialise output file array"); cleanup_state(retval, false); return NULL; @@ -535,7 +535,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) output_filename = expand_format_string(opts->output_format_string, input_base_name, - retval->rg_id[i], i, + retval->tag_vals[i], i, &opts->ga.out); if ( output_filename == NULL ) { @@ -544,35 +544,35 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) return NULL; } - retval->rg_output_file_name[i] = output_filename; + retval->output_file_name[i] = output_filename; sam_open_mode(outmode + 1, output_filename, NULL); - retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out); + retval->output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out); - if (retval->rg_output_file[i] == NULL) { + if (retval->output_file[i] == NULL) { print_error_errno("split", "Could not open \"%s\"", output_filename); cleanup_state(retval, false); free(input_base_name); return NULL; } if (retval->p.pool) - hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p); + hts_set_opt(retval->output_file[i], HTS_OPT_THREAD_POOL, &retval->p); // Record index in hash int ret; - khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret); + khiter_t iter = kh_put_c2i(retval->tag_val_hash, retval->tag_vals[i], &ret); if (ret < 0) { print_error_errno("split", "Couldn't add @RG ID to look-up table"); cleanup_state(retval, false); free(input_base_name); return NULL; } - kh_val(retval->rg_hash,iter) = i; + kh_val(retval->tag_val_hash,iter) = i; // Set and edit header - retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header); - if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) || + retval->output_header[i] = sam_hdr_dup(retval->merged_input_header); + if (sam_hdr_remove_except(retval->output_header[i], "RG", "ID", retval->tag_vals[i]) || (!opts->no_pg && - sam_hdr_add_pg(retval->rg_output_header[i], "samtools", + sam_hdr_add_pg(retval->output_header[i], "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, @@ -612,16 +612,16 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) if (is_rg) { size_t i; for (i = 0; i < state->output_count; i++) { - if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { - print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); + if (sam_hdr_write(state->output_file[i], state->output_header[i]) != 0) { + print_error_errno("split", "Could not write file header to \"%s\"", state->output_file_name[i]); goto error; } if (state->write_index) { - state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], - state->rg_output_file_name[i], - state->rg_output_header[i]); - if (!state->rg_index_file_name[i]) { - print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); + state->index_file_name[i] = auto_index(state->output_file[i], + state->output_file_name[i], + state->output_header[i]); + if (!state->index_file_name[i]) { + print_error_errno("split", "Could not create index for file \"%s\"", state->output_file_name[i]); goto error; } } @@ -633,15 +633,15 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) char *val = tag ? bam_aux2Z(tag) : NULL; khiter_t iter; if ( val != NULL ) { - iter = kh_get_c2i(state->rg_hash, val); + iter = kh_get_c2i(state->tag_val_hash, val); } else { - iter = kh_end(state->rg_hash); + iter = kh_end(state->tag_val_hash); } - if (!is_rg && val && iter == kh_end(state->rg_hash)) { + if (!is_rg && val && iter == kh_end(state->tag_val_hash)) { // Need to open a new output file iter = prep_sam_file(opts, state, val, arg_list); - if (iter == kh_end(state->rg_hash)) { // Open failed + if (iter == kh_end(state->tag_val_hash)) { // Open failed print_error("split", "Could not create output file for tag \"%s:%s\"", opts->tag, bam_aux2Z(tag)); @@ -651,11 +651,11 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) } // Write the read out to correct file - if (iter != kh_end(state->rg_hash)) { + if (iter != kh_end(state->tag_val_hash)) { // if found write to the appropriate untangled bam - int i = kh_val(state->rg_hash,iter); - if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) { - print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]); + int i = kh_val(state->tag_val_hash,iter); + if (sam_write1(state->output_file[i], state->output_header[i], file_read) < 0) { + print_error_errno("split", "Could not write to \"%s\"", state->output_file_name[i]); goto error; } } else { @@ -691,11 +691,11 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) if (state->write_index) { size_t i; for (i = 0; i < state->output_count; i++) { - if (sam_idx_save(state->rg_output_file[i]) < 0) { + if (sam_idx_save(state->output_file[i]) < 0) { print_error_errno("split", "writing index failed"); return false; } - free(state->rg_index_file_name[i]); + free(state->index_file_name[i]); } } @@ -720,26 +720,26 @@ static int cleanup_state(state_t* status, bool check_close) sam_close(status->merged_input_file); size_t i; for (i = 0; i < status->output_count; i++) { - if (status->rg_output_header && status->rg_output_header[i]) - sam_hdr_destroy(status->rg_output_header[i]); - if (status->rg_output_file && status->rg_output_file[i]) { - if (sam_close(status->rg_output_file[i]) < 0 && check_close) { - print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); + if (status->output_header && status->output_header[i]) + sam_hdr_destroy(status->output_header[i]); + if (status->output_file && status->output_file[i]) { + if (sam_close(status->output_file[i]) < 0 && check_close) { + print_error("split", "Error on closing output file \"%s\"", status->output_file_name[i]); ret = -1; } } - if (status->rg_id) free(status->rg_id[i]); - if (status->rg_output_file_name) free(status->rg_output_file_name[i]); + if (status->tag_vals) free(status->tag_vals[i]); + if (status->output_file_name) free(status->output_file_name[i]); } if (status->merged_input_header) sam_hdr_destroy(status->merged_input_header); - free(status->rg_output_header); - free(status->rg_output_file); - free(status->rg_output_file_name); - free(status->rg_index_file_name); - kh_destroy_c2i(status->rg_hash); + free(status->output_header); + free(status->output_file); + free(status->output_file_name); + free(status->index_file_name); + kh_destroy_c2i(status->tag_val_hash); - free(status->rg_id); + free(status->tag_vals); if (status->p.pool) hts_tpool_destroy(status->p.pool); free(status); From d3cec4bb45e30dcca187db3c7b5fad92c6ccba62 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 3 Nov 2023 14:43:07 +0000 Subject: [PATCH 17/26] Allow `-d RG` to make new files from alignment RG tag values Removes the previous restriction that all read groups must be listed in the header. This is only enabled when using `-d RG` though, so previous behaviour (when `-d` was not in use) remains unchanged. --- bam_split.c | 28 +++++++++++++++++++++++++--- doc/samtools-split.1 | 30 ++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/bam_split.c b/bam_split.c index 3d36f590c..ecfd88f6e 100644 --- a/bam_split.c +++ b/bam_split.c @@ -308,7 +308,8 @@ static int grow_output_lists(state_t *state, size_t count) { } static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, - const char *tag, const char *arg_list) { + const char *tag, const char *arg_list, + int is_rg) { char *input_base_name = NULL, *new_file_name = NULL, *tag_val = NULL; char *new_idx_fn = NULL; sam_hdr_t *new_hdr = NULL; @@ -358,6 +359,23 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, goto fail; } + if (is_rg) { + // If here, we've found an RG:Z: tag without a corresponding @RG + // line in the header. + if (sam_hdr_remove_lines(new_hdr, "RG", "ID", NULL) != 0) { + print_error_errno("split", + "Failed to remove @RG lines from file \"%s\"", + new_file_name); + goto fail; + } + if (sam_hdr_add_line(new_hdr, "RG", "ID", tag_val, NULL) != 0) { + print_error_errno("split", + "Failed to add @RG line to file \"%s\"", + new_file_name); + goto fail; + } + } + char outmode[4] = "w"; sam_open_mode(outmode + 1, new_file_name, NULL); new_sam_file = sam_open_format(new_file_name, outmode, &opts->ga.out); @@ -638,9 +656,13 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) iter = kh_end(state->tag_val_hash); } - if (!is_rg && val && iter == kh_end(state->tag_val_hash)) { + // Check for opts->tag here instead of !is_rg so we open new + // files if the user specified '-d RG' and we find a RG:Z: value + // that wasn't listed in the header. If the '-d' option is + // not used, we don't open a file to preserve existing behaviour. + if (opts->tag && val && iter == kh_end(state->tag_val_hash)) { // Need to open a new output file - iter = prep_sam_file(opts, state, val, arg_list); + iter = prep_sam_file(opts, state, val, arg_list, is_rg); if (iter == kh_end(state->tag_val_hash)) { // Open failed print_error("split", "Could not create output file for tag \"%s:%s\"", diff --git a/doc/samtools-split.1 b/doc/samtools-split.1 index 0adea99f0..90a076e43 100644 --- a/doc/samtools-split.1 +++ b/doc/samtools-split.1 @@ -49,18 +49,36 @@ samtools split .SH DESCRIPTION .PP -Splits a file by read group, producing one or more output files -matching a common prefix (by default based on the input filename) -each containing one read-group. +Splits a file by read group, or a specified tag, +producing one or more output files +matching a common prefix (by default based on the input filename). +Unless the \fB-d\fR option is used, the file will be split according to the +.B @RG +tags listed in the header. Records without an RG tag or with an RG tag undefined in the header will cause the program to exit with an error unless the \fB-u\fR option is used. RG values defined in the header but with no records will produce an output file only containing a header. +If the +.BI "-d " TAG +option is used, the file will be split on the value in the given aux tag. +Note that only string tags (type \fBZ\fR) are currently supported. +Unless the \fB-u\fR option is used, the program will exit with an error if +it finds a record without the given tag. + +Using +.B -d RG +behaves in a similar way to the default (without \fB-d\fR), +opening an output file for each \fB@RG\fR line in the header. +However, unlike the default, +new output files will be opened for any RG tags found in the alignment records +irrespective of if they have a matching header \fB@RG\fR line. + The \fB-u\fR option may be used to specify the output filename for any -records with a missing or unrecognised RG tag. This option will always write +records with a missing or unrecognised tag. This option will always write out a file even if there are no records. Output format defaults to BAM. For SAM or CRAM then either set the format with @@ -102,8 +120,8 @@ center; lb l . %% % %* basename -%# @RG index -%! @RG ID +%# index (of @RG in the header, or count of TAG values seen so far) +%! @RG ID or TAG value %. output format filename extension .TE .TP From 7f829ec7ad0d22f979598ca04313388047fd1b84 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 3 Nov 2023 17:03:03 +0000 Subject: [PATCH 18/26] Add a -M option to limit the number of split files made To prevent "too many open files" and/or too many files in a directory problems. --- bam_split.c | 40 ++++++++++++++++++++++++++++++++-------- doc/samtools-split.1 | 21 +++++++++++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/bam_split.c b/bam_split.c index ecfd88f6e..c6eaaff61 100644 --- a/bam_split.c +++ b/bam_split.c @@ -49,11 +49,14 @@ struct parsed_opts { const char *unaccounted_name; const char *output_format_string; const char *tag; + long max_split; bool verbose; int no_pg; sam_global_args ga; }; +#define DEFAULT_MAX_SPLIT 100 + typedef struct parsed_opts parsed_opts_t; struct state { @@ -83,12 +86,14 @@ static void usage(FILE *write_to) "Usage: samtools split [-u ] [-h ]\n" " [-f ] [-v] \n" "Options:\n" -" -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" -" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" -" -h FILE2 ... and override the header with FILE2 (-u file only)\n" -" -d TAG split by TAG value. TAG value must be a string.\n" -" -v verbose output\n" -" --no-PG do not add a PG line\n"); +" -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" +" -u FILE1 put left-over reads in FILE1\n" +" -h FILE2 ... and override the header with FILE2 (-u file only)\n" +" -d TAG split by TAG value. TAG value must be a string.\n" +" -M,--max-split NUM limit number of output files from -d to NUM [%d]\n" +" -v verbose output\n" +" --no-PG do not add a PG line\n", + DEFAULT_MAX_SPLIT); sam_global_opt_help(write_to, "-....@.."); fprintf(write_to, "\n" @@ -106,17 +111,19 @@ static parsed_opts_t* parse_args(int argc, char** argv) { if (argc == 1) { usage(stdout); return NULL; } - const char *optstring = "vf:h:u:d:@:"; + const char *optstring = "vf:h:u:d:M:@:"; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), {"no-PG", no_argument, NULL, 1}, + {"max-split", required_argument, NULL, 'M'}, { NULL, 0, NULL, 0 } }; parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1); if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; } + retval->max_split = DEFAULT_MAX_SPLIT; sam_global_args_init(&retval->ga); int opt; @@ -138,6 +145,18 @@ static parsed_opts_t* parse_args(int argc, char** argv) retval->tag = optarg; retval->output_format_string = "%*_%!.%."; break; + case 'M': { + char *end = optarg; + retval->max_split = strtol(optarg, &end, 10); + if (*optarg == '\0' || *end != '\0' || retval->max_split == 0) { + print_error("split", "Invalid -M argument: \"%s\"", optarg); + free(retval); + return NULL; + } + if (retval->max_split < 0) // No limit requested + retval->max_split = LONG_MAX; + break; + } case 1: retval->no_pg = 1; break; @@ -535,6 +554,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) if (!is_rg) return retval; // Done for this case - outputs will be opened later + // Adjust max_split if too small for the read-groups listed in the header + if (opts->max_split < retval->output_count) + opts->max_split = retval->output_count; + // Open output files for RGs char* dirsep = strrchr(opts->merged_input_name, '/'); char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); @@ -660,7 +683,8 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) // files if the user specified '-d RG' and we find a RG:Z: value // that wasn't listed in the header. If the '-d' option is // not used, we don't open a file to preserve existing behaviour. - if (opts->tag && val && iter == kh_end(state->tag_val_hash)) { + if (opts->tag && val && iter == kh_end(state->tag_val_hash) + && state->output_count < opts->max_split) { // Need to open a new output file iter = prep_sam_file(opts, state, val, arg_list, is_rg); if (iter == kh_end(state->tag_val_hash)) { // Open failed diff --git a/doc/samtools-split.1 b/doc/samtools-split.1 index 90a076e43..460aa6404 100644 --- a/doc/samtools-split.1 +++ b/doc/samtools-split.1 @@ -69,6 +69,11 @@ Note that only string tags (type \fBZ\fR) are currently supported. Unless the \fB-u\fR option is used, the program will exit with an error if it finds a record without the given tag. +Note that attempting to split on a tag with high cardinality may result +in the creation of a large number of output files. +To prevent this, the \fB-M\fR option can be used to set a limit on the +number of splits made. + Using .B -d RG behaves in a similar way to the default (without \fB-d\fR), @@ -108,6 +113,22 @@ supplied with the option. The value of the TAG has to be a string (i.e. .B key:Z:value ) .TP +.BI "-M,--max-split " NUM +Limit the number of files created by the \fB-d\fR option to \fINUM\fR (default +100). +This prevents accidents where trying to split on a tag with high cardinality +could result in the creation of a very large number of output files. +Once the file limit is reached, +any tag values not already seen will be treated as unmatched and the program +will exit with an error unless the \fB-u\fR option is in use. + +If desired, the limit can be removed using \fB-M -1\fR, +although in practice the number of outputs will still be restricted by +system limits on the number of files that can be open at once. + +If splitting by read group, and the read group count in the header +is higher than the requested limit then the limit will be raised to match. +.TP .B -v Verbose output .TP From 050362f142820e4efcb70185c345806ea4a24dfb Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 6 Nov 2023 13:57:29 +0000 Subject: [PATCH 19/26] Add tests for split by string aux tag functionality --- test/split/split.expected.grp1.sam | 24 ++++----- test/split/split.expected.grp2.sam | 24 ++++----- test/split/split.expected.unk.sam | 12 ++--- test/split/split.expected_d_RG.grp3.sam | 29 ++++++++++ test/split/split.expected_d_RG.unk.sam | 32 +++++++++++ test/split/split.expected_d_an.aardvark.sam | 32 +++++++++++ test/split/split.expected_d_an.badger.sam | 38 +++++++++++++ test/split/split.expected_d_an.cat.sam | 38 +++++++++++++ test/split/split.expected_d_an.dog.sam | 34 ++++++++++++ test/split/split.expected_d_an.unk.sam | 32 +++++++++++ test/split/split.expected_d_an_M_3.unk.sam | 36 +++++++++++++ test/split/split.sam | 60 ++++++++++----------- test/test.pl | 37 +++++++++++++ 13 files changed, 368 insertions(+), 60 deletions(-) create mode 100644 test/split/split.expected_d_RG.grp3.sam create mode 100644 test/split/split.expected_d_RG.unk.sam create mode 100644 test/split/split.expected_d_an.aardvark.sam create mode 100644 test/split/split.expected_d_an.badger.sam create mode 100644 test/split/split.expected_d_an.cat.sam create mode 100644 test/split/split.expected_d_an.dog.sam create mode 100644 test/split/split.expected_d_an.unk.sam create mode 100644 test/split/split.expected_d_an_M_3.unk.sam diff --git a/test/split/split.expected.grp1.sam b/test/split/split.expected.grp1.sam index 23e15367d..4fbd1775d 100644 --- a/test/split/split.expected.grp1.sam +++ b/test/split/split.expected.grp1.sam @@ -24,17 +24,17 @@ @SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 @SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 @RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample -ref1_grp1_p001 99 ref1 1 0 10M = 25 34 CGAGCTCGGT !!!!!!!!!! RG:Z:grp1 BC:Z:ACGT H0:i:1 aa:A:! ab:A:~ fa:f:3.14159 za:Z:Hello world! ha:H:DEADBEEF ba:B:c,-128,0,127 bb:B:C,0,127,255 bc:B:s,-32768,0,32767 bd:B:S,0,32768,65535 be:B:i,-2147483648,0,2147483647 bf:B:I,0,2147483648,4294967295 bg:B:f,2.71828,6.626e-34,2.9979e+09 NM:i:0 MD:Z:10 -ref1_grp1_p002 99 ref1 5 2 10M = 29 34 CTCGGTACCC ########## RG:Z:grp1 BC:Z:AATTCCGG H0:i:1 aa:A:a ab:A:z fa:f:4.3597e-18 za:Z:Another string ha:H:2000AD NM:i:0 MD:Z:10 -ref1_grp1_p003 99 ref1 9 4 10M = 33 34 GTACCCGGGG %%%%%%%%%% fa:f:1.66e-27 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p004 99 ref1 13 6 10M = 37 34 CCGGGGATCC '''''''''' fa:f:1.38e-23 za:Z:xRG:Z:grp2 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p005 99 ref1 17 8 10M = 41 34 GGATCCTCTA )))))))))) RG:Z:grp1 ia:i:40000 NM:i:0 MD:Z:10 +ref1_grp1_p001 99 ref1 1 0 10M = 25 34 CGAGCTCGGT !!!!!!!!!! RG:Z:grp1 BC:Z:ACGT H0:i:1 aa:A:! ab:A:~ fa:f:3.14159 za:Z:Hello world! ha:H:DEADBEEF ba:B:c,-128,0,127 bb:B:C,0,127,255 bc:B:s,-32768,0,32767 bd:B:S,0,32768,65535 be:B:i,-2147483648,0,2147483647 bf:B:I,0,2147483648,4294967295 bg:B:f,2.71828,6.626e-34,2.9979e+09 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p002 99 ref1 5 2 10M = 29 34 CTCGGTACCC ########## RG:Z:grp1 BC:Z:AATTCCGG H0:i:1 aa:A:a ab:A:z fa:f:4.3597e-18 za:Z:Another string ha:H:2000AD NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p003 99 ref1 9 4 10M = 33 34 GTACCCGGGG %%%%%%%%%% fa:f:1.66e-27 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p004 99 ref1 13 6 10M = 37 34 CCGGGGATCC '''''''''' fa:f:1.38e-23 za:Z:xRG:Z:grp2 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p005 99 ref1 17 8 10M = 41 34 GGATCCTCTA )))))))))) RG:Z:grp1 ia:i:40000 NM:i:0 MD:Z:10 an:Z:badger ref1_grp1_p006 99 ref1 21 10 10M = 45 34 CCTCTAGAGT ++++++++++ RG:Z:grp1 ia:i:255 NM:i:0 MD:Z:10 -ref1_grp1_p001 147 ref1 25 12 10M = 1 -34 TAGAGTCGAC ---------- RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p002 147 ref1 29 14 10M = 5 -34 GTCGACCTGC ////////// RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p003 147 ref1 33 16 10M = 9 -34 ACCTGCAGGC 1111111111 RG:Z:grp1 NM:i:0 MD:Z:10 -ref12_grp1_p001 97 ref1 36 50 10M ref2 2 0 TGCAGGCATG AAAAAAAAAA RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p004 147 ref1 37 18 10M = 13 -34 GCAGGCATGC 3333333333 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 RG:Z:grp1 NM:i:0 MD:Z:10 +ref1_grp1_p001 147 ref1 25 12 10M = 1 -34 TAGAGTCGAC ---------- RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p002 147 ref1 29 14 10M = 5 -34 GTCGACCTGC ////////// RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p003 147 ref1 33 16 10M = 9 -34 ACCTGCAGGC 1111111111 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref12_grp1_p001 97 ref1 36 50 10M ref2 2 0 TGCAGGCATG AAAAAAAAAA RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p004 147 ref1 37 18 10M = 13 -34 GCAGGCATGC 3333333333 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger ref1_grp1_p006 147 ref1 45 22 10M = 21 -34 GCAAGCTTGA 7777777777 RG:Z:grp1 NM:i:0 MD:Z:10 -ref12_grp1_p001 145 ref2 2 50 10M ref1 36 0 TTCTATAGTG BBBBBBBBBB RG:Z:grp1 NM:i:0 MD:Z:10 +ref12_grp1_p001 145 ref2 2 50 10M ref1 36 0 TTCTATAGTG BBBBBBBBBB RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat diff --git a/test/split/split.expected.grp2.sam b/test/split/split.expected.grp2.sam index 07798dd8c..6a3854c40 100644 --- a/test/split/split.expected.grp2.sam +++ b/test/split/split.expected.grp2.sam @@ -24,17 +24,17 @@ @SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 @SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 @RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample -ref1_grp2_p001 99 ref1 3 1 10M = 27 34 AGCTCGGTAC """""""""" RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE NM:i:0 MD:Z:10 -ref1_grp2_p002 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ fa:f:6.022e+23 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp2_p003 99 ref1 11 5 10M = 35 34 ACCCGGGGAT &&&&&&&&&& RG:Z:grp2 ia:i:4294967295 NM:i:0 MD:Z:10 -ref1_grp2_p004 99 ref1 15 7 10M = 39 34 GGGGATCCTC (((((((((( RG:Z:grp2 ia:i:-2147483648 NM:i:0 MD:Z:10 -ref1_grp2_p005 99 ref1 19 9 10M = 43 34 ATCCTCTAGA ********** RG:Z:grp2 ia:i:-1000 NM:i:0 MD:Z:10 +ref1_grp2_p001 99 ref1 3 1 10M = 27 34 AGCTCGGTAC """""""""" RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p002 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ fa:f:6.022e+23 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p003 99 ref1 11 5 10M = 35 34 ACCCGGGGAT &&&&&&&&&& RG:Z:grp2 ia:i:4294967295 NM:i:0 MD:Z:10 an:Z:dog +ref1_grp2_p004 99 ref1 15 7 10M = 39 34 GGGGATCCTC (((((((((( RG:Z:grp2 ia:i:-2147483648 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p005 99 ref1 19 9 10M = 43 34 ATCCTCTAGA ********** RG:Z:grp2 ia:i:-1000 NM:i:0 MD:Z:10 an:Z:aardvark ref1_grp2_p006 99 ref1 23 11 10M = 47 34 TCTAGAGTCG ,,,,,,,,,, RG:Z:grp2 ia:i:-1 NM:i:0 MD:Z:10 -ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp2_p002 147 ref1 31 15 10M = 7 -34 CGACCTGCAG 0000000000 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp2_p003 147 ref1 35 17 10M = 11 -34 CTGCAGGCAT 2222222222 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp2_p004 147 ref1 39 19 10M = 15 -34 AGGCATGCAA 4444444444 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp2_p005 147 ref1 43 21 10M = 19 -34 ATGCAAGCTT 6666666666 RG:Z:grp2 NM:i:0 MD:Z:10 -ref12_grp2_p001 97 ref1 46 50 10M ref2 12 0 CAAGCTTGAG AAAAAAAAAA RG:Z:grp2 NM:i:0 MD:Z:10 +ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p002 147 ref1 31 15 10M = 7 -34 CGACCTGCAG 0000000000 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p003 147 ref1 35 17 10M = 11 -34 CTGCAGGCAT 2222222222 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:dog +ref1_grp2_p004 147 ref1 39 19 10M = 15 -34 AGGCATGCAA 4444444444 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p005 147 ref1 43 21 10M = 19 -34 ATGCAAGCTT 6666666666 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark +ref12_grp2_p001 97 ref1 46 50 10M ref2 12 0 CAAGCTTGAG AAAAAAAAAA RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:cat ref1_grp2_p006 147 ref1 47 23 10M = 23 -34 AAGCTTGAGT 8888888888 RG:Z:grp2 NM:i:0 MD:Z:10 -ref12_grp2_p001 145 ref2 12 50 10M ref1 46 0 TCACCTAAAT BBBBBBBBBB RG:Z:grp2 NM:i:0 MD:Z:10 +ref12_grp2_p001 145 ref2 12 50 10M ref1 46 0 TCACCTAAAT BBBBBBBBBB RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:cat diff --git a/test/split/split.expected.unk.sam b/test/split/split.expected.unk.sam index 59f6ff569..8417d13e5 100644 --- a/test/split/split.expected.unk.sam +++ b/test/split/split.expected.unk.sam @@ -26,9 +26,9 @@ @SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 @SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 @SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 -ref2_grp3_p001 83 ref2 1 99 15M = 31 45 ATTCTATAGTGTCAC ~~~~~~~~~~~~~~~ NM:i:0 MD:Z:15 -ref2_grp3_p002 147 ref2 16 99 15M = 46 45 CTAAATAGCTTGGCG }}}}}}}}}}}}}}} NM:i:0 MD:Z:15 -ref2_grp3_p001 163 ref2 31 99 15M = 1 -45 CTGTTTCCTGTGTGA ||||||||||||||| NM:i:13 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 -ref2_grp3_p002 99 ref2 46 99 15M = 16 -45 CTGTTTCCTGTGTGA {{{{{{{{{{{{{{{ NM:i:0 MD:Z:15 -unaligned_grp3_p001 77 * 0 0 * * 0 0 CACTCGTTCATGACG 0123456789abcde -unaligned_grp3_p001 141 * 0 0 * * 0 0 GAAAGTGAGGAGGTG edcba9876543210 +ref2_grp3_p001 83 ref2 1 99 15M = 31 45 ATTCTATAGTGTCAC ~~~~~~~~~~~~~~~ NM:i:0 MD:Z:15 an:Z:cat +ref2_grp3_p002 147 ref2 16 99 15M = 46 45 CTAAATAGCTTGGCG }}}}}}}}}}}}}}} NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +ref2_grp3_p001 163 ref2 31 99 15M = 1 -45 CTGTTTCCTGTGTGA ||||||||||||||| NM:i:13 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 an:Z:cat +ref2_grp3_p002 99 ref2 46 99 15M = 16 -45 CTGTTTCCTGTGTGA {{{{{{{{{{{{{{{ NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +unaligned_grp3_p001 77 * 0 0 * * 0 0 CACTCGTTCATGACG 0123456789abcde an:Z:dog +unaligned_grp3_p001 141 * 0 0 * * 0 0 GAAAGTGAGGAGGTG edcba9876543210 an:Z:dog diff --git a/test/split/split.expected_d_RG.grp3.sam b/test/split/split.expected_d_RG.grp3.sam new file mode 100644 index 000000000..14e7997da --- /dev/null +++ b/test/split/split.expected_d_RG.grp3.sam @@ -0,0 +1,29 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp3 +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref2_grp3_p002 147 ref2 16 99 15M = 46 45 CTAAATAGCTTGGCG }}}}}}}}}}}}}}} NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +ref2_grp3_p002 99 ref2 46 99 15M = 16 -45 CTGTTTCCTGTGTGA {{{{{{{{{{{{{{{ NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog diff --git a/test/split/split.expected_d_RG.unk.sam b/test/split/split.expected_d_RG.unk.sam new file mode 100644 index 000000000..2676ba6d3 --- /dev/null +++ b/test/split/split.expected_d_RG.unk.sam @@ -0,0 +1,32 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref2_grp3_p001 83 ref2 1 99 15M = 31 45 ATTCTATAGTGTCAC ~~~~~~~~~~~~~~~ NM:i:0 MD:Z:15 an:Z:cat +ref2_grp3_p001 163 ref2 31 99 15M = 1 -45 CTGTTTCCTGTGTGA ||||||||||||||| NM:i:13 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 an:Z:cat +unaligned_grp3_p001 77 * 0 0 * * 0 0 CACTCGTTCATGACG 0123456789abcde an:Z:dog +unaligned_grp3_p001 141 * 0 0 * * 0 0 GAAAGTGAGGAGGTG edcba9876543210 an:Z:dog diff --git a/test/split/split.expected_d_an.aardvark.sam b/test/split/split.expected_d_an.aardvark.sam new file mode 100644 index 000000000..b5ce3fb0b --- /dev/null +++ b/test/split/split.expected_d_an.aardvark.sam @@ -0,0 +1,32 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref1_grp2_p004 99 ref1 15 7 10M = 39 34 GGGGATCCTC (((((((((( RG:Z:grp2 ia:i:-2147483648 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p005 99 ref1 19 9 10M = 43 34 ATCCTCTAGA ********** RG:Z:grp2 ia:i:-1000 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p004 147 ref1 39 19 10M = 15 -34 AGGCATGCAA 4444444444 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p005 147 ref1 43 21 10M = 19 -34 ATGCAAGCTT 6666666666 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark diff --git a/test/split/split.expected_d_an.badger.sam b/test/split/split.expected_d_an.badger.sam new file mode 100644 index 000000000..a78886b5a --- /dev/null +++ b/test/split/split.expected_d_an.badger.sam @@ -0,0 +1,38 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref1_grp1_p001 99 ref1 1 0 10M = 25 34 CGAGCTCGGT !!!!!!!!!! RG:Z:grp1 BC:Z:ACGT H0:i:1 aa:A:! ab:A:~ fa:f:3.14159 za:Z:Hello world! ha:H:DEADBEEF ba:B:c,-128,0,127 bb:B:C,0,127,255 bc:B:s,-32768,0,32767 bd:B:S,0,32768,65535 be:B:i,-2147483648,0,2147483647 bf:B:I,0,2147483648,4294967295 bg:B:f,2.71828,6.626e-34,2.9979e+09 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p001 99 ref1 3 1 10M = 27 34 AGCTCGGTAC """""""""" RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p002 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ fa:f:6.022e+23 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p004 99 ref1 13 6 10M = 37 34 CCGGGGATCC '''''''''' fa:f:1.38e-23 za:Z:xRG:Z:grp2 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p005 99 ref1 17 8 10M = 41 34 GGATCCTCTA )))))))))) RG:Z:grp1 ia:i:40000 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p001 147 ref1 25 12 10M = 1 -34 TAGAGTCGAC ---------- RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p002 147 ref1 31 15 10M = 7 -34 CGACCTGCAG 0000000000 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p004 147 ref1 37 18 10M = 13 -34 GCAGGCATGC 3333333333 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger diff --git a/test/split/split.expected_d_an.cat.sam b/test/split/split.expected_d_an.cat.sam new file mode 100644 index 000000000..c2bdd3af6 --- /dev/null +++ b/test/split/split.expected_d_an.cat.sam @@ -0,0 +1,38 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref1_grp1_p002 99 ref1 5 2 10M = 29 34 CTCGGTACCC ########## RG:Z:grp1 BC:Z:AATTCCGG H0:i:1 aa:A:a ab:A:z fa:f:4.3597e-18 za:Z:Another string ha:H:2000AD NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p003 99 ref1 9 4 10M = 33 34 GTACCCGGGG %%%%%%%%%% fa:f:1.66e-27 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p002 147 ref1 29 14 10M = 5 -34 GTCGACCTGC ////////// RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p003 147 ref1 33 16 10M = 9 -34 ACCTGCAGGC 1111111111 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref12_grp1_p001 97 ref1 36 50 10M ref2 2 0 TGCAGGCATG AAAAAAAAAA RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref12_grp2_p001 97 ref1 46 50 10M ref2 12 0 CAAGCTTGAG AAAAAAAAAA RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:cat +ref2_grp3_p001 83 ref2 1 99 15M = 31 45 ATTCTATAGTGTCAC ~~~~~~~~~~~~~~~ NM:i:0 MD:Z:15 an:Z:cat +ref12_grp1_p001 145 ref2 2 50 10M ref1 36 0 TTCTATAGTG BBBBBBBBBB RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref12_grp2_p001 145 ref2 12 50 10M ref1 46 0 TCACCTAAAT BBBBBBBBBB RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:cat +ref2_grp3_p001 163 ref2 31 99 15M = 1 -45 CTGTTTCCTGTGTGA ||||||||||||||| NM:i:13 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 an:Z:cat diff --git a/test/split/split.expected_d_an.dog.sam b/test/split/split.expected_d_an.dog.sam new file mode 100644 index 000000000..d7d699302 --- /dev/null +++ b/test/split/split.expected_d_an.dog.sam @@ -0,0 +1,34 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref1_grp2_p003 99 ref1 11 5 10M = 35 34 ACCCGGGGAT &&&&&&&&&& RG:Z:grp2 ia:i:4294967295 NM:i:0 MD:Z:10 an:Z:dog +ref1_grp2_p003 147 ref1 35 17 10M = 11 -34 CTGCAGGCAT 2222222222 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:dog +ref2_grp3_p002 147 ref2 16 99 15M = 46 45 CTAAATAGCTTGGCG }}}}}}}}}}}}}}} NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +ref2_grp3_p002 99 ref2 46 99 15M = 16 -45 CTGTTTCCTGTGTGA {{{{{{{{{{{{{{{ NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +unaligned_grp3_p001 77 * 0 0 * * 0 0 CACTCGTTCATGACG 0123456789abcde an:Z:dog +unaligned_grp3_p001 141 * 0 0 * * 0 0 GAAAGTGAGGAGGTG edcba9876543210 an:Z:dog diff --git a/test/split/split.expected_d_an.unk.sam b/test/split/split.expected_d_an.unk.sam new file mode 100644 index 000000000..5092c89a6 --- /dev/null +++ b/test/split/split.expected_d_an.unk.sam @@ -0,0 +1,32 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref1_grp1_p006 99 ref1 21 10 10M = 45 34 CCTCTAGAGT ++++++++++ RG:Z:grp1 ia:i:255 NM:i:0 MD:Z:10 +ref1_grp2_p006 99 ref1 23 11 10M = 47 34 TCTAGAGTCG ,,,,,,,,,, RG:Z:grp2 ia:i:-1 NM:i:0 MD:Z:10 +ref1_grp1_p006 147 ref1 45 22 10M = 21 -34 GCAAGCTTGA 7777777777 RG:Z:grp1 NM:i:0 MD:Z:10 +ref1_grp2_p006 147 ref1 47 23 10M = 23 -34 AAGCTTGAGT 8888888888 RG:Z:grp2 NM:i:0 MD:Z:10 diff --git a/test/split/split.expected_d_an_M_3.unk.sam b/test/split/split.expected_d_an_M_3.unk.sam new file mode 100644 index 000000000..dbe9c6936 --- /dev/null +++ b/test/split/split.expected_d_an_M_3.unk.sam @@ -0,0 +1,36 @@ +@HD VN:1.4 SO:coordinate +@RG ID:grp1 DS:Group 1 LB:Library 1 SM:Sample +@RG ID:grp2 DS:Group 2 LB:Library 2 SM:Sample +@PG ID:prog1 PN:emacs CL:emacs VN:23.1.1 +@CO The MIT License +@CO +@CO Copyright (c) 2014 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining a copy +@CO of this software and associated documentation files (the "Software"), to deal +@CO in the Software without restriction, including without limitation the rights +@CO to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +@CO copies of the Software, and to permit persons to whom the Software is +@CO furnished to do so, subject to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included in +@CO all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +@CO IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +@CO FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +@CO AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +@CO LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +@CO OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +@CO THE SOFTWARE. +@SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 +@SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 +@SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 +ref1_grp2_p004 99 ref1 15 7 10M = 39 34 GGGGATCCTC (((((((((( RG:Z:grp2 ia:i:-2147483648 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p005 99 ref1 19 9 10M = 43 34 ATCCTCTAGA ********** RG:Z:grp2 ia:i:-1000 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp1_p006 99 ref1 21 10 10M = 45 34 CCTCTAGAGT ++++++++++ RG:Z:grp1 ia:i:255 NM:i:0 MD:Z:10 +ref1_grp2_p006 99 ref1 23 11 10M = 47 34 TCTAGAGTCG ,,,,,,,,,, RG:Z:grp2 ia:i:-1 NM:i:0 MD:Z:10 +ref1_grp2_p004 147 ref1 39 19 10M = 15 -34 AGGCATGCAA 4444444444 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp2_p005 147 ref1 43 21 10M = 19 -34 ATGCAAGCTT 6666666666 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp1_p006 147 ref1 45 22 10M = 21 -34 GCAAGCTTGA 7777777777 RG:Z:grp1 NM:i:0 MD:Z:10 +ref1_grp2_p006 147 ref1 47 23 10M = 23 -34 AAGCTTGAGT 8888888888 RG:Z:grp2 NM:i:0 MD:Z:10 diff --git a/test/split/split.sam b/test/split/split.sam index c874469d1..c07002ee2 100644 --- a/test/split/split.sam +++ b/test/split/split.sam @@ -26,37 +26,37 @@ @SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 @SQ SN:ref2 LN:60 M5:7c35feac7036c1cdef3bee0cc4b21437 @SQ SN:ref3_unused LN:70 M5:5fdd18c2c6ecac4838996d029bf395b5 -ref1_grp1_p001 99 ref1 1 0 10M = 25 34 CGAGCTCGGT !!!!!!!!!! RG:Z:grp1 BC:Z:ACGT H0:i:1 aa:A:! ab:A:~ fa:f:3.14159 za:Z:Hello world! ha:H:DEADBEEF ba:B:c,-128,0,127 bb:B:C,0,127,255 bc:B:s,-32768,0,32767 bd:B:S,0,32768,65535 be:B:i,-2147483648,0,2147483647 bf:B:I,0,2147483648,4294967295 bg:B:f,2.71828,6.626e-34,2.9979e+09 NM:i:0 MD:Z:10 -ref1_grp2_p001 99 ref1 3 1 10M = 27 34 AGCTCGGTAC """""""""" RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE NM:i:0 MD:Z:10 -ref1_grp1_p002 99 ref1 5 2 10M = 29 34 CTCGGTACCC ########## RG:Z:grp1 BC:Z:AATTCCGG H0:i:1 aa:A:a ab:A:z fa:f:4.3597e-18 za:Z:Another string ha:H:2000AD NM:i:0 MD:Z:10 -ref1_grp2_p002 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ fa:f:6.022e+23 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp1_p003 99 ref1 9 4 10M = 33 34 GTACCCGGGG %%%%%%%%%% fa:f:1.66e-27 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p003 99 ref1 11 5 10M = 35 34 ACCCGGGGAT &&&&&&&&&& RG:Z:grp2 ia:i:4294967295 NM:i:0 MD:Z:10 -ref1_grp1_p004 99 ref1 13 6 10M = 37 34 CCGGGGATCC '''''''''' fa:f:1.38e-23 za:Z:xRG:Z:grp2 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p004 99 ref1 15 7 10M = 39 34 GGGGATCCTC (((((((((( RG:Z:grp2 ia:i:-2147483648 NM:i:0 MD:Z:10 -ref1_grp1_p005 99 ref1 17 8 10M = 41 34 GGATCCTCTA )))))))))) RG:Z:grp1 ia:i:40000 NM:i:0 MD:Z:10 -ref1_grp2_p005 99 ref1 19 9 10M = 43 34 ATCCTCTAGA ********** RG:Z:grp2 ia:i:-1000 NM:i:0 MD:Z:10 +ref1_grp1_p001 99 ref1 1 0 10M = 25 34 CGAGCTCGGT !!!!!!!!!! RG:Z:grp1 BC:Z:ACGT H0:i:1 aa:A:! ab:A:~ fa:f:3.14159 za:Z:Hello world! ha:H:DEADBEEF ba:B:c,-128,0,127 bb:B:C,0,127,255 bc:B:s,-32768,0,32767 bd:B:S,0,32768,65535 be:B:i,-2147483648,0,2147483647 bf:B:I,0,2147483648,4294967295 bg:B:f,2.71828,6.626e-34,2.9979e+09 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p001 99 ref1 3 1 10M = 27 34 AGCTCGGTAC """""""""" RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p002 99 ref1 5 2 10M = 29 34 CTCGGTACCC ########## RG:Z:grp1 BC:Z:AATTCCGG H0:i:1 aa:A:a ab:A:z fa:f:4.3597e-18 za:Z:Another string ha:H:2000AD NM:i:0 MD:Z:10 an:Z:cat +ref1_grp2_p002 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ fa:f:6.022e+23 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p003 99 ref1 9 4 10M = 33 34 GTACCCGGGG %%%%%%%%%% fa:f:1.66e-27 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp2_p003 99 ref1 11 5 10M = 35 34 ACCCGGGGAT &&&&&&&&&& RG:Z:grp2 ia:i:4294967295 NM:i:0 MD:Z:10 an:Z:dog +ref1_grp1_p004 99 ref1 13 6 10M = 37 34 CCGGGGATCC '''''''''' fa:f:1.38e-23 za:Z:xRG:Z:grp2 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p004 99 ref1 15 7 10M = 39 34 GGGGATCCTC (((((((((( RG:Z:grp2 ia:i:-2147483648 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp1_p005 99 ref1 17 8 10M = 41 34 GGATCCTCTA )))))))))) RG:Z:grp1 ia:i:40000 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p005 99 ref1 19 9 10M = 43 34 ATCCTCTAGA ********** RG:Z:grp2 ia:i:-1000 NM:i:0 MD:Z:10 an:Z:aardvark ref1_grp1_p006 99 ref1 21 10 10M = 45 34 CCTCTAGAGT ++++++++++ RG:Z:grp1 ia:i:255 NM:i:0 MD:Z:10 ref1_grp2_p006 99 ref1 23 11 10M = 47 34 TCTAGAGTCG ,,,,,,,,,, RG:Z:grp2 ia:i:-1 NM:i:0 MD:Z:10 -ref1_grp1_p001 147 ref1 25 12 10M = 1 -34 TAGAGTCGAC ---------- RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp1_p002 147 ref1 29 14 10M = 5 -34 GTCGACCTGC ////////// RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p002 147 ref1 31 15 10M = 7 -34 CGACCTGCAG 0000000000 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp1_p003 147 ref1 33 16 10M = 9 -34 ACCTGCAGGC 1111111111 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p003 147 ref1 35 17 10M = 11 -34 CTGCAGGCAT 2222222222 RG:Z:grp2 NM:i:0 MD:Z:10 -ref12_grp1_p001 97 ref1 36 50 10M ref2 2 0 TGCAGGCATG AAAAAAAAAA RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp1_p004 147 ref1 37 18 10M = 13 -34 GCAGGCATGC 3333333333 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p004 147 ref1 39 19 10M = 15 -34 AGGCATGCAA 4444444444 RG:Z:grp2 NM:i:0 MD:Z:10 -ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 RG:Z:grp1 NM:i:0 MD:Z:10 -ref1_grp2_p005 147 ref1 43 21 10M = 19 -34 ATGCAAGCTT 6666666666 RG:Z:grp2 NM:i:0 MD:Z:10 +ref1_grp1_p001 147 ref1 25 12 10M = 1 -34 TAGAGTCGAC ---------- RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p002 147 ref1 29 14 10M = 5 -34 GTCGACCTGC ////////// RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp2_p002 147 ref1 31 15 10M = 7 -34 CGACCTGCAG 0000000000 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp1_p003 147 ref1 33 16 10M = 9 -34 ACCTGCAGGC 1111111111 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp2_p003 147 ref1 35 17 10M = 11 -34 CTGCAGGCAT 2222222222 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:dog +ref12_grp1_p001 97 ref1 36 50 10M ref2 2 0 TGCAGGCATG AAAAAAAAAA RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref1_grp1_p004 147 ref1 37 18 10M = 13 -34 GCAGGCATGC 3333333333 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p004 147 ref1 39 19 10M = 15 -34 AGGCATGCAA 4444444444 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark +ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:badger +ref1_grp2_p005 147 ref1 43 21 10M = 19 -34 ATGCAAGCTT 6666666666 RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:aardvark ref1_grp1_p006 147 ref1 45 22 10M = 21 -34 GCAAGCTTGA 7777777777 RG:Z:grp1 NM:i:0 MD:Z:10 -ref12_grp2_p001 97 ref1 46 50 10M ref2 12 0 CAAGCTTGAG AAAAAAAAAA RG:Z:grp2 NM:i:0 MD:Z:10 +ref12_grp2_p001 97 ref1 46 50 10M ref2 12 0 CAAGCTTGAG AAAAAAAAAA RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:cat ref1_grp2_p006 147 ref1 47 23 10M = 23 -34 AAGCTTGAGT 8888888888 RG:Z:grp2 NM:i:0 MD:Z:10 -ref2_grp3_p001 83 ref2 1 99 15M = 31 45 ATTCTATAGTGTCAC ~~~~~~~~~~~~~~~ NM:i:0 MD:Z:15 -ref12_grp1_p001 145 ref2 2 50 10M ref1 36 0 TTCTATAGTG BBBBBBBBBB RG:Z:grp1 NM:i:0 MD:Z:10 -ref12_grp2_p001 145 ref2 12 50 10M ref1 46 0 TCACCTAAAT BBBBBBBBBB RG:Z:grp2 NM:i:0 MD:Z:10 -ref2_grp3_p002 147 ref2 16 99 15M = 46 45 CTAAATAGCTTGGCG }}}}}}}}}}}}}}} NM:i:0 MD:Z:15 -ref2_grp3_p001 163 ref2 31 99 15M = 1 -45 CTGTTTCCTGTGTGA ||||||||||||||| NM:i:13 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 -ref2_grp3_p002 99 ref2 46 99 15M = 16 -45 CTGTTTCCTGTGTGA {{{{{{{{{{{{{{{ NM:i:0 MD:Z:15 -unaligned_grp3_p001 77 * 0 0 * * 0 0 CACTCGTTCATGACG 0123456789abcde -unaligned_grp3_p001 141 * 0 0 * * 0 0 GAAAGTGAGGAGGTG edcba9876543210 +ref2_grp3_p001 83 ref2 1 99 15M = 31 45 ATTCTATAGTGTCAC ~~~~~~~~~~~~~~~ NM:i:0 MD:Z:15 an:Z:cat +ref12_grp1_p001 145 ref2 2 50 10M ref1 36 0 TTCTATAGTG BBBBBBBBBB RG:Z:grp1 NM:i:0 MD:Z:10 an:Z:cat +ref12_grp2_p001 145 ref2 12 50 10M ref1 46 0 TCACCTAAAT BBBBBBBBBB RG:Z:grp2 NM:i:0 MD:Z:10 an:Z:cat +ref2_grp3_p002 147 ref2 16 99 15M = 46 45 CTAAATAGCTTGGCG }}}}}}}}}}}}}}} NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +ref2_grp3_p001 163 ref2 31 99 15M = 1 -45 CTGTTTCCTGTGTGA ||||||||||||||| NM:i:13 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 an:Z:cat +ref2_grp3_p002 99 ref2 46 99 15M = 16 -45 CTGTTTCCTGTGTGA {{{{{{{{{{{{{{{ NM:i:0 MD:Z:15 RG:Z:grp3 an:Z:dog +unaligned_grp3_p001 77 * 0 0 * * 0 0 CACTCGTTCATGACG 0123456789abcde an:Z:dog +unaligned_grp3_p001 141 * 0 0 * * 0 0 GAAAGTGAGGAGGTG edcba9876543210 an:Z:dog diff --git a/test/test.pl b/test/test.pl index 179390a3f..68b765015 100755 --- a/test/test.pl +++ b/test/test.pl @@ -3510,6 +3510,43 @@ sub test_split ignore_pg_header => 1, reorder_header => 1, cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam"); + + test_cmd($opts, + out=>"dat/empty.expected", + out_map => { + 'split/split.tmp.grp1.sam' => 'split/split.expected.grp1.sam', + 'split/split.tmp.grp2.sam' => 'split/split.expected.grp2.sam', + 'split/split.tmp.grp3.sam' => 'split/split.expected_d_RG.grp3.sam', + 'split/split.tmp.unk.sam' => 'split/split.expected_d_RG.unk.sam', + }, + ignore_pg_header => 1, + reorder_header => 1, + cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -d RG -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam"); + + test_cmd($opts, + out=>"dat/empty.expected", + out_map => { + 'split/split.tmp.aardvark.sam' => 'split/split.expected_d_an.aardvark.sam', + 'split/split.tmp.badger.sam' => 'split/split.expected_d_an.badger.sam', + 'split/split.tmp.cat.sam' => 'split/split.expected_d_an.cat.sam', + 'split/split.tmp.dog.sam' => 'split/split.expected_d_an.dog.sam', + 'split/split.tmp.unk.sam' => 'split/split.expected_d_an.unk.sam', + }, + ignore_pg_header => 1, + reorder_header => 1, + cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -d an -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam"); + + test_cmd($opts, + out=>"dat/empty.expected", + out_map => { + 'split/split.tmp.badger.sam' => 'split/split.expected_d_an.badger.sam', + 'split/split.tmp.cat.sam' => 'split/split.expected_d_an.cat.sam', + 'split/split.tmp.dog.sam' => 'split/split.expected_d_an.dog.sam', + 'split/split.tmp.unk.sam' => 'split/split.expected_d_an_M_3.unk.sam', + }, + ignore_pg_header => 1, + reorder_header => 1, + cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -d an -M 3 -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam"); } sub test_ampliconclip From 528e1b243c59493c7341328e2851583c49c3b7af Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 14 Nov 2023 10:11:53 +0000 Subject: [PATCH 20/26] Fix various --write-index issues Call auto_index() after writing the file header on files made by prep_sam_file(). Fixes failure to make index files when using the -d option on non-RG tags. Write an index on the unaccounted file, which was previously missed out. Ensure index file names are freed on error. Fix crash on error in prep_sam_file() due to passing a NULL pointer to sam_close(). --- bam_split.c | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/bam_split.c b/bam_split.c index c6eaaff61..e0f730a33 100644 --- a/bam_split.c +++ b/bam_split.c @@ -64,6 +64,7 @@ struct state { sam_hdr_t* merged_input_header; samFile* unaccounted_file; sam_hdr_t* unaccounted_header; + char *unaccounted_idx_fn; size_t output_count; char **tag_vals; char **index_file_name; @@ -405,6 +406,12 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, if (state->p.pool) hts_set_opt(new_sam_file, HTS_OPT_THREAD_POOL, &state->p); + if (sam_hdr_write(new_sam_file, new_hdr) != 0) { + print_error_errno("split", "Couldn't write header to \"%s\"", + new_file_name); + goto fail; + } + if (state->write_index) { new_idx_fn = auto_index(new_sam_file, new_file_name, new_hdr); if (!new_idx_fn) { @@ -413,11 +420,6 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, } } - if (sam_hdr_write(new_sam_file, new_hdr) != 0) { - print_error_errno("split", "Couldn't write header to \"%s\"", - new_file_name); - goto fail; - } int ret = -1; i = kh_put_c2i(state->tag_val_hash, tag_val, &ret); if (ret < 0) { @@ -440,8 +442,10 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state, free(new_file_name); free(tag_val); free(new_idx_fn); - sam_hdr_destroy(new_hdr); - sam_close(new_sam_file); + if (new_hdr) + sam_hdr_destroy(new_hdr); + if (new_sam_file) + sam_close(new_sam_file); return kh_end(state->tag_val_hash); } @@ -633,9 +637,22 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) { int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0; - if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { - print_error_errno("split", "Could not write output file header"); - return false; + if (state->unaccounted_file) { + if (sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { + print_error_errno("split", "Could not write output file header"); + return false; + } + if (opts->ga.write_index) { + state->unaccounted_idx_fn = auto_index(state->unaccounted_file, + opts->unaccounted_name, + state->unaccounted_header); + if (!state->unaccounted_idx_fn) { + print_error_errno("split", + "Creating index file for file \"%s\" failed", + opts->unaccounted_name); + return false; + } + } } bam1_t* file_read = bam_init1(); // Read the first record @@ -738,10 +755,17 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list) size_t i; for (i = 0; i < state->output_count; i++) { if (sam_idx_save(state->output_file[i]) < 0) { - print_error_errno("split", "writing index failed"); + print_error_errno("split", "writing index \"%s\" failed", + state->index_file_name[i]); + return false; + } + } + if (state->unaccounted_file) { + if (sam_idx_save(state->unaccounted_file) < 0) { + print_error_errno("split", "writing index \"%s\" failed", + state->unaccounted_idx_fn); return false; } - free(state->index_file_name[i]); } } @@ -776,6 +800,7 @@ static int cleanup_state(state_t* status, bool check_close) } if (status->tag_vals) free(status->tag_vals[i]); if (status->output_file_name) free(status->output_file_name[i]); + if (status->index_file_name[i]) free(status->index_file_name[i]); } if (status->merged_input_header) sam_hdr_destroy(status->merged_input_header); @@ -783,6 +808,7 @@ static int cleanup_state(state_t* status, bool check_close) free(status->output_file); free(status->output_file_name); free(status->index_file_name); + free(status->unaccounted_idx_fn); kh_destroy_c2i(status->tag_val_hash); free(status->tag_vals); From 4c5285fa0a1109c32fd1e70c97782f2511ba09c1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 24 Nov 2023 09:51:25 +0000 Subject: [PATCH 21/26] Fix a couple minor samtools view memory problems. If we fail to decode a BAM file due to file truncation, we attempted to access iter->curr_tid in the diagnostic error just after it was freed, potentially giving incorrect errors. We also didn't free the index on exit, which is harmless but makes finding real problems harder. --- sam_view.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sam_view.c b/sam_view.c index 2a1439acb..877f84a13 100644 --- a/sam_view.c +++ b/sam_view.c @@ -777,13 +777,13 @@ static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter) while ((result = sam_itr_multi_next(conf->in, iter, b)) >= 0) { if (process_one_record(conf, b, &write_error) < 0) break; } - hts_itr_multi_destroy(iter); bam_destroy1(b); if (result < -1) { print_error("view", "retrieval of region #%d failed", iter->curr_tid); - return 1; + write_error = 1; } + hts_itr_multi_destroy(iter); return write_error; } @@ -1376,8 +1376,6 @@ int main_samview(int argc, char *argv[]) } } - if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx); - if (ga.write_index) { if (sam_idx_save(settings.out) < 0) { print_error_errno("view", "writing index failed"); @@ -1390,6 +1388,8 @@ int main_samview(int argc, char *argv[]) } view_end: + if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx); + if (settings.is_count && ret == 0) { if (fprintf(settings.fn_out? fp_out : stdout, "%" PRId64 "\n", settings.count) < 0) { if (settings.fn_out) print_error_errno("view", "writing to \"%s\" failed", settings.fn_out); From 5e0ae52bb28601d10a4f93c388c69b96e81365db Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 14 Nov 2023 15:58:56 +0000 Subject: [PATCH 22/26] News updates for Winter 2023 (1.19) --- NEWS | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/NEWS b/NEWS index d7d6f20f3..c961fd9be 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,69 @@ Release a.b ----------- +New work and changes: + +* Samtools coverage: add a new --plot-depth option to draw depth (of + coverage) rather than the percentage of bases covered. + (PR#1910. Thanks to Pierre Lindenbaum) + +* Samtools merge / sort: add a lexicographical name-sort option via the -N + option. The "natural" alpha-numeric sort is still available via -n. + (PR#1900, fixes #1500. Reported by Steve Huang) + +* Samtools view: add -N ^NAME_FILE and -R ^RG_FILE options. The standard -N + and -R options only output reads matching a specified list of names or + read-groups. With a caret (^) prefix these may be negated to only output + read not matching the specified files. + (PR#1896, fixes #1895. Suggested by Feng Tian) + +* Cope with Htslib's change to no longer close stdout on is hts_close is + used with the output file being stdout. + Htslib companion PR is samtools/htslib#1665. + (PR#1909. Thanks to John Marshall) + +* Plot-bamstats: add a new plot of the read lengths ("RL") from samtools + stats output. + (PR#1922, fixes #1824. Thanks to @erboone, suggested by Martin Pollard) + +* Samtools split: support splitting files based on the contents of + auxiliary tags. Also adds a -M option to limit the number of files split + can make, to avoid accidental resource over-allocation, and fixes some + issues with --write-index. + (PR#1222, PR#1933, fixes #1758. Thanks to Valeriu Ohan, suggested by + Scott Norton) + +Bug Fixes + +* Samtools stats: empty barcode tags are now treated as having no barcode. + (PR#1929, fixes #1926. Reported by Jukka Matilainen) + +* Samtools cat: add support for non-seekable streams. The file format + detection blocked pipes from working before, but now files may be + non-seekable such as stdin or a pipe. + (PR#1930, fixes #1731. Reported by Julian Hess) + + +Documentation + +* Samtools mpileup: add more usage examples to the man page. + (PR#1913, fixes #1801) + +* Samtools fastq: explicitly document the order that filters apply. + (PR#1907. fixes ?) + +* Samtools merge: fix example output to use an uppercase RG PL field. + (PR#1917. Thanks to John Marshall. Reported by Michael Macias) + +* Add hclen SAM filter documentation. + (PR#1902. See also samtools/htslib#1660) + +* Samtools consensus: remove the -5 option from documentation. This option + was renamed before the consensus subcommand was merged, but accidentally + left in the man page. + (PR#1924) + + Release 1.18 (25th July 2023) ----------------------------- From a55b8a9a4bf97b4f28dc57788ca7738627b7a825 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Nov 2023 09:52:32 +0000 Subject: [PATCH 23/26] Renamed NEWS to NEWS.md so it's formatted nicely on github. It's pretty much already in Markdown anyway, but I edited a few quoting issues in pre-1.0 releases and changed PR-hash to PR-space-hash to get auto-formatting working for release notes. The format is still essentially plain text and readable without needing to have multiple copies for different media. --- NEWS => NEWS.md | 360 ++++++++++++++++++++++++------------------------ 1 file changed, 180 insertions(+), 180 deletions(-) rename NEWS => NEWS.md (90%) diff --git a/NEWS b/NEWS.md similarity index 90% rename from NEWS rename to NEWS.md index c961fd9be..1527fbb00 100644 --- a/NEWS +++ b/NEWS.md @@ -5,63 +5,63 @@ New work and changes: * Samtools coverage: add a new --plot-depth option to draw depth (of coverage) rather than the percentage of bases covered. - (PR#1910. Thanks to Pierre Lindenbaum) + (PR #1910. Thanks to Pierre Lindenbaum) * Samtools merge / sort: add a lexicographical name-sort option via the -N option. The "natural" alpha-numeric sort is still available via -n. - (PR#1900, fixes #1500. Reported by Steve Huang) + (PR #1900, fixes #1500. Reported by Steve Huang) * Samtools view: add -N ^NAME_FILE and -R ^RG_FILE options. The standard -N and -R options only output reads matching a specified list of names or read-groups. With a caret (^) prefix these may be negated to only output read not matching the specified files. - (PR#1896, fixes #1895. Suggested by Feng Tian) + (PR #1896, fixes #1895. Suggested by Feng Tian) * Cope with Htslib's change to no longer close stdout on is hts_close is used with the output file being stdout. Htslib companion PR is samtools/htslib#1665. - (PR#1909. Thanks to John Marshall) + (PR #1909. Thanks to John Marshall) * Plot-bamstats: add a new plot of the read lengths ("RL") from samtools stats output. - (PR#1922, fixes #1824. Thanks to @erboone, suggested by Martin Pollard) + (PR #1922, fixes #1824. Thanks to @erboone, suggested by Martin Pollard) * Samtools split: support splitting files based on the contents of auxiliary tags. Also adds a -M option to limit the number of files split can make, to avoid accidental resource over-allocation, and fixes some issues with --write-index. - (PR#1222, PR#1933, fixes #1758. Thanks to Valeriu Ohan, suggested by + (PR #1222, PR #1933, fixes #1758. Thanks to Valeriu Ohan, suggested by Scott Norton) Bug Fixes * Samtools stats: empty barcode tags are now treated as having no barcode. - (PR#1929, fixes #1926. Reported by Jukka Matilainen) + (PR #1929, fixes #1926. Reported by Jukka Matilainen) * Samtools cat: add support for non-seekable streams. The file format detection blocked pipes from working before, but now files may be non-seekable such as stdin or a pipe. - (PR#1930, fixes #1731. Reported by Julian Hess) + (PR #1930, fixes #1731. Reported by Julian Hess) Documentation * Samtools mpileup: add more usage examples to the man page. - (PR#1913, fixes #1801) + (PR #1913, fixes #1801) * Samtools fastq: explicitly document the order that filters apply. - (PR#1907. fixes ?) + (PR #1907) * Samtools merge: fix example output to use an uppercase RG PL field. - (PR#1917. Thanks to John Marshall. Reported by Michael Macias) + (PR #1917. Thanks to John Marshall. Reported by Michael Macias) * Add hclen SAM filter documentation. - (PR#1902. See also samtools/htslib#1660) + (PR #1902. See also samtools/htslib#1660) * Samtools consensus: remove the -5 option from documentation. This option was renamed before the consensus subcommand was merged, but accidentally left in the man page. - (PR#1924) + (PR #1924) Release 1.18 (25th July 2023) @@ -73,51 +73,51 @@ New work and changes: minimiser sort to arrange the minimiser values in the same order as they occur in the reference genome. This is acts as an extremely crude and simplistic read aligner that can be used to boost read compression. - (PR#1818) + (PR #1818) * Add a --duplicate-count option to markdup. Adds the number of duplicates (including itself) to the original read in a 'dc' tag. - (PR#1816. Thanks to wulj2) + (PR #1816. Thanks to wulj2) * Make calmd handle unaligned data or empty files without throwing an error. This is to make pipelines work more smoothly. A warning will still be issued. - (PR#1841, fixes #1839. Reported by Filipe G. Vieira) + (PR #1841, fixes #1839. Reported by Filipe G. Vieira) * Consistent, more comprehensive flag filtering for fasta/fastq. Added --rf/--incl[ude]-flags and long options for -F (--excl[ude]-flags and -f (--require-flags). - (PR#1842. Thanks to Devang Thakkar) + (PR #1842. Thanks to Devang Thakkar) * Apply fastq --input-fmt-option settings. Previously any options specified were not being applied to the input file. - (PR#1855. Thanks to John Marshall) + (PR #1855. Thanks to John Marshall) * Add fastq -d TAG[:VAL] check. This mirrors view -d and will only output alignments that match TAG (and VAL if specified). - (PR#1863, fixes #1854. Requested by Rasmus Kirkegaard) + (PR #1863, fixes #1854. Requested by Rasmus Kirkegaard) * Extend import --order TAG to --order TAG:length. If length is specified, the tag format goes from integer to a 0-padded string format. This is a workaround for BAM and CRAM that cannot encode an order tag of over 4 billion records. - (PR#1850, fixes #1847. Reported by Feng Tian) + (PR #1850, fixes #1847. Reported by Feng Tian) * New -aa mode for consensus. This works like the -aa option in depth and mpileup. The single 'a' reports all bases in contigs covered by alignments. Double 'aa' (or '-a -a') reports Ns even for the references with no alignments against them. - (PR#1851, fixes #1849. Requested by Tim Fennell) + (PR #1851, fixes #1849. Requested by Tim Fennell) * Add long option support to samtools index. - (PR#1872, fixes #1869. Reported by Jason Bacon) + (PR #1872, fixes #1869. Reported by Jason Bacon) * Be consistent with rounding of "average length" in samtools stats. - (PR#1876, fixes #1867. Reported by Jelinek-J) + (PR #1876, fixes #1867. Reported by Jelinek-J) * Add option to ampliconclip that marks reads as unmapped when they do not have enough aligned bases left after clipping. Default is to unmap reads with zero aligned bases. - (PR#1865, fixes #1856. Requested by ces) + (PR #1865, fixes #1856. Requested by ces) Bug Fixes: @@ -128,45 +128,45 @@ Bug Fixes: although the change in 1.11 was bug-fixing multi-threaded index queries. This bug did not affect index building. There is no need to reindex your CRAM files. - (PR#samtools/htslib#1574, PR#samtools/htslib#1640. Fixes + (PR #samtools/htslib#1574, PR #samtools/htslib#1640. Fixes #samtools/htslib#1569, #samtools/htslib#1639, #1808, #1819. Reported by xuxif, Jens Reeder and Jared Simpson) * Fix a sort -M bug (regression) when merging sub-blocks. Data was valid but in a poor order for compression. - (PR#1812) + (PR #1812) * Fix bug in split output format. Now SAM and CRAM format can chosen as well as BAM. Also a documentation change, see below. - (PR#1821) + (PR #1821) * Add error checking to view -e filter expression code. Invalid expressions were not returning an error code. - (PR#1833, fixes #1829. Reported by Steve Huang) + (PR #1833, fixes #1829. Reported by Steve Huang) * Fix reheader CRAM output version. Sets the correct CRAM output version for non-3.0 CRAMs. - (PR#1868, fixes #1866. Reported by John Marshall) + (PR #1868, fixes #1866. Reported by John Marshall) Documentation: * Expand the default filtering information on the mpileup man page. - (PR#1802, fixes #1801. Reported by gevro) + (PR #1802, fixes #1801. Reported by gevro) * Add an explanation of the default behaviour of split files on generating a file for reads with missing or unrecognised RG tags. Also a small bug fix, see above. - (PR#1821, fixes #1817. Reported by Steve Huang) + (PR #1821, fixes #1817. Reported by Steve Huang) * In the INSTALL instructions, switched back to openssl for Alpine. This matches the current Alpine Linux practice. - (PR#1837, see htslib#1591. Reported by John Marshall) + (PR #1837, see htslib#1591. Reported by John Marshall) * Fix various typos caught by lintian parsers. - (PR#1877. Thanks to Étienne Mollier) + (PR #1877. Thanks to Étienne Mollier) * Document consensus --qual-calibration option. - (PR#1880, fixes #1879. Reported by John Marshall) + (PR #1880, fixes #1879. Reported by John Marshall) * Updated the page about samtools duplicate marking with more detail at www.htslib.org/algorithms/duplicate.html @@ -174,7 +174,7 @@ Documentation: Non user-visible changes and build improvements: * Removed a redundant line that caused a warning in gcc-13. - (PR#1838) + (PR #1838) Release 1.17 (21st February 2023) @@ -187,115 +187,115 @@ New work and changes: reverse direction, sequence and its quality values are reversed and complemented and the reverse flag is reset. Supplementary and secondary alignment data are discarded. - (PR#1767, implements #1682. Requested by dkj) + (PR #1767, implements #1682. Requested by dkj) * New samtools cram-size subcommand. It writes out metrics about a CRAM file reporting aggregate sizes per block "Content ID" fields, the data-series contained within them, and the compression methods used. - (PR#1777) + (PR #1777) * Added a --sanitize option to fixmate and view. This performs some sanity checks on the state of SAM record fields, fixing up common mistakes made by aligners. - (PR#1698) + (PR #1698) * Permit 1 thread with samtools view. All other subcommands already allow this and it does provide a modest speed increase. - (PR#1755, fixes #1743. Reported by Goran Vinterhalter) + (PR #1755, fixes #1743. Reported by Goran Vinterhalter) * Add CRAM_OPT_REQUIRED_FIELDS option for view -c. This is a big speed up for CRAM (maybe 5-fold), but it depends on which filtering options are being used. - (PR#1776, fixes #1775. Reported by Chang Y) + (PR #1776, fixes #1775. Reported by Chang Y) * New filtering options in samtools depth. The new --excl-flags option is a synonym for -G, with --incl-flags and --require-flags added to match view logic. - (PR#1718, fixes #1702. Reported by Dario Beraldi) + (PR #1718, fixes #1702. Reported by Dario Beraldi) * Speed up calmd's slow handling of non-position-sorted data by adding caching. This uses more memory but is only activated when needed. - (PR#1723, fixes #1595. Reported by lxwgcool) + (PR #1723, fixes #1595. Reported by lxwgcool) * Improve samtools consensus for platforms with instrument specific profiles, considerably helping for data with very different indel error models and providing base quality recalibration tables. On PacBio HiFi, ONT and Ultima Genomics consensus qualities are also redistributed within homopolymers and the likelihood of nearby indel errors is raised. - (PR#1721, PR#1733) + (PR #1721, PR #1733) * Consensus --mark-ins option. This permits he consensus output to include a markup indicating the next base is an insertion. This is necessary as we need a way of outputting both consensus and also how that consensus marries up with the reference coordinates. - (PR#1746) + (PR #1746) * Make faidx/fqidx output line length default to the input line length. - (PR#1738, fixes #1734. Reported by John Marshall) + (PR #1738, fixes #1734. Reported by John Marshall) * Speed up optical duplicate checking where data has a lot of duplicates compared to non-duplicates. - (PR#1779, fixes #1771. Reported by Poshi) + (PR #1779, fixes #1771. Reported by Poshi) * For collate use TMPDIR environment variable, when looking for a temporary folder. - (PR#1782, based on PR#1178 and fixes #1172. Reported by Martin Pollard) + (PR #1782, based on PR #1178 and fixes #1172. Reported by Martin Pollard) Bug Fixes: * Fix stats breakage on long deletions when given a reference. - (PR#1712, fixes #1707. Reported by John Didion) + (PR #1712, fixes #1707. Reported by John Didion) * In ampliconclip, stop hard clipping from wrongly removing entire reads. - (PR#1722, fixes #1717. Reported by Kevin Xu) + (PR #1722, fixes #1717. Reported by Kevin Xu) * Fix bug in ampliconstats where references mentioned in the input file headers but not in the bed file would cause it to complain that the SAM headers were inconsistent. - (PR#1727, fixes #1650. Reported by jPontix) + (PR #1727, fixes #1650. Reported by jPontix) * Fixed SEGV in samtools collate when no filename given. - (PR#1724) + (PR #1724) * Changed the default UMI barcode regex in markdup. The old regex was too restrictive. This version will at least allow the default read name UMI as given in the Illumina example documentation. - (PR#1737, fixes #1730. Reported by yloemie) + (PR #1737, fixes #1730. Reported by yloemie) * Fix samtools consensus buffer overrun with MD:Z handling. - (PR#1745, fixes #1744. Reported by trilisser) + (PR #1745, fixes #1744. Reported by trilisser) * Fix a buffer read-overflow in mpileup and tview on sequences with seq "*". - (PR#1747) + (PR #1747) * Fix view -X command line parsing that was broken in 1.15. - (PR#1772, fixes #1720. Reported by Francisco Rodríguez-Algarra + (PR #1772, fixes #1720. Reported by Francisco Rodríguez-Algarra and Miguel Machado) * Stop samtools view -d from reporting meaningless system errors when tag validation fails. - (PR#1796) + (PR #1796) Documentation: * Add a description of the samtools tview display layout to the man page. Documents . vs , and upper vs lowercase. Adds a -s sample example, and documents the -w option. - (PR#1765, fixes #1759. Reported by Lucas Ferreira da Silva) + (PR #1765, fixes #1759. Reported by Lucas Ferreira da Silva) * Clarify intention of samtools fasta/q in man page and soft vs hard clipping. - (PR#1794, fixes #1792. Reported by Ryan Lorig-Roach) + (PR #1794, fixes #1792. Reported by Ryan Lorig-Roach) * Minor fix to wording of mpileup --rf usage and man page. - (PR#1795, fixes #1791. Reported by Luka Pavageau) + (PR #1795, fixes #1791. Reported by Luka Pavageau) Non user-visible changes and build improvements: * Use POSIX grep in testing as egrep and fgrep are considered obsolete. - (PR#1726, thanks to David Seifert) + (PR #1726, thanks to David Seifert) * Switch MacOS CI tests to an ARM-based image. - (PR#1770) + (PR #1770) Release 1.16.1 (2nd September 2022) @@ -306,11 +306,11 @@ Bug fixes: * Fixed a bug with the template-coordinate sort which caused incorrect ordering when using threads, or processing large files that don't fit completely in memory. - (PR#1703, thanks to Nils Homer) + (PR #1703, thanks to Nils Homer) * Fixed a crash that occurred when trying to use `samtools merge` in template-coordinate mode. - (PR#1705, thanks to Nils Homer) + (PR #1705, thanks to Nils Homer) Release 1.16 (18th August 2022) ------------------------------- @@ -319,90 +319,90 @@ New work and changes: * samtools reference command added. This subcommand extracts the embedded reference out of a CRAM file. - (PR#1649, addresses #723. Requested by Torsten Seemann) + (PR #1649, addresses #723. Requested by Torsten Seemann) * samtools import now adds grouped by query-name to the header. - (PR#1633, thanks to Nils Homer) + (PR #1633, thanks to Nils Homer) * Made samtools view read error messages more generic. Former error message would claim that there was a "truncated file or corrupt BAM index file" with no real justification. Also reset errno in stream_view which could lead to confusing error messages. - (PR#1645, addresses some of the issues in #1640. Reported by Jian-Guo Zhou) + (PR #1645, addresses some of the issues in #1640. Reported by Jian-Guo Zhou) * Make samtools view -p also clear mqual, tlen and cigar. - (PR#1647, fixes #1606. Reported by eboyden) + (PR #1647, fixes #1606. Reported by eboyden) * Add bedcov option -c to report read count. - (PR#1644, fixes #1629. Reported by Natchaphon Rajudom) + (PR #1644, fixes #1629. Reported by Natchaphon Rajudom) * Add UMI/barcode handling to samtools markdup. - (PR#1630, fixes #1358 and #1514. Reported by Gert Hulselmans and Poshi) + (PR #1630, fixes #1358 and #1514. Reported by Gert Hulselmans and Poshi) * Add a new template coordinate sort order to samtools sort and samtools merge. This is useful when working with unique molecular identifiers (UMIs). - (PR#1605, fixes #1591. Thanks to Nils Homer) + (PR #1605, fixes #1591. Thanks to Nils Homer) * Rename mpileup --ignore-overlaps to --ignore-overlaps-removal or --disable-overlap-removal. The previous name was ambiguous and was often read as an option to enable removal of overlapping bases, while in reality this is on by default and the option turns off the ability to remove overlapping bases. - (PR#1666, fixes #1663. Reported by yangdingyangding) + (PR #1666, fixes #1663. Reported by yangdingyangding) * The dict command can now read BWA's .alt file and add AH:* tags indicating reference sequences that represent alternate loci. - (PR#1676. Thanks to John Marshall) + (PR #1676. Thanks to John Marshall) * The "samtools index" command can now accept multiple alignment filenames with the new -M option, and will index each of them separately. (Specifying the output index filename via out.index or the new -o option is currently only applicable when there is only one alignment file to be indexed.) - (PR#1674. Reported by Abigail Ramsøe and Nicola Romanò. + (PR #1674. Reported by Abigail Ramsøe and Nicola Romanò. Thanks to John Marshall) * Allow samtools fastq -T "*". This allows all tags from SAM records to be written to fastq headers. This is a counterpart to samtools import -T "*". - (PR#1679. Thanks to cjw85) + (PR #1679. Thanks to cjw85) Bug Fixes: * Re-enable --reference option for samtools depth. The reference is not used but this makes the command line usage compatible with older releases. - (PR#1646, fixes #1643. Reported by Randy Harr) + (PR #1646, fixes #1643. Reported by Randy Harr) * Fix regex coordinate bug in samtools markdup. - (PR#1657, fixes #1642. Reported by Randy Harr) + (PR #1657, fixes #1642. Reported by Randy Harr) * Fix divide by zero in plot-bamstats -m, on unmapped data. - (PR#1678, fixes #1675. Thanks to Shane McCarthy) + (PR #1678, fixes #1675. Thanks to Shane McCarthy) * Fix missing RG headers when using samtools merge -r. - (PR#1683, addresses htslib#1479. Reported by Alex Leonard) + (PR #1683, addresses htslib#1479. Reported by Alex Leonard) * Fix a possible unaligned access in samtools reference. - (PR#1696) + (PR #1696) Documentation: * Add documentation on CRAM compression profiles and some of the newer options that appear in CRAM 3.1 and above. - (PR#1659, fixes #1656. Reported by Matthias De Smet) + (PR #1659, fixes #1656. Reported by Matthias De Smet) * Add "sclen" filter expression keyword documentation. - (PR#1661, see also htslib#1441) + (PR #1661, see also htslib#1441) * Extend FILTER EXPRESSION man page section to match the changes made in HTSlib. - (PR#1687, samtools/htslib#1478) + (PR #1687, samtools/htslib#1478) Non user-visible changes and build improvements: * Ensure generated test files are ignored (by git) and cleaned (by make testclean) - (PR#1692, Thanks to John Marshall) + (PR #1692, Thanks to John Marshall) Release 1.15.1 (7th April 2022) ------------------------------- @@ -767,7 +767,7 @@ Release 1.12 (17th March 2021) * samtools markdup now benefits from an increase in performance in the situation when a single read has tens or hundreds of thousands of duplicates. - Thanks to `@denriquez` for reporting the issue. (#1345; fixed #1325) + Thanks to @denriquez for reporting the issue. (#1345; fixed #1325) * The documentation for samtools ampliconstats has been added to the samtools man page. (#1351) @@ -793,7 +793,7 @@ Release 1.12 (17th March 2021) (#1343) * The documentation for `samtools depth -s` has been improved. - Thanks to `@wulj2`. (#1355) + Thanks to @wulj2. (#1355) * Fixed a `samtools merge` segmentation fault when it failed to merge header `@PG` records. Thanks to John Marshall. (#1394; reported by @@ -969,10 +969,10 @@ Changes affecting the whole of samtools, or multiple sub-commands: for invalid headers, it is possible that some illegal files will now be rejected when they would have been allowed by earlier versions. (#998) - Examples of problems that will now be rejected include @SQ lines with - no SN: tag, and @RG or @PG lines with no ID: tag. + Examples of problems that will now be rejected include `@SQ` lines with + no SN: tag, and `@RG` or `@PG` lines with no ID: tag. - * samtools sub-commands will now add '@PG' header lines to output sam/bam/cram + * samtools sub-commands will now add `@PG` header lines to output sam/bam/cram files. To disable this, use the '--no-PG' option. (#1087; #1097) * samtools now supports alignment records with reference positions greater @@ -1152,7 +1152,7 @@ Changes affecting specific sub-commands: * samtools quickcheck: - - Add unmapped (-u) option, which disables the check for @SQ lines in + - Add unmapped (-u) option, which disables the check for `@SQ` lines in the header. (#920, thanks to Shane McCarthy) * samtools reheader: @@ -1176,7 +1176,7 @@ Changes affecting specific sub-commands: header file to be specified in its own option. (#961) - Fixed bug where samtools split would crash if it read a SAM header that - contained an @RG line with no ID tag. (#954, reported by @blue-bird1) + contained an `@RG` line with no ID tag. (#954, reported by @blue-bird1) * samtools stats: @@ -1399,7 +1399,7 @@ Release 1.7 (26th January 2018) * samtools bedcov will now ignore BED comment and header lines (#571; thanks to Daniel Baker). -* samtools collate now updates the @HD SO: and GO: tags, and sort will +* samtools collate now updates the `@HD` SO: and GO: tags, and sort will remove a GO: tag if present. (#757; reported by Imran Haque). * Bug-fixes: @@ -1466,7 +1466,7 @@ Release 1.5 [Solstice Release] (21st June 2017) (#675; thanks to Patrick Marks of 10xgenomics). Release 1.4.1 (8th May 2017) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------- * Added options to fastq to create fastq files from BC (or other) tags. @@ -1491,13 +1491,13 @@ Release 1.4.1 (8th May 2017) Release 1.4 (13 March 2017) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +---------------------------- Noteworthy changes in samtools: * Fixed Issue #345 - out-by-one error in insert-size in samtools stats -* bam_split now add a @PG header to the bam file +* bam_split now add a `@PG` header to the bam file * Added mate cigar tag support to fixmate @@ -1529,7 +1529,7 @@ Noteworthy changes in samtools: Beta Release 1.3.1 (22 April 2016) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------------- Noteworthy changes in samtools: @@ -1554,20 +1554,20 @@ Noteworthy changes in samtools: field (#517). * The merge and sort commands now handle (unusual) BAM files that have no - textual @SQ headers (#548, #550). + textual `@SQ` headers (#548, #550). * Sorting files containing @CO headers no longer duplicates the comment headers, which previously happened on large sorts for which temporary files were needed (#563). -* The rmdup and view -l commands no longer crash on @RG headers that do not +* The rmdup and view -l commands no longer crash on `@RG` headers that do not have a LB field (#538). * Fixed miscellaneous issues #128, #130, #131, #489, and #514. Beta Release 1.3 (15 December 2015) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ Noteworthy changes in samtools: @@ -1602,7 +1602,7 @@ Noteworthy changes in samtools: Previously it listened to $(LDLIBS) instead; if you were overriding that, you should now override LIBS rather than LDLIBS. -* A new addreplacerg command that adds or alters @RG headers and RG:Z +* A new addreplacerg command that adds or alters `@RG` headers and RG:Z record tags has been added. * The rmdup command no longer immediately aborts (previously it always @@ -1663,7 +1663,7 @@ Noteworthy changes in samtools: Beta Release 1.2 (2 February 2015) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------------- Noteworthy changes in samtools: @@ -1679,7 +1679,7 @@ Noteworthy changes in samtools: Beta Release 1.1 (19 September, 2014) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- Notable changes in samtools: @@ -1694,7 +1694,7 @@ Notable changes in samtools: Beta Release 1.0 (15 August, 2014) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +---------------------------------- First release of HTSlib-based samtools. @@ -1707,13 +1707,13 @@ superseded by BGZF/bgzip and have been removed from samtools. Beta Release 0.1.20 (15 August, 2014) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- Final release of standalone samtools. Beta Release 0.1.19 (15 March, 2013) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ Notable changes in samtools and bcftools: @@ -1743,7 +1743,7 @@ http://github.com/samtools/samtools/commits/master Beta Release 0.1.18 (2 September, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------------- Notable changes in samtools: @@ -1770,14 +1770,14 @@ Notable changes in bcftools: Beta Release 0.1.17 (6 July, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +---------------------------------- -With the maturity of `mpileup' and the lack of update in the `pileup' command, -the `pileup' command is now formally dropped. Most of the pileup functionality, +With the maturity of `mpileup` and the lack of update in the `pileup` command, +the `pileup` command is now formally dropped. Most of the pileup functionality, such as outputting mapping quality and read positions, have been added -`mpileup'. +`mpileup`. -Since this release, `bcftools view' is able to perform contrast SNP calling +Since this release, `bcftools view` is able to perform contrast SNP calling (option -T) for discovering de novo and/or somatic mutations between a pair of samples or in a family trio. Potential mutations are scored by a log likelihood ratio, which is very simple in math, but should be comparable to more @@ -1795,7 +1795,7 @@ Other notable changes in samtools: * Fixed an issue where mpileup does not apply BAQ for the first few reads when a region is specified. - * Fixed an issue where `faidx' does not work with FASTA files with long lines. + * Fixed an issue where `faidx` does not work with FASTA files with long lines. * Bugfix: wrong SP genotype information in the BCF output. @@ -1814,14 +1814,14 @@ Other notable changes in bcftools: Beta Release 0.1.16 (21 April, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ Notable changes in samtools: - * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. + * Support the new SAM/BAM type `B` in the latest SAM spec v1.4. - * When the output file of `samtools merge' exists, do not overwrite it unless - a new command-line option `-f' is applied. + * When the output file of `samtools merge` exists, do not overwrite it unless + a new command-line option `-f` is applied. * Bugfix: BED support is not working when the input BED is not sorted. @@ -1839,11 +1839,11 @@ Notable changes in bcftools: * Use Brent's method to estimate the site allele frequency when EM converges slowly. The resulting ML estimate of allele frequnecy is more accurate. - * Added the `ldpair' command, which computes r^2 between SNP pairs given in + * Added the `ldpair` command, which computes r^2 between SNP pairs given in an input file. -Also, the `pileup' command, which has been deprecated by `mpileup' since -version 0.1.10, will be dropped in the next release. The old `pileup' command +Also, the `pileup` command, which has been deprecated by `mpileup` since +version 0.1.10, will be dropped in the next release. The old `pileup` command is substandard and causing a lot of confusion. (0.1.16: 21 April 2011, r963:234) @@ -1851,27 +1851,27 @@ is substandard and causing a lot of confusion. Beta Release 0.1.15 (10 April, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ Notable changes: * Allow to perform variant calling or to extract information in multiple - regions specified by a BED file (`samtools mpileup -l', `samtools view -L' - and `bcftools view -l'). + regions specified by a BED file (`samtools mpileup -l`, `samtools view -L` + and `bcftools view -l`). - * Added the `depth' command to samtools to compute the per-base depth with a - simpler interface. File `bam2depth.c', which implements this command, is the + * Added the `depth` command to samtools to compute the per-base depth with a + simpler interface. File `bam2depth.c`, which implements this command, is the recommended example on how to use the mpileup APIs. * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg test using this estimate. - * For `samtools view', when `-R' is specified, drop read groups in the header + * For `samtools view`, when `-R` is specified, drop read groups in the header that are not contained in the specified file. - * For `samtools flagstat', separate QC-pass and QC-fail reads. + * For `samtools flagstat`, separate QC-pass and QC-fail reads. - * Improved the command line help of `samtools mpileup' and `bcftools view'. + * Improved the command line help of `samtools mpileup` and `bcftools view`. * Use a global variable to control the verbose level of samtools stderr output. Nonetheless, it has not been full utilized. @@ -1884,7 +1884,7 @@ Notable changes: Beta release 0.1.14 (21 March, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ This release implements a method for testing associations for case-control data. The method does not call genotypes but instead sums over all genotype @@ -1898,48 +1898,48 @@ ploidy of each sample (see also manual bcftools/bcftools.1). Other notable changes: - * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + * Added `bcftools view -F` to parse BCF files generated by samtools r921 or older which encodes PL in a different way. - * Changed the behavior of `bcftools view -s'. Now when a list of samples is + * Changed the behavior of `bcftools view -s`. Now when a list of samples is provided, the samples in the output will be reordered to match the ordering in the sample list. This change is mainly designed for association test. - * Sped up `bcftools view -v' for target sequencing given thousands of samples. - Also added a new option `view -d' to skip loci where only a few samples are + * Sped up `bcftools view -v` for target sequencing given thousands of samples. + Also added a new option `view -d` to skip loci where only a few samples are covered by reads. * Dropped HWE test. This feature has never been implemented properly. An EM should be much better. To be implemented in future. - * Added the `cat' command to samtools. This command concatenate BAMs with + * Added the `cat` command to samtools. This command concatenate BAMs with identical sequence dictionaries in an efficient way. Modified from bam_cat.c written by Chris Saunders. - * Added `samtools view -1' to write BAMs at a low compression level but twice - faster to create. The `sort' command generates temporary files at a low + * Added `samtools view -1` to write BAMs at a low compression level but twice + faster to create. The `sort` command generates temporary files at a low compression level as well. - * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality + * Added `samtools mpileup -6` to accept "BAM" with Illumina 1.3+ quality strings (strictly speaking, such a file is not BAM). - * Added `samtools mpileup -L' to skip INDEL calling in regions with + * Added `samtools mpileup -L` to skip INDEL calling in regions with excessively high coverage. Such regions dramatically slow down mpileup. - * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. + * Updated `misc/export2sam.pl`, provided by Chris Saunders from Illumina Inc. (0.1.14: 21 March 2011, r933:170) Beta release 0.1.13 (1 March, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------------- The most important though largely invisible modification is the change of the order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF generated by samtools older than r921 inclusive. VCF/BCF generated by the new -samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools +samtools will contain a line `##fileformat=VCFv4.1` as well as the samtools version number. Single Individual Haplotyping (SIH) is added as an experimental feature. It @@ -1957,17 +1957,17 @@ Other notable changes in samtools: * Improved sorting order checking in indexing. Now indexing is the preferred way to check if a BAM is sorted. - * Added a switch `-E' to mpileup and calmd. This option uses an alternative way + * Added a switch `-E` to mpileup and calmd. This option uses an alternative way to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of a little loss in specificity. - * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling. + * Added `mpileup -A` to allow to use reads in anomalous pairs in SNP calling. - * Added `mpileup -m' to allow fine control of the collection of INDEL candidates. + * Added `mpileup -m` to allow fine control of the collection of INDEL candidates. - * Added `mpileup -S' to compute per-sample strand bias P-value. + * Added `mpileup -S` to compute per-sample strand bias P-value. - * Added `mpileup -G' to exclude read groups in variant calling. + * Added `mpileup -G` to exclude read groups in variant calling. * Fixed segfault in indel calling related to unmapped and refskip reads. @@ -1978,7 +1978,7 @@ Other notable changes in samtools: * Fixed a very rare memory issue in bam_md.c - * Fixed an out-of-boundary bug in mpileup when the read base is `N'. + * Fixed an out-of-boundary bug in mpileup when the read base is `N`. * Fixed a compiling error when the knetfile library is not used. Fixed a library compiling error due to the lack of bam_nt16_nt4_table[] table. @@ -1989,9 +1989,9 @@ Other notable changes in bcftools: * Updated the BCF spec. - * Added the `FQ' VCF INFO field, which gives the phred-scaled probability + * Added the `FQ` VCF INFO field, which gives the phred-scaled probability of all samples being the same (identical to the reference or all homozygous - variants). Option `view -f' has been dropped. + variants). Option `view -f` has been dropped. * Implemented "vcfutils.pl vcf2fq" to generate a consensus sequence similar to "samtools.pl pileup2fq". @@ -2001,7 +2001,7 @@ Other notable changes in bcftools: * Output bcftools specific INFO and FORMAT in the VCF header. - * Added `view -s' to call variants from a subset of samples. + * Added `view -s` to call variants from a subset of samples. * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless, custom fields are still unparsed and will be stored as a missing value. @@ -2014,7 +2014,7 @@ Other notable changes in bcftools: Beta release 0.1.12a (2 December, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------------- This is another bug fix release: @@ -2040,7 +2040,7 @@ This is another bug fix release: Beta release 0.1.11 (21 November, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------------- This is mainly a bug fix release: @@ -2061,8 +2061,8 @@ One minor feature has been implemented in bcftools: result in the new mode, although the reference sequence has to be used in realignment when SAMtools computes genotype likelihoods. -In addition, since 0.1.10, the `pileup' command has been deprecated by -`mpileup' which is more powerful and more accurate. The `pileup' command +In addition, since 0.1.10, the `pileup` command has been deprecated by +`mpileup` which is more powerful and more accurate. The `pileup` command will not be removed in the next few releases, but new features will not be added. @@ -2071,7 +2071,7 @@ be added. Beta Release 0.1.10 (16 November, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------------- This release is featured as the first major improvement to the indel caller. The method is similar to the old one implemented in the pileup @@ -2106,7 +2106,7 @@ Other notable changes: Beta Release 0.1.9 (27 October, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- This release is featured as the first major improvement to the samtools' SNP caller. It comes with a revised MAQ error model, the support of @@ -2155,36 +2155,36 @@ In addition to the three major improvements, other notable changes are: Beta Release 0.1.8 (11 July, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +---------------------------------- Notable functional changes: - * Added the `reheader' command which replaces a BAM header with a new + * Added the `reheader` command which replaces a BAM header with a new header. This command is much faster than replacing header by BAM->SAM->BAM conversions. - * Added the `mpileup' command which computes the pileup of multiple + * Added the `mpileup` command which computes the pileup of multiple alignments. - * The `index' command now stores the number of mapped and unmapped + * The `index` command now stores the number of mapped and unmapped reads in the index file. This information can be retrieved quickly by - the new `idxstats' command. + the new `idxstats` command. * By default, pileup used the SOAPsnp model for SNP calling. This avoids the floating overflow in the MAQ model which leads to spurious calls in repetitive regions, although these calls will be immediately filtered by varFilter. - * The `tview' command now correctly handles CIGARs like 7I10M and + * The `tview` command now correctly handles CIGARs like 7I10M and 10M1P1I10M which cause assertion failure in earlier versions. - * Tview accepts a region like `=10,000' where `=' stands for the + * Tview accepts a region like `=10,000` where `=` stands for the current sequence name. This saves typing for long sequence names. - * Added the `-d' option to `pileup' which avoids slow indel calling + * Added the `-d` option to `pileup` which avoids slow indel calling in ultradeep regions by subsampling reads locally. - * Added the `-R' option to `view' which retrieves alignments in read + * Added the `-R` option to `view` which retrieves alignments in read groups listed in the specified file. Performance improvements: @@ -2207,9 +2207,9 @@ Bug fixes: * Fixed another issue in the indel caller which may lead to incorrect genotype. - * Fixed a bug in `sort' when option `-o' is applied. + * Fixed a bug in `sort` when option `-o` is applied. - * Fixed a bug in `view -r'. + * Fixed a bug in `view -r`. APIs and other changes: @@ -2234,7 +2234,7 @@ Changes in other utilities: Beta Release 0.1.7 (10 November, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------------------- Notable changes: @@ -2274,7 +2274,7 @@ Changes in other utilities: Beta Release 0.1.6 (2 September, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------------------- Notable changes: @@ -2294,17 +2294,17 @@ Notable changes: * The view command now recognizes and optionally prints FLAG in HEXs or strings to make a SAM file more friendly to human eyes. This is a samtools-C extension, not implemented in Picard for the time - being. Please type `samtools view -?' for more information. + being. Please type `samtools view -?` for more information. * BAM files now have an end-of-file (EOF) marker to facilitate truncation detection. A warning will be given if an on-disk BAM file does not have this marker. The warning will be seen on BAM files generated by an older version of samtools. It does NO harm. - * New key bindings in tview: `r' to show read names and `s' to show + * New key bindings in tview: 'r' to show read names and 's' to show reference skip (N operation) as deletions. - * Fixed a bug in `samtools merge -n'. + * Fixed a bug in `samtools merge -n`. * Samtools merge now optionally copies the header of a user specified SAM file to the resultant BAM output. @@ -2325,7 +2325,7 @@ Changes in other utilities: Beta Release 0.1.5 (7 July, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------- Notable changes: @@ -2361,7 +2361,7 @@ Notable changes: option is preferred when the output is piped to other commands. * In view, added '-l' and '-r' to get the alignments for one library or - read group. The "@RG" header lines are now partially parsed. + read group. The `@RG` header lines are now partially parsed. * Do not include command line utilities to libbam.a. @@ -2386,7 +2386,7 @@ bugs. Beta Release 0.1.4 (21 May, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------- Notable changes: @@ -2400,7 +2400,7 @@ Notable changes: * Unified the interface to BAM and SAM I/O. This is done by implementing a wrapper on top of the old APIs and therefore old APIs - are still valid. The new I/O APIs also recognize the @SQ header + are still valid. The new I/O APIs also recognize the `@SQ` header lines. * Generate the MD tag. @@ -2412,7 +2412,7 @@ Notable changes: * Implemented the GNU building system. However, currently the building system does not generate libbam.a. We will improve this later. For - the time being, `make -f Makefile.generic' is preferred. + the time being, `make -f Makefile.generic` is preferred. * Fixed a minor bug in pileup: the first read in a chromosome may be skipped. @@ -2427,7 +2427,7 @@ Notable changes: Beta Release 0.1.3 (15 April, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------------- Notable changes in SAMtools: @@ -2452,7 +2452,7 @@ Notable changes in SAMtools: * Fixed a minor bug in indexing: the bin number of an unmapped read is wrongly calculated. - * Added `flagstat' command to show statistics on the FLAG field. + * Added `flagstat` command to show statistics on the FLAG field. * Improved indel caller by setting the maximum window size in local realignment. @@ -2474,7 +2474,7 @@ Changes in other utilities: Beta Release 0.1.2 (28 January, 2008) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- Notable changes in SAMtools: @@ -2543,8 +2543,8 @@ Changes in other utilities: Beta Release 0.1.1 (22 December, 2008) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------------------- The is the first public release of samtools. For more information, -please check the manual page `samtools.1' and the samtools website +please check the manual page `samtools.1` and the samtools website http://samtools.sourceforge.net From 1f96fa8396b5ee88b1ece39826e2432f2e0be177 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 7 Dec 2023 11:21:25 +0000 Subject: [PATCH 24/26] Make samtools mpileup -aa work on empty files. (#1939) --- bam_plcmd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bam_plcmd.c b/bam_plcmd.c index 264a7f5a0..c61d4b681 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -570,9 +570,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) int ret; int last_tid = -1; hts_pos_t last_pos = -1; + int one_seq = 0; // begin pileup while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) { + one_seq = 1; // at least 1 output if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested mplp_get_ref(data[0], tid, &ref, &ref_len); //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); @@ -811,8 +813,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) last_tid = tid0; last_pos = beg0-1; mplp_get_ref(data[0], tid0, &ref, &ref_len); + } else if (last_tid < 0 && !one_seq && conf->all > 1) { + last_tid = 0; // --aa on a blank file } - while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { + while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { + mplp_get_ref(data[0], last_tid, &ref, &ref_len); while (++last_pos < sam_hdr_tid2len(h, last_tid)) { if (last_pos >= end0) break; if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) From 413d8fce1e5e6e3f010d9c0ef4521a0cb36665f6 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Mon, 11 Dec 2023 17:15:00 +0000 Subject: [PATCH 25/26] Speed up optical duplicate marking. (PR #1952) Checking optical duplicates in very deep data was very slow. By adding more sort criteria we can reduce the total amount of work that needs to be done and so speed up the processing. --- bam_markdup.c | 173 ++++++++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 76 deletions(-) diff --git a/bam_markdup.c b/bam_markdup.c index 677a47f82..17fedd58d 100644 --- a/bam_markdup.c +++ b/bam_markdup.c @@ -122,6 +122,7 @@ typedef struct { int opt; int beg; int end; + int len; } check_t; typedef struct { @@ -875,7 +876,10 @@ static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, lon return ret; } - if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { + int o_len = o_end - o_beg; + int d_len = d_end - d_beg; + + if ((o_len == d_len) && memcmp(original + o_beg, duplicate + d_beg, o_len) == 0) { long xdiff, ydiff; if (ox > dx) { @@ -918,7 +922,10 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const return ret; } - if (strncmp(name + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { + int o_len = o_end - o_beg; + int d_len = d_end - d_beg; + + if ((o_len == d_len) && memcmp(name + o_beg, duplicate + d_beg, o_len) == 0) { // the initial parts match, look at the numbers long xdiff, ydiff; @@ -945,6 +952,7 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const c->y = dy; c->beg = d_beg; c->end = d_end; + c->len = d_end - d_beg; return ret; } @@ -1156,9 +1164,15 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * } -static int xcoord_sort(const void *a, const void *b) { +static int chain_sort(const void *a, const void *b) { check_t *ac = (check_t *) a; check_t *bc = (check_t *) b; + int ret; + + if ((ret = ac->len - bc->len)) + return ret; + else if ((ret = memcmp(bam_get_qname(ac->b) + ac->beg, bam_get_qname(bc->b) + bc->beg, ac->len))) + return ret; return (ac->x - bc->x); } @@ -1170,106 +1184,113 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has int ret = 0; size_t curr = 0; - qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort); + qsort(list->c, list->length, sizeof(list->c[0]), chain_sort); while (curr < list->length - 1) { - check_t *current = &list->c[curr]; - size_t count = curr; - char *cur_name = bam_get_qname(current->b); - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + check_t *base = &list->c[curr]; + char *base_name = bam_get_qname(base->b); + int end_name_match = curr; - while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) { - // while close enough along the x coordinate - check_t *chk = &list->c[count]; + // find the end of the matching name parts + while (++end_name_match < list->length) { + check_t *chk = &list->c[end_name_match]; - if (current->opt && chk->opt) - continue; + if ((base->len == chk->len) && memcmp(base_name + base->beg, bam_get_qname(chk->b) + chk->beg, base->len) != 0) + break; + } - // if both are already optical duplicates there is no need to check again, otherwise... + while (curr < end_name_match) { + size_t count = curr; + check_t *current = &list->c[curr]; + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); - long ydiff; + while (++count < end_name_match && (list->c[count].x - current->x <= param->opt_dist)) { + // while close enough along the x coordinate + check_t *chk = &list->c[count]; - if (current->y > chk->y) { - ydiff = current->y - chk->y; - } else { - ydiff = chk->y - current->y; - } + if (current->opt && chk->opt) + continue; - if (ydiff > param->opt_dist) - continue; + long ydiff; - // the number are right, check the names - if (strncmp(cur_name + current->beg, bam_get_qname(chk->b) + chk->beg, current->end - current->beg) != 0) - continue; + if (current->y > chk->y) { + ydiff = current->y - chk->y; + } else { + ydiff = chk->y - current->y; + } - // optical duplicates - int chk_dup = 0; - int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); + if (ydiff > param->opt_dist) + continue; - if (current_paired != chk_paired) { - if (!chk_paired) { - // chk is single vs pair, this is a dup. - chk_dup = 1; - } - } else { - // do it by scores - int64_t cur_score, chk_score; + // optical duplicates + int chk_dup = 0; + int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); - if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) { - if (current->b->core.flag & BAM_FQCFAIL) { - cur_score = 0; - chk_score = 1; - } else { - cur_score = 1; - chk_score = 0; + if (current_paired != chk_paired) { + if (!chk_paired) { + // chk is single vs pair, this is a dup. + chk_dup = 1; } } else { - cur_score = current->score; - chk_score = chk->score; + // do it by scores + int64_t cur_score, chk_score; - if (current_paired) { - // they are pairs so add mate scores. - chk_score += chk->mate_score; - cur_score += current->mate_score; + if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) { + if (current->b->core.flag & BAM_FQCFAIL) { + cur_score = 0; + chk_score = 1; + } else { + cur_score = 1; + chk_score = 0; + } + } else { + cur_score = current->score; + chk_score = chk->score; + + if (current_paired) { + // they are pairs so add mate scores. + chk_score += chk->mate_score; + cur_score += current->mate_score; + } } - } - if (cur_score == chk_score) { - if (strcmp(bam_get_qname(chk->b), cur_name) < 0) { - chk_score++; - } else { - chk_score--; + if (cur_score == chk_score) { + if (strcmp(bam_get_qname(chk->b), bam_get_qname(current->b)) < 0) { + chk_score++; + } else { + chk_score--; + } } - } - if (cur_score > chk_score) { - chk_dup = 1; + if (cur_score > chk_score) { + chk_dup = 1; + } } - } - if (chk_dup) { - // the duplicate is the optical duplicate - if (!chk->opt) { // only change if not already an optical duplicate - if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) { - ret = -1; - goto fail; - } + if (chk_dup) { + // the duplicate is the optical duplicate + if (!chk->opt) { // only change if not already an optical duplicate + if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) { + ret = -1; + goto fail; + } - chk->opt = 1; - } - } else { - if (!current->opt) { - if (optical_retag(param, dup_hash, current->b, current_paired, stats)) { - ret = -1; - goto fail; + chk->opt = 1; } + } else { + if (!current->opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, stats)) { + ret = -1; + goto fail; + } - current->opt = 1; + current->opt = 1; + } } } - } - curr++; + curr++; + } } fail: From d3e2c29ffd8ceea5f4937cb6ffe4efc1d4e8a0c2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 11 Dec 2023 17:53:54 +0000 Subject: [PATCH 26/26] Add latest NEWS for 1.19 --- NEWS.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1527fbb00..8ba86b113 100644 --- a/NEWS.md +++ b/NEWS.md @@ -17,8 +17,7 @@ New work and changes: read not matching the specified files. (PR #1896, fixes #1895. Suggested by Feng Tian) -* Cope with Htslib's change to no longer close stdout on is hts_close is - used with the output file being stdout. +* Cope with Htslib's change to no longer close stdout on hts_close. Htslib companion PR is samtools/htslib#1665. (PR #1909. Thanks to John Marshall) @@ -33,7 +32,7 @@ New work and changes: (PR #1222, PR #1933, fixes #1758. Thanks to Valeriu Ohan, suggested by Scott Norton) -Bug Fixes +Bug Fixes: * Samtools stats: empty barcode tags are now treated as having no barcode. (PR #1929, fixes #1926. Reported by Jukka Matilainen) @@ -43,8 +42,15 @@ Bug Fixes non-seekable such as stdin or a pipe. (PR #1930, fixes #1731. Reported by Julian Hess) +* Samtools mpileup -aa (absolutely all positions) now produces an output + even when given an empty input file. + (PR #1939. Reported by Chang Y) + +* Samtools markdup: speed up optical duplicate tagging on regions with very + deep data. + (PR #1952) -Documentation +Documentation: * Samtools mpileup: add more usage examples to the man page. (PR #1913, fixes #1801)