From 88bb16a3eb77f73597b207425b8a9372ec052a42 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 14 Aug 2023 12:08:43 +0100
Subject: [PATCH 01/26] Add view -N/-R ^FILE for FILE negation. (#1896)

These options output only the reads with read names (-N) or read
groups (-R) listed in FILE.  Copying the syntax used in various
bcftools commands, now ^FILE negates this so it only outputs reads
with their name/group not listed in the file.

The third exclusion-by-hash in the code is tag types, but it's less
likely we'll be using files of tag names as these tend to be more
prescribed than the generated names and read-groups.  Hence negation
isn't supported there.

Fixes #1895
---
 doc/samtools-view.1 |  8 ++++++++
 sam_view.c          | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/doc/samtools-view.1 b/doc/samtools-view.1
index f38d29d23..2e34f912f 100644
--- a/doc/samtools-view.1
+++ b/doc/samtools-view.1
@@ -292,6 +292,10 @@ or
 .BI "-N " FILE ", --qname-file " FILE
 Output only alignments with read names listed in
 .IR FILE .
+If \fIFILE\fR starts with \fB^\fR then the operation is negated and
+only outputs alignment with read groups not listed in \fIFILE\fR.
+It is not permissible to mix both the filter-in and filter-out style
+syntax in the same command.
 .TP
 .BI "-r " STR ", --read-group " STR
 Output alignments in read group
@@ -306,6 +310,10 @@ This behaviour may change in a future release.
 Output alignments in read groups listed in
 .I FILE
 [null].
+If \fIFILE\fR starts with \fB^\fR then the operation is negated and
+only outputs alignment with read names not listed in \fIFILE\fR.
+It is not permissible to mix both the filter-in and filter-out style
+syntax in the same command.
 Note that records with no
 .B RG
 tag will also be output when using this option.
diff --git a/sam_view.c b/sam_view.c
index aa5b92310..2a1439acb 100644
--- a/sam_view.c
+++ b/sam_view.c
@@ -54,6 +54,8 @@ typedef struct samview_settings {
     strhash_t rnhash;
     strhash_t tvhash;
     int min_mapQ;
+    int rghash_discard; // 0 keep, 1 discard
+    int rnhash_discard; // 0 keep, 1 discard
 
     // Described here in the same terms as the usage statement.
     // The code however always negates to "reject if"         keep if:
@@ -176,7 +178,8 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
         uint8_t *s = bam_aux_get(b, "RG");
         if (s) {
             khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
-            if (k == kh_end(settings->rghash)) return 1;
+            if ((k == kh_end(settings->rghash)) != settings->rghash_discard)
+                return 1;
         }
     }
     if (settings->tag) {
@@ -204,9 +207,11 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
     }
     if (settings->rnhash) {
         const char* rn = bam_get_qname(b);
-        if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+        strhash_t h = settings->rnhash;
+        if (!rn && !settings->rnhash_discard)
+            return 1;
+        if ((kh_get(str, h, rn) == kh_end(h)) != settings->rnhash_discard)
             return 1;
-        }
     }
     if (settings->library) {
         const char *p = bam_get_library((sam_hdr_t*)h, b);
@@ -305,7 +310,12 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
     if (settings->rghash == NULL) {
         settings->rghash = kh_init(str);
         if (settings->rghash == NULL) goto err;
+    } else if (settings->rghash_discard == 1) {
+        print_error("view", "cannot mix include and exclude read-group files in the same command line");
+        free(d);
+        return -1;
     }
+    settings->rghash_discard = 0;
 
     kh_put(str, settings->rghash, d, &ret);
     if (ret == -1) goto err;
@@ -326,8 +336,14 @@ static int add_read_names_file(const char *subcmd, samview_settings_t *settings,
             perror(NULL);
             return -1;
         }
+    } else if ((settings->rnhash_discard == 0 && *fn == '^') ||
+        (settings->rnhash_discard == 1 && *fn != '^')) {
+        print_error("view", "cannot mix include and exclude read-name files in the same command line");
+        return -1;
     }
-    return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+    settings->rnhash_discard = (*fn == '^');
+    return populate_lookup_from_file(subcmd, settings->rnhash,
+                                     fn + (*fn == '^'));
 }
 
 static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
@@ -338,8 +354,14 @@ static int add_read_groups_file(const char *subcmd, samview_settings_t *settings
             perror(NULL);
             return -1;
         }
+    } else if ((settings->rghash_discard == 0 && *fn == '^') ||
+        (settings->rghash_discard == 1 && *fn != '^')) {
+        print_error("view", "cannot mix include and exclude read-group files in the same command line");
+        return -1;
     }
-    return populate_lookup_from_file(subcmd, settings->rghash, fn);
+    settings->rghash_discard = (*fn == '^');
+    return populate_lookup_from_file(subcmd, settings->rghash,
+                                     fn + (*fn == '^'));
 }
 
 static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
@@ -1458,9 +1480,10 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
 "\n"
 "Filtering options (Only include in output reads that...):\n"
 "  -L, --target[s]-file FILE  ...overlap (BED) regions in FILE\n"
+"  -N, --qname-file [^]FILE   ...whose read name is listed in FILE (\"^\" negates)\n"
 "  -r, --read-group STR       ...are in read group STR\n"
-"  -R, --read-group-file FILE ...are in a read group listed in FILE\n"
-"  -N, --qname-file FILE      ...whose read name is listed in FILE\n"
+"  -R, --read-group-file [^]FILE\n"
+"                             ...are in a read group listed in FILE\n"
 "  -d, --tag STR1[:STR2]      ...have a tag STR1 (with associated value STR2)\n"
 "  -D, --tag-file STR:FILE    ...have a tag STR whose value is listed in FILE\n"
 "  -q, --min-MQ INT           ...have mapping quality >= INT\n"

From 3aa477829a34ea664b5e4e0b544cc5cfe7aa09d0 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 7 Aug 2023 11:50:02 +0100
Subject: [PATCH 02/26] Add samtools sort -N / samtools merge -N option.

These handle an alternative form of read-name sort order that is
purely ASCII instead of the "natural" alphanumeric sort.

Natural sort order will place "a12" after "a3", as it computes 12 > 3
rather than "1" < "3" (ASCII).  However it also means hexadecimal
values such as "a12b" > "a3bc", and "a09b" > "a7bc".  Typically any
read name generation system using leading fixed-width columns with
leading zeros to pad is going to be safer using traditional ASCII sort
orders.

Note while writing this my initial implementation was to add
QueryNameASCII to the SamOrder typedef, but it turns out the sheer
level of code duplication going on here means doing that yielding 81
changed lines, vs 15 for this.  I don't like using a global variable,
but it's already using one due to the lacklustre ksort API which can't
pass in a client state argument. (Being poorly modelled on qsort I
assume.)  Hence a sibling global is a trivial change and less error
prone.

Fixes #1500
---
 bam_sort.c                        | 19 ++++++++---
 doc/samtools-merge.1              | 22 ++++++++++---
 doc/samtools-sort.1               | 52 ++++++++++++++++++++++---------
 test/sort/name.sort.expected.sam  |  2 +-
 test/sort/name2.sort.expected.sam | 24 ++++++++++++++
 test/test.pl                      |  1 +
 6 files changed, 97 insertions(+), 23 deletions(-)
 create mode 100644 test/sort/name2.sort.expected.sam

diff --git a/bam_sort.c b/bam_sort.c
index b44bd665d..d0163394c 100644
--- a/bam_sort.c
+++ b/bam_sort.c
@@ -164,11 +164,15 @@ static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_co
 
 typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder;
 static SamOrder g_sam_order = Coordinate;
+static int natural_sort = 1; // not ASCII, but alphanumeric: a12b > a7b
 static char g_sort_tag[2] = {0,0};
 
 #define is_digit(c) ((c)<='9' && (c)>='0')
 static int strnum_cmp(const char *_a, const char *_b)
 {
+    if (!natural_sort)
+        return strcmp(_a,_b);
+
     const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
     const unsigned char *pa = a, *pb = b;
     while (*pa && *pb) {
@@ -1535,7 +1539,8 @@ static void merge_usage(FILE *to)
 "   or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
 "\n"
 "Options:\n"
-"  -n         Input files are sorted by read name\n"
+"  -n         Input files are sorted by read name (natural)\n"
+"  -N         Input files are sorted by read name (ASCII)\n"
 "  -t TAG     Input files are sorted by TAG value\n"
 "  -r         Attach RG tag (inferred from file names)\n"
 "  -u         Uncompressed BAM output\n"
@@ -1581,11 +1586,12 @@ int bam_merge(int argc, char *argv[])
         return 0;
     }
 
-    while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nNru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': flag |= MERGE_RG; break;
         case 'f': flag |= MERGE_FORCE; break;
         case 'h': fn_headers = optarg; break;
+        case 'N': natural_sort = 0; // fall through
         case 'n': sam_order = QueryName; break;
         case 'o': fnout = optarg; break;
         case 't': sort_tag = optarg; break;
@@ -3287,6 +3293,9 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
             break;
         case QueryName:
             new_so = "queryname";
+            new_ss = natural_sort
+                ? "queryname:natural"
+                : "queryname:lexicographical";
             break;
         case MinHash:
             new_so = "coordinate";
@@ -3605,7 +3614,8 @@ static void sort_usage(FILE *fp)
 "  -I FILE    Order minimisers by their position in FILE FASTA\n"
 "  -w INT     Window size for minimiser indexing via -I ref.fa [100]\n"
 "  -H         Squash homopolymers when computing minimiser\n"
-"  -n         Sort by read name (not compatible with samtools index command)\n"
+"  -n         Sort by read name (natural): cannot be used with samtools index\n"
+"  -N         Sort by read name (ASCII): cannot be used with samtools index\n"
 "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
 "  -o FILE    Write final output to FILE rather than standard output\n"
 "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n"
@@ -3658,9 +3668,10 @@ int bam_sort(int argc, char *argv[])
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:nNo:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
         switch (c) {
         case 'o': fnout = optarg; o_seen = 1; break;
+        case 'N': natural_sort = 0; // fall through
         case 'n': sam_order = QueryName; break;
         case 't': by_tag = true; sort_tag = optarg; break;
         case 'm': {
diff --git a/doc/samtools-merge.1 b/doc/samtools-merge.1
index 342313010..2978afc63 100644
--- a/doc/samtools-merge.1
+++ b/doc/samtools-merge.1
@@ -83,8 +83,10 @@ of existing IDs in the output header will have a suffix appended to them to diff
 records from other files and the read records will be updated to reflect this.
 
 The ordering of the records in the input files must match the usage of the
-\fB-n\fP and \fB-t\fP command-line options.  If they do not, the output
-order will be undefined.  See
+\fB-n\fP, \fB-N\fP and \fB-t\fP command-line options.  If they do not,
+the output order will be undefined.  Note this also extends to
+disallowing mixing of "queryname" files with a combination of natural
+and lexicographical sort orders.  See
 .B sort
 for information about record ordering.
 
@@ -122,8 +124,20 @@ is actually in SAM format, though any alignment records it may contain
 are ignored.)
 .TP
 .B -n
-The input alignments are sorted by read names rather than by chromosomal
-coordinates
+The input alignments are sorted by read names using an alpha-numeric
+ordering, rather than by chromosomal coordinates.
+The alpha-numeric or \*(lqnatural\*(rq sort order detects runs of digits in the
+strings and sorts these numerically.  Hence "a7b" appears before "a12b".
+Note this is not suitable where hexadecimal values are in use.
+.TP
+.B -N
+The input alignments are sorted by read names using a lexicographical
+ordering, rather than by chromosomal coordinates.  Unlike \fB-n\fR no
+detection of numeric components is used, instead relying purely on the
+ASCII value of each character.  Hence "x12" comes before "x7" as "1"
+is before "7" in ASCII.  This is a more appropriate name sort order
+where all digits in names are already zero-padded and/or hexadecimal
+values are being used.
 .TP
 .BI -o \ FILE
 Write merged output to
diff --git a/doc/samtools-sort.1 b/doc/samtools-sort.1
index 4e9be1976..7b25b1387 100644
--- a/doc/samtools-sort.1
+++ b/doc/samtools-sort.1
@@ -48,11 +48,12 @@ samtools sort
 
 .SH DESCRIPTION
 .PP
-Sort alignments by leftmost coordinates, by read name when \fB-n\fR
-is used, by tag contents with \fB-t\fR, or a minimiser-based collation
-order with \fB-M\fR.  An appropriate
-.B @HD-SO
-sort order header tag will be added or an existing one updated if necessary.
+Sort alignments by leftmost coordinates, by read name when \fB-n\fR or
+\fB-N\fR are used, by tag contents with \fB-t\fR, or a minimiser-based
+collation order with \fB-M\fR.  An appropriate \fB@HD SO\fR
+sort order header tag will be added or an existing one updated if
+necessary, along with the \fB@HD SS\fR sub-sort header tag where
+appropriate.
 
 The sorted output is written to standard output by default, or to the
 specified file
@@ -74,7 +75,8 @@ instead if you need name collated data without a full lexicographical sort.
 Note that if the sorted output file is to be indexed with
 .BR "samtools index" ,
 the default coordinate sort must be used.
-Thus the \fB-n\fR, \fB-t\fR and \fB-M\fR options are incompatible with
+Thus the \fB-n\fR, \fB-N\fR, \fB-t\fR and \fB-M\fR options are
+incompatible with
 .BR "samtools index" .
 
 When sorting by minimisier (\fB-M\fR), the sort order is defined by
@@ -115,11 +117,26 @@ minimum value of 1M for this setting.
 .B -n
 Sort by read names (i.e., the
 .B QNAME
-field) rather than by chromosomal coordinates.
+field) using an alpha-numeric ordering, rather than by chromosomal coordinates.
+The alpha-numeric or \*(lqnatural\*(rq sort order detects runs of digits in the
+strings and sorts these numerically.  Hence "a7b" appears before "a12b".
+Note this is not suitable where hexadecimal values are in use.
+Sets the header sub-sort (\fB@HD SS\fR) tag to \fBqueryname:natural\fR.
+.TP
+.B -N
+Sort by read names (i.e., the
+.B QNAME
+field) using the lexicographical ordering, rather than by chromosomal
+coordinates.  Unlike \fB-n\fR no detection of numeric components is
+used, instead relying purely on the ASCII value of each character.
+Hence "x12" comes before "x7" as "1" is before "7" in ASCII.  This is
+a more appropriate name sort order where all digits in names are
+already zero-padded and/or hexadecimal values are being used.
+Sets the header sub-sort (\fB@HD SS\fR) tag to \fBqueryname:lexicographical\fR.
 .TP
 .BI "-t " TAG
 Sort first by the value in the alignment tag TAG, then by position or name (if
-also using \fB-n\fP).
+also using \fB-n\fP or \fB-N\fR).
 .TP
 .B "-M "
 Sort unmapped reads (those in chromosome "*") by their sequence
@@ -212,7 +229,8 @@ and the sub-sort (SS) is
 The following rules are used for ordering records.
 
 If option \fB-t\fP is in use, records are first sorted by the value of
-the given alignment tag, and then by position or name (if using \fB-n\fP).
+the given alignment tag, and then by position or name (if using \fB-n\fP
+or \fB-N\fP).
 For example, \*(lq-t RG\*(rq will make read group the primary sort key.  The
 rules for ordering by tag are:
 
@@ -236,11 +254,17 @@ Character tags (type A) are compared by binary character value.
 No attempt is made to compare tags of other types \(em notably type B
 array values will not be compared.
 .PP
-When the \fB-n\fP option is present, records are sorted by name.  Names are
-compared so as to give a \*(lqnatural\*(rq ordering \(em i.e. sections
-consisting of digits are compared numerically while all other sections are
-compared based on their binary representation.  This means \*(lqa1\*(rq will
-come before \*(lqb1\*(rq and \*(lqa9\*(rq will come before \*(lqa10\*(rq.
+When the \fB-n\fP or \fB-N\fP option is present, records are sorted by
+name.  Historically samtools has used a \*(lqnatural\*(rq ordering
+\(em i.e. sections consisting of digits are compared numerically while
+all other sections are compared based on their binary representation.
+This means \*(lqa1\*(rq will come before \*(lqb1\*(rq and \*(lqa9\*(rq
+will come before \*(lqa10\*(rq.  However this alpha-numeric sort can
+be confused by runs of hexadecimal digits.  The newer \fB-N\fP
+option adds a simpler lexicographical based name collation which does not
+attempt any numeric comparisons and may be more appropriate for some
+data sets.  Note care must be taken when using \fBsamtools merge\fP to
+ensure all files are using the same collation order.
 Records with the same name will be ordered according to the values of
 the READ1 and READ2 flags (see
 .BR flags ).
diff --git a/test/sort/name.sort.expected.sam b/test/sort/name.sort.expected.sam
index 6e10914af..22fca6238 100644
--- a/test/sort/name.sort.expected.sam
+++ b/test/sort/name.sort.expected.sam
@@ -1,4 +1,4 @@
-@HD	VN:1.4	SO:queryname
+@HD	VN:1.4	SO:queryname	SS:queryname:natural
 @SQ	SN:insert	LN:599
 @SQ	SN:ref1	LN:45
 @SQ	SN:ref2	LN:40
diff --git a/test/sort/name2.sort.expected.sam b/test/sort/name2.sort.expected.sam
new file mode 100644
index 000000000..340d10821
--- /dev/null
+++ b/test/sort/name2.sort.expected.sam
@@ -0,0 +1,24 @@
+@HD	VN:1.4	SO:queryname	SS:queryname:lexicographical
+@SQ	SN:insert	LN:599
+@SQ	SN:ref1	LN:45
+@SQ	SN:ref2	LN:40
+@SQ	SN:ref3	LN:4
+@PG	ID:llama
+@RG	ID:fish	PG:llama
+@RG	ID:cow	PU:13_&^&&*(:332	PG:donkey
+@RG	PU:*9u8jkjjkjd:	ID:colt
+@PG	ID:bull	PP:donkey
+@PG	ID:donkey
+@CO	Do you know?
+r005	83	ref1	37	30	9M	=	7	-39	CAGCGCCAT	*	RG:Z:colt	PG:Z:donkey
+r005	163	ref1	7	30	8M4I4M1D3M	=	37	39	TTAGATAAAGAGGATACTG	*	XX:B:S,12561,2,20,112	YY:i:100	RG:Z:colt	PG:Z:donkey
+r006	0	ref1	9	30	1S2I6M1P1I1P1I4M2I	*	0	0	AAAAGATAAGGGATAAA	*	XA:Z:abc	XB:i:-10	RG:Z:colt	PG:Z:donkey
+r006	16	ref1	29	30	6H5M	*	0	0	TAGGC	*	RG:Z:colt	PG:Z:donkey
+r007	0	ref1	9	30	5H6M	*	0	0	AGCTAA	*	RG:Z:colt	PG:Z:donkey
+r007	0	ref1	16	30	6M14N1I5M	*	0	0	ATAGCTCTCAGC	*	RG:Z:colt	PG:Z:donkey
+x10	0	ref2	10	30	25M	*	0	0	CAAATAATTAAGTCTACAGAGCAAC	?????????????????????????	RG:Z:cow	PG:Z:bull
+x11	0	ref2	12	30	24M	*	0	0	AATAATTAAGTCTACAGAGCAACT	????????????????????????	RG:Z:cow	PG:Z:bull
+x12	0	ref2	14	30	23M	*	0	0	TAATTAAGTCTACAGAGCAACTA	???????????????????????	RG:Z:cow	PG:Z:bull
+x7	0	ref2	1	30	20M	*	0	0	AGGTTTTATAAAACAAATAA	*	RG:Z:cow	PG:Z:bull
+x8	0	ref2	2	30	21M	*	0	0	GGTTTTATAAAACAAATAATT	?????????????????????	RG:Z:cow	PG:Z:bull
+x9	0	ref2	6	30	9M4I13M	*	0	0	TTATAAAACAAATAATTAAGTCTACA	??????????????????????????	RG:Z:cow	PG:Z:bull
diff --git a/test/test.pl b/test/test.pl
index 3702994d8..179390a3f 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -3213,6 +3213,7 @@ sub test_sort
 
     # Name sort
     test_cmd($opts, out=>"sort/name.sort.expected.sam", ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools sort${threads} -n -m 10M $$opts{path}/dat/test_input_1_a.bam -O SAM -o -");
+    test_cmd($opts, out=>"sort/name2.sort.expected.sam", ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools sort${threads} -N -m 10M $$opts{path}/dat/test_input_1_b.bam -O SAM -o -");
 
     # Tag sort (RG)
     test_cmd($opts, out=>"sort/tag.rg.sort.expected.sam", ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools sort${threads} -t RG -m 10M $$opts{path}/dat/test_input_1_a.bam -O SAM -o -");

From ef784095cd3a38453c68bc0bc67305d81570eb87 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 7 Aug 2023 12:22:15 +0100
Subject: [PATCH 03/26] Bug fix samtools sort --write-index sort mode checks.

This previously had a verbatim list of all the sorts to exclude
(accidentally omitting MinHash), instead of checking only the one
sort order that works with indices.
---
 bam_sort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bam_sort.c b/bam_sort.c
index d0163394c..d5533f5cf 100644
--- a/bam_sort.c
+++ b/bam_sort.c
@@ -3743,7 +3743,7 @@ int bam_sort(int argc, char *argv[])
         goto sort_end;
     }
 
-    if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) {
+    if (ga.write_index && sam_order != Coordinate) {
         fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n");
         ga.write_index = 0;
     }

From ed3b05fbefe6fc79a49e7e2207e6c7610f547fdb Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 9 Aug 2023 12:05:41 +0100
Subject: [PATCH 04/26] Add hclen SAM filter documentation.

Fixes the last comment in #813 (previously closed already), along with
the associated htslib PR.
---
 doc/samtools.1 | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/doc/samtools.1 b/doc/samtools.1
index be5b4aa7c..307a7c396 100644
--- a/doc/samtools.1
+++ b/doc/samtools.1
@@ -1165,6 +1165,7 @@ flag.secondary	int	Single bit, 0 or 256
 flag.qcfail	int	Single bit, 0 or 512
 flag.dup	int	Single bit, 0 or 1024
 flag.supplementary	int	Single bit, 0 or 2048
+hclen	int	Number of hard-clipped bases
 library	string	Library (LB header via RG)
 mapq	int	Mapping quality
 mpos	int	Synonym for pnext
@@ -1193,10 +1194,12 @@ equivalent to \fBflag & 1024\fR.
 "qlen" and "rlen" are measured using the CIGAR string to count the
 number of query (sequence) and reference bases consumed.  Note "qlen"
 may not exactly match the length of the "seq" field if the sequence is
-"*".  "sclen" is the number of soft-clipped bases.  When combined in
-"qlen-sclen" it can give the number of sequence bases used in the
-alignment, distinguishing between global alignment and local alignment
-length.
+"*".
+
+"sclen" and "hclen" are the number of soft and hard-clipped
+bases respectively.  The formula "qlen-sclen" gives the
+number of sequence bases used in the alignment, distinguishing between
+global alignment and local alignment length.
 
 "endpos" is the (1-based inclusive) position of the rightmost mapped base
 of the read, as measured using the CIGAR string, and for mapped reads

From 117549fc746bded7fecefcd746aadebeb45ec637 Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <whitwham@users.noreply.github.com>
Date: Tue, 22 Aug 2023 18:05:21 +0100
Subject: [PATCH 05/26] Add the order of filtering to the man page. (#1907)

---
 doc/samtools-fasta.1 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/samtools-fasta.1 b/doc/samtools-fasta.1
index 7eb47e84a..fbdef34f5 100644
--- a/doc/samtools-fasta.1
+++ b/doc/samtools-fasta.1
@@ -128,6 +128,8 @@ soft-clipped CIGAR records then the soft-clipped data will be in the
 output FASTA/FASTQ.  Hard-clipped data is, by definition, absent from
 the SAM record and hence will be absent in any FASTA/FASTQ produced.
 
+The filter options order of precedence is -d, -f, -F, --rf and -G.
+
 .SH OPTIONS
 .TP 8
 .B -n

From ae19296215245937dd7ff8fee36c58d2d259f349 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Mon, 21 Aug 2023 22:30:33 +1200
Subject: [PATCH 06/26] Check fclose(stdout) at the end of main()

Since PR samtools/htslib#1665, hts_open("-", "w") / hts_close() no longer
actually closes stdout. Close it at the end of main() so there is an
opportunity to detect I/O errors in previously-uncommitted writes.

Ignore EBADF as other code may have already closed stdout, e.g., either
particular subcommands or when (dynamically) linked against an older
version of HTSlib.
---
 bamtk.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/bamtk.c b/bamtk.c
index e05ea1816..221787561 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -303,5 +303,14 @@ int main(int argc, char *argv[])
         fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
         return 1;
     }
+
+    // For subcommands that may have produced substantial output on stdout,
+    // make a final check for delayed I/O errors. Ignore EBADF as other code
+    // may have already closed stdout.
+    if (fclose(stdout) != 0 && errno != EBADF) {
+        print_error_errno(argv[1], "closing standard output failed");
+        return 1;
+    }
+
     return ret;
 }

From 5d67a96b99fbf4cd270e04975e2edd627a04c261 Mon Sep 17 00:00:00 2001
From: Pierre Lindenbaum <plindenbaum@yahoo.fr>
Date: Wed, 30 Aug 2023 16:18:59 +0200
Subject: [PATCH 07/26] new option --plot-depth for samtools coverage

---
 coverage.c              | 38 ++++++++++++++++++++++++++++----------
 doc/samtools-coverage.1 | 21 +++++++++++++++++++++
 2 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/coverage.c b/coverage.c
index dedaa8e99..f9546b799 100644
--- a/coverage.c
+++ b/coverage.c
@@ -120,6 +120,7 @@ static int usage() {
             "                          effectively removing any depth limit.\n"
             "Output options:\n"
             "  -m, --histogram         show histogram instead of tabular output\n"
+            "  -D, --plot-depth        plot depth instead of tabular output\n"
             "  -A, --ascii             show only ASCII characters in histogram\n"
             "  -o, --output FILE       write output to FILE [stdout]\n"
             "  -H, --no-header         don't print a header in tabular mode\n"
@@ -209,7 +210,7 @@ void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *s
 }
 
 void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
-        const int hist_size, const bool full_utf) {
+        const int hist_size, const bool full_utf, bool plot_coverage) {
     int i, col;
     bool show_percentiles = false;
     const int n_rows = 10;
@@ -221,7 +222,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
     double hist_data[hist_size];
     double max_val = 0.0;
     for (i = 0; i < hist_size; ++i) {
-        hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
+        hist_data[i] = (plot_coverage?1:100) * hist[i] / (double) stats[tid].bin_width;
         if (hist_data[i] > max_val) max_val = hist_data[i];
     }
 
@@ -231,7 +232,10 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
     double row_bin_size = max_val / (double) n_rows;
     for (i = n_rows-1; i >= 0; --i) {
         double current_bin = row_bin_size * i;
-        if (show_percentiles) {
+        if (plot_coverage) {
+            fprintf(file_out, ">%8.1f ",i*row_bin_size);
+        }
+        else if (show_percentiles) {
             fprintf(file_out, ">%3i%% ", i*10);
         } else {
             fprintf(file_out, ">%7.2f%% ", current_bin);
@@ -265,7 +269,13 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
                             stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
             case 1: fprintf(file_out, "Histo bin width: %sbp",
                             readable_bps(stats[tid].bin_width, buf)); break;
-            case 0: fprintf(file_out, "Histo max bin:   %.5g%%", max_val); break;
+            case 0: if (plot_coverage) {
+                        fprintf(file_out, "Histo max cov:   %.5g",  max_val);
+                    } else {
+                        fprintf(file_out, "Histo max bin:   %.5g%%", max_val);
+                    }
+                    break;
+
         };
         fputc('\n', file_out);
     }
@@ -313,6 +323,7 @@ int main_coverage(int argc, char *argv[]) {
     bool opt_print_header = true;
     bool opt_print_tabular = true;
     bool opt_print_histogram = false;
+    bool opt_plot_coverage = false;
     bool opt_full_utf = true;
 
     FILE *file_out = stdout;
@@ -332,6 +343,7 @@ int main_coverage(int argc, char *argv[]) {
         {"min-bq", required_argument, NULL, 'Q'},
         {"histogram", no_argument, NULL, 'm'},
         {"ascii", no_argument, NULL, 'A'},
+        {"plot-depth", no_argument, NULL, 'D'},
         {"output", required_argument, NULL, 'o'},
         {"no-header", no_argument, NULL, 'H'},
         {"n-bins", required_argument, NULL, 'w'},
@@ -344,7 +356,7 @@ int main_coverage(int argc, char *argv[]) {
     // parse the command line
     int c;
     opterr = 0;
-    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
+    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:D", lopts, NULL)) != -1) {
         switch (c) {
             case 1:
                 if ((required_flags = bam_str2flag(optarg)) < 0) {
@@ -368,6 +380,10 @@ int main_coverage(int argc, char *argv[]) {
             case 'A': opt_full_utf = false;
                       opt_print_histogram = true; opt_print_tabular = false;
                       break;
+            case 'D': opt_print_histogram = true;
+                     opt_print_tabular = false;
+                     opt_plot_coverage = true;
+                     break;
             case 'H': opt_print_header = false; break;
             case 'h': return usage();
             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -560,7 +576,7 @@ int main_coverage(int argc, char *argv[]) {
         if (tid != old_tid) { // Next target sequence
             if (old_tid >= 0) {
                 if (opt_print_histogram) {
-                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
+                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf, opt_plot_coverage);
                     fputc('\n', file_out);
                 } else if (opt_print_tabular) {
                     print_tabular_line(file_out, h, stats, old_tid);
@@ -604,12 +620,14 @@ int main_coverage(int argc, char *argv[]) {
                 count_base = true;
                 stats[tid].summed_coverage += depth_at_pos;
             }
-            // hist[current_bin] += depth_at_pos;  // Add counts to the histogram here to have one based on coverage
-            //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+            if(current_bin < n_bins && opt_plot_coverage) {
+                hist[current_bin] += depth_at_pos;
+                }
         }
         if (count_base) {
             stats[tid].n_covered_bases++;
-            if (opt_print_histogram && current_bin < n_bins)
+            if (opt_print_histogram && current_bin < n_bins && !opt_plot_coverage)
                 ++(hist[current_bin]); // Histogram based on breadth of coverage
         }
     }
@@ -620,7 +638,7 @@ int main_coverage(int argc, char *argv[]) {
 
     if (tid < n_targets && tid >=0) {
         if (opt_print_histogram) {
-            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
+            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf, opt_plot_coverage);
         } else if (opt_print_tabular) {
             print_tabular_line(file_out, h, stats, tid);
         }
diff --git a/doc/samtools-coverage.1 b/doc/samtools-coverage.1
index cf71d50be..304ee30dc 100644
--- a/doc/samtools-coverage.1
+++ b/doc/samtools-coverage.1
@@ -102,6 +102,9 @@ Output options:
 .TP 8
 .BI -m,\ --histogram
 Show histogram instead of tabular output.
+.BI -D,\ --plot-depth
+Show histogram instead of tabular output but displays the depth of coverage instead of the percent of coverage.
+This option can be used to visualize copy number varaiations in the terminal..
 .TP
 .BI -A,\ --ascii
 Show only ASCII characters in histogram using colon and fullstop for
@@ -154,6 +157,24 @@ chr1 (249.25Mbp)
         1.00M     4.44M     7.87M       12.00M 
 .EE
 
+.EX 2
+samtools coverage  -m -r 'chr1:24500000-25600000' --plot-depth -w 32 -A input.bam
+
+chr1 (249.25Mbp)
+>    38.8 |            .:::::::            | Number of reads: 283218
+>    34.5 |            ::::::::            |     (3327 filtered)
+>    30.2 |           :::::::::.           | Covered bases:   1.10Mbp
+>    25.9 |.:::::.:.::::::::::::::::::::::.| Percent covered: 99.83%
+>    21.6 |::::::::::::::::::::::::::::::::| Mean coverage:   33.2x
+>    17.2 |::::::::::::::::::::::::::::::::| Mean baseQ:      37.2
+>    12.9 |::::::::::::::::::::::::::::::::| Mean mapQ:       59.3
+>     8.6 |::::::::::::::::::::::::::::::::|
+>     4.3 |::::::::::::::::::::::::::::::::| Histo bin width: 34.5Kbp
+>     0.0 |::::::::::::::::::::::::::::::::| Histo max cov:   43.117
+        24.50M    24.84M    25.19M      25.60M
+
+.EE
+
 
 .SH AUTHOR
 .PP

From 38e45176de8d8d65c3ba144b131381b55121bf16 Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <aw7@sanger.ac.uk>
Date: Fri, 8 Sep 2023 14:15:09 +0100
Subject: [PATCH 08/26] Minor layout changes and added const modifier.

---
 coverage.c              | 5 ++---
 doc/samtools-coverage.1 | 5 +++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/coverage.c b/coverage.c
index f9546b799..68dd6be5e 100644
--- a/coverage.c
+++ b/coverage.c
@@ -210,7 +210,7 @@ void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *s
 }
 
 void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
-        const int hist_size, const bool full_utf, bool plot_coverage) {
+        const int hist_size, const bool full_utf, const bool plot_coverage) {
     int i, col;
     bool show_percentiles = false;
     const int n_rows = 10;
@@ -234,8 +234,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
         double current_bin = row_bin_size * i;
         if (plot_coverage) {
             fprintf(file_out, ">%8.1f ",i*row_bin_size);
-        }
-        else if (show_percentiles) {
+        } else if (show_percentiles) {
             fprintf(file_out, ">%3i%% ", i*10);
         } else {
             fprintf(file_out, ">%7.2f%% ", current_bin);
diff --git a/doc/samtools-coverage.1 b/doc/samtools-coverage.1
index 304ee30dc..3da80abdd 100644
--- a/doc/samtools-coverage.1
+++ b/doc/samtools-coverage.1
@@ -102,9 +102,10 @@ Output options:
 .TP 8
 .BI -m,\ --histogram
 Show histogram instead of tabular output.
+.TP
 .BI -D,\ --plot-depth
-Show histogram instead of tabular output but displays the depth of coverage instead of the percent of coverage.
-This option can be used to visualize copy number varaiations in the terminal..
+As above but displays the depth of coverage instead of the percent of coverage.
+This option can be used to visualize copy number variations in the terminal.
 .TP
 .BI -A,\ --ascii
 Show only ASCII characters in histogram using colon and fullstop for

From 007d83e8b63737d8a08060723a41eba4a1df3f7a Mon Sep 17 00:00:00 2001
From: vasudeva8 <vasudeva.sarma@sanger.ac.uk>
Date: Tue, 12 Sep 2023 09:57:22 +0100
Subject: [PATCH 09/26] mpileup manual updated with samples

---
 doc/samtools-mpileup.1 | 42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/doc/samtools-mpileup.1 b/doc/samtools-mpileup.1
index 35b14d4d4..2eb29ee7a 100644
--- a/doc/samtools-mpileup.1
+++ b/doc/samtools-mpileup.1
@@ -76,6 +76,7 @@ could be specified using \fB-r 20 -l chr20.bed\fR, meaning that the
 index is used to find chromosome 20 and then it is filtered for the
 regions listed in the bed file.
 
+Unmapped reads are not considered and are always discarded.
 By default secondary alignments, QC failures and duplicate reads will
 be omitted, along with low quality bases and some reads in high depth
 regions.  See the \fB--ff\fR, \fB-Q\fR and \fB-d\fR options for
@@ -283,8 +284,8 @@ none of the mask bits set.  Hence this does not override the
 .TP
 .BI --ff,\ --excl-flags \ STR|INT
 Filter flags: skip reads with any of the mask bits set.  This defaults
-to UNMAP,SECONDARY,QCFAIL,DUP.  The option is not accumulative, so
-specifying e.g. \fB--ff UNMAP,QCFAIL\fR will reenable output of
+to SECONDARY,QCFAIL,DUP.  The option is not accumulative, so
+specifying e.g. \fB--ff QCFAIL\fR will reenable output of
 secondary and duplicate alignments.  Note this does not override the
 \fB--incl-flags\fR option.
 .TP
@@ -482,7 +483,42 @@ The
 .B -E
 option can be used to make it ignore the contents of the BQ:Z tag and
 force it to recalculate the BAQ scores by making a new alignment.
-
+.PP
+.SH EXAMPLES
+Using range:
+With implicit index files in1.bam.<ext> and in2.sam.gz.<ext>,
+.EX 2
+samtools mpileup in1.bam in2.sam.gz -r chr10:100000-200000
+.EE
+With explicit index files,
+.EX 2
+samtools mpileup in1.bam in2.sam.gz idx/in1.csi idx/in2.csi -X -r chr10:100000-200000
+.EE
+With fofn being a file of input file names, and implicit index files present with inputs,
+.EX 2
+samtools mpileup -b fofn -r chr10:100000-200000
+.EE
+Using flags:
+To get reads with flags READ2 or REVERSE and not having any of SECONDARY,QCFAIL,DUP,
+.EX 2
+samtools mpileup --rf READ2,REVERSE in.sam
+.EE
+or
+.EX 2
+samtools mpileup --rf 144 in.sam
+.EE
+To get reads with flag SECONDARY,
+.EX 2
+samtools mpileup --rf SECONDARY --ff QCFAIL,DUP in.sam
+.EE
+Using all possible alignmentes:
+To show all possible alignments, either of below two equivalent commands may be used,
+.EX 2
+samtools mpileup --count-orphans --no-BAQ --max-depth 0 --fasta-ref ref_file.fa \\
+--min-BQ 0 --excl-flags 0 --disable-overlap-removal in.sam
+
+samtools mpileup -A -B -d 0 -f ref_file.fa -Q 0 --ff 0 -x in.sam
+.EE
 .SH AUTHOR
 .PP
 Written by Heng Li from the Sanger Institute.

From 379f1a626167acc0c68c215e5e252daea19a2826 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Wed, 20 Sep 2023 18:01:29 +1200
Subject: [PATCH 10/26] Use the spec-defined PL values in man page example

Spell these values as ILLUMINA and LS454, as per the SAM specification.
The Perl command shown interpolates `@RG` as an undefined array; instead
use printf(1) to avoid any such issues.
---
 doc/samtools-merge.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/samtools-merge.1 b/doc/samtools-merge.1
index 2978afc63..c92811a91 100644
--- a/doc/samtools-merge.1
+++ b/doc/samtools-merge.1
@@ -201,7 +201,7 @@ Attach the
 .B RG
 tag while merging sorted alignments:
 .EX 2
-perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt
+printf '@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:ILLUMINA\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:LS454\\n' > rg.txt
 samtools merge -rh rg.txt merged.bam ga.bam 454.bam
 .EE
 The value in a

From 147ac2f83857d6209e344a77a2ae74324975fc67 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 3 Oct 2023 16:14:05 +0100
Subject: [PATCH 11/26] Remove -5 option from samtools consensus.

This option referred to the original name of the "gap5" mode, but it's
now just called "-m bayesian" and is the default anyway.  The -5
option produces an error now.
---
 doc/samtools-consensus.1 | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/doc/samtools-consensus.1 b/doc/samtools-consensus.1
index 683b9b80e..2de5774ee 100644
--- a/doc/samtools-consensus.1
+++ b/doc/samtools-consensus.1
@@ -264,13 +264,9 @@ output "N".
 
 .TP 0
 The following options apply only to Bayesian consensus mode enabled
-with the \fB-5\fR option.
+(default on).
 
 .TP 10
-.B -5
-Enable Bayesian consensus algorithm.
-
-.TP
 .BI "-C " C ", --cutoff " C
 Only used for the Gap5 consensus mode, which produces a Phred style
 score for the final consensus quality.  If this is below

From 04d53eba2533eba1fd8090ef448c6628f3d83c81 Mon Sep 17 00:00:00 2001
From: Vasudeva Easwara Sarma <vs8@sanger.ac.uk>
Date: Thu, 19 Oct 2023 12:14:56 +0100
Subject: [PATCH 12/26] treats empty barcode as no barcode - avoids issues with
 realloc

---
 stats.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/stats.c b/stats.c
index 44783a974..1574b3203 100644
--- a/stats.c
+++ b/stats.c
@@ -766,6 +766,9 @@ static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) {
             continue;
 
         uint32_t barcode_len = strlen(barcode);
+        if (!barcode_len) {
+            continue;        //consider 0 size barcode same as no barcode - avoids issues with realloc below
+        }
         if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time
             uint32_t offset = 0;
             for (i = 0; i < stats->ntags; i++)

From 7cce9186eb2a1d453ce2f55a6fba99b10906bfb6 Mon Sep 17 00:00:00 2001
From: erboone <eboone1001@gmail.com>
Date: Wed, 30 Aug 2023 13:39:15 -0700
Subject: [PATCH 13/26] Add plot of read lengths ("RL") from samtools stats
 output.

Fixes #1824
---
 misc/plot-bamstats | 133 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/misc/plot-bamstats b/misc/plot-bamstats
index d07c0b1e7..3231ff125 100755
--- a/misc/plot-bamstats
+++ b/misc/plot-bamstats
@@ -25,6 +25,8 @@
 use strict;
 use warnings;
 use Carp;
+use POSIX qw/floor/;
+use List::Util qw(max);
 
 my $opts = parse_params();
 parse_bamcheck($opts);
@@ -39,6 +41,7 @@ plot_coverage($opts);
 plot_mismatches_per_cycle($opts);
 plot_indel_dist($opts);
 plot_indel_cycles($opts);
+plot_read_len_hist($opts);
 create_html($opts);
 
 exit;
@@ -1204,6 +1207,136 @@ sub plot_indel_cycles
     plot($$args{gp});
 }
 
+sub plot_read_len_hist
+{
+    my ($opts) = @_;
+
+    my @rl = get_values($opts, 'RL');
+    if ( !@rl ) { return; }
+
+    # TODO: Add modified version for paired end reads.
+    # my $is_paired = get_value($opts,'SN','reads paired:');
+
+    my $args = get_defaults($opts,"$$opts{prefix}read_length.png");
+    my $fh = $$args{fh};
+
+    # anonymous subroutines for code readibility
+    my $log_bin = sub {
+        my $x = $_[0];
+        return floor((log($x) / log(10)) * 10);
+    };
+    my $inv_log_bin = sub {
+        my $x = $_[0];
+        return floor(10**($x/10));
+    };
+
+    # Open files to store data: this is more elegant than copying four sets of
+    # data into a single GNUplot file.
+    open(my $rd_fh, '>', "$$opts{prefix}read_len_rawdata.txt");
+    open(my $hd_fh, '>', "$$opts{prefix}read_len_histdata.txt");
+
+    # Create gnuplot command file options
+    my $min_x = $rl[0][0];
+    my $max_x = $rl[-1][0];
+
+    # Gnuplot commands for normalized binned data
+    # The normalized plot bins reads by read length with bin widths on a
+    # logarithmic scale (i.e. bins 10-11 bp, 12-14 bp, 15-18 bp, etc.), then
+    # bin height is normalized by the width of the bins.
+    # The end goal is to smooth the distribution of reads compared to plotting
+    # each length without binning.  Visualization of unbinned data is included
+    # in order to make single value spikes more visible, for example in the
+    # case that spike ins are used as a control.
+    print $fh qq[
+        set ytics nomirror
+        set y2tic nomirror
+        set terminal png size 800,1000 truecolor
+        set output "$$args{img}"
+        $$args{grid}
+        set grid noy2tics
+        set multiplot layout 2,1 columnsfirst scale 1,0.9
+        set autoscale y
+        set ylabel "Read Count"
+        set autoscale x
+        set logscale x
+        set xrange[$min_x: $max_x]
+        set xtics rotate by -45 out scale 1 nomirror
+        set xlabel "Read Length"
+        set boxwidth .8 relative
+        set title "$$args{title}" noenhanced
+        set title offset -15,-.5
+        set key at graph 1, 1.15
+        plot "$$opts{prefix}read_len_rawdata.txt" using 1:2 lt rgb "orange" with boxes fs solid .7 title 'unbinned', "$$opts{prefix}read_len_histdata.txt" using 4:3 lt rgb "blue" with boxes title 'normalized bins(count/bin width)' axis x1y2
+    ];
+    print $fh qq[];
+
+    # Gnuplot commands for non-normalized binned data
+    # The un-normalized plot has reads assigned to bins identically to the
+    # normalized plot, however, the bin height is not divided by bin width. The
+    # goal of this visualization is to more accurately reflect the distribution
+    # of reads by volume, and will generally collect around the mean read
+    # length.
+    # Inclusion of unbinned data is to give point of visual comparison to
+    # normalized histogram.
+    print $fh qq[
+        set boxwidth 1 relative
+        plot "$$opts{prefix}read_len_rawdata.txt" using 1:2 lt rgb "orange" with boxes fs solid .7 title 'unbinned', "$$opts{prefix}read_len_histdata.txt" using 4:2 lt rgb "blue" with boxes title 'unnormalized bins' axis x1y2
+    ];
+
+    my @logscale_data = (0)x&$log_bin($max_x);
+    my $log_x = 0;
+    my $start_i = 1;
+    for my $line (@rl)
+    {
+        my @deref_line = @{$line};
+        my $curr_x = $deref_line[0];
+        my $curr_y = $deref_line[1];
+
+        print $rd_fh "$curr_x $curr_y\n";
+        $log_x = max(&$log_bin($curr_x), 5);
+        @logscale_data[floor($log_x)] += $curr_y;
+        $start_i = $curr_x + 1;
+    }
+
+    # Copy binned data into its own file
+
+    # This is a special case to deal with the problem that reads of len < 5 do
+    # not cooperate with the log10(x)*10 transformation and cause small
+    # fragments to graph badly.
+    my $start = 1;
+    my $end = 3;
+    my $div_binw_val = $logscale_data[5]/($end - $start);
+    my $start_idx = -1;
+    for my $i (0..@logscale_data-1) {
+        if($logscale_data[$i] != 0)
+        {
+            $start_idx = $i;
+            if ($i == 5)
+            {
+                print $hd_fh "5 $logscale_data[5] $div_binw_val $start $end\n";
+                $start_idx += 1;
+            }
+            last;
+        }
+    }
+
+
+    for my $i ($start_idx..@logscale_data-1) {
+        $start = &$inv_log_bin($i);
+        $end = &$inv_log_bin($i+1);
+        $div_binw_val = $logscale_data[$i]/max(($end - $start), 1);
+        print $hd_fh "$i $logscale_data[$i] $div_binw_val $start $end\n";
+    }
+
+    # Close file buffers and clean up workspace
+    close($fh);
+    close($hd_fh);
+    close($rd_fh);
+    plot($$args{gp});
+    unlink("$$opts{prefix}read_len_rawdata.txt");
+    unlink("$$opts{prefix}read_len_histdata.txt");
+}
+
 sub merge_bamcheck
 {
     my ($opts) = @_;

From 9a594672ff98c903a642da760cd9bbeb0b15285a Mon Sep 17 00:00:00 2001
From: vasudeva8 <vasudeva.sarma@sanger.ac.uk>
Date: Mon, 16 Oct 2023 12:46:47 +0100
Subject: [PATCH 14/26] updated cat to support multiple non-seekable streams

---
 bam_cat.c | 330 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 179 insertions(+), 151 deletions(-)

diff --git a/bam_cat.c b/bam_cat.c
index ed8cf58c5..4bc845dca 100644
--- a/bam_cat.c
+++ b/bam_cat.c
@@ -1,6 +1,6 @@
 /*  bam_cat.c -- efficiently concatenates bam files.
 
-    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
+    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021, 2023 Genome Research Ltd.
     Modified SAMtools work copyright (C) 2010 Illumina, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -49,52 +49,76 @@ Illumina.
 #include "samtools.h"
 #include "sam_opts.h"
 
-/*
- * Check the files are consistent and capable of being concatenated.
- * Also fills out the version numbers and produces a new sam_hdr_t
- * structure with merged RG lines.
- * Note it is only a simple merge.
- *
- * Returns updated header on success;
- *        NULL on failure.
- */
-static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h,
-                                     int *vers_maj_p, int *vers_min_p) {
+/// cat_check_merge_hdr - check compatibility and merge RG hearders merges RGon both CRAM and BAM.
+/** @param firstfile - pointer to the 1sr file opened in caller
+ *  @param nfn - number of files to be processed, including the firstfile
+ *  @param fn - array of file paths to be processed
+ *  @param h - sam header pointer which contains explicitly given header
+ *  @param vers_maj_p - cram major version set and send out for output creation
+ *  @param vers_min_p - cram min version set and send out for output creation
+ *  @param out_h - pointer to sam header pointer, outputs the merged header
+ * returns array of opened samFile pointers on success and NULL on failure
+ * This method has the merged header processing for cram and bam.
+ * RG lines are merged for both cram and bam. For cram, version match for each
+ * file and order match of RG lines are compared as well.
+ * Note: it is a simple merge of RG lines alone.
+*/
+static samFile** cat_check_merge_hdr(samFile * const firstfile, int nfn, char * const *fn, const sam_hdr_t *h,
+                                     int *vers_maj_p, int *vers_min_p, sam_hdr_t **out_h) {
     int i, vers_maj = -1, vers_min = -1;
     sam_hdr_t *new_h = NULL, *old_h = NULL;
     samFile *in = NULL;
     kstring_t ks = KS_INITIALIZE;
-
-    if (h) {
-        new_h = sam_hdr_dup(h);
-        if (!new_h) {
-            fprintf(stderr, "[%s] ERROR: header duplication failed.\n",
-                    __func__);
-            goto fail;
+    samFile **files = calloc(nfn, sizeof(samFile *));
+    if(!files) {
+        fprintf(stderr, "[%s] ERROR: failed to allocate space for file handles.\n", __func__);
+        return NULL;
+    }
+    if (!out_h || !firstfile) {
+        fprintf(stderr, "[%s] ERROR: header check failed.\n", __func__);
+        goto fail;
+    }
+    if (*out_h) {           //use header if one is already present
+        new_h = *out_h;
+    }
+    else {
+        if (h) {            //use the explicit header given
+            new_h = sam_hdr_dup(h);
+            if (!new_h) {
+                fprintf(stderr, "[%s] ERROR: header duplication failed.\n",
+                        __func__);
+                goto fail;
+            }
         }
     }
 
     for (i = 0; i < nfn; ++i) {
-        cram_fd *in_c;
         int ki;
-
-        in = sam_open(fn[i], "rc");
+        //1st file is already open and passed, rest open locally
+        files[i] = in = i ? sam_open(fn[i], "r") : firstfile;
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
             goto fail;
         }
-        in_c = in->fp.cram;
-
-        int vmaj = cram_major_vers(in_c);
-        int vmin = cram_minor_vers(in_c);
-        if ((vers_maj != -1 && vers_maj != vmaj) ||
-            (vers_min != -1 && vers_min != vmin)) {
-            fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
-                    __func__);
+        if (firstfile->format.format != in->format.format) {
+            print_error("cat", "File %s is of different format!", fn[i]);
             goto fail;
         }
-        vers_maj = vmaj;
-        vers_min = vmin;
+        if (firstfile->format.format == cram) {     //version check for cram
+            cram_fd *in_c;
+            in_c = in->fp.cram;
+
+            int vmaj = cram_major_vers(in_c);
+            int vmin = cram_minor_vers(in_c);
+            if ((vers_maj != -1 && vers_maj != vmaj) ||
+                (vers_min != -1 && vers_min != vmin)) {
+                fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
+                        __func__);
+                goto fail;
+            }
+            vers_maj = vmaj;
+            vers_min = vmin;
+        }
 
         old_h = sam_hdr_read(in);
         if (!old_h) {
@@ -111,10 +135,10 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
                 goto fail;
             }
             sam_hdr_destroy(old_h);
-            sam_close(in);
+            old_h = NULL;
             continue;
         }
-
+        //merge RG lines
         int old_count = sam_hdr_count_lines(old_h, "RG");
         for (ki = 0; ki < old_count; ki++) {
             const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
@@ -136,7 +160,8 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
             }
         }
 
-        if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
+        if (firstfile->format.format == cram && old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
+            //RG order check for cram
             for (ki = 0; ki < old_count; ki++) {
                 const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
                 const char *new_name = sam_hdr_line_name(new_h, "RG", ki);
@@ -148,72 +173,61 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
             }
         }
 
-        sam_hdr_destroy(old_h);
-        sam_close(in);
+        sam_hdr_destroy(old_h); old_h = NULL;
     }
 
     ks_free(&ks);
 
-    *vers_maj_p = vers_maj;
-    *vers_min_p = vers_min;
-
-    return new_h;
+    if (vers_maj_p) {
+        *vers_maj_p = vers_maj;
+    }
+    if (vers_min_p) {
+        *vers_min_p = vers_min;
+    }
+    *out_h = new_h;
+    return files;
 
 fail:
     ks_free(&ks);
     if (old_h) sam_hdr_destroy(old_h);
     if (new_h) sam_hdr_destroy(new_h);
-    if (in) sam_close(in);
+    *out_h = NULL;
+    for (i = 1; i < nfn; ++i) {         //close files other than the firstfile
+        if (files[i]) {
+            sam_close(files[i]);
+        }
+    }
+    free(files);
 
     return NULL;
 }
 
-
-/*
- * CRAM files don't store the RG:Z:ID per read in the aux field.
- * Instead they have a numerical data series (RG) to point each read
- * back to the Nth @RG line in the file.  This means that we may need
- * to edit the RG data series (if the files were produced from
- * "samtools split" for example).
- *
- * The encoding method is stored in the compression header. Typical
- * examples:
- *
- * RG => EXTERNAL {18}           # Block content-id 18 holds RG values
- *                               # as a series of ITF8 encoded values
- *
- * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
- *                               # One RG value #-1.  (No RG)
- *
- * RG => HUFFMAN {1, 0, 1, 0}    # One RG value #0 (always first RG)
- *
- * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
- *                               # Two RG values, #0 and #1, written
- *                               # to the CORE block and possibly
- *                               # mixed with other data series.
- *
- * A single value can (but may not be) implemented as a zero bit
- * huffman code.  In this situation we can change the meta-data in the
- * compression header to renumber an RG value..
- */
-int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg)
+int cram_cat(samFile * const firstfile, int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg)
 {
-    samFile *out;
+    samFile *out = NULL;
     cram_fd *out_c;
-    int i, vers_maj, vers_min;
+    int i, vers_maj, vers_min, ret = -1;
     sam_hdr_t *new_h = NULL;
+    samFile **files = NULL;
 
-    /* Check consistent versioning and compatible headers */
-    if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min)))
+    /* Check consistent versioning and compatible headers;
+    merges RG lines, opens all files and returns them that multiple non-seekable
+    stream inputs can be handled */
+    if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, &vers_maj, &vers_min, &new_h)))
         return -1;
 
+    if (!new_h) {
+        print_error_errno("cat", "failed to make output header");
+        goto closefiles;
+    }
+
     /* Open the file with cram_vers */
     char vers[100];
-    sprintf(vers, "%d.%d", vers_maj, vers_min);
+    snprintf(vers, sizeof(vers), "%d.%d", vers_maj, vers_min);
     out = sam_open_format(outcram, "wc", &ga->out);
     if (out == 0) {
         print_error_errno("cat", "fail to open output file '%s'", outcram);
-        return -1;
+        goto closefiles;
     }
     out_c = out->fp.cram;
     cram_set_option(out_c, CRAM_OPT_VERSION, vers);
@@ -224,12 +238,13 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                                  arg_list ? "CL": NULL,
                                  arg_list ? arg_list : NULL,
                                  NULL))
-        return -1;
+        goto closefiles;
 
     if (sam_hdr_write(out, new_h) < 0) {
         print_error_errno("cat", "Couldn't write header");
-        return -1;
+        goto closefiles;
     }
+    out_c = out->fp.cram;
 
     for (i = 0; i < nfn; ++i) {
         samFile *in;
@@ -238,17 +253,17 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
         sam_hdr_t *old_h;
         int new_rg = -1;
 
-        in = sam_open(fn[i], "rc");
+        in = files[i];
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
-            return -1;
+            goto closefiles;
         }
         in_c = in->fp.cram;
 
         old_h = sam_hdr_read(in);
         if (!old_h) {
             print_error("cat", "fail to read the header of file '%s'", fn[i]);
-            return -1;
+            goto closefiles;
         }
 
         // Compute RG mapping if suitable for changing.
@@ -258,11 +273,11 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                 new_rg = sam_hdr_line_index(new_h, "RG", old_name);
                 if (new_rg < 0) {
                     print_error("cat", "fail to find @RG line '%s' in the new header", old_name);
-                    return -1;
+                    goto closefiles;
                 }
             } else {
                 print_error("cat", "fail to find @RG line in file '%s'", fn[i]);
-                return -1;
+                goto closefiles;
             }
         } else {
             new_rg = 0;
@@ -274,7 +289,7 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                 cram_block *blk;
                 // Container compression header
                 if (!(blk = cram_read_block(in_c)))
-                    return -1;
+                    goto closefiles;
                 cram_free_block(blk);
                 cram_free_container(c);
                 continue;
@@ -292,34 +307,42 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
 
                 // Not switching rg so do the usual read/write loop
                 if (cram_write_container(out_c, c) != 0)
-                    return -1;
+                    goto closefiles;
 
                 // Container compression header
                 if (!(blk = cram_read_block(in_c)))
-                    return -1;
+                    goto closefiles;
                 if (cram_write_block(out_c, blk) != 0) {
                     cram_free_block(blk);
-                    return -1;
+                    goto closefiles;
                 }
                 cram_free_block(blk);
 
-
                 // Container num_blocks can be invalid, due to a bug.
                 // Instead we iterate in slice context instead.
                 (void)cram_container_get_landmarks(c, &num_slices);
                 cram_copy_slice(in_c, out_c, num_slices);
             }
-
             cram_free_container(c);
         }
-
         sam_hdr_destroy(old_h);
-        sam_close(in);
     }
-    sam_close(out);
-    sam_hdr_destroy(new_h);
+    ret = 0;
 
-    return 0;
+closefiles:
+    if (out) {
+        sam_close(out);
+    }
+    if (new_h) {
+        sam_hdr_destroy(new_h);
+    }
+    for (i = 1; i < nfn; ++i) {     //skip firstfile and close rest
+        if (files[i]) {
+            sam_close(files[i]);
+        }
+    }
+    free(files);
+    return ret;
 }
 
 
@@ -330,31 +353,40 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
 
 #define BGZF_EMPTY_BLOCK_SIZE 28
 
-int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
+int bam_cat(samFile * const firstfile, int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
 {
-    BGZF *fp, *in = NULL;
+    BGZF *fp = NULL, *in = NULL;
     uint8_t *buf = NULL;
     uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
     const int es=BGZF_EMPTY_BLOCK_SIZE;
     int i;
+    samFile **files = NULL;
+    sam_hdr_t *new_h = NULL;
 
+    /* merges RG lines, opens all files and returns them that multiple non-seekable
+    stream inputs can be handled */
+    if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, NULL, NULL, &new_h)))
+        return -1;
+    if (!new_h) {
+        print_error_errno("cat", "failed to make output header");
+        goto fail;
+    }
     fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
     if (fp == 0) {
         print_error_errno("cat", "fail to open output file '%s'", outbam);
-        return -1;
+        goto fail;
     }
-    if (h) {
-        if (!no_pg && sam_hdr_add_pg(h, "samtools",
-                                     "VN", samtools_version(),
-                                     arg_list ? "CL": NULL,
-                                     arg_list ? arg_list : NULL,
-                                     NULL))
-            goto fail;
 
-        if (bam_hdr_write(fp, h) < 0) {
-            print_error_errno("cat", "Couldn't write header");
-            goto fail;
-        }
+    if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
+                                    "VN", samtools_version(),
+                                    arg_list ? "CL": NULL,
+                                    arg_list ? arg_list : NULL,
+                                    NULL))
+        goto fail;
+
+    if (bam_hdr_write(fp, new_h) < 0) {
+        print_error_errno("cat", "Couldn't write header");
+        goto fail;
     }
 
     buf = (uint8_t*) malloc(BUF_SIZE);
@@ -363,35 +395,13 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a
         goto fail;
     }
     for(i = 0; i < nfn; ++i){
-        sam_hdr_t *old;
         int len,j;
-
-        in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
+        in = files[i]->fp.bgzf;
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
             goto fail;
         }
-        if (in->is_write) return -1;
-
-        old = bam_hdr_read(in);
-        if (old == NULL) {
-            fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
-                    __func__, fn[i]);
-            goto fail;
-        }
-        if (h == 0 && i == 0) {
-            if (!no_pg && sam_hdr_add_pg(old, "samtools",
-                                         "VN", samtools_version(),
-                                         arg_list ? "CL": NULL,
-                                         arg_list ? arg_list : NULL,
-                                         NULL))
-                goto fail;
-
-            if (bam_hdr_write(fp, old) < 0) {
-                print_error_errno("cat", "Couldn't write header");
-                goto fail;
-            }
-        }
+        if (in->is_write) goto fail;
 
         if (in->block_offset < in->block_length) {
             if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
@@ -432,27 +442,42 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a
                 if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
             }
         }
-        sam_hdr_destroy(old);
-        bgzf_close(in);
         in = NULL;
     }
     free(buf);
     if (bgzf_close(fp) < 0) {
         fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
-        return -1;
+        goto fail;
+    }
+    for (i = 1; i < nfn; ++i) {     //skip firstfile and close rest
+        if (files[i]) {
+            sam_close(files[i]);
+        }
     }
+    free(files);
+    sam_hdr_destroy(new_h);
     return 0;
 
  write_fail:
     fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
  fail:
-    if (in) bgzf_close(in);
+    if (new_h) {
+        sam_hdr_destroy(new_h);
+    }
     if (fp) bgzf_close(fp);
     free(buf);
+
+    if (files) {
+        for(i = 1; i < nfn; ++i) {  //except the firstfile
+            if(files[i]) {
+                sam_close(files[i]);
+            }
+        }
+        free(files);
+    }
     return -1;
 }
 
-
 int main_cat(int argc, char *argv[])
 {
     sam_hdr_t *h = 0;
@@ -472,21 +497,22 @@ int main_cat(int argc, char *argv[])
     char *arg_list = NULL;
 
     sam_global_args_init(&ga);
-
     while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
         switch (c) {
             case 'h': {
                 samFile *fph = sam_open(optarg, "r");
                 if (fph == 0) {
                     fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg);
-                    return 1;
+                    ret = 1;
+                    goto end;
                 }
                 h = sam_hdr_read(fph);
                 if (h == NULL) {
                     fprintf(stderr,
                             "[%s] ERROR: failed to read the header from '%s'.\n",
                             __func__, optarg);
-                    return 1;
+                    ret = 1;
+                    goto end;
                 }
                 sam_close(fph);
                 break;
@@ -521,7 +547,8 @@ int main_cat(int argc, char *argv[])
 
     if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
         print_error("cat", "failed to create arg_list");
-        return 1;
+        ret = 1;
+        goto end;
     }
 
     // Append files specified in argv to the list.
@@ -541,34 +568,34 @@ int main_cat(int argc, char *argv[])
         fprintf(stderr, "         -h FILE  copy the header from FILE [default is 1st input file]\n");
         fprintf(stderr, "         -o FILE  output BAM/CRAM\n");
         fprintf(stderr, "         --no-PG  do not add a PG line\n");
-        sam_global_opt_help(stderr, "--..-@-.");
-        return 1;
+        sam_global_opt_help(stderr, "---.-@-.");
+        ret = 1;
+        goto end;
     }
 
     in = sam_open(infns[0], "r");
     if (!in) {
         print_error_errno("cat", "failed to open file '%s'", infns[0]);
-        return 1;
+        ret = 1;
+        goto end;
     }
 
     switch (hts_get_format(in)->format) {
     case bam:
-        sam_close(in);
-        if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
+        if (bam_cat(in, infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
             ret = 1;
         break;
 
     case cram:
-        sam_close(in);
-        if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0)
+        if (cram_cat(in, infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0)
             ret = 1;
         break;
 
     default:
-        sam_close(in);
         fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
-        return 1;
+        ret = 1;
     }
+    sam_close(in);
 
  end:
     if (infns_size > 0) {
@@ -582,6 +609,7 @@ int main_cat(int argc, char *argv[])
     free(arg_list);
     if (h)
         sam_hdr_destroy(h);
+    sam_global_args_free(&ga);
 
     return ret;
 }

From 57de9c2fb201887f72aba4097233e173fd046d5b Mon Sep 17 00:00:00 2001
From: Valeriu Ohan <vo2@sanger.ac.uk>
Date: Tue, 17 Mar 2020 14:53:06 +0000
Subject: [PATCH 15/26] Add -d TAG option, which allows read split by custom
 aux tags.

Reworked by rmd to use the existing rg arrays for both split
by read group, and split by alternate tags.  The original
used a separate hash table for the `-d` option.

Co-authored-by: Rob Davies <rmd+git@sanger.ac.uk>
---
 bam_split.c          | 260 +++++++++++++++++++++++++++++++++++--------
 doc/samtools-split.1 |   8 +-
 2 files changed, 218 insertions(+), 50 deletions(-)

diff --git a/bam_split.c b/bam_split.c
index e9f0fb591..67ef4735c 100644
--- a/bam_split.c
+++ b/bam_split.c
@@ -48,6 +48,7 @@ struct parsed_opts {
     const char *unaccounted_header_name;
     const char *unaccounted_name;
     const char *output_format_string;
+    const char *tag;
     bool verbose;
     int no_pg;
     sam_global_args ga;
@@ -61,11 +62,11 @@ struct state {
     samFile* unaccounted_file;
     sam_hdr_t* unaccounted_header;
     size_t output_count;
-    char** rg_id;
+    char **rg_id;
     char **rg_index_file_name;
     char **rg_output_file_name;
-    samFile** rg_output_file;
-    sam_hdr_t** rg_output_header;
+    samFile **rg_output_file;
+    sam_hdr_t **rg_output_header;
     kh_c2i_t* rg_hash;
     htsThreadPool p;
     int write_index;
@@ -85,6 +86,7 @@ static void usage(FILE *write_to)
 "  -f STRING       output filename format string [\"%%*_%%#.%%.\"]\n"
 "  -u FILE1        put reads with no RG tag or an unrecognised RG tag in FILE1\n"
 "  -h FILE2        ... and override the header with FILE2 (-u file only)\n"
+"  -d TAG          split by TAG value. TAG value must be a string.\n"
 "  -v              verbose output\n"
 "  --no-PG         do not add a PG line\n");
     sam_global_opt_help(write_to, "-....@..");
@@ -104,7 +106,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 {
     if (argc == 1) { usage(stdout); return NULL; }
 
-    const char *optstring = "vf:h:u:@:";
+    const char *optstring = "vf:h:u:d:@:";
 
     static const struct option lopts[] = {
         SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
@@ -132,6 +134,10 @@ static parsed_opts_t* parse_args(int argc, char** argv)
         case 'u':
             retval->unaccounted_name = optarg;
             break;
+        case 'd':
+            retval->tag = optarg;
+            retval->output_format_string = "%*_%!.%.";
+            break;
         case 1:
             retval->no_pg = 1;
             break;
@@ -275,6 +281,133 @@ static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2)
     return 0;
 }
 
+static int grow_output_lists(state_t *state, size_t count) {
+    char **new_list = realloc(state->rg_id, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->rg_id = new_list;
+    new_list = realloc(state->rg_index_file_name, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->rg_index_file_name = new_list;
+    new_list = realloc(state->rg_output_file_name, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->rg_output_file_name = new_list;
+    samFile **new_file = realloc(state->rg_output_file,
+                                 count * sizeof(samFile *));
+    if (!new_file)
+        return -1;
+    state->rg_output_file = new_file;
+    sam_hdr_t **new_hdr = realloc(state->rg_output_header,
+                                  count * sizeof(sam_hdr_t *));
+    if (!new_hdr)
+        return -1;
+    state->rg_output_header = new_hdr;
+    return 0;
+}
+
+static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
+                              const char *tag, const char *arg_list) {
+    char *input_base_name = NULL, *new_file_name = NULL, *tag_key = NULL;
+    char *new_idx_fn = NULL;
+    sam_hdr_t *new_hdr = NULL;
+    samFile *new_sam_file = NULL;
+
+    khiter_t i = kh_get_c2i(state->rg_hash, tag);
+    if (i != kh_end(state->rg_hash)) {
+        return i;
+    }
+    // create new file
+    if (grow_output_lists(state, state->output_count + 1) != 0) {
+        print_error_errno("split", "Couldn't grow output lists");
+        return kh_end(state->rg_hash);
+    }
+    tag_key = strdup(tag);
+    if (!tag_key) {
+        print_error_errno("split", "Couldn't copy tag value");
+        return kh_end(state->rg_hash);
+    }
+    char *dirsep = strrchr(opts->merged_input_name, '/');
+    input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
+    if (!input_base_name) {
+        print_error_errno("split", "Filename parsing failed");
+        goto fail;
+    }
+
+    char* extension = strrchr(input_base_name, '.');
+    if (extension) *extension = '\0';
+
+    new_file_name = expand_format_string(opts->output_format_string, input_base_name, tag, 0, &opts->ga.out);
+    if (!new_file_name) {
+        print_error_errno("split", "Filename creation failed");
+        goto fail;
+    }
+
+    new_hdr = sam_hdr_dup(state->merged_input_header);
+    if (!new_hdr) {
+        print_error_errno("split", "Duplicating header for file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+    if (!opts->no_pg && sam_hdr_add_pg(new_hdr, "samtools",
+                                       "VN", samtools_version(),
+                                       arg_list ? "CL": NULL,
+                                       arg_list ? arg_list : NULL,
+                                       NULL)) {
+        print_error_errno("split", "Adding PG line to file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+
+    char outmode[4] = "w";
+    sam_open_mode(outmode + 1, new_file_name, NULL);
+    new_sam_file = sam_open_format(new_file_name, outmode, &opts->ga.out);
+    if (!new_sam_file) {
+        print_error_errno("split", "Opening filename for writing \"%s\" failed", new_file_name);
+        goto fail;
+    }
+    if (state->p.pool)
+        hts_set_opt(new_sam_file, HTS_OPT_THREAD_POOL, &state->p);
+
+    if (state->write_index) {
+        new_idx_fn = auto_index(new_sam_file, new_file_name, new_hdr);
+        if (!new_idx_fn) {
+            print_error_errno("split", "Creating index file for file \"%s\" failed", new_file_name);
+            goto fail;
+        }
+    }
+
+    if (sam_hdr_write(new_sam_file, new_hdr) != 0) {
+        print_error_errno("split", "Couldn't write header to \"%s\"",
+                          new_file_name);
+        goto fail;
+    }
+    int ret = -1;
+    i = kh_put_c2i(state->rg_hash, tag_key, &ret);
+    if (ret < 0) {
+        print_error_errno("split", "Adding file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+
+    kh_val(state->rg_hash, i) = state->output_count;
+    state->rg_id[state->output_count] = tag_key;
+    state->rg_index_file_name[state->output_count] = new_idx_fn;
+    state->rg_output_file_name[state->output_count] = new_file_name;
+    state->rg_output_file[state->output_count] = new_sam_file;
+    state->rg_output_header[state->output_count] = new_hdr;
+    state->output_count++;
+    free(input_base_name);
+    return i;
+
+ fail:
+    free(input_base_name);
+    free(new_file_name);
+    free(tag_key);
+    free(new_idx_fn);
+    sam_hdr_destroy(new_hdr);
+    sam_close(new_sam_file);
+    return kh_end(state->rg_hash);
+}
+
 // Set the initial state
 static state_t* init(parsed_opts_t* opts, const char *arg_list)
 {
@@ -306,6 +439,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         cleanup_state(retval, false);
         return NULL;
     }
+    retval->write_index = opts->ga.write_index;
 
     if (opts->unaccounted_name) {
         if (opts->unaccounted_header_name) {
@@ -331,10 +465,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         } else {
             retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header);
             if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools",
-                                    "VN", samtools_version(),
-                                    arg_list ? "CL": NULL,
-                                    arg_list ? arg_list : NULL,
-                                    NULL)) {
+                                               "VN", samtools_version(),
+                                               arg_list ? "CL": NULL,
+                                               arg_list ? arg_list : NULL,
+                                               NULL)) {
                 print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name);
                 cleanup_state(retval, false);
                 return NULL;
@@ -354,9 +488,19 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
             hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
     }
 
-    // Open output files for RGs
-    if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
-    if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count);
+    int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
+    if (is_rg) {
+        if (!count_RG(retval->merged_input_header,
+                      &retval->output_count, &retval->rg_id)) {
+            cleanup_state(retval, false);
+            return NULL;
+        }
+        if (opts->verbose)
+            fprintf(stderr, "@RG's found %zu\n",retval->output_count);
+    } else {
+        retval->output_count = 0;
+    }
+
     // Prevent calloc(0, size);
     size_t num = retval->output_count ? retval->output_count : 1;
     retval->rg_index_file_name = (char **)calloc(num, sizeof(char *));
@@ -370,7 +514,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         cleanup_state(retval, false);
         return NULL;
     }
+    if (!is_rg)
+        return retval;  // Done for this case - outputs will be opened later
 
+    // Open output files for RGs
     char* dirsep = strrchr(opts->merged_input_name, '/');
     char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
     if (!input_base_name) {
@@ -398,7 +545,6 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         }
 
         retval->rg_output_file_name[i] = output_filename;
-
         sam_open_mode(outmode + 1, output_filename, NULL);
         retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
 
@@ -425,12 +571,12 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         // Set and edit header
         retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header);
         if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) ||
-           (!opts->no_pg &&
-            sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
-                        "VN", samtools_version(),
-                        arg_list ? "CL": NULL,
-                        arg_list ? arg_list : NULL,
-                        NULL))) {
+            (!opts->no_pg &&
+             sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
+                            "VN", samtools_version(),
+                            arg_list ? "CL": NULL,
+                            arg_list ? arg_list : NULL,
+                            NULL))) {
             print_error("split", "Could not rewrite header for \"%s\"", output_filename);
             cleanup_state(retval, false);
             free(input_base_name);
@@ -439,34 +585,17 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
     }
 
     free(input_base_name);
-    retval->write_index = opts->ga.write_index;
 
     return retval;
 }
 
-static bool split(state_t* state)
+static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
 {
+    int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
     if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
         print_error_errno("split", "Could not write output file header");
         return false;
     }
-    size_t i;
-    for (i = 0; i < state->output_count; i++) {
-        if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
-            print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
-            return false;
-        }
-        if (state->write_index) {
-            state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
-                                                      state->rg_output_file_name[i],
-                                                      state->rg_output_header[i]);
-            if (!state->rg_index_file_name[i]) {
-                print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
-                return false;
-            }
-        }
-    }
-
     bam1_t* file_read = bam_init1();
     // Read the first record
     int r;
@@ -480,25 +609,54 @@ static bool split(state_t* state)
         }
     }
 
+    if (is_rg) {
+        size_t i;
+        for (i = 0; i < state->output_count; i++) {
+            if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
+                print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
+                goto error;
+            }
+            if (state->write_index) {
+                state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
+                        state->rg_output_file_name[i],
+                        state->rg_output_header[i]);
+                if (!state->rg_index_file_name[i]) {
+                    print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
+                    goto error;
+                }
+            }
+        }
+    }
     while (file_read != NULL) {
         // Get RG tag from read and look it up in hash to find file to output it to
-        uint8_t* tag = bam_aux_get(file_read, "RG");
+        uint8_t* tag = bam_aux_get(file_read, is_rg ? "RG" : opts->tag);
+        char *val = tag ? bam_aux2Z(tag) : NULL;
         khiter_t iter;
-        if ( tag != NULL ) {
-            char* rg = bam_aux2Z(tag);
-            iter = kh_get_c2i(state->rg_hash, rg);
+        if ( val != NULL ) {
+            iter = kh_get_c2i(state->rg_hash, val);
         } else {
             iter = kh_end(state->rg_hash);
         }
 
+        if (!is_rg && val && iter == kh_end(state->rg_hash)) {
+            // Need to open a new output file
+            iter = prep_sam_file(opts, state, val, arg_list);
+            if (iter == kh_end(state->rg_hash)) { // Open failed
+                print_error("split",
+                            "Could not create output file for tag \"%s:%s\"",
+                            opts->tag, bam_aux2Z(tag));
+                goto error;
+
+            }
+        }
+
         // Write the read out to correct file
         if (iter != kh_end(state->rg_hash)) {
             // if found write to the appropriate untangled bam
             int i = kh_val(state->rg_hash,iter);
             if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
                 print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
-                bam_destroy1(file_read);
-                return false;
+                goto error;
             }
         } else {
             // otherwise write to the unaccounted bam if there is one or fail
@@ -506,15 +664,14 @@ static bool split(state_t* state)
                 if (tag) {
                     fprintf(stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
                 } else {
-                    fprintf(stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
+                    fprintf(stderr, "Read \"%s\" has no %s tag.\n",
+                            bam_get_qname(file_read), is_rg ? "RG" : opts->tag);
                 }
-                bam_destroy1(file_read);
-                return false;
+                goto error;
             } else {
                 if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
                     print_error_errno("split", "Could not write to unaccounted output file");
-                    bam_destroy1(file_read);
-                    return false;
+                    goto error;
                 }
             }
         }
@@ -532,6 +689,7 @@ static bool split(state_t* state)
     }
 
     if (state->write_index) {
+        size_t i;
         for (i = 0; i < state->output_count; i++) {
             if (sam_idx_save(state->rg_output_file[i]) < 0) {
                 print_error_errno("split", "writing index failed");
@@ -542,6 +700,9 @@ static bool split(state_t* state)
     }
 
     return true;
+error:
+    bam_destroy1(file_read);
+    return false;
 }
 
 static int cleanup_state(state_t* status, bool check_close)
@@ -577,6 +738,7 @@ static int cleanup_state(state_t* status, bool check_close)
     free(status->rg_output_file_name);
     free(status->rg_index_file_name);
     kh_destroy_c2i(status->rg_hash);
+
     free(status->rg_id);
     if (status->p.pool)
         hts_tpool_destroy(status->p.pool);
@@ -603,7 +765,7 @@ int main_split(int argc, char** argv)
     state_t* status = init(opts, arg_list);
     if (!status) goto cleanup_opts;
 
-    if (!split(status)) {
+    if (!split(status, opts, arg_list)) {
         cleanup_state(status, false);
         goto cleanup_opts;
     }
diff --git a/doc/samtools-split.1 b/doc/samtools-split.1
index 9ae71ce30..0adea99f0 100644
--- a/doc/samtools-split.1
+++ b/doc/samtools-split.1
@@ -70,7 +70,7 @@ Output format defaults to BAM.  For SAM or CRAM then either set the format with
 .SH OPTIONS
 .TP 14
 .BI "-u " FILE1
-.RI "Put reads with no RG tag or an unrecognised RG tag into " FILE1
+.RI "Put reads with no tag or an unrecognised tag into " FILE1
 .TP
 .BI "-h " FILE2
 .RI "Use the header from " FILE2 " when writing the file given in the " -u
@@ -84,6 +84,12 @@ references must be in the same order and have the same lengths.
 Output filename format string (see below)
 ["%*_%#.%."]
 .TP
+.BI "-d " TAG
+Split reads by TAG value into distinct files. Only the TAG key must be 
+supplied with the option. The value of the TAG has to be a string (i.e.
+.B key:Z:value
+)
+.TP
 .B -v
 Verbose output
 .TP

From 41e1543cea551c474622c467c7784e4d32e8dfc9 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 3 Nov 2023 10:35:08 +0000
Subject: [PATCH 16/26] Refactor some names as we now split on tags other than
 RG

---
 bam_split.c | 170 ++++++++++++++++++++++++++--------------------------
 1 file changed, 85 insertions(+), 85 deletions(-)

diff --git a/bam_split.c b/bam_split.c
index 67ef4735c..3d36f590c 100644
--- a/bam_split.c
+++ b/bam_split.c
@@ -62,12 +62,12 @@ struct state {
     samFile* unaccounted_file;
     sam_hdr_t* unaccounted_header;
     size_t output_count;
-    char **rg_id;
-    char **rg_index_file_name;
-    char **rg_output_file_name;
-    samFile **rg_output_file;
-    sam_hdr_t **rg_output_header;
-    kh_c2i_t* rg_hash;
+    char **tag_vals;
+    char **index_file_name;
+    char **output_file_name;
+    samFile **output_file;
+    sam_hdr_t **output_header;
+    kh_c2i_t* tag_val_hash;
     htsThreadPool p;
     int write_index;
 };
@@ -95,8 +95,8 @@ static void usage(FILE *write_to)
 "Format string expansions:\n"
 "  %%%%     %%\n"
 "  %%*     basename\n"
-"  %%#     @RG index\n"
-"  %%!     @RG ID\n"
+"  %%#     index (of @RG in the header, or count of TAG values seen so far)\n"
+"  %%!     @RG ID or TAG value\n"
 "  %%.     filename extension for output format\n"
       );
 }
@@ -169,7 +169,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 }
 
 // Expands a output filename format string
-static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
+static char* expand_format_string(const char* format_string, const char* basename, const char* tag_val, const int file_idx, const htsFormat *format)
 {
     kstring_t str = { 0, 0, NULL };
     const char* pointer = format_string;
@@ -185,10 +185,10 @@ static char* expand_format_string(const char* format_string, const char* basenam
                 if (kputs(basename, &str) < 0) goto memfail;
                 break;
             case '#':
-                if (kputl(rg_idx, &str) < 0) goto memfail;
+                if (kputl(file_idx, &str) < 0) goto memfail;
                 break;
             case '!':
-                if (kputs(rg_id, &str) < 0) goto memfail;
+                if (kputs(tag_val, &str) < 0) goto memfail;
                 break;
             case '.':
                 // Only really need to cope with sam, bam, cram
@@ -282,51 +282,51 @@ static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2)
 }
 
 static int grow_output_lists(state_t *state, size_t count) {
-    char **new_list = realloc(state->rg_id, count * sizeof(char *));
+    char **new_list = realloc(state->tag_vals, count * sizeof(char *));
     if (!new_list)
         return -1;
-    state->rg_id = new_list;
-    new_list = realloc(state->rg_index_file_name, count * sizeof(char *));
+    state->tag_vals = new_list;
+    new_list = realloc(state->index_file_name, count * sizeof(char *));
     if (!new_list)
         return -1;
-    state->rg_index_file_name = new_list;
-    new_list = realloc(state->rg_output_file_name, count * sizeof(char *));
+    state->index_file_name = new_list;
+    new_list = realloc(state->output_file_name, count * sizeof(char *));
     if (!new_list)
         return -1;
-    state->rg_output_file_name = new_list;
-    samFile **new_file = realloc(state->rg_output_file,
+    state->output_file_name = new_list;
+    samFile **new_file = realloc(state->output_file,
                                  count * sizeof(samFile *));
     if (!new_file)
         return -1;
-    state->rg_output_file = new_file;
-    sam_hdr_t **new_hdr = realloc(state->rg_output_header,
+    state->output_file = new_file;
+    sam_hdr_t **new_hdr = realloc(state->output_header,
                                   count * sizeof(sam_hdr_t *));
     if (!new_hdr)
         return -1;
-    state->rg_output_header = new_hdr;
+    state->output_header = new_hdr;
     return 0;
 }
 
 static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
                               const char *tag, const char *arg_list) {
-    char *input_base_name = NULL, *new_file_name = NULL, *tag_key = NULL;
+    char *input_base_name = NULL, *new_file_name = NULL, *tag_val = NULL;
     char *new_idx_fn = NULL;
     sam_hdr_t *new_hdr = NULL;
     samFile *new_sam_file = NULL;
 
-    khiter_t i = kh_get_c2i(state->rg_hash, tag);
-    if (i != kh_end(state->rg_hash)) {
+    khiter_t i = kh_get_c2i(state->tag_val_hash, tag);
+    if (i != kh_end(state->tag_val_hash)) {
         return i;
     }
     // create new file
     if (grow_output_lists(state, state->output_count + 1) != 0) {
         print_error_errno("split", "Couldn't grow output lists");
-        return kh_end(state->rg_hash);
+        return kh_end(state->tag_val_hash);
     }
-    tag_key = strdup(tag);
-    if (!tag_key) {
+    tag_val = strdup(tag);
+    if (!tag_val) {
         print_error_errno("split", "Couldn't copy tag value");
-        return kh_end(state->rg_hash);
+        return kh_end(state->tag_val_hash);
     }
     char *dirsep = strrchr(opts->merged_input_name, '/');
     input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
@@ -382,18 +382,18 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
         goto fail;
     }
     int ret = -1;
-    i = kh_put_c2i(state->rg_hash, tag_key, &ret);
+    i = kh_put_c2i(state->tag_val_hash, tag_val, &ret);
     if (ret < 0) {
         print_error_errno("split", "Adding file \"%s\" failed", new_file_name);
         goto fail;
     }
 
-    kh_val(state->rg_hash, i) = state->output_count;
-    state->rg_id[state->output_count] = tag_key;
-    state->rg_index_file_name[state->output_count] = new_idx_fn;
-    state->rg_output_file_name[state->output_count] = new_file_name;
-    state->rg_output_file[state->output_count] = new_sam_file;
-    state->rg_output_header[state->output_count] = new_hdr;
+    kh_val(state->tag_val_hash, i) = state->output_count;
+    state->tag_vals[state->output_count] = tag_val;
+    state->index_file_name[state->output_count] = new_idx_fn;
+    state->output_file_name[state->output_count] = new_file_name;
+    state->output_file[state->output_count] = new_sam_file;
+    state->output_header[state->output_count] = new_hdr;
     state->output_count++;
     free(input_base_name);
     return i;
@@ -401,11 +401,11 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
  fail:
     free(input_base_name);
     free(new_file_name);
-    free(tag_key);
+    free(tag_val);
     free(new_idx_fn);
     sam_hdr_destroy(new_hdr);
     sam_close(new_sam_file);
-    return kh_end(state->rg_hash);
+    return kh_end(state->tag_val_hash);
 }
 
 // Set the initial state
@@ -491,7 +491,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
     int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
     if (is_rg) {
         if (!count_RG(retval->merged_input_header,
-                      &retval->output_count, &retval->rg_id)) {
+                      &retval->output_count, &retval->tag_vals)) {
             cleanup_state(retval, false);
             return NULL;
         }
@@ -503,13 +503,13 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
 
     // Prevent calloc(0, size);
     size_t num = retval->output_count ? retval->output_count : 1;
-    retval->rg_index_file_name = (char **)calloc(num, sizeof(char *));
-    retval->rg_output_file_name = (char **)calloc(num, sizeof(char *));
-    retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*));
-    retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
-    retval->rg_hash = kh_init_c2i();
-    if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header ||
-        !retval->rg_hash || !retval->rg_index_file_name) {
+    retval->index_file_name = (char **)calloc(num, sizeof(char *));
+    retval->output_file_name = (char **)calloc(num, sizeof(char *));
+    retval->output_file = (samFile**)calloc(num, sizeof(samFile*));
+    retval->output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
+    retval->tag_val_hash = kh_init_c2i();
+    if (!retval->output_file_name || !retval->output_file || !retval->output_header ||
+        !retval->tag_val_hash || !retval->index_file_name) {
         print_error_errno("split", "Could not initialise output file array");
         cleanup_state(retval, false);
         return NULL;
@@ -535,7 +535,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
 
         output_filename = expand_format_string(opts->output_format_string,
                                                input_base_name,
-                                               retval->rg_id[i], i,
+                                               retval->tag_vals[i], i,
                                                &opts->ga.out);
 
         if ( output_filename == NULL ) {
@@ -544,35 +544,35 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
             return NULL;
         }
 
-        retval->rg_output_file_name[i] = output_filename;
+        retval->output_file_name[i] = output_filename;
         sam_open_mode(outmode + 1, output_filename, NULL);
-        retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+        retval->output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
 
-        if (retval->rg_output_file[i] == NULL) {
+        if (retval->output_file[i] == NULL) {
             print_error_errno("split", "Could not open \"%s\"", output_filename);
             cleanup_state(retval, false);
             free(input_base_name);
             return NULL;
         }
         if (retval->p.pool)
-            hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
+            hts_set_opt(retval->output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
 
         // Record index in hash
         int ret;
-        khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret);
+        khiter_t iter = kh_put_c2i(retval->tag_val_hash, retval->tag_vals[i], &ret);
         if (ret < 0) {
             print_error_errno("split", "Couldn't add @RG ID to look-up table");
             cleanup_state(retval, false);
             free(input_base_name);
             return NULL;
         }
-        kh_val(retval->rg_hash,iter) = i;
+        kh_val(retval->tag_val_hash,iter) = i;
 
         // Set and edit header
-        retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header);
-        if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) ||
+        retval->output_header[i] = sam_hdr_dup(retval->merged_input_header);
+        if (sam_hdr_remove_except(retval->output_header[i], "RG", "ID", retval->tag_vals[i]) ||
             (!opts->no_pg &&
-             sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
+             sam_hdr_add_pg(retval->output_header[i], "samtools",
                             "VN", samtools_version(),
                             arg_list ? "CL": NULL,
                             arg_list ? arg_list : NULL,
@@ -612,16 +612,16 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
     if (is_rg) {
         size_t i;
         for (i = 0; i < state->output_count; i++) {
-            if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
-                print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
+            if (sam_hdr_write(state->output_file[i], state->output_header[i]) != 0) {
+                print_error_errno("split", "Could not write file header to \"%s\"", state->output_file_name[i]);
                 goto error;
             }
             if (state->write_index) {
-                state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
-                        state->rg_output_file_name[i],
-                        state->rg_output_header[i]);
-                if (!state->rg_index_file_name[i]) {
-                    print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
+                state->index_file_name[i] = auto_index(state->output_file[i],
+                        state->output_file_name[i],
+                        state->output_header[i]);
+                if (!state->index_file_name[i]) {
+                    print_error_errno("split", "Could not create index for file \"%s\"", state->output_file_name[i]);
                     goto error;
                 }
             }
@@ -633,15 +633,15 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
         char *val = tag ? bam_aux2Z(tag) : NULL;
         khiter_t iter;
         if ( val != NULL ) {
-            iter = kh_get_c2i(state->rg_hash, val);
+            iter = kh_get_c2i(state->tag_val_hash, val);
         } else {
-            iter = kh_end(state->rg_hash);
+            iter = kh_end(state->tag_val_hash);
         }
 
-        if (!is_rg && val && iter == kh_end(state->rg_hash)) {
+        if (!is_rg && val && iter == kh_end(state->tag_val_hash)) {
             // Need to open a new output file
             iter = prep_sam_file(opts, state, val, arg_list);
-            if (iter == kh_end(state->rg_hash)) { // Open failed
+            if (iter == kh_end(state->tag_val_hash)) { // Open failed
                 print_error("split",
                             "Could not create output file for tag \"%s:%s\"",
                             opts->tag, bam_aux2Z(tag));
@@ -651,11 +651,11 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
         }
 
         // Write the read out to correct file
-        if (iter != kh_end(state->rg_hash)) {
+        if (iter != kh_end(state->tag_val_hash)) {
             // if found write to the appropriate untangled bam
-            int i = kh_val(state->rg_hash,iter);
-            if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
-                print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
+            int i = kh_val(state->tag_val_hash,iter);
+            if (sam_write1(state->output_file[i], state->output_header[i], file_read) < 0) {
+                print_error_errno("split", "Could not write to \"%s\"", state->output_file_name[i]);
                 goto error;
             }
         } else {
@@ -691,11 +691,11 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
     if (state->write_index) {
         size_t i;
         for (i = 0; i < state->output_count; i++) {
-            if (sam_idx_save(state->rg_output_file[i]) < 0) {
+            if (sam_idx_save(state->output_file[i]) < 0) {
                 print_error_errno("split", "writing index failed");
                 return false;
             }
-            free(state->rg_index_file_name[i]);
+            free(state->index_file_name[i]);
         }
     }
 
@@ -720,26 +720,26 @@ static int cleanup_state(state_t* status, bool check_close)
     sam_close(status->merged_input_file);
     size_t i;
     for (i = 0; i < status->output_count; i++) {
-        if (status->rg_output_header && status->rg_output_header[i])
-            sam_hdr_destroy(status->rg_output_header[i]);
-        if (status->rg_output_file && status->rg_output_file[i]) {
-            if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
-                print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
+        if (status->output_header && status->output_header[i])
+            sam_hdr_destroy(status->output_header[i]);
+        if (status->output_file && status->output_file[i]) {
+            if (sam_close(status->output_file[i]) < 0 && check_close) {
+                print_error("split", "Error on closing output file \"%s\"", status->output_file_name[i]);
                 ret = -1;
             }
         }
-        if (status->rg_id) free(status->rg_id[i]);
-        if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
+        if (status->tag_vals) free(status->tag_vals[i]);
+        if (status->output_file_name) free(status->output_file_name[i]);
     }
     if (status->merged_input_header)
         sam_hdr_destroy(status->merged_input_header);
-    free(status->rg_output_header);
-    free(status->rg_output_file);
-    free(status->rg_output_file_name);
-    free(status->rg_index_file_name);
-    kh_destroy_c2i(status->rg_hash);
+    free(status->output_header);
+    free(status->output_file);
+    free(status->output_file_name);
+    free(status->index_file_name);
+    kh_destroy_c2i(status->tag_val_hash);
 
-    free(status->rg_id);
+    free(status->tag_vals);
     if (status->p.pool)
         hts_tpool_destroy(status->p.pool);
     free(status);

From d3cec4bb45e30dcca187db3c7b5fad92c6ccba62 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 3 Nov 2023 14:43:07 +0000
Subject: [PATCH 17/26] Allow `-d RG` to make new files from alignment RG tag
 values

Removes the previous restriction that all read groups must be
listed in the header.  This is only enabled when using `-d RG`
though, so previous behaviour (when `-d` was not in use) remains
unchanged.
---
 bam_split.c          | 28 +++++++++++++++++++++++++---
 doc/samtools-split.1 | 30 ++++++++++++++++++++++++------
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/bam_split.c b/bam_split.c
index 3d36f590c..ecfd88f6e 100644
--- a/bam_split.c
+++ b/bam_split.c
@@ -308,7 +308,8 @@ static int grow_output_lists(state_t *state, size_t count) {
 }
 
 static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
-                              const char *tag, const char *arg_list) {
+                              const char *tag, const char *arg_list,
+                              int is_rg) {
     char *input_base_name = NULL, *new_file_name = NULL, *tag_val = NULL;
     char *new_idx_fn = NULL;
     sam_hdr_t *new_hdr = NULL;
@@ -358,6 +359,23 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
         goto fail;
     }
 
+    if (is_rg) {
+        // If here, we've found an RG:Z: tag without a corresponding @RG
+        // line in the header.
+        if (sam_hdr_remove_lines(new_hdr, "RG", "ID", NULL) != 0) {
+            print_error_errno("split",
+                              "Failed to remove @RG lines from file \"%s\"",
+                              new_file_name);
+            goto fail;
+        }
+        if (sam_hdr_add_line(new_hdr, "RG", "ID", tag_val, NULL) != 0) {
+            print_error_errno("split",
+                              "Failed to add @RG line to file \"%s\"",
+                              new_file_name);
+            goto fail;
+        }
+    }
+
     char outmode[4] = "w";
     sam_open_mode(outmode + 1, new_file_name, NULL);
     new_sam_file = sam_open_format(new_file_name, outmode, &opts->ga.out);
@@ -638,9 +656,13 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
             iter = kh_end(state->tag_val_hash);
         }
 
-        if (!is_rg && val && iter == kh_end(state->tag_val_hash)) {
+        // Check for opts->tag here instead of !is_rg so we open new
+        // files if the user specified '-d RG' and we find a RG:Z: value
+        // that wasn't listed in the header.  If the '-d' option is
+        // not used, we don't open a file to preserve existing behaviour.
+        if (opts->tag && val && iter == kh_end(state->tag_val_hash)) {
             // Need to open a new output file
-            iter = prep_sam_file(opts, state, val, arg_list);
+            iter = prep_sam_file(opts, state, val, arg_list, is_rg);
             if (iter == kh_end(state->tag_val_hash)) { // Open failed
                 print_error("split",
                             "Could not create output file for tag \"%s:%s\"",
diff --git a/doc/samtools-split.1 b/doc/samtools-split.1
index 0adea99f0..90a076e43 100644
--- a/doc/samtools-split.1
+++ b/doc/samtools-split.1
@@ -49,18 +49,36 @@ samtools split
 
 .SH DESCRIPTION
 .PP
-Splits a file by read group, producing one or more output files
-matching a common prefix (by default based on the input filename)
-each containing one read-group.
+Splits a file by read group, or a specified tag,
+producing one or more output files
+matching a common prefix (by default based on the input filename).
 
+Unless the \fB-d\fR option is used, the file will be split according to the
+.B @RG
+tags listed in the header.
 Records without an RG tag or with an RG tag undefined in the header will cause
 the program to exit with an error unless the \fB-u\fR option is used.
 
 RG values defined in the header but with no records will produce an output file
 only containing a header.
 
+If the
+.BI "-d " TAG
+option is used, the file will be split on the value in the given aux tag.
+Note that only string tags (type \fBZ\fR) are currently supported.
+Unless the \fB-u\fR option is used, the program will exit with an error if
+it finds a record without the given tag.
+
+Using
+.B -d RG
+behaves in a similar way to the default (without \fB-d\fR),
+opening an output file for each \fB@RG\fR line in the header.
+However, unlike the default,
+new output files will be opened for any RG tags found in the alignment records
+irrespective of if they have a matching header \fB@RG\fR line.
+
 The \fB-u\fR option may be used to specify the output filename for any
-records with a missing or unrecognised RG tag.  This option will always write
+records with a missing or unrecognised tag.  This option will always write
 out a file even if there are no records.
 
 Output format defaults to BAM.  For SAM or CRAM then either set the format with
@@ -102,8 +120,8 @@ center;
 lb l .
 %%	%
 %*	basename
-%#	@RG index
-%!	@RG ID
+%#	index (of @RG in the header, or count of TAG values seen so far)
+%!	@RG ID or TAG value
 %.	output format filename extension
 .TE
 .TP

From 7f829ec7ad0d22f979598ca04313388047fd1b84 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 3 Nov 2023 17:03:03 +0000
Subject: [PATCH 18/26] Add a -M option to limit the number of split files made

To prevent "too many open files" and/or too many files in a
directory problems.
---
 bam_split.c          | 40 ++++++++++++++++++++++++++++++++--------
 doc/samtools-split.1 | 21 +++++++++++++++++++++
 2 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/bam_split.c b/bam_split.c
index ecfd88f6e..c6eaaff61 100644
--- a/bam_split.c
+++ b/bam_split.c
@@ -49,11 +49,14 @@ struct parsed_opts {
     const char *unaccounted_name;
     const char *output_format_string;
     const char *tag;
+    long max_split;
     bool verbose;
     int no_pg;
     sam_global_args ga;
 };
 
+#define DEFAULT_MAX_SPLIT 100
+
 typedef struct parsed_opts parsed_opts_t;
 
 struct state {
@@ -83,12 +86,14 @@ static void usage(FILE *write_to)
 "Usage: samtools split [-u <unaccounted.bam>] [-h <unaccounted_header.sam>]\n"
 "                      [-f <format_string>] [-v] <merged.bam>\n"
 "Options:\n"
-"  -f STRING       output filename format string [\"%%*_%%#.%%.\"]\n"
-"  -u FILE1        put reads with no RG tag or an unrecognised RG tag in FILE1\n"
-"  -h FILE2        ... and override the header with FILE2 (-u file only)\n"
-"  -d TAG          split by TAG value. TAG value must be a string.\n"
-"  -v              verbose output\n"
-"  --no-PG         do not add a PG line\n");
+"  -f STRING           output filename format string [\"%%*_%%#.%%.\"]\n"
+"  -u FILE1            put left-over reads in FILE1\n"
+"  -h FILE2            ... and override the header with FILE2 (-u file only)\n"
+"  -d TAG              split by TAG value. TAG value must be a string.\n"
+"  -M,--max-split NUM  limit number of output files from -d to NUM [%d]\n"
+"  -v                  verbose output\n"
+"  --no-PG             do not add a PG line\n",
+            DEFAULT_MAX_SPLIT);
     sam_global_opt_help(write_to, "-....@..");
     fprintf(write_to,
 "\n"
@@ -106,17 +111,19 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 {
     if (argc == 1) { usage(stdout); return NULL; }
 
-    const char *optstring = "vf:h:u:d:@:";
+    const char *optstring = "vf:h:u:d:M:@:";
 
     static const struct option lopts[] = {
         SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
         {"no-PG", no_argument, NULL, 1},
+        {"max-split", required_argument, NULL, 'M'},
         { NULL, 0, NULL, 0 }
     };
 
     parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
     if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
 
+    retval->max_split = DEFAULT_MAX_SPLIT;
     sam_global_args_init(&retval->ga);
 
     int opt;
@@ -138,6 +145,18 @@ static parsed_opts_t* parse_args(int argc, char** argv)
             retval->tag = optarg;
             retval->output_format_string = "%*_%!.%.";
             break;
+        case 'M': {
+            char *end = optarg;
+            retval->max_split = strtol(optarg, &end, 10);
+            if (*optarg == '\0' || *end != '\0' || retval->max_split == 0) {
+                print_error("split", "Invalid -M argument: \"%s\"", optarg);
+                free(retval);
+                return NULL;
+            }
+            if (retval->max_split < 0) // No limit requested
+                retval->max_split = LONG_MAX;
+            break;
+        }
         case 1:
             retval->no_pg = 1;
             break;
@@ -535,6 +554,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
     if (!is_rg)
         return retval;  // Done for this case - outputs will be opened later
 
+    // Adjust max_split if too small for the read-groups listed in the header
+    if (opts->max_split < retval->output_count)
+        opts->max_split = retval->output_count;
+
     // Open output files for RGs
     char* dirsep = strrchr(opts->merged_input_name, '/');
     char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
@@ -660,7 +683,8 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
         // files if the user specified '-d RG' and we find a RG:Z: value
         // that wasn't listed in the header.  If the '-d' option is
         // not used, we don't open a file to preserve existing behaviour.
-        if (opts->tag && val && iter == kh_end(state->tag_val_hash)) {
+        if (opts->tag && val && iter == kh_end(state->tag_val_hash)
+            && state->output_count < opts->max_split) {
             // Need to open a new output file
             iter = prep_sam_file(opts, state, val, arg_list, is_rg);
             if (iter == kh_end(state->tag_val_hash)) { // Open failed
diff --git a/doc/samtools-split.1 b/doc/samtools-split.1
index 90a076e43..460aa6404 100644
--- a/doc/samtools-split.1
+++ b/doc/samtools-split.1
@@ -69,6 +69,11 @@ Note that only string tags (type \fBZ\fR) are currently supported.
 Unless the \fB-u\fR option is used, the program will exit with an error if
 it finds a record without the given tag.
 
+Note that attempting to split on a tag with high cardinality may result
+in the creation of a large number of output files.
+To prevent this, the \fB-M\fR option can be used to set a limit on the
+number of splits made.
+
 Using
 .B -d RG
 behaves in a similar way to the default (without \fB-d\fR),
@@ -108,6 +113,22 @@ supplied with the option. The value of the TAG has to be a string (i.e.
 .B key:Z:value
 )
 .TP
+.BI "-M,--max-split " NUM
+Limit the number of files created by the \fB-d\fR option to \fINUM\fR (default
+100).
+This prevents accidents where trying to split on a tag with high cardinality
+could result in the creation of a very large number of output files.
+Once the file limit is reached,
+any tag values not already seen will be treated as unmatched and the program
+will exit with an error unless the \fB-u\fR option is in use.
+
+If desired, the limit can be removed using \fB-M -1\fR,
+although in practice the number of outputs will still be restricted by
+system limits on the number of files that can be open at once.
+
+If splitting by read group, and the read group count in the header
+is higher than the requested limit then the limit will be raised to match.
+.TP
 .B -v
 Verbose output
 .TP

From 050362f142820e4efcb70185c345806ea4a24dfb Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 6 Nov 2023 13:57:29 +0000
Subject: [PATCH 19/26] Add tests for split by string aux tag functionality

---
 test/split/split.expected.grp1.sam          | 24 ++++-----
 test/split/split.expected.grp2.sam          | 24 ++++-----
 test/split/split.expected.unk.sam           | 12 ++---
 test/split/split.expected_d_RG.grp3.sam     | 29 ++++++++++
 test/split/split.expected_d_RG.unk.sam      | 32 +++++++++++
 test/split/split.expected_d_an.aardvark.sam | 32 +++++++++++
 test/split/split.expected_d_an.badger.sam   | 38 +++++++++++++
 test/split/split.expected_d_an.cat.sam      | 38 +++++++++++++
 test/split/split.expected_d_an.dog.sam      | 34 ++++++++++++
 test/split/split.expected_d_an.unk.sam      | 32 +++++++++++
 test/split/split.expected_d_an_M_3.unk.sam  | 36 +++++++++++++
 test/split/split.sam                        | 60 ++++++++++-----------
 test/test.pl                                | 37 +++++++++++++
 13 files changed, 368 insertions(+), 60 deletions(-)
 create mode 100644 test/split/split.expected_d_RG.grp3.sam
 create mode 100644 test/split/split.expected_d_RG.unk.sam
 create mode 100644 test/split/split.expected_d_an.aardvark.sam
 create mode 100644 test/split/split.expected_d_an.badger.sam
 create mode 100644 test/split/split.expected_d_an.cat.sam
 create mode 100644 test/split/split.expected_d_an.dog.sam
 create mode 100644 test/split/split.expected_d_an.unk.sam
 create mode 100644 test/split/split.expected_d_an_M_3.unk.sam

diff --git a/test/split/split.expected.grp1.sam b/test/split/split.expected.grp1.sam
index 23e15367d..4fbd1775d 100644
--- a/test/split/split.expected.grp1.sam
+++ b/test/split/split.expected.grp1.sam
@@ -24,17 +24,17 @@
 @SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
 @SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
 @RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
-ref1_grp1_p001	99	ref1	1	0	10M	=	25	34	CGAGCTCGGT	!!!!!!!!!!	RG:Z:grp1	BC:Z:ACGT	H0:i:1	aa:A:!	ab:A:~	fa:f:3.14159	za:Z:Hello world!	ha:H:DEADBEEF	ba:B:c,-128,0,127	bb:B:C,0,127,255	bc:B:s,-32768,0,32767	bd:B:S,0,32768,65535	be:B:i,-2147483648,0,2147483647	bf:B:I,0,2147483648,4294967295	bg:B:f,2.71828,6.626e-34,2.9979e+09	NM:i:0	MD:Z:10
-ref1_grp1_p002	99	ref1	5	2	10M	=	29	34	CTCGGTACCC	##########	RG:Z:grp1	BC:Z:AATTCCGG	H0:i:1	aa:A:a	ab:A:z	fa:f:4.3597e-18	za:Z:Another string	ha:H:2000AD	NM:i:0	MD:Z:10
-ref1_grp1_p003	99	ref1	9	4	10M	=	33	34	GTACCCGGGG	%%%%%%%%%%	fa:f:1.66e-27	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p004	99	ref1	13	6	10M	=	37	34	CCGGGGATCC	''''''''''	fa:f:1.38e-23	za:Z:xRG:Z:grp2	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p005	99	ref1	17	8	10M	=	41	34	GGATCCTCTA	))))))))))	RG:Z:grp1	ia:i:40000	NM:i:0	MD:Z:10
+ref1_grp1_p001	99	ref1	1	0	10M	=	25	34	CGAGCTCGGT	!!!!!!!!!!	RG:Z:grp1	BC:Z:ACGT	H0:i:1	aa:A:!	ab:A:~	fa:f:3.14159	za:Z:Hello world!	ha:H:DEADBEEF	ba:B:c,-128,0,127	bb:B:C,0,127,255	bc:B:s,-32768,0,32767	bd:B:S,0,32768,65535	be:B:i,-2147483648,0,2147483647	bf:B:I,0,2147483648,4294967295	bg:B:f,2.71828,6.626e-34,2.9979e+09	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p002	99	ref1	5	2	10M	=	29	34	CTCGGTACCC	##########	RG:Z:grp1	BC:Z:AATTCCGG	H0:i:1	aa:A:a	ab:A:z	fa:f:4.3597e-18	za:Z:Another string	ha:H:2000AD	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p003	99	ref1	9	4	10M	=	33	34	GTACCCGGGG	%%%%%%%%%%	fa:f:1.66e-27	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p004	99	ref1	13	6	10M	=	37	34	CCGGGGATCC	''''''''''	fa:f:1.38e-23	za:Z:xRG:Z:grp2	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p005	99	ref1	17	8	10M	=	41	34	GGATCCTCTA	))))))))))	RG:Z:grp1	ia:i:40000	NM:i:0	MD:Z:10	an:Z:badger
 ref1_grp1_p006	99	ref1	21	10	10M	=	45	34	CCTCTAGAGT	++++++++++	RG:Z:grp1	ia:i:255	NM:i:0	MD:Z:10
-ref1_grp1_p001	147	ref1	25	12	10M	=	1	-34	TAGAGTCGAC	----------	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p002	147	ref1	29	14	10M	=	5	-34	GTCGACCTGC	//////////	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p003	147	ref1	33	16	10M	=	9	-34	ACCTGCAGGC	1111111111	RG:Z:grp1	NM:i:0	MD:Z:10
-ref12_grp1_p001	97	ref1	36	50	10M	ref2	2	0	TGCAGGCATG	AAAAAAAAAA	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p004	147	ref1	37	18	10M	=	13	-34	GCAGGCATGC	3333333333	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p005	147	ref1	41	20	10M	=	17	-34	GCATGCAAGC	5555555555	RG:Z:grp1	NM:i:0	MD:Z:10
+ref1_grp1_p001	147	ref1	25	12	10M	=	1	-34	TAGAGTCGAC	----------	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p002	147	ref1	29	14	10M	=	5	-34	GTCGACCTGC	//////////	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p003	147	ref1	33	16	10M	=	9	-34	ACCTGCAGGC	1111111111	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref12_grp1_p001	97	ref1	36	50	10M	ref2	2	0	TGCAGGCATG	AAAAAAAAAA	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p004	147	ref1	37	18	10M	=	13	-34	GCAGGCATGC	3333333333	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p005	147	ref1	41	20	10M	=	17	-34	GCATGCAAGC	5555555555	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
 ref1_grp1_p006	147	ref1	45	22	10M	=	21	-34	GCAAGCTTGA	7777777777	RG:Z:grp1	NM:i:0	MD:Z:10
-ref12_grp1_p001	145	ref2	2	50	10M	ref1	36	0	TTCTATAGTG	BBBBBBBBBB	RG:Z:grp1	NM:i:0	MD:Z:10
+ref12_grp1_p001	145	ref2	2	50	10M	ref1	36	0	TTCTATAGTG	BBBBBBBBBB	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
diff --git a/test/split/split.expected.grp2.sam b/test/split/split.expected.grp2.sam
index 07798dd8c..6a3854c40 100644
--- a/test/split/split.expected.grp2.sam
+++ b/test/split/split.expected.grp2.sam
@@ -24,17 +24,17 @@
 @SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
 @SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
 @RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
-ref1_grp2_p001	99	ref1	3	1	10M	=	27	34	AGCTCGGTAC	""""""""""	RG:Z:grp2	BC:Z:TGCA	H0:i:1	aa:A:A	ab:A:Z	fa:f:6.67e-11	za:Z:!"$%^&*()	ha:H:CAFE	NM:i:0	MD:Z:10
-ref1_grp2_p002	99	ref1	7	3	10M	=	31	34	CGGTACCCGG	$$$$$$$$$$	fa:f:6.022e+23	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp2_p003	99	ref1	11	5	10M	=	35	34	ACCCGGGGAT	&&&&&&&&&&	RG:Z:grp2	ia:i:4294967295	NM:i:0	MD:Z:10
-ref1_grp2_p004	99	ref1	15	7	10M	=	39	34	GGGGATCCTC	((((((((((	RG:Z:grp2	ia:i:-2147483648	NM:i:0	MD:Z:10
-ref1_grp2_p005	99	ref1	19	9	10M	=	43	34	ATCCTCTAGA	**********	RG:Z:grp2	ia:i:-1000	NM:i:0	MD:Z:10
+ref1_grp2_p001	99	ref1	3	1	10M	=	27	34	AGCTCGGTAC	""""""""""	RG:Z:grp2	BC:Z:TGCA	H0:i:1	aa:A:A	ab:A:Z	fa:f:6.67e-11	za:Z:!"$%^&*()	ha:H:CAFE	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p002	99	ref1	7	3	10M	=	31	34	CGGTACCCGG	$$$$$$$$$$	fa:f:6.022e+23	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p003	99	ref1	11	5	10M	=	35	34	ACCCGGGGAT	&&&&&&&&&&	RG:Z:grp2	ia:i:4294967295	NM:i:0	MD:Z:10	an:Z:dog
+ref1_grp2_p004	99	ref1	15	7	10M	=	39	34	GGGGATCCTC	((((((((((	RG:Z:grp2	ia:i:-2147483648	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p005	99	ref1	19	9	10M	=	43	34	ATCCTCTAGA	**********	RG:Z:grp2	ia:i:-1000	NM:i:0	MD:Z:10	an:Z:aardvark
 ref1_grp2_p006	99	ref1	23	11	10M	=	47	34	TCTAGAGTCG	,,,,,,,,,,	RG:Z:grp2	ia:i:-1	NM:i:0	MD:Z:10
-ref1_grp2_p001	147	ref1	27	13	10M	=	3	-34	GAGTCGACCT	..........	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp2_p002	147	ref1	31	15	10M	=	7	-34	CGACCTGCAG	0000000000	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp2_p003	147	ref1	35	17	10M	=	11	-34	CTGCAGGCAT	2222222222	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp2_p004	147	ref1	39	19	10M	=	15	-34	AGGCATGCAA	4444444444	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp2_p005	147	ref1	43	21	10M	=	19	-34	ATGCAAGCTT	6666666666	RG:Z:grp2	NM:i:0	MD:Z:10
-ref12_grp2_p001	97	ref1	46	50	10M	ref2	12	0	CAAGCTTGAG	AAAAAAAAAA	RG:Z:grp2	NM:i:0	MD:Z:10
+ref1_grp2_p001	147	ref1	27	13	10M	=	3	-34	GAGTCGACCT	..........	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p002	147	ref1	31	15	10M	=	7	-34	CGACCTGCAG	0000000000	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p003	147	ref1	35	17	10M	=	11	-34	CTGCAGGCAT	2222222222	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:dog
+ref1_grp2_p004	147	ref1	39	19	10M	=	15	-34	AGGCATGCAA	4444444444	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p005	147	ref1	43	21	10M	=	19	-34	ATGCAAGCTT	6666666666	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
+ref12_grp2_p001	97	ref1	46	50	10M	ref2	12	0	CAAGCTTGAG	AAAAAAAAAA	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:cat
 ref1_grp2_p006	147	ref1	47	23	10M	=	23	-34	AAGCTTGAGT	8888888888	RG:Z:grp2	NM:i:0	MD:Z:10
-ref12_grp2_p001	145	ref2	12	50	10M	ref1	46	0	TCACCTAAAT	BBBBBBBBBB	RG:Z:grp2	NM:i:0	MD:Z:10
+ref12_grp2_p001	145	ref2	12	50	10M	ref1	46	0	TCACCTAAAT	BBBBBBBBBB	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:cat
diff --git a/test/split/split.expected.unk.sam b/test/split/split.expected.unk.sam
index 59f6ff569..8417d13e5 100644
--- a/test/split/split.expected.unk.sam
+++ b/test/split/split.expected.unk.sam
@@ -26,9 +26,9 @@
 @SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
 @SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
 @SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
-ref2_grp3_p001	83	ref2	1	99	15M	=	31	45	ATTCTATAGTGTCAC	~~~~~~~~~~~~~~~	NM:i:0	MD:Z:15
-ref2_grp3_p002	147	ref2	16	99	15M	=	46	45	CTAAATAGCTTGGCG	}}}}}}}}}}}}}}}	NM:i:0	MD:Z:15
-ref2_grp3_p001	163	ref2	31	99	15M	=	1	-45	CTGTTTCCTGTGTGA	|||||||||||||||	NM:i:13	MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0
-ref2_grp3_p002	99	ref2	46	99	15M	=	16	-45	CTGTTTCCTGTGTGA	{{{{{{{{{{{{{{{	NM:i:0	MD:Z:15
-unaligned_grp3_p001	77	*	0	0	*	*	0	0	CACTCGTTCATGACG	0123456789abcde
-unaligned_grp3_p001	141	*	0	0	*	*	0	0	GAAAGTGAGGAGGTG	edcba9876543210
+ref2_grp3_p001	83	ref2	1	99	15M	=	31	45	ATTCTATAGTGTCAC	~~~~~~~~~~~~~~~	NM:i:0	MD:Z:15	an:Z:cat
+ref2_grp3_p002	147	ref2	16	99	15M	=	46	45	CTAAATAGCTTGGCG	}}}}}}}}}}}}}}}	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+ref2_grp3_p001	163	ref2	31	99	15M	=	1	-45	CTGTTTCCTGTGTGA	|||||||||||||||	NM:i:13	MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0	an:Z:cat
+ref2_grp3_p002	99	ref2	46	99	15M	=	16	-45	CTGTTTCCTGTGTGA	{{{{{{{{{{{{{{{	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+unaligned_grp3_p001	77	*	0	0	*	*	0	0	CACTCGTTCATGACG	0123456789abcde	an:Z:dog
+unaligned_grp3_p001	141	*	0	0	*	*	0	0	GAAAGTGAGGAGGTG	edcba9876543210	an:Z:dog
diff --git a/test/split/split.expected_d_RG.grp3.sam b/test/split/split.expected_d_RG.grp3.sam
new file mode 100644
index 000000000..14e7997da
--- /dev/null
+++ b/test/split/split.expected_d_RG.grp3.sam
@@ -0,0 +1,29 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp3
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref2_grp3_p002	147	ref2	16	99	15M	=	46	45	CTAAATAGCTTGGCG	}}}}}}}}}}}}}}}	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+ref2_grp3_p002	99	ref2	46	99	15M	=	16	-45	CTGTTTCCTGTGTGA	{{{{{{{{{{{{{{{	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
diff --git a/test/split/split.expected_d_RG.unk.sam b/test/split/split.expected_d_RG.unk.sam
new file mode 100644
index 000000000..2676ba6d3
--- /dev/null
+++ b/test/split/split.expected_d_RG.unk.sam
@@ -0,0 +1,32 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref2_grp3_p001	83	ref2	1	99	15M	=	31	45	ATTCTATAGTGTCAC	~~~~~~~~~~~~~~~	NM:i:0	MD:Z:15	an:Z:cat
+ref2_grp3_p001	163	ref2	31	99	15M	=	1	-45	CTGTTTCCTGTGTGA	|||||||||||||||	NM:i:13	MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0	an:Z:cat
+unaligned_grp3_p001	77	*	0	0	*	*	0	0	CACTCGTTCATGACG	0123456789abcde	an:Z:dog
+unaligned_grp3_p001	141	*	0	0	*	*	0	0	GAAAGTGAGGAGGTG	edcba9876543210	an:Z:dog
diff --git a/test/split/split.expected_d_an.aardvark.sam b/test/split/split.expected_d_an.aardvark.sam
new file mode 100644
index 000000000..b5ce3fb0b
--- /dev/null
+++ b/test/split/split.expected_d_an.aardvark.sam
@@ -0,0 +1,32 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref1_grp2_p004	99	ref1	15	7	10M	=	39	34	GGGGATCCTC	((((((((((	RG:Z:grp2	ia:i:-2147483648	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p005	99	ref1	19	9	10M	=	43	34	ATCCTCTAGA	**********	RG:Z:grp2	ia:i:-1000	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p004	147	ref1	39	19	10M	=	15	-34	AGGCATGCAA	4444444444	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p005	147	ref1	43	21	10M	=	19	-34	ATGCAAGCTT	6666666666	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
diff --git a/test/split/split.expected_d_an.badger.sam b/test/split/split.expected_d_an.badger.sam
new file mode 100644
index 000000000..a78886b5a
--- /dev/null
+++ b/test/split/split.expected_d_an.badger.sam
@@ -0,0 +1,38 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref1_grp1_p001	99	ref1	1	0	10M	=	25	34	CGAGCTCGGT	!!!!!!!!!!	RG:Z:grp1	BC:Z:ACGT	H0:i:1	aa:A:!	ab:A:~	fa:f:3.14159	za:Z:Hello world!	ha:H:DEADBEEF	ba:B:c,-128,0,127	bb:B:C,0,127,255	bc:B:s,-32768,0,32767	bd:B:S,0,32768,65535	be:B:i,-2147483648,0,2147483647	bf:B:I,0,2147483648,4294967295	bg:B:f,2.71828,6.626e-34,2.9979e+09	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p001	99	ref1	3	1	10M	=	27	34	AGCTCGGTAC	""""""""""	RG:Z:grp2	BC:Z:TGCA	H0:i:1	aa:A:A	ab:A:Z	fa:f:6.67e-11	za:Z:!"$%^&*()	ha:H:CAFE	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p002	99	ref1	7	3	10M	=	31	34	CGGTACCCGG	$$$$$$$$$$	fa:f:6.022e+23	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p004	99	ref1	13	6	10M	=	37	34	CCGGGGATCC	''''''''''	fa:f:1.38e-23	za:Z:xRG:Z:grp2	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p005	99	ref1	17	8	10M	=	41	34	GGATCCTCTA	))))))))))	RG:Z:grp1	ia:i:40000	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p001	147	ref1	25	12	10M	=	1	-34	TAGAGTCGAC	----------	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p001	147	ref1	27	13	10M	=	3	-34	GAGTCGACCT	..........	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p002	147	ref1	31	15	10M	=	7	-34	CGACCTGCAG	0000000000	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p004	147	ref1	37	18	10M	=	13	-34	GCAGGCATGC	3333333333	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p005	147	ref1	41	20	10M	=	17	-34	GCATGCAAGC	5555555555	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
diff --git a/test/split/split.expected_d_an.cat.sam b/test/split/split.expected_d_an.cat.sam
new file mode 100644
index 000000000..c2bdd3af6
--- /dev/null
+++ b/test/split/split.expected_d_an.cat.sam
@@ -0,0 +1,38 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref1_grp1_p002	99	ref1	5	2	10M	=	29	34	CTCGGTACCC	##########	RG:Z:grp1	BC:Z:AATTCCGG	H0:i:1	aa:A:a	ab:A:z	fa:f:4.3597e-18	za:Z:Another string	ha:H:2000AD	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p003	99	ref1	9	4	10M	=	33	34	GTACCCGGGG	%%%%%%%%%%	fa:f:1.66e-27	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p002	147	ref1	29	14	10M	=	5	-34	GTCGACCTGC	//////////	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p003	147	ref1	33	16	10M	=	9	-34	ACCTGCAGGC	1111111111	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref12_grp1_p001	97	ref1	36	50	10M	ref2	2	0	TGCAGGCATG	AAAAAAAAAA	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref12_grp2_p001	97	ref1	46	50	10M	ref2	12	0	CAAGCTTGAG	AAAAAAAAAA	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:cat
+ref2_grp3_p001	83	ref2	1	99	15M	=	31	45	ATTCTATAGTGTCAC	~~~~~~~~~~~~~~~	NM:i:0	MD:Z:15	an:Z:cat
+ref12_grp1_p001	145	ref2	2	50	10M	ref1	36	0	TTCTATAGTG	BBBBBBBBBB	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref12_grp2_p001	145	ref2	12	50	10M	ref1	46	0	TCACCTAAAT	BBBBBBBBBB	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:cat
+ref2_grp3_p001	163	ref2	31	99	15M	=	1	-45	CTGTTTCCTGTGTGA	|||||||||||||||	NM:i:13	MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0	an:Z:cat
diff --git a/test/split/split.expected_d_an.dog.sam b/test/split/split.expected_d_an.dog.sam
new file mode 100644
index 000000000..d7d699302
--- /dev/null
+++ b/test/split/split.expected_d_an.dog.sam
@@ -0,0 +1,34 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref1_grp2_p003	99	ref1	11	5	10M	=	35	34	ACCCGGGGAT	&&&&&&&&&&	RG:Z:grp2	ia:i:4294967295	NM:i:0	MD:Z:10	an:Z:dog
+ref1_grp2_p003	147	ref1	35	17	10M	=	11	-34	CTGCAGGCAT	2222222222	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:dog
+ref2_grp3_p002	147	ref2	16	99	15M	=	46	45	CTAAATAGCTTGGCG	}}}}}}}}}}}}}}}	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+ref2_grp3_p002	99	ref2	46	99	15M	=	16	-45	CTGTTTCCTGTGTGA	{{{{{{{{{{{{{{{	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+unaligned_grp3_p001	77	*	0	0	*	*	0	0	CACTCGTTCATGACG	0123456789abcde	an:Z:dog
+unaligned_grp3_p001	141	*	0	0	*	*	0	0	GAAAGTGAGGAGGTG	edcba9876543210	an:Z:dog
diff --git a/test/split/split.expected_d_an.unk.sam b/test/split/split.expected_d_an.unk.sam
new file mode 100644
index 000000000..5092c89a6
--- /dev/null
+++ b/test/split/split.expected_d_an.unk.sam
@@ -0,0 +1,32 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref1_grp1_p006	99	ref1	21	10	10M	=	45	34	CCTCTAGAGT	++++++++++	RG:Z:grp1	ia:i:255	NM:i:0	MD:Z:10
+ref1_grp2_p006	99	ref1	23	11	10M	=	47	34	TCTAGAGTCG	,,,,,,,,,,	RG:Z:grp2	ia:i:-1	NM:i:0	MD:Z:10
+ref1_grp1_p006	147	ref1	45	22	10M	=	21	-34	GCAAGCTTGA	7777777777	RG:Z:grp1	NM:i:0	MD:Z:10
+ref1_grp2_p006	147	ref1	47	23	10M	=	23	-34	AAGCTTGAGT	8888888888	RG:Z:grp2	NM:i:0	MD:Z:10
diff --git a/test/split/split.expected_d_an_M_3.unk.sam b/test/split/split.expected_d_an_M_3.unk.sam
new file mode 100644
index 000000000..dbe9c6936
--- /dev/null
+++ b/test/split/split.expected_d_an_M_3.unk.sam
@@ -0,0 +1,36 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:grp1	DS:Group 1	LB:Library 1	SM:Sample
+@RG	ID:grp2	DS:Group 2	LB:Library 2	SM:Sample
+@PG	ID:prog1	PN:emacs	CL:emacs	VN:23.1.1
+@CO	The MIT License
+@CO	
+@CO	Copyright (c) 2014 Genome Research Ltd.
+@CO	
+@CO	Permission is hereby granted, free of charge, to any person obtaining a copy
+@CO	of this software and associated documentation files (the "Software"), to deal
+@CO	in the Software without restriction, including without limitation the rights
+@CO	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+@CO	copies of the Software, and to permit persons to whom the Software is
+@CO	furnished to do so, subject to the following conditions:
+@CO	
+@CO	The above copyright notice and this permission notice shall be included in
+@CO	all copies or substantial portions of the Software.
+@CO	
+@CO	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+@CO	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+@CO	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+@CO	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+@CO	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+@CO	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+@CO	THE SOFTWARE.
+@SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
+@SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
+@SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
+ref1_grp2_p004	99	ref1	15	7	10M	=	39	34	GGGGATCCTC	((((((((((	RG:Z:grp2	ia:i:-2147483648	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p005	99	ref1	19	9	10M	=	43	34	ATCCTCTAGA	**********	RG:Z:grp2	ia:i:-1000	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp1_p006	99	ref1	21	10	10M	=	45	34	CCTCTAGAGT	++++++++++	RG:Z:grp1	ia:i:255	NM:i:0	MD:Z:10
+ref1_grp2_p006	99	ref1	23	11	10M	=	47	34	TCTAGAGTCG	,,,,,,,,,,	RG:Z:grp2	ia:i:-1	NM:i:0	MD:Z:10
+ref1_grp2_p004	147	ref1	39	19	10M	=	15	-34	AGGCATGCAA	4444444444	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp2_p005	147	ref1	43	21	10M	=	19	-34	ATGCAAGCTT	6666666666	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp1_p006	147	ref1	45	22	10M	=	21	-34	GCAAGCTTGA	7777777777	RG:Z:grp1	NM:i:0	MD:Z:10
+ref1_grp2_p006	147	ref1	47	23	10M	=	23	-34	AAGCTTGAGT	8888888888	RG:Z:grp2	NM:i:0	MD:Z:10
diff --git a/test/split/split.sam b/test/split/split.sam
index c874469d1..c07002ee2 100644
--- a/test/split/split.sam
+++ b/test/split/split.sam
@@ -26,37 +26,37 @@
 @SQ	SN:ref1	LN:56	M5:08c04d512d4797d9ba2a156c1daba468
 @SQ	SN:ref2	LN:60	M5:7c35feac7036c1cdef3bee0cc4b21437
 @SQ	SN:ref3_unused	LN:70	M5:5fdd18c2c6ecac4838996d029bf395b5
-ref1_grp1_p001	99	ref1	1	0	10M	=	25	34	CGAGCTCGGT	!!!!!!!!!!	RG:Z:grp1	BC:Z:ACGT	H0:i:1	aa:A:!	ab:A:~	fa:f:3.14159	za:Z:Hello world!	ha:H:DEADBEEF	ba:B:c,-128,0,127	bb:B:C,0,127,255	bc:B:s,-32768,0,32767	bd:B:S,0,32768,65535	be:B:i,-2147483648,0,2147483647	bf:B:I,0,2147483648,4294967295	bg:B:f,2.71828,6.626e-34,2.9979e+09	NM:i:0	MD:Z:10
-ref1_grp2_p001	99	ref1	3	1	10M	=	27	34	AGCTCGGTAC	""""""""""	RG:Z:grp2	BC:Z:TGCA	H0:i:1	aa:A:A	ab:A:Z	fa:f:6.67e-11	za:Z:!"$%^&*()	ha:H:CAFE	NM:i:0	MD:Z:10
-ref1_grp1_p002	99	ref1	5	2	10M	=	29	34	CTCGGTACCC	##########	RG:Z:grp1	BC:Z:AATTCCGG	H0:i:1	aa:A:a	ab:A:z	fa:f:4.3597e-18	za:Z:Another string	ha:H:2000AD	NM:i:0	MD:Z:10
-ref1_grp2_p002	99	ref1	7	3	10M	=	31	34	CGGTACCCGG	$$$$$$$$$$	fa:f:6.022e+23	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp1_p003	99	ref1	9	4	10M	=	33	34	GTACCCGGGG	%%%%%%%%%%	fa:f:1.66e-27	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p003	99	ref1	11	5	10M	=	35	34	ACCCGGGGAT	&&&&&&&&&&	RG:Z:grp2	ia:i:4294967295	NM:i:0	MD:Z:10
-ref1_grp1_p004	99	ref1	13	6	10M	=	37	34	CCGGGGATCC	''''''''''	fa:f:1.38e-23	za:Z:xRG:Z:grp2	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p004	99	ref1	15	7	10M	=	39	34	GGGGATCCTC	((((((((((	RG:Z:grp2	ia:i:-2147483648	NM:i:0	MD:Z:10
-ref1_grp1_p005	99	ref1	17	8	10M	=	41	34	GGATCCTCTA	))))))))))	RG:Z:grp1	ia:i:40000	NM:i:0	MD:Z:10
-ref1_grp2_p005	99	ref1	19	9	10M	=	43	34	ATCCTCTAGA	**********	RG:Z:grp2	ia:i:-1000	NM:i:0	MD:Z:10
+ref1_grp1_p001	99	ref1	1	0	10M	=	25	34	CGAGCTCGGT	!!!!!!!!!!	RG:Z:grp1	BC:Z:ACGT	H0:i:1	aa:A:!	ab:A:~	fa:f:3.14159	za:Z:Hello world!	ha:H:DEADBEEF	ba:B:c,-128,0,127	bb:B:C,0,127,255	bc:B:s,-32768,0,32767	bd:B:S,0,32768,65535	be:B:i,-2147483648,0,2147483647	bf:B:I,0,2147483648,4294967295	bg:B:f,2.71828,6.626e-34,2.9979e+09	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p001	99	ref1	3	1	10M	=	27	34	AGCTCGGTAC	""""""""""	RG:Z:grp2	BC:Z:TGCA	H0:i:1	aa:A:A	ab:A:Z	fa:f:6.67e-11	za:Z:!"$%^&*()	ha:H:CAFE	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p002	99	ref1	5	2	10M	=	29	34	CTCGGTACCC	##########	RG:Z:grp1	BC:Z:AATTCCGG	H0:i:1	aa:A:a	ab:A:z	fa:f:4.3597e-18	za:Z:Another string	ha:H:2000AD	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp2_p002	99	ref1	7	3	10M	=	31	34	CGGTACCCGG	$$$$$$$$$$	fa:f:6.022e+23	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p003	99	ref1	9	4	10M	=	33	34	GTACCCGGGG	%%%%%%%%%%	fa:f:1.66e-27	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp2_p003	99	ref1	11	5	10M	=	35	34	ACCCGGGGAT	&&&&&&&&&&	RG:Z:grp2	ia:i:4294967295	NM:i:0	MD:Z:10	an:Z:dog
+ref1_grp1_p004	99	ref1	13	6	10M	=	37	34	CCGGGGATCC	''''''''''	fa:f:1.38e-23	za:Z:xRG:Z:grp2	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p004	99	ref1	15	7	10M	=	39	34	GGGGATCCTC	((((((((((	RG:Z:grp2	ia:i:-2147483648	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp1_p005	99	ref1	17	8	10M	=	41	34	GGATCCTCTA	))))))))))	RG:Z:grp1	ia:i:40000	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p005	99	ref1	19	9	10M	=	43	34	ATCCTCTAGA	**********	RG:Z:grp2	ia:i:-1000	NM:i:0	MD:Z:10	an:Z:aardvark
 ref1_grp1_p006	99	ref1	21	10	10M	=	45	34	CCTCTAGAGT	++++++++++	RG:Z:grp1	ia:i:255	NM:i:0	MD:Z:10
 ref1_grp2_p006	99	ref1	23	11	10M	=	47	34	TCTAGAGTCG	,,,,,,,,,,	RG:Z:grp2	ia:i:-1	NM:i:0	MD:Z:10
-ref1_grp1_p001	147	ref1	25	12	10M	=	1	-34	TAGAGTCGAC	----------	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p001	147	ref1	27	13	10M	=	3	-34	GAGTCGACCT	..........	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp1_p002	147	ref1	29	14	10M	=	5	-34	GTCGACCTGC	//////////	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p002	147	ref1	31	15	10M	=	7	-34	CGACCTGCAG	0000000000	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp1_p003	147	ref1	33	16	10M	=	9	-34	ACCTGCAGGC	1111111111	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p003	147	ref1	35	17	10M	=	11	-34	CTGCAGGCAT	2222222222	RG:Z:grp2	NM:i:0	MD:Z:10
-ref12_grp1_p001	97	ref1	36	50	10M	ref2	2	0	TGCAGGCATG	AAAAAAAAAA	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp1_p004	147	ref1	37	18	10M	=	13	-34	GCAGGCATGC	3333333333	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p004	147	ref1	39	19	10M	=	15	-34	AGGCATGCAA	4444444444	RG:Z:grp2	NM:i:0	MD:Z:10
-ref1_grp1_p005	147	ref1	41	20	10M	=	17	-34	GCATGCAAGC	5555555555	RG:Z:grp1	NM:i:0	MD:Z:10
-ref1_grp2_p005	147	ref1	43	21	10M	=	19	-34	ATGCAAGCTT	6666666666	RG:Z:grp2	NM:i:0	MD:Z:10
+ref1_grp1_p001	147	ref1	25	12	10M	=	1	-34	TAGAGTCGAC	----------	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p001	147	ref1	27	13	10M	=	3	-34	GAGTCGACCT	..........	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p002	147	ref1	29	14	10M	=	5	-34	GTCGACCTGC	//////////	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp2_p002	147	ref1	31	15	10M	=	7	-34	CGACCTGCAG	0000000000	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp1_p003	147	ref1	33	16	10M	=	9	-34	ACCTGCAGGC	1111111111	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp2_p003	147	ref1	35	17	10M	=	11	-34	CTGCAGGCAT	2222222222	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:dog
+ref12_grp1_p001	97	ref1	36	50	10M	ref2	2	0	TGCAGGCATG	AAAAAAAAAA	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref1_grp1_p004	147	ref1	37	18	10M	=	13	-34	GCAGGCATGC	3333333333	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p004	147	ref1	39	19	10M	=	15	-34	AGGCATGCAA	4444444444	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
+ref1_grp1_p005	147	ref1	41	20	10M	=	17	-34	GCATGCAAGC	5555555555	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:badger
+ref1_grp2_p005	147	ref1	43	21	10M	=	19	-34	ATGCAAGCTT	6666666666	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:aardvark
 ref1_grp1_p006	147	ref1	45	22	10M	=	21	-34	GCAAGCTTGA	7777777777	RG:Z:grp1	NM:i:0	MD:Z:10
-ref12_grp2_p001	97	ref1	46	50	10M	ref2	12	0	CAAGCTTGAG	AAAAAAAAAA	RG:Z:grp2	NM:i:0	MD:Z:10
+ref12_grp2_p001	97	ref1	46	50	10M	ref2	12	0	CAAGCTTGAG	AAAAAAAAAA	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:cat
 ref1_grp2_p006	147	ref1	47	23	10M	=	23	-34	AAGCTTGAGT	8888888888	RG:Z:grp2	NM:i:0	MD:Z:10
-ref2_grp3_p001	83	ref2	1	99	15M	=	31	45	ATTCTATAGTGTCAC	~~~~~~~~~~~~~~~	NM:i:0	MD:Z:15
-ref12_grp1_p001	145	ref2	2	50	10M	ref1	36	0	TTCTATAGTG	BBBBBBBBBB	RG:Z:grp1	NM:i:0	MD:Z:10
-ref12_grp2_p001	145	ref2	12	50	10M	ref1	46	0	TCACCTAAAT	BBBBBBBBBB	RG:Z:grp2	NM:i:0	MD:Z:10
-ref2_grp3_p002	147	ref2	16	99	15M	=	46	45	CTAAATAGCTTGGCG	}}}}}}}}}}}}}}}	NM:i:0	MD:Z:15
-ref2_grp3_p001	163	ref2	31	99	15M	=	1	-45	CTGTTTCCTGTGTGA	|||||||||||||||	NM:i:13	MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0
-ref2_grp3_p002	99	ref2	46	99	15M	=	16	-45	CTGTTTCCTGTGTGA	{{{{{{{{{{{{{{{	NM:i:0	MD:Z:15
-unaligned_grp3_p001	77	*	0	0	*	*	0	0	CACTCGTTCATGACG	0123456789abcde
-unaligned_grp3_p001	141	*	0	0	*	*	0	0	GAAAGTGAGGAGGTG	edcba9876543210
+ref2_grp3_p001	83	ref2	1	99	15M	=	31	45	ATTCTATAGTGTCAC	~~~~~~~~~~~~~~~	NM:i:0	MD:Z:15	an:Z:cat
+ref12_grp1_p001	145	ref2	2	50	10M	ref1	36	0	TTCTATAGTG	BBBBBBBBBB	RG:Z:grp1	NM:i:0	MD:Z:10	an:Z:cat
+ref12_grp2_p001	145	ref2	12	50	10M	ref1	46	0	TCACCTAAAT	BBBBBBBBBB	RG:Z:grp2	NM:i:0	MD:Z:10	an:Z:cat
+ref2_grp3_p002	147	ref2	16	99	15M	=	46	45	CTAAATAGCTTGGCG	}}}}}}}}}}}}}}}	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+ref2_grp3_p001	163	ref2	31	99	15M	=	1	-45	CTGTTTCCTGTGTGA	|||||||||||||||	NM:i:13	MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0	an:Z:cat
+ref2_grp3_p002	99	ref2	46	99	15M	=	16	-45	CTGTTTCCTGTGTGA	{{{{{{{{{{{{{{{	NM:i:0	MD:Z:15	RG:Z:grp3	an:Z:dog
+unaligned_grp3_p001	77	*	0	0	*	*	0	0	CACTCGTTCATGACG	0123456789abcde	an:Z:dog
+unaligned_grp3_p001	141	*	0	0	*	*	0	0	GAAAGTGAGGAGGTG	edcba9876543210	an:Z:dog
diff --git a/test/test.pl b/test/test.pl
index 179390a3f..68b765015 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -3510,6 +3510,43 @@ sub test_split
              ignore_pg_header => 1,
              reorder_header => 1,
              cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam");
+
+    test_cmd($opts,
+             out=>"dat/empty.expected",
+             out_map => {
+                 'split/split.tmp.grp1.sam' => 'split/split.expected.grp1.sam',
+                 'split/split.tmp.grp2.sam' => 'split/split.expected.grp2.sam',
+                 'split/split.tmp.grp3.sam' => 'split/split.expected_d_RG.grp3.sam',
+                 'split/split.tmp.unk.sam' => 'split/split.expected_d_RG.unk.sam',
+             },
+             ignore_pg_header => 1,
+             reorder_header => 1,
+             cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -d RG -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam");
+
+    test_cmd($opts,
+             out=>"dat/empty.expected",
+             out_map => {
+                 'split/split.tmp.aardvark.sam' => 'split/split.expected_d_an.aardvark.sam',
+                 'split/split.tmp.badger.sam' => 'split/split.expected_d_an.badger.sam',
+                 'split/split.tmp.cat.sam' => 'split/split.expected_d_an.cat.sam',
+                 'split/split.tmp.dog.sam' => 'split/split.expected_d_an.dog.sam',
+                 'split/split.tmp.unk.sam' => 'split/split.expected_d_an.unk.sam',
+             },
+             ignore_pg_header => 1,
+             reorder_header => 1,
+             cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -d an -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam");
+
+    test_cmd($opts,
+             out=>"dat/empty.expected",
+             out_map => {
+                 'split/split.tmp.badger.sam' => 'split/split.expected_d_an.badger.sam',
+                 'split/split.tmp.cat.sam' => 'split/split.expected_d_an.cat.sam',
+                 'split/split.tmp.dog.sam' => 'split/split.expected_d_an.dog.sam',
+                 'split/split.tmp.unk.sam' => 'split/split.expected_d_an_M_3.unk.sam',
+             },
+             ignore_pg_header => 1,
+             reorder_header => 1,
+             cmd => "$$opts{bin}/samtools split $threads --output-fmt sam -d an -M 3 -u $$opts{path}/split/split.tmp.unk.sam -f $$opts{path}/split/split.tmp.\%!.\%. $$opts{path}/split/split.sam");
 }
 
 sub test_ampliconclip

From 528e1b243c59493c7341328e2851583c49c3b7af Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 14 Nov 2023 10:11:53 +0000
Subject: [PATCH 20/26] Fix various --write-index issues

Call auto_index() after writing the file header on files made
by prep_sam_file().  Fixes failure to make index files when
using the -d option on non-RG tags.

Write an index on the unaccounted file, which was previously missed
out.

Ensure index file names are freed on error.

Fix crash on error in prep_sam_file() due to passing a NULL
pointer to sam_close().
---
 bam_split.c | 50 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/bam_split.c b/bam_split.c
index c6eaaff61..e0f730a33 100644
--- a/bam_split.c
+++ b/bam_split.c
@@ -64,6 +64,7 @@ struct state {
     sam_hdr_t* merged_input_header;
     samFile* unaccounted_file;
     sam_hdr_t* unaccounted_header;
+    char *unaccounted_idx_fn;
     size_t output_count;
     char **tag_vals;
     char **index_file_name;
@@ -405,6 +406,12 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
     if (state->p.pool)
         hts_set_opt(new_sam_file, HTS_OPT_THREAD_POOL, &state->p);
 
+    if (sam_hdr_write(new_sam_file, new_hdr) != 0) {
+        print_error_errno("split", "Couldn't write header to \"%s\"",
+                          new_file_name);
+        goto fail;
+    }
+
     if (state->write_index) {
         new_idx_fn = auto_index(new_sam_file, new_file_name, new_hdr);
         if (!new_idx_fn) {
@@ -413,11 +420,6 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
         }
     }
 
-    if (sam_hdr_write(new_sam_file, new_hdr) != 0) {
-        print_error_errno("split", "Couldn't write header to \"%s\"",
-                          new_file_name);
-        goto fail;
-    }
     int ret = -1;
     i = kh_put_c2i(state->tag_val_hash, tag_val, &ret);
     if (ret < 0) {
@@ -440,8 +442,10 @@ static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
     free(new_file_name);
     free(tag_val);
     free(new_idx_fn);
-    sam_hdr_destroy(new_hdr);
-    sam_close(new_sam_file);
+    if (new_hdr)
+        sam_hdr_destroy(new_hdr);
+    if (new_sam_file)
+        sam_close(new_sam_file);
     return kh_end(state->tag_val_hash);
 }
 
@@ -633,9 +637,22 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
 static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
 {
     int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
-    if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
-        print_error_errno("split", "Could not write output file header");
-        return false;
+    if (state->unaccounted_file) {
+        if (sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
+            print_error_errno("split", "Could not write output file header");
+            return false;
+        }
+        if (opts->ga.write_index) {
+            state->unaccounted_idx_fn = auto_index(state->unaccounted_file,
+                                                   opts->unaccounted_name,
+                                                   state->unaccounted_header);
+            if (!state->unaccounted_idx_fn) {
+                print_error_errno("split",
+                                  "Creating index file for file \"%s\" failed",
+                                  opts->unaccounted_name);
+                return false;
+            }
+        }
     }
     bam1_t* file_read = bam_init1();
     // Read the first record
@@ -738,10 +755,17 @@ static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
         size_t i;
         for (i = 0; i < state->output_count; i++) {
             if (sam_idx_save(state->output_file[i]) < 0) {
-                print_error_errno("split", "writing index failed");
+                print_error_errno("split", "writing index \"%s\" failed",
+                                  state->index_file_name[i]);
+                return false;
+            }
+        }
+        if (state->unaccounted_file) {
+            if (sam_idx_save(state->unaccounted_file) < 0) {
+                print_error_errno("split", "writing index \"%s\" failed",
+                                  state->unaccounted_idx_fn);
                 return false;
             }
-            free(state->index_file_name[i]);
         }
     }
 
@@ -776,6 +800,7 @@ static int cleanup_state(state_t* status, bool check_close)
         }
         if (status->tag_vals) free(status->tag_vals[i]);
         if (status->output_file_name) free(status->output_file_name[i]);
+        if (status->index_file_name[i]) free(status->index_file_name[i]);
     }
     if (status->merged_input_header)
         sam_hdr_destroy(status->merged_input_header);
@@ -783,6 +808,7 @@ static int cleanup_state(state_t* status, bool check_close)
     free(status->output_file);
     free(status->output_file_name);
     free(status->index_file_name);
+    free(status->unaccounted_idx_fn);
     kh_destroy_c2i(status->tag_val_hash);
 
     free(status->tag_vals);

From 4c5285fa0a1109c32fd1e70c97782f2511ba09c1 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 24 Nov 2023 09:51:25 +0000
Subject: [PATCH 21/26] Fix a couple minor samtools view memory problems.

If we fail to decode a BAM file due to file truncation, we attempted
to access iter->curr_tid in the diagnostic error just after it was
freed, potentially giving incorrect errors.  We also didn't free the
index on exit, which is harmless but makes finding real problems
harder.
---
 sam_view.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sam_view.c b/sam_view.c
index 2a1439acb..877f84a13 100644
--- a/sam_view.c
+++ b/sam_view.c
@@ -777,13 +777,13 @@ static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter)
     while ((result = sam_itr_multi_next(conf->in, iter, b)) >= 0) {
         if (process_one_record(conf, b, &write_error) < 0) break;
     }
-    hts_itr_multi_destroy(iter);
     bam_destroy1(b);
 
     if (result < -1) {
         print_error("view", "retrieval of region #%d failed", iter->curr_tid);
-        return 1;
+        write_error = 1;
     }
+    hts_itr_multi_destroy(iter);
     return write_error;
 }
 
@@ -1376,8 +1376,6 @@ int main_samview(int argc, char *argv[])
         }
     }
 
-    if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx);
-
     if (ga.write_index) {
         if (sam_idx_save(settings.out) < 0) {
             print_error_errno("view", "writing index failed");
@@ -1390,6 +1388,8 @@ int main_samview(int argc, char *argv[])
     }
 
 view_end:
+    if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx);
+
     if (settings.is_count && ret == 0) {
         if (fprintf(settings.fn_out? fp_out : stdout, "%" PRId64 "\n", settings.count) < 0) {
             if (settings.fn_out) print_error_errno("view", "writing to \"%s\" failed", settings.fn_out);

From 5e0ae52bb28601d10a4f93c388c69b96e81365db Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 14 Nov 2023 15:58:56 +0000
Subject: [PATCH 22/26] News updates for Winter 2023 (1.19)

---
 NEWS | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/NEWS b/NEWS
index d7d6f20f3..c961fd9be 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,69 @@
 Release a.b
 -----------
 
+New work and changes:
+
+* Samtools coverage: add a new --plot-depth option to draw depth (of
+  coverage) rather than the percentage of bases covered.
+  (PR#1910.  Thanks to Pierre Lindenbaum)
+
+* Samtools merge / sort: add a lexicographical name-sort option via the -N
+  option.  The "natural" alpha-numeric sort is still available via -n.
+  (PR#1900, fixes #1500.  Reported by Steve Huang)
+
+* Samtools view: add -N ^NAME_FILE and -R ^RG_FILE options.  The standard -N
+  and -R options only output reads matching a specified list of names or
+  read-groups.  With a caret (^) prefix these may be negated to only output
+  read not matching the specified files.
+  (PR#1896, fixes #1895.  Suggested by Feng Tian)
+
+* Cope with Htslib's change to no longer close stdout on is hts_close is
+  used with the output file being stdout.
+  Htslib companion PR is samtools/htslib#1665.
+  (PR#1909.  Thanks to John Marshall)
+
+* Plot-bamstats: add a new plot of the read lengths ("RL") from samtools
+  stats output.
+  (PR#1922, fixes #1824.  Thanks to @erboone, suggested by Martin Pollard)
+
+* Samtools split: support splitting files based on the contents of
+  auxiliary tags.  Also adds a -M option to limit the number of files split
+  can make, to avoid accidental resource over-allocation, and fixes some
+  issues with --write-index.
+  (PR#1222, PR#1933, fixes #1758.  Thanks to Valeriu Ohan, suggested by
+  Scott Norton)
+
+Bug Fixes
+
+* Samtools stats: empty barcode tags are now treated as having no barcode.
+  (PR#1929, fixes #1926.  Reported by Jukka Matilainen)
+
+* Samtools cat: add support for non-seekable streams.  The file format
+  detection blocked pipes from working before, but now files may be
+  non-seekable such as stdin or a pipe.
+  (PR#1930, fixes #1731.  Reported by Julian Hess)
+
+
+Documentation
+
+* Samtools mpileup: add more usage examples to the man page.
+  (PR#1913, fixes #1801)
+
+* Samtools fastq: explicitly document the order that filters apply.
+  (PR#1907.  fixes ?)
+
+* Samtools merge: fix example output to use an uppercase RG PL field.
+  (PR#1917.  Thanks to John Marshall.  Reported by Michael Macias)
+
+* Add hclen SAM filter documentation.
+  (PR#1902.  See also samtools/htslib#1660)
+
+* Samtools consensus: remove the -5 option from documentation.  This option
+  was renamed before the consensus subcommand was merged, but accidentally
+  left in the man page.
+  (PR#1924)
+
+
 Release 1.18 (25th July 2023)
 -----------------------------
 

From a55b8a9a4bf97b4f28dc57788ca7738627b7a825 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 15 Nov 2023 09:52:32 +0000
Subject: [PATCH 23/26] Renamed NEWS to NEWS.md so it's formatted nicely on
 github.

It's pretty much already in Markdown anyway, but I edited a few
quoting issues in pre-1.0 releases and changed PR-hash to
PR-space-hash to get auto-formatting working for release notes.

The format is still essentially plain text and readable without
needing to have multiple copies for different media.
---
 NEWS => NEWS.md | 360 ++++++++++++++++++++++++------------------------
 1 file changed, 180 insertions(+), 180 deletions(-)
 rename NEWS => NEWS.md (90%)

diff --git a/NEWS b/NEWS.md
similarity index 90%
rename from NEWS
rename to NEWS.md
index c961fd9be..1527fbb00 100644
--- a/NEWS
+++ b/NEWS.md
@@ -5,63 +5,63 @@ New work and changes:
 
 * Samtools coverage: add a new --plot-depth option to draw depth (of
   coverage) rather than the percentage of bases covered.
-  (PR#1910.  Thanks to Pierre Lindenbaum)
+  (PR #1910.  Thanks to Pierre Lindenbaum)
 
 * Samtools merge / sort: add a lexicographical name-sort option via the -N
   option.  The "natural" alpha-numeric sort is still available via -n.
-  (PR#1900, fixes #1500.  Reported by Steve Huang)
+  (PR #1900, fixes #1500.  Reported by Steve Huang)
 
 * Samtools view: add -N ^NAME_FILE and -R ^RG_FILE options.  The standard -N
   and -R options only output reads matching a specified list of names or
   read-groups.  With a caret (^) prefix these may be negated to only output
   read not matching the specified files.
-  (PR#1896, fixes #1895.  Suggested by Feng Tian)
+  (PR #1896, fixes #1895.  Suggested by Feng Tian)
 
 * Cope with Htslib's change to no longer close stdout on is hts_close is
   used with the output file being stdout.
   Htslib companion PR is samtools/htslib#1665.
-  (PR#1909.  Thanks to John Marshall)
+  (PR #1909.  Thanks to John Marshall)
 
 * Plot-bamstats: add a new plot of the read lengths ("RL") from samtools
   stats output.
-  (PR#1922, fixes #1824.  Thanks to @erboone, suggested by Martin Pollard)
+  (PR #1922, fixes #1824.  Thanks to @erboone, suggested by Martin Pollard)
 
 * Samtools split: support splitting files based on the contents of
   auxiliary tags.  Also adds a -M option to limit the number of files split
   can make, to avoid accidental resource over-allocation, and fixes some
   issues with --write-index.
-  (PR#1222, PR#1933, fixes #1758.  Thanks to Valeriu Ohan, suggested by
+  (PR #1222, PR #1933, fixes #1758.  Thanks to Valeriu Ohan, suggested by
   Scott Norton)
 
 Bug Fixes
 
 * Samtools stats: empty barcode tags are now treated as having no barcode.
-  (PR#1929, fixes #1926.  Reported by Jukka Matilainen)
+  (PR #1929, fixes #1926.  Reported by Jukka Matilainen)
 
 * Samtools cat: add support for non-seekable streams.  The file format
   detection blocked pipes from working before, but now files may be
   non-seekable such as stdin or a pipe.
-  (PR#1930, fixes #1731.  Reported by Julian Hess)
+  (PR #1930, fixes #1731.  Reported by Julian Hess)
 
 
 Documentation
 
 * Samtools mpileup: add more usage examples to the man page.
-  (PR#1913, fixes #1801)
+  (PR #1913, fixes #1801)
 
 * Samtools fastq: explicitly document the order that filters apply.
-  (PR#1907.  fixes ?)
+  (PR #1907)
 
 * Samtools merge: fix example output to use an uppercase RG PL field.
-  (PR#1917.  Thanks to John Marshall.  Reported by Michael Macias)
+  (PR #1917.  Thanks to John Marshall.  Reported by Michael Macias)
 
 * Add hclen SAM filter documentation.
-  (PR#1902.  See also samtools/htslib#1660)
+  (PR #1902.  See also samtools/htslib#1660)
 
 * Samtools consensus: remove the -5 option from documentation.  This option
   was renamed before the consensus subcommand was merged, but accidentally
   left in the man page.
-  (PR#1924)
+  (PR #1924)
 
 
 Release 1.18 (25th July 2023)
@@ -73,51 +73,51 @@ New work and changes:
   minimiser sort to arrange the minimiser values in the same order as they
   occur in the reference genome. This is acts as an extremely crude and
   simplistic read aligner that can be used to boost read compression.
-  (PR#1818)
+  (PR #1818)
 
 * Add a --duplicate-count option to markdup.  Adds the number of duplicates
   (including itself) to the original read in a 'dc' tag.
-  (PR#1816. Thanks to wulj2)
+  (PR #1816. Thanks to wulj2)
 
 * Make calmd handle unaligned data or empty files without throwing an error.
   This is to make pipelines work more smoothly.  A warning will still be issued.
-  (PR#1841, fixes #1839.  Reported by Filipe G. Vieira)
+  (PR #1841, fixes #1839.  Reported by Filipe G. Vieira)
 
 * Consistent, more comprehensive flag filtering for fasta/fastq.  Added
   --rf/--incl[ude]-flags and long options for -F (--excl[ude]-flags and
   -f (--require-flags).
-  (PR#1842.  Thanks to Devang Thakkar)
+  (PR #1842.  Thanks to Devang Thakkar)
 
 * Apply fastq --input-fmt-option settings.  Previously any options specified
   were not being applied to the input file.
-  (PR#1855.  Thanks to John Marshall)
+  (PR #1855.  Thanks to John Marshall)
 
 * Add fastq -d TAG[:VAL] check.  This mirrors view -d and will only output
   alignments that match TAG (and VAL if specified).
-  (PR#1863, fixes #1854.  Requested by Rasmus Kirkegaard)
+  (PR #1863, fixes #1854.  Requested by Rasmus Kirkegaard)
 
 * Extend import --order TAG to --order TAG:length.  If length is specified, the
   tag format goes from integer to a 0-padded string format.  This is a
   workaround for BAM and CRAM that cannot encode an order tag of over 4 billion
   records.
-  (PR#1850, fixes #1847.  Reported by Feng Tian)
+  (PR #1850, fixes #1847.  Reported by Feng Tian)
 
 * New -aa mode for consensus.  This works like the -aa option in depth and
   mpileup. The single 'a' reports all bases in contigs covered by alignments.
   Double 'aa' (or '-a -a') reports Ns even for the references with no alignments
   against them.
-  (PR#1851, fixes #1849.  Requested by Tim Fennell)
+  (PR #1851, fixes #1849.  Requested by Tim Fennell)
 
 * Add long option support to samtools index.
-  (PR#1872, fixes #1869.  Reported by Jason Bacon)
+  (PR #1872, fixes #1869.  Reported by Jason Bacon)
 
 * Be consistent with rounding of "average length" in samtools stats.
-  (PR#1876, fixes #1867.  Reported by Jelinek-J)
+  (PR #1876, fixes #1867.  Reported by Jelinek-J)
 
 * Add option to ampliconclip that marks reads as unmapped when they do not have
   enough aligned bases left after clipping.  Default is to unmap reads with zero
   aligned bases.
-  (PR#1865, fixes #1856.  Requested by ces)
+  (PR #1865, fixes #1856.  Requested by ces)
 
 Bug Fixes:
 
@@ -128,45 +128,45 @@ Bug Fixes:
   although the change in 1.11 was bug-fixing multi-threaded index queries.
   This bug did not affect index building.  There is no need to reindex your
   CRAM files.
-  (PR#samtools/htslib#1574, PR#samtools/htslib#1640. Fixes
+  (PR #samtools/htslib#1574, PR #samtools/htslib#1640. Fixes
   #samtools/htslib#1569, #samtools/htslib#1639, #1808, #1819.  Reported by
   xuxif, Jens Reeder and Jared Simpson)
 
 * Fix a sort -M bug (regression) when merging sub-blocks.  Data was valid but
   in a poor order for compression.
-  (PR#1812)
+  (PR #1812)
 
 * Fix bug in split output format.  Now SAM and CRAM format can chosen as well
   as BAM.  Also a documentation change, see below.
-  (PR#1821)
+  (PR #1821)
 
 * Add error checking to view -e filter expression code.  Invalid expressions
   were not returning an error code.
-  (PR#1833, fixes #1829.  Reported by Steve Huang)
+  (PR #1833, fixes #1829.  Reported by Steve Huang)
 
 * Fix reheader CRAM output version.  Sets the correct CRAM output version for
   non-3.0 CRAMs.
-  (PR#1868, fixes #1866.  Reported by John Marshall)
+  (PR #1868, fixes #1866.  Reported by John Marshall)
 
 Documentation:
 
 * Expand the default filtering information on the mpileup man page.
-  (PR#1802, fixes #1801.  Reported by gevro)
+  (PR #1802, fixes #1801.  Reported by gevro)
 
 * Add an explanation of the default behaviour of split files on generating
   a file for reads with missing or unrecognised RG tags.  Also a small bug fix,
   see above.
-  (PR#1821, fixes #1817.  Reported by Steve Huang)
+  (PR #1821, fixes #1817.  Reported by Steve Huang)
 
 * In the INSTALL instructions, switched back to openssl for Alpine.  This
   matches the current Alpine Linux practice.
-  (PR#1837, see htslib#1591.  Reported by John Marshall)
+  (PR #1837, see htslib#1591.  Reported by John Marshall)
 
 * Fix various typos caught by lintian parsers.
-  (PR#1877.  Thanks to Étienne Mollier)
+  (PR #1877.  Thanks to Étienne Mollier)
 
 * Document consensus --qual-calibration option.
-  (PR#1880, fixes #1879.  Reported by John Marshall)
+  (PR #1880, fixes #1879.  Reported by John Marshall)
 
 * Updated the page about samtools duplicate marking with more detail at
   www.htslib.org/algorithms/duplicate.html
@@ -174,7 +174,7 @@ Documentation:
 Non user-visible changes and build improvements:
 
 * Removed a redundant line that caused a warning in gcc-13.
-  (PR#1838)
+  (PR #1838)
 
 
 Release 1.17 (21st February 2023)
@@ -187,115 +187,115 @@ New work and changes:
   reverse direction, sequence and its quality values are reversed and
   complemented and the reverse flag is reset.  Supplementary and secondary
   alignment data are discarded.
-  (PR#1767, implements #1682. Requested by dkj)
+  (PR #1767, implements #1682. Requested by dkj)
 
 * New samtools cram-size subcommand.  It writes out metrics about a CRAM file
   reporting aggregate sizes per block "Content ID" fields, the data-series
   contained within them, and the compression methods used.
-  (PR#1777)
+  (PR #1777)
 
 * Added a --sanitize option to fixmate and view.  This performs some sanity
   checks on the state of SAM record fields, fixing up common mistakes made by
   aligners.
-  (PR#1698)
+  (PR #1698)
 
 * Permit 1 thread with samtools view.  All other subcommands already allow this
   and it does provide a modest speed increase.
-  (PR#1755, fixes #1743. Reported by Goran Vinterhalter)
+  (PR #1755, fixes #1743. Reported by Goran Vinterhalter)
 
 * Add CRAM_OPT_REQUIRED_FIELDS option for view -c.  This is a big speed up for
   CRAM (maybe 5-fold), but it depends on which filtering options are being used.
-  (PR#1776, fixes #1775. Reported by Chang Y)
+  (PR #1776, fixes #1775. Reported by Chang Y)
 
 * New filtering options in samtools depth.  The new --excl-flags option is a
   synonym for -G, with --incl-flags and --require-flags added to match view
   logic.
-  (PR#1718, fixes #1702. Reported by Dario Beraldi)
+  (PR #1718, fixes #1702. Reported by Dario Beraldi)
 
 * Speed up calmd's slow handling of non-position-sorted data by adding caching.
   This uses more memory but is only activated when needed.
-  (PR#1723, fixes #1595. Reported by lxwgcool)
+  (PR #1723, fixes #1595. Reported by lxwgcool)
 
 * Improve samtools consensus for platforms with instrument specific profiles,
   considerably helping for data with very different indel error models and
   providing base quality recalibration tables. On PacBio HiFi, ONT and
   Ultima Genomics consensus qualities are also redistributed within homopolymers
   and the likelihood of nearby indel errors is raised.
-  (PR#1721, PR#1733)
+  (PR #1721, PR #1733)
 
 * Consensus --mark-ins option.  This permits he consensus output to include a
   markup indicating the next base is an insertion. This is necessary as we need
   a way of outputting both consensus and also how that consensus marries up with
   the reference coordinates.
-  (PR#1746)
+  (PR #1746)
 
 * Make faidx/fqidx output line length default to the input line length.
-  (PR#1738, fixes #1734. Reported by John Marshall)
+  (PR #1738, fixes #1734. Reported by John Marshall)
 
 * Speed up optical duplicate checking where data has a lot of duplicates
   compared to non-duplicates.
-  (PR#1779, fixes #1771. Reported by Poshi)
+  (PR #1779, fixes #1771. Reported by Poshi)
 
 * For collate use TMPDIR environment variable, when looking for a temporary
   folder.
-  (PR#1782, based on PR#1178 and fixes #1172.  Reported by Martin Pollard)
+  (PR #1782, based on PR #1178 and fixes #1172.  Reported by Martin Pollard)
 
 Bug Fixes:
 
 * Fix stats breakage on long deletions when given a reference.
-  (PR#1712, fixes #1707. Reported by John Didion)
+  (PR #1712, fixes #1707. Reported by John Didion)
 
 * In ampliconclip, stop hard clipping from wrongly removing entire reads.
-  (PR#1722, fixes #1717. Reported by Kevin Xu)
+  (PR #1722, fixes #1717. Reported by Kevin Xu)
 
 * Fix bug in ampliconstats where references mentioned in the input file headers
   but not in the bed file would cause it to complain that the SAM headers were
   inconsistent.
-  (PR#1727, fixes #1650. Reported by jPontix)
+  (PR #1727, fixes #1650. Reported by jPontix)
 
 * Fixed SEGV in samtools collate when no filename given.
-  (PR#1724)
+  (PR #1724)
 
 * Changed the default UMI barcode regex in markdup.  The old regex was too
   restrictive.  This version will at least allow the default read name UMI as
   given in the Illumina example documentation.
-  (PR#1737, fixes #1730. Reported by yloemie)
+  (PR #1737, fixes #1730. Reported by yloemie)
 
 * Fix samtools consensus buffer overrun with MD:Z handling.
-  (PR#1745, fixes #1744. Reported by trilisser)
+  (PR #1745, fixes #1744. Reported by trilisser)
 
 * Fix a buffer read-overflow in mpileup and tview on sequences with seq "*".
-  (PR#1747)
+  (PR #1747)
 
 * Fix view -X command line parsing that was broken in 1.15.
-  (PR#1772, fixes #1720.  Reported by Francisco Rodríguez-Algarra
+  (PR #1772, fixes #1720.  Reported by Francisco Rodríguez-Algarra
   and Miguel Machado)
 
 * Stop samtools view -d from reporting meaningless system errors when
   tag validation fails.
-  (PR#1796)
+  (PR #1796)
 
 Documentation:
 
 * Add a description of the samtools tview display layout to the man page.
   Documents . vs , and upper vs lowercase. Adds a -s sample example, and
   documents the -w option.
-  (PR#1765, fixes #1759. Reported by Lucas Ferreira da Silva)
+  (PR #1765, fixes #1759. Reported by Lucas Ferreira da Silva)
 
 * Clarify intention of samtools fasta/q in man page and soft vs hard
   clipping.
-  (PR#1794, fixes #1792. Reported by Ryan Lorig-Roach)
+  (PR #1794, fixes #1792. Reported by Ryan Lorig-Roach)
 
 * Minor fix to wording of mpileup --rf usage and man page.
-  (PR#1795, fixes #1791. Reported by Luka Pavageau)
+  (PR #1795, fixes #1791. Reported by Luka Pavageau)
 
 Non user-visible changes and build improvements:
 
 * Use POSIX grep in testing as egrep and fgrep are considered obsolete.
-  (PR#1726, thanks to David Seifert)
+  (PR #1726, thanks to David Seifert)
 
 * Switch MacOS CI tests to an ARM-based image.
-  (PR#1770)
+  (PR #1770)
 
 
 Release 1.16.1 (2nd September 2022)
@@ -306,11 +306,11 @@ Bug fixes:
  * Fixed a bug with the template-coordinate sort which caused incorrect
    ordering when using threads, or processing large files that don't
    fit completely in memory.
-   (PR#1703, thanks to Nils Homer)
+   (PR #1703, thanks to Nils Homer)
 
  * Fixed a crash that occurred when trying to use `samtools merge` in
    template-coordinate mode.
-   (PR#1705, thanks to Nils Homer)
+   (PR #1705, thanks to Nils Homer)
 
 Release 1.16 (18th August 2022)
 -------------------------------
@@ -319,90 +319,90 @@ New work and changes:
 
  * samtools reference command added.  This subcommand extracts the embedded
    reference out of a CRAM file.
-   (PR#1649, addresses #723.  Requested by Torsten Seemann)
+   (PR #1649, addresses #723.  Requested by Torsten Seemann)
 
  * samtools import now adds grouped by query-name to the header.
-   (PR#1633, thanks to Nils Homer)
+   (PR #1633, thanks to Nils Homer)
 
  * Made samtools view read error messages more generic.  Former error message
    would claim that there was a "truncated file or corrupt BAM index file" with
    no real justification.  Also reset errno in stream_view which could lead to
    confusing error messages.
-   (PR#1645, addresses some of the issues in #1640.  Reported by Jian-Guo Zhou)
+   (PR #1645, addresses some of the issues in #1640.  Reported by Jian-Guo Zhou)
 
  * Make samtools view -p also clear mqual, tlen and cigar.
-   (PR#1647, fixes #1606.  Reported by eboyden)
+   (PR #1647, fixes #1606.  Reported by eboyden)
 
  * Add bedcov option -c to report read count.
-   (PR#1644, fixes #1629.  Reported by Natchaphon Rajudom)
+   (PR #1644, fixes #1629.  Reported by Natchaphon Rajudom)
 
  * Add UMI/barcode handling to samtools markdup.
-   (PR#1630, fixes #1358 and #1514.  Reported by Gert Hulselmans and Poshi)
+   (PR #1630, fixes #1358 and #1514.  Reported by Gert Hulselmans and Poshi)
 
  * Add a new template coordinate sort order to samtools sort and
    samtools merge.  This is useful when working with unique molecular
    identifiers (UMIs).
-   (PR#1605, fixes #1591.  Thanks to Nils Homer)
+   (PR #1605, fixes #1591.  Thanks to Nils Homer)
 
  * Rename mpileup --ignore-overlaps to --ignore-overlaps-removal
    or --disable-overlap-removal.  The previous name was ambiguous and was often
    read as an option to enable removal of overlapping bases, while in reality
    this is on by default and the option turns off the ability to remove
    overlapping bases.
-   (PR#1666, fixes #1663.  Reported by yangdingyangding)
+   (PR #1666, fixes #1663.  Reported by yangdingyangding)
 
  * The dict command can now read BWA's .alt file and add AH:* tags
    indicating reference sequences that represent alternate loci.
-   (PR#1676.  Thanks to John Marshall)
+   (PR #1676.  Thanks to John Marshall)
 
  * The "samtools index" command can now accept multiple alignment filenames
    with the new -M option, and will index each of them separately. (Specifying
    the output index filename via out.index or the new -o option is currently
    only applicable when there is only one alignment file to be indexed.)
-   (PR#1674.  Reported by Abigail Ramsøe and Nicola Romanò.
+   (PR #1674.  Reported by Abigail Ramsøe and Nicola Romanò.
     Thanks to John Marshall)
 
  * Allow samtools fastq -T "*".
    This allows all tags from SAM records to be written to fastq headers. This is
    a counterpart to samtools import -T "*".
-   (PR#1679.  Thanks to cjw85)
+   (PR #1679.  Thanks to cjw85)
 
 Bug Fixes:
 
  * Re-enable --reference option for samtools depth.  The reference is not used
    but this makes the command line usage compatible with older releases.
-   (PR#1646, fixes #1643.  Reported by Randy Harr)
+   (PR #1646, fixes #1643.  Reported by Randy Harr)
 
  * Fix regex coordinate bug in samtools markdup.
-   (PR#1657, fixes #1642.  Reported by Randy Harr)
+   (PR #1657, fixes #1642.  Reported by Randy Harr)
 
  * Fix divide by zero in plot-bamstats -m, on unmapped data.
-   (PR#1678, fixes #1675.  Thanks to Shane McCarthy)
+   (PR #1678, fixes #1675.  Thanks to Shane McCarthy)
 
  * Fix missing RG headers when using samtools merge -r.
-   (PR#1683, addresses htslib#1479.  Reported by Alex Leonard)
+   (PR #1683, addresses htslib#1479.  Reported by Alex Leonard)
 
  * Fix a possible unaligned access in samtools reference.
-   (PR#1696)
+   (PR #1696)
 
 Documentation:
 
  * Add documentation on CRAM compression profiles and some of the newer options
    that appear in CRAM 3.1 and above.
-   (PR#1659, fixes #1656.  Reported by Matthias De Smet)
+   (PR #1659, fixes #1656.  Reported by Matthias De Smet)
 
  * Add "sclen" filter expression keyword documentation.
-   (PR#1661, see also htslib#1441)
+   (PR #1661, see also htslib#1441)
 
  * Extend FILTER EXPRESSION man page section to match the changes made in
    HTSlib.
-   (PR#1687, samtools/htslib#1478)
+   (PR #1687, samtools/htslib#1478)
 
 Non user-visible changes and build improvements:
 
  * Ensure generated test files are ignored (by git) and cleaned (by make
    testclean)
-   (PR#1692, Thanks to John Marshall)
+   (PR #1692, Thanks to John Marshall)
 
 Release 1.15.1 (7th April 2022)
 -------------------------------
@@ -767,7 +767,7 @@ Release 1.12 (17th March 2021)
 
  * samtools markdup now benefits from an increase in performance in the
    situation when a single read has tens or hundreds of thousands of duplicates.
-   Thanks to `@denriquez` for reporting the issue. (#1345; fixed #1325)
+   Thanks to @denriquez for reporting the issue. (#1345; fixed #1325)
 
  * The documentation for samtools ampliconstats has been added to the samtools
    man page. (#1351)
@@ -793,7 +793,7 @@ Release 1.12 (17th March 2021)
    (#1343)
 
  * The documentation for `samtools depth -s` has been improved.
-   Thanks to `@wulj2`. (#1355)
+   Thanks to @wulj2. (#1355)
 
  * Fixed a `samtools merge` segmentation fault when it failed to merge
    header `@PG` records. Thanks to John Marshall.  (#1394; reported by
@@ -969,10 +969,10 @@ Changes affecting the whole of samtools, or multiple sub-commands:
    for invalid headers, it is possible that some illegal files will now
    be rejected when they would have been allowed by earlier versions. (#998)
 
-   Examples of problems that will now be rejected include @SQ lines with
-   no SN: tag, and @RG or @PG lines with no ID: tag.
+   Examples of problems that will now be rejected include `@SQ` lines with
+   no SN: tag, and `@RG` or `@PG` lines with no ID: tag.
 
- * samtools sub-commands will now add '@PG' header lines to output sam/bam/cram
+ * samtools sub-commands will now add `@PG` header lines to output sam/bam/cram
    files.  To disable this, use the '--no-PG' option. (#1087; #1097)
 
  * samtools now supports alignment records with reference positions greater
@@ -1152,7 +1152,7 @@ Changes affecting specific sub-commands:
 
  * samtools quickcheck:
 
-   - Add unmapped (-u) option, which disables the check for @SQ lines in
+   - Add unmapped (-u) option, which disables the check for `@SQ` lines in
      the header. (#920, thanks to Shane McCarthy)
 
  * samtools reheader:
@@ -1176,7 +1176,7 @@ Changes affecting specific sub-commands:
      header file to be specified in its own option. (#961)
 
    - Fixed bug where samtools split would crash if it read a SAM header that
-     contained an @RG line with no ID tag. (#954, reported by @blue-bird1)
+     contained an `@RG` line with no ID tag. (#954, reported by @blue-bird1)
 
  * samtools stats:
 
@@ -1399,7 +1399,7 @@ Release 1.7 (26th January 2018)
 * samtools bedcov will now ignore BED comment and header lines (#571; thanks
   to Daniel Baker).
 
-* samtools collate now updates the @HD SO: and GO: tags, and sort will
+* samtools collate now updates the `@HD` SO: and GO: tags, and sort will
   remove a GO: tag if present.  (#757; reported by Imran Haque).
 
 * Bug-fixes:
@@ -1466,7 +1466,7 @@ Release 1.5 [Solstice Release] (21st June 2017)
   (#675; thanks to Patrick Marks of 10xgenomics).
 
 Release 1.4.1  (8th May 2017)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------
 
 * Added options to fastq to create fastq files from BC (or other)
   tags.
@@ -1491,13 +1491,13 @@ Release 1.4.1  (8th May 2017)
 
 
 Release 1.4  (13 March 2017)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+----------------------------
 
 Noteworthy changes in samtools:
 
 * Fixed Issue #345 - out-by-one error in insert-size in samtools stats
 
-* bam_split now add a @PG header to the bam file
+* bam_split now add a `@PG` header to the bam file
 
 * Added mate cigar tag support to fixmate
 
@@ -1529,7 +1529,7 @@ Noteworthy changes in samtools:
 
 
 Beta Release 1.3.1  (22 April 2016)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------
 
 Noteworthy changes in samtools:
 
@@ -1554,20 +1554,20 @@ Noteworthy changes in samtools:
   field (#517).
 
 * The merge and sort commands now handle (unusual) BAM files that have no
-  textual @SQ headers (#548, #550).
+  textual `@SQ` headers (#548, #550).
 
 * Sorting files containing @CO headers no longer duplicates the comment
   headers, which previously happened on large sorts for which temporary
   files were needed (#563).
 
-* The rmdup and view -l commands no longer crash on @RG headers that do not
+* The rmdup and view -l commands no longer crash on `@RG` headers that do not
   have a LB field (#538).
 
 * Fixed miscellaneous issues #128, #130, #131, #489, and #514.
 
 
 Beta Release 1.3  (15 December 2015)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
 
 Noteworthy changes in samtools:
 
@@ -1602,7 +1602,7 @@ Noteworthy changes in samtools:
   Previously it listened to $(LDLIBS) instead; if you were overriding
   that, you should now override LIBS rather than LDLIBS.
 
-* A new addreplacerg command that adds or alters @RG headers and RG:Z
+* A new addreplacerg command that adds or alters `@RG` headers and RG:Z
   record tags has been added.
 
 * The rmdup command no longer immediately aborts (previously it always
@@ -1663,7 +1663,7 @@ Noteworthy changes in samtools:
 
 
 Beta Release 1.2  (2 February 2015)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------
 
 Noteworthy changes in samtools:
 
@@ -1679,7 +1679,7 @@ Noteworthy changes in samtools:
 
 
 Beta Release 1.1 (19 September, 2014)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------
 
 Notable changes in samtools:
 
@@ -1694,7 +1694,7 @@ Notable changes in samtools:
 
 
 Beta Release 1.0 (15 August, 2014)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+----------------------------------
 
 First release of HTSlib-based samtools.
 
@@ -1707,13 +1707,13 @@ superseded by BGZF/bgzip and have been removed from samtools.
 
 
 Beta Release 0.1.20 (15 August, 2014)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------
 
 Final release of standalone samtools.
 
 
 Beta Release 0.1.19 (15 March, 2013)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
 
 Notable changes in samtools and bcftools:
 
@@ -1743,7 +1743,7 @@ http://github.com/samtools/samtools/commits/master
 
 
 Beta Release 0.1.18 (2 September, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------
 
 Notable changes in samtools:
 
@@ -1770,14 +1770,14 @@ Notable changes in bcftools:
 
 
 Beta Release 0.1.17 (6 July, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+----------------------------------
 
-With the maturity of `mpileup' and the lack of update in the `pileup' command,
-the `pileup' command is now formally dropped. Most of the pileup functionality,
+With the maturity of `mpileup` and the lack of update in the `pileup` command,
+the `pileup` command is now formally dropped. Most of the pileup functionality,
 such as outputting mapping quality and read positions, have been added
-`mpileup'.
+`mpileup`.
 
-Since this release, `bcftools view' is able to perform contrast SNP calling
+Since this release, `bcftools view` is able to perform contrast SNP calling
 (option -T) for discovering de novo and/or somatic mutations between a pair of
 samples or in a family trio. Potential mutations are scored by a log likelihood
 ratio, which is very simple in math, but should be comparable to more
@@ -1795,7 +1795,7 @@ Other notable changes in samtools:
  * Fixed an issue where mpileup does not apply BAQ for the first few reads when
    a region is specified.
 
- * Fixed an issue where `faidx' does not work with FASTA files with long lines.
+ * Fixed an issue where `faidx` does not work with FASTA files with long lines.
 
  * Bugfix: wrong SP genotype information in the BCF output.
 
@@ -1814,14 +1814,14 @@ Other notable changes in bcftools:
 
 
 Beta Release 0.1.16 (21 April, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
 
 Notable changes in samtools:
 
- * Support the new SAM/BAM type `B' in the latest SAM spec v1.4.
+ * Support the new SAM/BAM type `B` in the latest SAM spec v1.4.
 
- * When the output file of `samtools merge' exists, do not overwrite it unless
-   a new command-line option `-f' is applied.
+ * When the output file of `samtools merge` exists, do not overwrite it unless
+   a new command-line option `-f` is applied.
 
  * Bugfix: BED support is not working when the input BED is not sorted.
 
@@ -1839,11 +1839,11 @@ Notable changes in bcftools:
  * Use Brent's method to estimate the site allele frequency when EM converges
    slowly. The resulting ML estimate of allele frequnecy is more accurate.
 
- * Added the `ldpair' command, which computes r^2 between SNP pairs given in
+ * Added the `ldpair` command, which computes r^2 between SNP pairs given in
    an input file.
 
-Also, the `pileup' command, which has been deprecated by `mpileup' since
-version 0.1.10, will be dropped in the next release. The old `pileup' command
+Also, the `pileup` command, which has been deprecated by `mpileup` since
+version 0.1.10, will be dropped in the next release. The old `pileup` command
 is substandard and causing a lot of confusion.
 
 (0.1.16: 21 April 2011, r963:234)
@@ -1851,27 +1851,27 @@ is substandard and causing a lot of confusion.
 
 
 Beta Release 0.1.15 (10 April, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
 
 Notable changes:
 
  * Allow to perform variant calling or to extract information in multiple
-   regions specified by a BED file (`samtools mpileup -l', `samtools view -L'
-   and `bcftools view -l').
+   regions specified by a BED file (`samtools mpileup -l`, `samtools view -L`
+   and `bcftools view -l`).
 
- * Added the `depth' command to samtools to compute the per-base depth with a
-   simpler interface. File `bam2depth.c', which implements this command, is the
+ * Added the `depth` command to samtools to compute the per-base depth with a
+   simpler interface. File `bam2depth.c`, which implements this command, is the
    recommended example on how to use the mpileup APIs.
 
  * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg
    test using this estimate.
 
- * For `samtools view', when `-R' is specified, drop read groups in the header
+ * For `samtools view`, when `-R` is specified, drop read groups in the header
    that are not contained in the specified file.
 
- * For `samtools flagstat', separate QC-pass and QC-fail reads.
+ * For `samtools flagstat`, separate QC-pass and QC-fail reads.
 
- * Improved the command line help of `samtools mpileup' and `bcftools view'.
+ * Improved the command line help of `samtools mpileup` and `bcftools view`.
 
  * Use a global variable to control the verbose level of samtools stderr
    output. Nonetheless, it has not been full utilized.
@@ -1884,7 +1884,7 @@ Notable changes:
 
 
 Beta release 0.1.14 (21 March, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
 
 This release implements a method for testing associations for case-control
 data. The method does not call genotypes but instead sums over all genotype
@@ -1898,48 +1898,48 @@ ploidy of each sample (see also manual bcftools/bcftools.1).
 
 Other notable changes:
 
- * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
+ * Added `bcftools view -F` to parse BCF files generated by samtools r921 or
    older which encodes PL in a different way.
 
- * Changed the behavior of `bcftools view -s'. Now when a list of samples is
+ * Changed the behavior of `bcftools view -s`. Now when a list of samples is
    provided, the samples in the output will be reordered to match the ordering
    in the sample list. This change is mainly designed for association test.
 
- * Sped up `bcftools view -v' for target sequencing given thousands of samples.
-   Also added a new option `view -d' to skip loci where only a few samples are
+ * Sped up `bcftools view -v` for target sequencing given thousands of samples.
+   Also added a new option `view -d` to skip loci where only a few samples are
    covered by reads.
 
  * Dropped HWE test. This feature has never been implemented properly. An EM
    should be much better. To be implemented in future.
 
- * Added the `cat' command to samtools. This command concatenate BAMs with
+ * Added the `cat` command to samtools. This command concatenate BAMs with
    identical sequence dictionaries in an efficient way. Modified from bam_cat.c
    written by Chris Saunders.
 
- * Added `samtools view -1' to write BAMs at a low compression level but twice
-   faster to create. The `sort' command generates temporary files at a low
+ * Added `samtools view -1` to write BAMs at a low compression level but twice
+   faster to create. The `sort` command generates temporary files at a low
    compression level as well.
 
- * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality
+ * Added `samtools mpileup -6` to accept "BAM" with Illumina 1.3+ quality
    strings (strictly speaking, such a file is not BAM).
 
- * Added `samtools mpileup -L' to skip INDEL calling in regions with
+ * Added `samtools mpileup -L` to skip INDEL calling in regions with
    excessively high coverage. Such regions dramatically slow down mpileup.
 
- * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc.
+ * Updated `misc/export2sam.pl`, provided by Chris Saunders from Illumina Inc.
 
 (0.1.14: 21 March 2011, r933:170)
 
 
 
 Beta release 0.1.13 (1 March, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------
 
 The most important though largely invisible modification is the change of the
 order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF
 spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF
 generated by samtools older than r921 inclusive.  VCF/BCF generated by the new
-samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools
+samtools will contain a line `##fileformat=VCFv4.1` as well as the samtools
 version number.
 
 Single Individual Haplotyping (SIH) is added as an experimental feature. It
@@ -1957,17 +1957,17 @@ Other notable changes in samtools:
  * Improved sorting order checking in indexing. Now indexing is the preferred way
    to check if a BAM is sorted.
 
- * Added a switch `-E' to mpileup and calmd. This option uses an alternative way
+ * Added a switch `-E` to mpileup and calmd. This option uses an alternative way
    to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of
    a little loss in specificity.
 
- * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling.
+ * Added `mpileup -A` to allow to use reads in anomalous pairs in SNP calling.
 
- * Added `mpileup -m' to allow fine control of the collection of INDEL candidates.
+ * Added `mpileup -m` to allow fine control of the collection of INDEL candidates.
 
- * Added `mpileup -S' to compute per-sample strand bias P-value.
+ * Added `mpileup -S` to compute per-sample strand bias P-value.
 
- * Added `mpileup -G' to exclude read groups in variant calling.
+ * Added `mpileup -G` to exclude read groups in variant calling.
 
  * Fixed segfault in indel calling related to unmapped and refskip reads.
 
@@ -1978,7 +1978,7 @@ Other notable changes in samtools:
 
  * Fixed a very rare memory issue in bam_md.c
 
- * Fixed an out-of-boundary bug in mpileup when the read base is `N'.
+ * Fixed an out-of-boundary bug in mpileup when the read base is `N`.
 
  * Fixed a compiling error when the knetfile library is not used. Fixed a
    library compiling error due to the lack of bam_nt16_nt4_table[] table.
@@ -1989,9 +1989,9 @@ Other notable changes in bcftools:
 
  * Updated the BCF spec.
 
- * Added the `FQ' VCF INFO field, which gives the phred-scaled probability
+ * Added the `FQ` VCF INFO field, which gives the phred-scaled probability
    of all samples being the same (identical to the reference or all homozygous
-   variants). Option `view -f' has been dropped.
+   variants). Option `view -f` has been dropped.
 
  * Implemented "vcfutils.pl vcf2fq" to generate a consensus sequence
    similar to "samtools.pl pileup2fq".
@@ -2001,7 +2001,7 @@ Other notable changes in bcftools:
 
  * Output bcftools specific INFO and FORMAT in the VCF header.
 
- * Added `view -s' to call variants from a subset of samples.
+ * Added `view -s` to call variants from a subset of samples.
 
  * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless,
    custom fields are still unparsed and will be stored as a missing value.
@@ -2014,7 +2014,7 @@ Other notable changes in bcftools:
 
 
 Beta release 0.1.12a (2 December, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------
 
 This is another bug fix release:
 
@@ -2040,7 +2040,7 @@ This is another bug fix release:
 
 
 Beta release 0.1.11 (21 November, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------
 
 This is mainly a bug fix release:
 
@@ -2061,8 +2061,8 @@ One minor feature has been implemented in bcftools:
    result in the new mode, although the reference sequence has to be
    used in realignment when SAMtools computes genotype likelihoods.
 
-In addition, since 0.1.10, the `pileup' command has been deprecated by
-`mpileup' which is more powerful and more accurate. The `pileup' command
+In addition, since 0.1.10, the `pileup` command has been deprecated by
+`mpileup` which is more powerful and more accurate. The `pileup` command
 will not be removed in the next few releases, but new features will not
 be added.
 
@@ -2071,7 +2071,7 @@ be added.
 
 
 Beta Release 0.1.10 (16 November, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------
 
 This release is featured as the first major improvement to the indel
 caller. The method is similar to the old one implemented in the pileup
@@ -2106,7 +2106,7 @@ Other notable changes:
 
 
 Beta Release 0.1.9 (27 October, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------
 
 This release is featured as the first major improvement to the samtools'
 SNP caller.  It comes with a revised MAQ error model, the support of
@@ -2155,36 +2155,36 @@ In addition to the three major improvements, other notable changes are:
 
 
 Beta Release 0.1.8 (11 July, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+----------------------------------
 
 Notable functional changes:
 
- * Added the `reheader' command which replaces a BAM header with a new
+ * Added the `reheader` command which replaces a BAM header with a new
    header. This command is much faster than replacing header by
    BAM->SAM->BAM conversions.
 
- * Added the `mpileup' command which computes the pileup of multiple
+ * Added the `mpileup` command which computes the pileup of multiple
    alignments.
 
- * The `index' command now stores the number of mapped and unmapped
+ * The `index` command now stores the number of mapped and unmapped
    reads in the index file. This information can be retrieved quickly by
-   the new `idxstats' command.
+   the new `idxstats` command.
 
  * By default, pileup used the SOAPsnp model for SNP calling. This
    avoids the floating overflow in the MAQ model which leads to spurious
    calls in repetitive regions, although these calls will be immediately
    filtered by varFilter.
 
- * The `tview' command now correctly handles CIGARs like 7I10M and
+ * The `tview` command now correctly handles CIGARs like 7I10M and
    10M1P1I10M which cause assertion failure in earlier versions.
 
- * Tview accepts a region like `=10,000' where `=' stands for the
+ * Tview accepts a region like `=10,000` where `=` stands for the
    current sequence name. This saves typing for long sequence names.
 
- * Added the `-d' option to `pileup' which avoids slow indel calling
+ * Added the `-d` option to `pileup` which avoids slow indel calling
    in ultradeep regions by subsampling reads locally.
 
- * Added the `-R' option to `view' which retrieves alignments in read
+ * Added the `-R` option to `view` which retrieves alignments in read
    groups listed in the specified file.
 
 Performance improvements:
@@ -2207,9 +2207,9 @@ Bug fixes:
  * Fixed another issue in the indel caller which may lead to incorrect
    genotype.
 
- * Fixed a bug in `sort' when option `-o' is applied.
+ * Fixed a bug in `sort` when option `-o` is applied.
 
- * Fixed a bug in `view -r'.
+ * Fixed a bug in `view -r`.
 
 APIs and other changes:
 
@@ -2234,7 +2234,7 @@ Changes in other utilities:
 
 
 Beta Release 0.1.7 (10 November, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------
 
 Notable changes:
 
@@ -2274,7 +2274,7 @@ Changes in other utilities:
 
 
 Beta Release 0.1.6 (2 September, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------
 
 Notable changes:
 
@@ -2294,17 +2294,17 @@ Notable changes:
  * The view command now recognizes and optionally prints FLAG in HEXs or
    strings to make a SAM file more friendly to human eyes. This is a
    samtools-C extension, not implemented in Picard for the time
-   being. Please type `samtools view -?' for more information.
+   being. Please type `samtools view -?` for more information.
 
  * BAM files now have an end-of-file (EOF) marker to facilitate
    truncation detection. A warning will be given if an on-disk BAM file
    does not have this marker. The warning will be seen on BAM files
    generated by an older version of samtools. It does NO harm.
 
- * New key bindings in tview: `r' to show read names and `s' to show
+ * New key bindings in tview: 'r' to show read names and 's' to show
    reference skip (N operation) as deletions.
 
- * Fixed a bug in `samtools merge -n'.
+ * Fixed a bug in `samtools merge -n`.
 
  * Samtools merge now optionally copies the header of a user specified
    SAM file to the resultant BAM output.
@@ -2325,7 +2325,7 @@ Changes in other utilities:
 
 
 Beta Release 0.1.5 (7 July, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------
 
 Notable changes:
 
@@ -2361,7 +2361,7 @@ Notable changes:
    option is preferred when the output is piped to other commands.
 
  * In view, added '-l' and '-r' to get the alignments for one library or
-   read group. The "@RG" header lines are now partially parsed.
+   read group. The `@RG` header lines are now partially parsed.
 
  * Do not include command line utilities to libbam.a.
 
@@ -2386,7 +2386,7 @@ bugs.
 
 
 Beta Release 0.1.4 (21 May, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------
 
 Notable changes:
 
@@ -2400,7 +2400,7 @@ Notable changes:
 
  * Unified the interface to BAM and SAM I/O. This is done by
    implementing a wrapper on top of the old APIs and therefore old APIs
-   are still valid. The new I/O APIs also recognize the @SQ header
+   are still valid. The new I/O APIs also recognize the `@SQ` header
    lines.
 
  * Generate the MD tag.
@@ -2412,7 +2412,7 @@ Notable changes:
 
  * Implemented the GNU building system. However, currently the building
    system does not generate libbam.a. We will improve this later. For
-   the time being, `make -f Makefile.generic' is preferred.
+   the time being, `make -f Makefile.generic` is preferred.
 
  * Fixed a minor bug in pileup: the first read in a chromosome may be
    skipped.
@@ -2427,7 +2427,7 @@ Notable changes:
 
 
 Beta Release 0.1.3 (15 April, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------
 
 Notable changes in SAMtools:
 
@@ -2452,7 +2452,7 @@ Notable changes in SAMtools:
  * Fixed a minor bug in indexing: the bin number of an unmapped read is
    wrongly calculated.
 
- * Added `flagstat' command to show statistics on the FLAG field.
+ * Added `flagstat` command to show statistics on the FLAG field.
 
  * Improved indel caller by setting the maximum window size in local
    realignment.
@@ -2474,7 +2474,7 @@ Changes in other utilities:
 
 
 Beta Release 0.1.2 (28 January, 2008)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------
 
 Notable changes in SAMtools:
 
@@ -2543,8 +2543,8 @@ Changes in other utilities:
 
 
 Beta Release 0.1.1 (22 December, 2008)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------
 
 The is the first public release of samtools. For more information,
-please check the manual page `samtools.1' and the samtools website
+please check the manual page `samtools.1` and the samtools website
 http://samtools.sourceforge.net

From 1f96fa8396b5ee88b1ece39826e2432f2e0be177 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 7 Dec 2023 11:21:25 +0000
Subject: [PATCH 24/26] Make samtools mpileup -aa work on empty files. (#1939)

---
 bam_plcmd.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bam_plcmd.c b/bam_plcmd.c
index 264a7f5a0..c61d4b681 100644
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -570,9 +570,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     int ret;
     int last_tid = -1;
     hts_pos_t last_pos = -1;
+    int one_seq = 0;
 
     // begin pileup
     while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
+        one_seq = 1; // at least 1 output
         if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
         mplp_get_ref(data[0], tid, &ref, &ref_len);
         //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
@@ -811,8 +813,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
             last_tid = tid0;
             last_pos = beg0-1;
             mplp_get_ref(data[0], tid0, &ref, &ref_len);
+        } else if (last_tid < 0 && !one_seq && conf->all > 1) {
+            last_tid = 0; // --aa on a blank file
         }
-       while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
+        while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
+            mplp_get_ref(data[0], last_tid, &ref, &ref_len);
             while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
                 if (last_pos >= end0) break;
                 if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)

From 413d8fce1e5e6e3f010d9c0ef4521a0cb36665f6 Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <whitwham@users.noreply.github.com>
Date: Mon, 11 Dec 2023 17:15:00 +0000
Subject: [PATCH 25/26] Speed up optical duplicate marking. (PR #1952)

Checking optical duplicates in very deep data was very slow.  By adding more sort criteria we can reduce the total amount of work that needs to be done and so speed up the processing.
---
 bam_markdup.c | 173 ++++++++++++++++++++++++++++----------------------
 1 file changed, 97 insertions(+), 76 deletions(-)

diff --git a/bam_markdup.c b/bam_markdup.c
index 677a47f82..17fedd58d 100644
--- a/bam_markdup.c
+++ b/bam_markdup.c
@@ -122,6 +122,7 @@ typedef struct {
     int opt;
     int beg;
     int end;
+    int len;
 } check_t;
 
 typedef struct {
@@ -875,7 +876,10 @@ static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, lon
         return ret;
     }
 
-    if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
+    int o_len = o_end - o_beg;
+    int d_len = d_end - d_beg;
+
+    if ((o_len == d_len) && memcmp(original + o_beg, duplicate + d_beg, o_len) == 0) {
         long xdiff, ydiff;
 
         if (ox > dx) {
@@ -918,7 +922,10 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const
         return ret;
     }
 
-    if (strncmp(name + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
+    int o_len = o_end - o_beg;
+    int d_len = d_end - d_beg;
+
+    if ((o_len == d_len) && memcmp(name + o_beg, duplicate + d_beg, o_len) == 0) {
         // the initial parts match, look at the numbers
         long xdiff, ydiff;
 
@@ -945,6 +952,7 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const
     c->y = dy;
     c->beg = d_beg;
     c->end = d_end;
+    c->len = d_end - d_beg;
 
     return ret;
 }
@@ -1156,9 +1164,15 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *
 }
 
 
-static int xcoord_sort(const void *a, const void *b) {
+static int chain_sort(const void *a, const void *b) {
     check_t *ac = (check_t *) a;
     check_t *bc = (check_t *) b;
+    int ret;
+
+    if ((ret = ac->len - bc->len))
+        return ret;
+    else if ((ret = memcmp(bam_get_qname(ac->b) + ac->beg, bam_get_qname(bc->b) + bc->beg, ac->len)))
+        return ret;
 
     return (ac->x - bc->x);
 }
@@ -1170,106 +1184,113 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has
     int ret = 0;
     size_t curr = 0;
 
-    qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+    qsort(list->c, list->length, sizeof(list->c[0]), chain_sort);
 
     while (curr < list->length - 1) {
-        check_t *current = &list->c[curr];
-        size_t count = curr;
-        char *cur_name = bam_get_qname(current->b);
-        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+        check_t *base = &list->c[curr];
+        char *base_name = bam_get_qname(base->b);
+        int end_name_match = curr;
 
-        while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
-            // while close enough along the x coordinate
-            check_t *chk = &list->c[count];
+        // find the end of the matching name parts
+        while (++end_name_match < list->length) {
+            check_t *chk = &list->c[end_name_match];
 
-            if (current->opt && chk->opt)
-                continue;
+            if ((base->len == chk->len) && memcmp(base_name + base->beg, bam_get_qname(chk->b) + chk->beg, base->len) != 0)
+                break;
+        }
 
-            // if both are already optical duplicates there is no need to check again, otherwise...
+        while (curr < end_name_match) {
+            size_t count = curr;
+            check_t *current = &list->c[curr];
+            int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
 
-            long ydiff;
+            while (++count < end_name_match && (list->c[count].x - current->x <= param->opt_dist)) {
+                // while close enough along the x coordinate
+                check_t *chk = &list->c[count];
 
-            if (current->y > chk->y) {
-                ydiff = current->y - chk->y;
-            } else {
-                ydiff = chk->y - current->y;
-            }
+                if (current->opt && chk->opt)
+                    continue;
 
-            if (ydiff > param->opt_dist)
-                continue;
+                long ydiff;
 
-            // the number are right, check the names
-            if (strncmp(cur_name + current->beg, bam_get_qname(chk->b) + chk->beg, current->end - current->beg) != 0)
-                continue;
+                if (current->y > chk->y) {
+                    ydiff = current->y - chk->y;
+                } else {
+                    ydiff = chk->y - current->y;
+                }
 
-            // optical duplicates
-            int chk_dup = 0;
-            int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+                if (ydiff > param->opt_dist)
+                    continue;
 
-            if (current_paired != chk_paired) {
-                if (!chk_paired) {
-                    // chk is single vs pair, this is a dup.
-                    chk_dup = 1;
-                }
-            } else {
-                // do it by scores
-                int64_t cur_score, chk_score;
+                // optical duplicates
+                int chk_dup = 0;
+                int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
 
-                if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
-                    if (current->b->core.flag & BAM_FQCFAIL) {
-                        cur_score = 0;
-                        chk_score = 1;
-                    } else {
-                        cur_score = 1;
-                        chk_score = 0;
+                if (current_paired != chk_paired) {
+                    if (!chk_paired) {
+                        // chk is single vs pair, this is a dup.
+                        chk_dup = 1;
                     }
                 } else {
-                    cur_score = current->score;
-                    chk_score = chk->score;
+                    // do it by scores
+                    int64_t cur_score, chk_score;
 
-                    if (current_paired) {
-                        // they are pairs so add mate scores.
-                        chk_score += chk->mate_score;
-                        cur_score += current->mate_score;
+                    if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+                        if (current->b->core.flag & BAM_FQCFAIL) {
+                            cur_score = 0;
+                            chk_score = 1;
+                        } else {
+                            cur_score = 1;
+                            chk_score = 0;
+                        }
+                    } else {
+                        cur_score = current->score;
+                        chk_score = chk->score;
+
+                        if (current_paired) {
+                            // they are pairs so add mate scores.
+                            chk_score += chk->mate_score;
+                            cur_score += current->mate_score;
+                        }
                     }
-                }
 
-                if (cur_score == chk_score) {
-                    if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
-                        chk_score++;
-                    } else {
-                        chk_score--;
+                    if (cur_score == chk_score) {
+                        if (strcmp(bam_get_qname(chk->b), bam_get_qname(current->b)) < 0) {
+                            chk_score++;
+                        } else {
+                            chk_score--;
+                        }
                     }
-                }
 
-                if (cur_score > chk_score) {
-                    chk_dup = 1;
+                    if (cur_score > chk_score) {
+                        chk_dup = 1;
+                    }
                 }
-            }
 
-            if (chk_dup) {
-                // the duplicate is the optical duplicate
-                if (!chk->opt) { // only change if not already an optical duplicate
-                    if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
-                        ret = -1;
-                        goto fail;
-                    }
+                if (chk_dup) {
+                    // the duplicate is the optical duplicate
+                    if (!chk->opt) { // only change if not already an optical duplicate
+                        if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
+                            ret = -1;
+                            goto fail;
+                        }
 
-                    chk->opt = 1;
-                }
-            } else {
-                if (!current->opt) {
-                    if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
-                        ret = -1;
-                        goto fail;
+                        chk->opt = 1;
                     }
+                } else {
+                    if (!current->opt) {
+                        if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
+                            ret = -1;
+                            goto fail;
+                        }
 
-                    current->opt = 1;
+                        current->opt = 1;
+                    }
                 }
             }
-        }
 
-        curr++;
+            curr++;
+        }
     }
 
  fail:

From d3e2c29ffd8ceea5f4937cb6ffe4efc1d4e8a0c2 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 11 Dec 2023 17:53:54 +0000
Subject: [PATCH 26/26] Add latest NEWS for 1.19

---
 NEWS.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 1527fbb00..8ba86b113 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -17,8 +17,7 @@ New work and changes:
   read not matching the specified files.
   (PR #1896, fixes #1895.  Suggested by Feng Tian)
 
-* Cope with Htslib's change to no longer close stdout on is hts_close is
-  used with the output file being stdout.
+* Cope with Htslib's change to no longer close stdout on hts_close.
   Htslib companion PR is samtools/htslib#1665.
   (PR #1909.  Thanks to John Marshall)
 
@@ -33,7 +32,7 @@ New work and changes:
   (PR #1222, PR #1933, fixes #1758.  Thanks to Valeriu Ohan, suggested by
   Scott Norton)
 
-Bug Fixes
+Bug Fixes:
 
 * Samtools stats: empty barcode tags are now treated as having no barcode.
   (PR #1929, fixes #1926.  Reported by Jukka Matilainen)
@@ -43,8 +42,15 @@ Bug Fixes
   non-seekable such as stdin or a pipe.
   (PR #1930, fixes #1731.  Reported by Julian Hess)
 
+* Samtools mpileup -aa (absolutely all positions) now produces an output
+  even when given an empty input file.
+  (PR #1939.  Reported by Chang Y)
+
+* Samtools markdup: speed up optical duplicate tagging on regions with very
+  deep data.
+  (PR #1952)
 
-Documentation
+Documentation:
 
 * Samtools mpileup: add more usage examples to the man page.
   (PR #1913, fixes #1801)