feat: export structural variants (#13)

bihealth · May 3, 2023 · db44d87 · db44d87
1 parent 83e8f7e
commit db44d87
Show file tree

Hide file tree

Showing 9 changed files with 388 additions and 59 deletions.
diff --git a/clinvar_tsv/Snakefile b/clinvar_tsv/Snakefile
@@ -10,7 +10,8 @@ REF = {
 rule default:
     input:
         expand(
-            "output/clinvar.{build}.tsv{ext}",
+            "output/clinvar_{size}.{build}.tsv{ext}",
+            size=("small", "sv"),
             build=("b37", "b38"),
             ext=(".gz", ".gz.tbi", ".gz.md5", ".gz.tbi.md5"),
         )
@@ -39,18 +40,22 @@ rule parse_clinvar:
         release_xml="downloads/ClinVarFullRelease_00-latest.xml.gz",
         summary="downloads/variant_summary.txt.gz",
     output:
-        b37=temp("parsed/clinvar_table_raw.b37.tsv"),
-        b38=temp("parsed/clinvar_table_raw.b38.tsv"),
+        b37_small="parsed/clinvar_table_raw.b37.tsv",
+        b37_sv="parsed/clinvar_sv.b37.tsv",
+        b38_small="parsed/clinvar_table_raw.b38.tsv",
+        b38_sv="parsed/clinvar_sv.b38.tsv",
     shell:
         r"""
         set -euo pipefail
 
         clinvar_tsv parse_xml \
             --clinvar-xml {input.release_xml} \
-            --output-b37 {output.b37} \
-            --output-b38 {output.b38} \
+            --output-b37-small {output.b37_small} \
+            --output-b37-sv {output.b37_sv} \
+            --output-b38-small {output.b38_small} \
+            --output-b38-sv {output.b38_sv} \
             $(if [[ {config[debug]} == "True" ]]; then
-                echo "--max-rcvs 100"
+                echo "--max-rcvs 1000"
             fi)
         """
 
@@ -59,7 +64,7 @@ rule normalize_clinvar:
     input:
         tsv="parsed/clinvar_table_raw.{genome_build}.tsv",
         reference=lambda wildcards: REF[wildcards.genome_build],
-    output: temp("normalized/clinvar_table_normalized.{genome_build}.tsv.gz"),
+    output: "normalized/clinvar_table_normalized.{genome_build}.tsv.gz",
     shell:
         r"""
         set -euo pipefail
@@ -77,10 +82,10 @@ rule normalize_clinvar:
 rule sort_tabix_clinvar:
     input: "normalized/clinvar_table_normalized.{genome_build}.tsv.gz",
     output:
-        tsv="unmerged/clinvar.{genome_build}.tsv.gz",
-        tbi="unmerged/clinvar.{genome_build}.tsv.gz.tbi",
-        tsv_md5="unmerged/clinvar.{genome_build}.tsv.gz.md5",
-        tbi_md5="unmerged/clinvar.{genome_build}.tsv.gz.tbi.md5",
+        tsv="unmerged/clinvar_small.{genome_build}.tsv.gz",
+        tbi="unmerged/clinvar_small.{genome_build}.tsv.gz.tbi",
+        tsv_md5="unmerged/clinvar_small.{genome_build}.tsv.gz.md5",
+        tbi_md5="unmerged/clinvar_small.{genome_build}.tsv.gz.tbi.md5",
     shell:
         r"""
         set -euo pipefail
@@ -97,13 +102,38 @@ rule sort_tabix_clinvar:
         md5sum $(basename {output.tbi}) >$(basename {output.tbi}).md5
         """
 
+rule sort_svs:
+    input: "parsed/clinvar_sv.{genome_build}.tsv",
+    output:
+        tsv="unmerged/clinvar_sv.{genome_build}.tsv.gz",
+        tbi="unmerged/clinvar_sv.{genome_build}.tsv.gz.tbi",
+        tsv_md5="unmerged/clinvar_sv.{genome_build}.tsv.gz.md5",
+        tbi_md5="unmerged/clinvar_sv.{genome_build}.tsv.gz.tbi.md5",
+    params:
+        clinvar_version=config.get("clinvar_version", ".")
+    shell:
+        r"""
+        set -euo pipefail
+
+        cat \
+            <(cat {input} | head -n 1) \
+            <(cat {input} | tail -n +2 | sort -k2,2V -k3,3n -k4,4n -k11,11) \
+        | bgzip -c \
+        > {output.tsv}
+        tabix -S 1 -s 2 -b 3 -e 4 -f {output.tsv}
+
+        cd $(dirname {output.tsv})
+        md5sum $(basename {output.tsv}) >$(basename {output.tsv}).md5
+        md5sum $(basename {output.tbi}) >$(basename {output.tbi}).md5
+        """
+
 rule merge_clinvar:
-    input: "unmerged/clinvar.{genome_build}.tsv.gz",
+    input: "unmerged/clinvar_{size}.{genome_build}.tsv.gz",
     output:
-        tsv="output/clinvar.{genome_build}.tsv.gz",
-        tbi="output/clinvar.{genome_build}.tsv.gz.tbi",
-        tsv_md5="output/clinvar.{genome_build}.tsv.gz.md5",
-        tbi_md5="output/clinvar.{genome_build}.tsv.gz.tbi.md5",
+        tsv="output/clinvar_{size}.{genome_build}.tsv.gz",
+        tbi="output/clinvar_{size}.{genome_build}.tsv.gz.tbi",
+        tsv_md5="output/clinvar_{size}.{genome_build}.tsv.gz.md5",
+        tbi_md5="output/clinvar_{size}.{genome_build}.tsv.gz.tbi.md5",
     params:
         clinvar_version=config.get("clinvar_version", ".")
     shell:

diff --git a/clinvar_tsv/__main__.py b/clinvar_tsv/__main__.py
@@ -21,6 +21,7 @@ def run_inspect(args):
         "summary": True,
         "verbose": True,
     }
+    print(f"Running snakemake {kwargs}")
     snakemake.snakemake(**kwargs)
 
 
@@ -40,6 +41,7 @@ def run_main(args):
         "force_incomplete": True,
         "cores": 8,
     }
+    print(f"Running snakemake {kwargs}")
     return not snakemake.snakemake(**kwargs)
 
 
@@ -52,15 +54,21 @@ def open_maybe_gzip(path, mode):
 
 def run_parse_xml(args):
     """Parse XML file."""
-    with open(args.output_b37, "wt") as output_b37:
-        with open(args.output_b38, "wt") as output_b38:
-            parser = parse_clinvar_xml.ClinvarParser(
-                input_file=open_maybe_gzip(args.clinvar_xml, "rb"),
-                out_b37=output_b37,
-                out_b38=output_b38,
-                max_rcvs=args.max_rcvs,
-            )
-            parser.run()
+    with (
+        open(args.output_b37_small, "wt") as output_b37_small,
+        open(args.output_b37_sv, "wt") as output_b37_sv,
+        open(args.output_b38_small, "wt") as output_b38_small,
+        open(args.output_b38_sv, "wt") as output_b38_sv,
+    ):
+        parser = parse_clinvar_xml.ClinvarParser(
+            input_file=open_maybe_gzip(args.clinvar_xml, "rb"),
+            out_b37_small=output_b37_small,
+            out_b37_sv=output_b37_sv,
+            out_b38_small=output_b38_small,
+            out_b38_sv=output_b38_sv,
+            max_rcvs=args.max_rcvs,
+        )
+        parser.run()
 
 
 def run_normalize_tsv(args):
@@ -129,10 +137,16 @@ def main(argv=None):
     parser_parse_xml = subparsers.add_parser("parse_xml", help="Parse the Clinvar XML")
     parser_parse_xml.add_argument("--clinvar-xml", required=True, help="Path to Clinvar XML file.")
     parser_parse_xml.add_argument(
-        "--output-b37", required=True, help="Output path for GRCh37 file."
+        "--output-b37-small", required=True, help="Output path for small vars GRCh37 file."
     )
     parser_parse_xml.add_argument(
-        "--output-b38", required=True, help="Output path for GRCh38 file."
+        "--output-b37-sv", required=True, help="Output path for SV GRCh37 file."
+    )
+    parser_parse_xml.add_argument(
+        "--output-b38-small", required=True, help="Output path for small vars GRCh38 file."
+    )
+    parser_parse_xml.add_argument(
+        "--output-b38-sv", required=True, help="Output path for SV GRCh38 file."
     )
     parser_parse_xml.add_argument(
         "--max-rcvs", required=False, type=int, help="Maximal number of RCV records to process."

diff --git a/clinvar_tsv/common.py b/clinvar_tsv/common.py
@@ -206,13 +206,15 @@ def from_element(cls, element: ET.Element):
             elem.text for elem in element.findall('.//Symbol/ElementValue[@Type="Preferred"]')
         ]
         hgnc_ids = [elem.attrib.get("ID") for elem in element.findall('.//XRef[@DB="HGNC"]')]
+        n = element.findall("./SequenceLocation")
+        m = element.findall(".//SequenceLocation")
         return Measure(
             measure_type=element.attrib.get("Type"),
             symbols=tuple(sorted(set(symbols))),
             hgnc_ids=tuple(sorted(set(hgnc_ids))),
             sequence_locations={
                 elem.attrib.get("Assembly"): SequenceLocation.from_element(elem)
-                for elem in element.findall("./SequenceLocation")
+                for elem in (n or m)
             },
             comments=tuple(
                 elem.text
@@ -539,13 +541,11 @@ def from_element(element: ET.Element):
     0: (
         "uncertain significance",
         # below: other
-        "affects",
         "association",
         "association not found",
         "cancer",
         "confers sensitivity",
         "drug response",
-        "drug response",
         "drug-response",
         "histocompatibility",
         "not provided",
@@ -557,6 +557,7 @@ def from_element(element: ET.Element):
         "untested",
         "variant of unknown significance",
         "associated with leiomyomas",
+        "conflicting data from submitters",
     ),
     1: (
         "likely pathogenic",
@@ -613,7 +614,11 @@ def label(self):
 
 
 _REVIEW_STATUS_LABELS = {
-    1: ("conflicting interpretations", "conflicting interpretations of pathogenicity"),
+    1: (
+        "conflicting interpretations",
+        "conflicting interpretations of pathogenicity",
+        "conflicting data from submitters",
+    ),
     2: ("criteria provided",),
     3: ("multiple submitters",),
     4: ("no assertion criteria provided",),

diff --git a/clinvar_tsv/normalize.py b/clinvar_tsv/normalize.py
@@ -153,6 +153,7 @@ def normalize_tab_delimited_file(infile, outfile, reference_fasta, verbose=True)
     ref_equals_alt = 0
     wrong_ref = 0
     invalid_nucleotide = 0
+    unknown_contig = 0
     for line in infile.readlines():
         data = dict(zip(columns, line.strip("\n").split("\t")))
         # fill the data with blanks for any missing data
@@ -176,6 +177,10 @@ def normalize_tab_delimited_file(infile, outfile, reference_fasta, verbose=True)
             )
             if data["chromosome"].startswith("chr"):
                 data["chromosome"] = data["chromosome"][3:]
+        except KeyError as e:
+            sys.stderr.write("\n" + str(e) + "\n")
+            unknown_contig += 1
+            continue
         except RefEqualsAltError as e:
             sys.stderr.write("\n" + str(e) + "\n")
             ref_equals_alt += 1
@@ -200,6 +205,6 @@ def normalize_tab_delimited_file(infile, outfile, reference_fasta, verbose=True)
     outfile.write("\n\n")
     if verbose:
         sys.stderr.write(
-            "Final counts of variants discarded:\nREF == ALT: %s\nWrong REF: %s\nInvalid nucleotide: %s\n"
-            % (ref_equals_alt, wrong_ref, invalid_nucleotide)
+            "Final counts of variants discarded:\nREF == ALT: %s\nWrong REF: %s\nInvalid nucleotide: %s\nUnknown contig: %s\n"
+            % (ref_equals_alt, wrong_ref, invalid_nucleotide, unknown_contig)
         )
diff --git a/clinvar_tsv/parse_clinvar_xml.py b/clinvar_tsv/parse_clinvar_xml.py
@@ -41,26 +41,46 @@
 )
 
 
+def try_map(values, converter, catch_exc):
+    """Try to convert the given iterable of converters ignoring exceptions up to the last one"""
+    for i, value in enumerate(values):
+        try:
+            return converter(value)
+        except catch_exc:
+            if i + 1 == len(values):
+                raise  # re-raise
+
+
 class ClinvarParser:
     """Helper class for parsing Clinvar XML"""
 
-    def __init__(self, input_file, out_b37, out_b38, max_rcvs=None):
+    def __init__(
+        self, input_file, out_b37_small, out_b37_sv, out_b38_small, out_b38_sv, max_rcvs=None
+    ):
         #: ``file``-like object to load the XML from
         self.input = input_file
-        #: ``file``-like object to write GRCh37 variants to
-        self.out_b37 = out_b37
-        #: ``file``-like object to write GRCh38 variants to
-        self.out_b38 = out_b38
+        #: ``file``-like object to write GRCh37 small variants to
+        self.out_b37_small = out_b37_small
+        #: ``file``-like object to write GRCh37 SVs to
+        self.out_b37_sv = out_b37_sv
+        #: ``file``-like object to write GRCh38 small variants to
+        self.out_b38_small = out_b38_small
+        #: ``file``-like object to write GRCh38 SVs to
+        self.out_b38_sv = out_b38_sv
         #: Number of processed RCVs, used for progress disable.
         self.rcvs = 0
         #: Largest number of rcvs to process out (for testing only)
         self.max_rcvs = max_rcvs
 
-    def run(self):
+    def run(self):  # noqa: C901
         logger.info("Parsing elements...")
-        out_files = {"GRCh37": self.out_b37, "GRCh38": self.out_b38}
-        for out_file in out_files.values():
-            print(TSV_HEADER, file=out_file)
+        out_files = {
+            "GRCh37": {"small": self.out_b37_small, "sv": self.out_b37_sv},
+            "GRCh38": {"small": self.out_b38_small, "sv": self.out_b38_sv},
+        }
+        for d in out_files.values():
+            for out_file in d.values():
+                print(TSV_HEADER, file=out_file)
         with tqdm.tqdm(unit="rcvs") as progress:
             for event, elem in ET.iterparse(self.input):
                 if elem.tag == "ClinVarSet" and event == "end":
@@ -76,6 +96,60 @@ def run(self):
                                 for build, location in measure.sequence_locations.items():
                                     if build not in out_files:
                                         continue
+                                    if location.ref is None or location.alt is None:
+                                        if measure.measure_type in (
+                                            "single nucleotide variant",
+                                            "Variation",
+                                        ):
+                                            continue  # skip, just a small variant without a coordinate
+                                        try:
+                                            start = try_map(
+                                                (
+                                                    location.start,
+                                                    location.outer_start,
+                                                    location.inner_start,
+                                                ),
+                                                int,
+                                                TypeError,
+                                            )
+                                            stop = try_map(
+                                                (
+                                                    location.stop,
+                                                    location.outer_stop,
+                                                    location.inner_stop,
+                                                ),
+                                                int,
+                                                TypeError,
+                                            )
+                                        except TypeError:
+                                            logger.info(
+                                                "Cannot determine location from %s", measure
+                                            )
+                                            continue
+                                        row = [
+                                            build,
+                                            location.chrom,
+                                            start,
+                                            stop,
+                                            binning.assign_bin(start - 1, stop),
+                                            location.ref or ".",
+                                            location.alt or ".",
+                                            measure.measure_type,
+                                            as_pg_list(measure.symbols),
+                                            as_pg_list(measure.hgnc_ids),
+                                            measure_set.accession,
+                                            clinvar_set.ref_cv_assertion.clinvar_accession,
+                                            clinvar_set.ref_cv_assertion.review_status,
+                                            clinvar_set.ref_cv_assertion.gold_stars,
+                                            clinvar_set.ref_cv_assertion.pathogenicity,
+                                            origin,
+                                            json.dumps(
+                                                cattr.unstructure(clinvar_set), cls=DateTimeEncoder
+                                            )
+                                            .replace(r"\"", "'")
+                                            .replace('"', '"""'),
+                                        ]
+                                        print("\t".join(map(str, row)), file=out_files[build]["sv"])
                                     elif location.ref is not None and location.alt is not None:
                                         if len(location.ref) == 1 and len(location.alt) == 1:
                                             variation_type = "snv"
@@ -106,7 +180,9 @@ def run(self):
                                             .replace(r"\"", "'")
                                             .replace('"', '"""'),
                                         ]
-                                        print("\t".join(map(str, row)), file=out_files[build])
+                                        print(
+                                            "\t".join(map(str, row)), file=out_files[build]["small"]
+                                        )
                         progress.update()
                     elem.clear()
                 if self.max_rcvs and self.rcvs >= self.max_rcvs:

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -4,7 +4,7 @@
 -r test.txt
 
 # Prettier CLI for py.test
-pytest-sugar==0.9.5
+pytest-sugar
 
 # Sphinx theme
 sphinx_rtd_theme==0.1.9