Skip to content

Commit

Permalink
feat: export structural variants (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed May 3, 2023
1 parent 83e8f7e commit db44d87
Show file tree
Hide file tree
Showing 9 changed files with 388 additions and 59 deletions.
62 changes: 46 additions & 16 deletions clinvar_tsv/Snakefile
Expand Up @@ -10,7 +10,8 @@ REF = {
rule default:
input:
expand(
"output/clinvar.{build}.tsv{ext}",
"output/clinvar_{size}.{build}.tsv{ext}",
size=("small", "sv"),
build=("b37", "b38"),
ext=(".gz", ".gz.tbi", ".gz.md5", ".gz.tbi.md5"),
)
Expand Down Expand Up @@ -39,18 +40,22 @@ rule parse_clinvar:
release_xml="downloads/ClinVarFullRelease_00-latest.xml.gz",
summary="downloads/variant_summary.txt.gz",
output:
b37=temp("parsed/clinvar_table_raw.b37.tsv"),
b38=temp("parsed/clinvar_table_raw.b38.tsv"),
b37_small="parsed/clinvar_table_raw.b37.tsv",
b37_sv="parsed/clinvar_sv.b37.tsv",
b38_small="parsed/clinvar_table_raw.b38.tsv",
b38_sv="parsed/clinvar_sv.b38.tsv",
shell:
r"""
set -euo pipefail
clinvar_tsv parse_xml \
--clinvar-xml {input.release_xml} \
--output-b37 {output.b37} \
--output-b38 {output.b38} \
--output-b37-small {output.b37_small} \
--output-b37-sv {output.b37_sv} \
--output-b38-small {output.b38_small} \
--output-b38-sv {output.b38_sv} \
$(if [[ {config[debug]} == "True" ]]; then
echo "--max-rcvs 100"
echo "--max-rcvs 1000"
fi)
"""

Expand All @@ -59,7 +64,7 @@ rule normalize_clinvar:
input:
tsv="parsed/clinvar_table_raw.{genome_build}.tsv",
reference=lambda wildcards: REF[wildcards.genome_build],
output: temp("normalized/clinvar_table_normalized.{genome_build}.tsv.gz"),
output: "normalized/clinvar_table_normalized.{genome_build}.tsv.gz",
shell:
r"""
set -euo pipefail
Expand All @@ -77,10 +82,10 @@ rule normalize_clinvar:
rule sort_tabix_clinvar:
input: "normalized/clinvar_table_normalized.{genome_build}.tsv.gz",
output:
tsv="unmerged/clinvar.{genome_build}.tsv.gz",
tbi="unmerged/clinvar.{genome_build}.tsv.gz.tbi",
tsv_md5="unmerged/clinvar.{genome_build}.tsv.gz.md5",
tbi_md5="unmerged/clinvar.{genome_build}.tsv.gz.tbi.md5",
tsv="unmerged/clinvar_small.{genome_build}.tsv.gz",
tbi="unmerged/clinvar_small.{genome_build}.tsv.gz.tbi",
tsv_md5="unmerged/clinvar_small.{genome_build}.tsv.gz.md5",
tbi_md5="unmerged/clinvar_small.{genome_build}.tsv.gz.tbi.md5",
shell:
r"""
set -euo pipefail
Expand All @@ -97,13 +102,38 @@ rule sort_tabix_clinvar:
md5sum $(basename {output.tbi}) >$(basename {output.tbi}).md5
"""

rule sort_svs:
input: "parsed/clinvar_sv.{genome_build}.tsv",
output:
tsv="unmerged/clinvar_sv.{genome_build}.tsv.gz",
tbi="unmerged/clinvar_sv.{genome_build}.tsv.gz.tbi",
tsv_md5="unmerged/clinvar_sv.{genome_build}.tsv.gz.md5",
tbi_md5="unmerged/clinvar_sv.{genome_build}.tsv.gz.tbi.md5",
params:
clinvar_version=config.get("clinvar_version", ".")
shell:
r"""
set -euo pipefail
cat \
<(cat {input} | head -n 1) \
<(cat {input} | tail -n +2 | sort -k2,2V -k3,3n -k4,4n -k11,11) \
| bgzip -c \
> {output.tsv}
tabix -S 1 -s 2 -b 3 -e 4 -f {output.tsv}
cd $(dirname {output.tsv})
md5sum $(basename {output.tsv}) >$(basename {output.tsv}).md5
md5sum $(basename {output.tbi}) >$(basename {output.tbi}).md5
"""

rule merge_clinvar:
input: "unmerged/clinvar.{genome_build}.tsv.gz",
input: "unmerged/clinvar_{size}.{genome_build}.tsv.gz",
output:
tsv="output/clinvar.{genome_build}.tsv.gz",
tbi="output/clinvar.{genome_build}.tsv.gz.tbi",
tsv_md5="output/clinvar.{genome_build}.tsv.gz.md5",
tbi_md5="output/clinvar.{genome_build}.tsv.gz.tbi.md5",
tsv="output/clinvar_{size}.{genome_build}.tsv.gz",
tbi="output/clinvar_{size}.{genome_build}.tsv.gz.tbi",
tsv_md5="output/clinvar_{size}.{genome_build}.tsv.gz.md5",
tbi_md5="output/clinvar_{size}.{genome_build}.tsv.gz.tbi.md5",
params:
clinvar_version=config.get("clinvar_version", ".")
shell:
Expand Down
36 changes: 25 additions & 11 deletions clinvar_tsv/__main__.py
Expand Up @@ -21,6 +21,7 @@ def run_inspect(args):
"summary": True,
"verbose": True,
}
print(f"Running snakemake {kwargs}")
snakemake.snakemake(**kwargs)


Expand All @@ -40,6 +41,7 @@ def run_main(args):
"force_incomplete": True,
"cores": 8,
}
print(f"Running snakemake {kwargs}")
return not snakemake.snakemake(**kwargs)


Expand All @@ -52,15 +54,21 @@ def open_maybe_gzip(path, mode):

def run_parse_xml(args):
"""Parse XML file."""
with open(args.output_b37, "wt") as output_b37:
with open(args.output_b38, "wt") as output_b38:
parser = parse_clinvar_xml.ClinvarParser(
input_file=open_maybe_gzip(args.clinvar_xml, "rb"),
out_b37=output_b37,
out_b38=output_b38,
max_rcvs=args.max_rcvs,
)
parser.run()
with (
open(args.output_b37_small, "wt") as output_b37_small,
open(args.output_b37_sv, "wt") as output_b37_sv,
open(args.output_b38_small, "wt") as output_b38_small,
open(args.output_b38_sv, "wt") as output_b38_sv,
):
parser = parse_clinvar_xml.ClinvarParser(
input_file=open_maybe_gzip(args.clinvar_xml, "rb"),
out_b37_small=output_b37_small,
out_b37_sv=output_b37_sv,
out_b38_small=output_b38_small,
out_b38_sv=output_b38_sv,
max_rcvs=args.max_rcvs,
)
parser.run()


def run_normalize_tsv(args):
Expand Down Expand Up @@ -129,10 +137,16 @@ def main(argv=None):
parser_parse_xml = subparsers.add_parser("parse_xml", help="Parse the Clinvar XML")
parser_parse_xml.add_argument("--clinvar-xml", required=True, help="Path to Clinvar XML file.")
parser_parse_xml.add_argument(
"--output-b37", required=True, help="Output path for GRCh37 file."
"--output-b37-small", required=True, help="Output path for small vars GRCh37 file."
)
parser_parse_xml.add_argument(
"--output-b38", required=True, help="Output path for GRCh38 file."
"--output-b37-sv", required=True, help="Output path for SV GRCh37 file."
)
parser_parse_xml.add_argument(
"--output-b38-small", required=True, help="Output path for small vars GRCh38 file."
)
parser_parse_xml.add_argument(
"--output-b38-sv", required=True, help="Output path for SV GRCh38 file."
)
parser_parse_xml.add_argument(
"--max-rcvs", required=False, type=int, help="Maximal number of RCV records to process."
Expand Down
13 changes: 9 additions & 4 deletions clinvar_tsv/common.py
Expand Up @@ -206,13 +206,15 @@ def from_element(cls, element: ET.Element):
elem.text for elem in element.findall('.//Symbol/ElementValue[@Type="Preferred"]')
]
hgnc_ids = [elem.attrib.get("ID") for elem in element.findall('.//XRef[@DB="HGNC"]')]
n = element.findall("./SequenceLocation")
m = element.findall(".//SequenceLocation")
return Measure(
measure_type=element.attrib.get("Type"),
symbols=tuple(sorted(set(symbols))),
hgnc_ids=tuple(sorted(set(hgnc_ids))),
sequence_locations={
elem.attrib.get("Assembly"): SequenceLocation.from_element(elem)
for elem in element.findall("./SequenceLocation")
for elem in (n or m)
},
comments=tuple(
elem.text
Expand Down Expand Up @@ -539,13 +541,11 @@ def from_element(element: ET.Element):
0: (
"uncertain significance",
# below: other
"affects",
"association",
"association not found",
"cancer",
"confers sensitivity",
"drug response",
"drug response",
"drug-response",
"histocompatibility",
"not provided",
Expand All @@ -557,6 +557,7 @@ def from_element(element: ET.Element):
"untested",
"variant of unknown significance",
"associated with leiomyomas",
"conflicting data from submitters",
),
1: (
"likely pathogenic",
Expand Down Expand Up @@ -613,7 +614,11 @@ def label(self):


_REVIEW_STATUS_LABELS = {
1: ("conflicting interpretations", "conflicting interpretations of pathogenicity"),
1: (
"conflicting interpretations",
"conflicting interpretations of pathogenicity",
"conflicting data from submitters",
),
2: ("criteria provided",),
3: ("multiple submitters",),
4: ("no assertion criteria provided",),
Expand Down
9 changes: 7 additions & 2 deletions clinvar_tsv/normalize.py
Expand Up @@ -153,6 +153,7 @@ def normalize_tab_delimited_file(infile, outfile, reference_fasta, verbose=True)
ref_equals_alt = 0
wrong_ref = 0
invalid_nucleotide = 0
unknown_contig = 0
for line in infile.readlines():
data = dict(zip(columns, line.strip("\n").split("\t")))
# fill the data with blanks for any missing data
Expand All @@ -176,6 +177,10 @@ def normalize_tab_delimited_file(infile, outfile, reference_fasta, verbose=True)
)
if data["chromosome"].startswith("chr"):
data["chromosome"] = data["chromosome"][3:]
except KeyError as e:
sys.stderr.write("\n" + str(e) + "\n")
unknown_contig += 1
continue
except RefEqualsAltError as e:
sys.stderr.write("\n" + str(e) + "\n")
ref_equals_alt += 1
Expand All @@ -200,6 +205,6 @@ def normalize_tab_delimited_file(infile, outfile, reference_fasta, verbose=True)
outfile.write("\n\n")
if verbose:
sys.stderr.write(
"Final counts of variants discarded:\nREF == ALT: %s\nWrong REF: %s\nInvalid nucleotide: %s\n"
% (ref_equals_alt, wrong_ref, invalid_nucleotide)
"Final counts of variants discarded:\nREF == ALT: %s\nWrong REF: %s\nInvalid nucleotide: %s\nUnknown contig: %s\n"
% (ref_equals_alt, wrong_ref, invalid_nucleotide, unknown_contig)
)
96 changes: 86 additions & 10 deletions clinvar_tsv/parse_clinvar_xml.py
Expand Up @@ -41,26 +41,46 @@
)


def try_map(values, converter, catch_exc):
"""Try to convert the given iterable of converters ignoring exceptions up to the last one"""
for i, value in enumerate(values):
try:
return converter(value)
except catch_exc:
if i + 1 == len(values):
raise # re-raise


class ClinvarParser:
"""Helper class for parsing Clinvar XML"""

def __init__(self, input_file, out_b37, out_b38, max_rcvs=None):
def __init__(
self, input_file, out_b37_small, out_b37_sv, out_b38_small, out_b38_sv, max_rcvs=None
):
#: ``file``-like object to load the XML from
self.input = input_file
#: ``file``-like object to write GRCh37 variants to
self.out_b37 = out_b37
#: ``file``-like object to write GRCh38 variants to
self.out_b38 = out_b38
#: ``file``-like object to write GRCh37 small variants to
self.out_b37_small = out_b37_small
#: ``file``-like object to write GRCh37 SVs to
self.out_b37_sv = out_b37_sv
#: ``file``-like object to write GRCh38 small variants to
self.out_b38_small = out_b38_small
#: ``file``-like object to write GRCh38 SVs to
self.out_b38_sv = out_b38_sv
#: Number of processed RCVs, used for progress disable.
self.rcvs = 0
#: Largest number of rcvs to process out (for testing only)
self.max_rcvs = max_rcvs

def run(self):
def run(self): # noqa: C901
logger.info("Parsing elements...")
out_files = {"GRCh37": self.out_b37, "GRCh38": self.out_b38}
for out_file in out_files.values():
print(TSV_HEADER, file=out_file)
out_files = {
"GRCh37": {"small": self.out_b37_small, "sv": self.out_b37_sv},
"GRCh38": {"small": self.out_b38_small, "sv": self.out_b38_sv},
}
for d in out_files.values():
for out_file in d.values():
print(TSV_HEADER, file=out_file)
with tqdm.tqdm(unit="rcvs") as progress:
for event, elem in ET.iterparse(self.input):
if elem.tag == "ClinVarSet" and event == "end":
Expand All @@ -76,6 +96,60 @@ def run(self):
for build, location in measure.sequence_locations.items():
if build not in out_files:
continue
if location.ref is None or location.alt is None:
if measure.measure_type in (
"single nucleotide variant",
"Variation",
):
continue # skip, just a small variant without a coordinate
try:
start = try_map(
(
location.start,
location.outer_start,
location.inner_start,
),
int,
TypeError,
)
stop = try_map(
(
location.stop,
location.outer_stop,
location.inner_stop,
),
int,
TypeError,
)
except TypeError:
logger.info(
"Cannot determine location from %s", measure
)
continue
row = [
build,
location.chrom,
start,
stop,
binning.assign_bin(start - 1, stop),
location.ref or ".",
location.alt or ".",
measure.measure_type,
as_pg_list(measure.symbols),
as_pg_list(measure.hgnc_ids),
measure_set.accession,
clinvar_set.ref_cv_assertion.clinvar_accession,
clinvar_set.ref_cv_assertion.review_status,
clinvar_set.ref_cv_assertion.gold_stars,
clinvar_set.ref_cv_assertion.pathogenicity,
origin,
json.dumps(
cattr.unstructure(clinvar_set), cls=DateTimeEncoder
)
.replace(r"\"", "'")
.replace('"', '"""'),
]
print("\t".join(map(str, row)), file=out_files[build]["sv"])
elif location.ref is not None and location.alt is not None:
if len(location.ref) == 1 and len(location.alt) == 1:
variation_type = "snv"
Expand Down Expand Up @@ -106,7 +180,9 @@ def run(self):
.replace(r"\"", "'")
.replace('"', '"""'),
]
print("\t".join(map(str, row)), file=out_files[build])
print(
"\t".join(map(str, row)), file=out_files[build]["small"]
)
progress.update()
elem.clear()
if self.max_rcvs and self.rcvs >= self.max_rcvs:
Expand Down
2 changes: 1 addition & 1 deletion requirements/dev.txt
Expand Up @@ -4,7 +4,7 @@
-r test.txt

# Prettier CLI for py.test
pytest-sugar==0.9.5
pytest-sugar

# Sphinx theme
sphinx_rtd_theme==0.1.9

0 comments on commit db44d87

Please sign in to comment.