Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/sap 11534 handle clnhgvs using measure set name #58

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4056282
Added absolute paths for scripts so that pipeline can be run in a doc…
jonkerry May 10, 2018
35f692a
Changes to column names and introduction of dbsnp by parsing of rsID …
jonkerry May 24, 2018
a29a74b
Added "type" in merge and to the final list of headers. SAP-4037
jonkerry May 29, 2018
e031fe3
Added function for translating clinical_significance terms to those r…
jonkerry May 29, 2018
15e0d3a
Altered VCF output to not include a bunch of columns that we are not …
jonkerry May 31, 2018
1108bdb
SAP-7990 : Modify XML parser to include variants with non-standard VC…
jonkerry Feb 4, 2019
764f817
Merge pull request #12 from Congenica/feature/SAP-7990_include_skippe…
jonkerry Feb 14, 2019
5d8f875
SAP-6265 : Use most pathogenic submission for conflicted variants
jonkerry Feb 14, 2019
9753f0f
Merge pull request #13 from Congenica/feature/SAP-6265_most_severe_pa…
jonkerry Feb 15, 2019
679b9f2
SAP-8224 : Output expected variant numbers in ClinVar VCF header
jonkerry Feb 20, 2019
82bbbf2
Merge pull request #14 from Congenica/feature/SAP-8224_expected_varia…
jonkerry Feb 21, 2019
ca4bc16
SAP-8219 : Add "rs" the front of the rsID
jonkerry Feb 25, 2019
48f7129
Merge pull request #15 from Congenica/feature/SAP-8219_add_rs_to_vari…
jonkerry Feb 25, 2019
7eb8576
SAP-8222 : Add "Haplotype;Phase unknown" to expected measureset types
jonkerry Feb 26, 2019
680bbd9
Merge pull request #16 from Congenica/feature/SAP-8222_compatibility_…
jonkerry Feb 26, 2019
ac819fc
SAP-8433 : Add pathogenicity mapping for old terms
jonkerry Mar 5, 2019
58e6615
Merge pull request #17 from Congenica/feature/SAP-8433_use_refClinVar…
jonkerry Mar 6, 2019
c895f74
SAP-8478 : Add human-readable descriptions for INFO fields in VCF header
jonkerry Mar 6, 2019
47a7acd
Merge pull request #18 from Congenica/feature/SAP-8478_human_readable…
jonkerry Mar 7, 2019
85b9b1c
SAP-8360 : Pass ClinVar version to VCF writer to add version and date…
jonkerry Mar 7, 2019
ca287a8
Merge pull request #19 from Congenica/feature/SAP-8360_cvl_descriptio…
jonkerry Mar 11, 2019
89d3d28
SAP-11534 : set CLNGHVS field to var_name as fallback
kevin-dunnicliffe Oct 28, 2019
60b0e7f
SAP-11534 : improve clnghvs handling by fallback to MeasureSet.name
kevin-dunnicliffe Nov 1, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/check_allele_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@
assert int(record['pos']) > 0 and int(record['pos']) < 3*10**8, 'Unexpected "pos" column value: ' + record['pos']
assert all(b in 'ACGTN' for b in record['ref']), 'Unexpected "ref" column value: ' + record['ref']
assert all(b in 'ACGTN' for b in record['alt']), 'Unexpected "alt" column value: ' + record['alt'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
assert record['variation_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \
'Unexpected "variation_type" column value: ' + record['variation_type'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
assert record['measureset_type'] in [
"Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes",
"CompoundHeterozygote;Haplotype", "Variant;gene-variant", "Haplotype;Phase unknown"
], 'Unexpected "measureset_type" column value: ' + record['measureset_type']

assert len(map(int, record['variation_id'].split(';'))) > 0, 'Unexpected "variation_id" column value: ' + record['variation_id']
assert len(map(lambda rcv: int(rcv.strip('RCV')), record['rcv'].split(';'))) > 0, 'Unexpected "rcv" column value: ' + record['rcv']
assert len(map(int, record['measureset_id'].split(';'))) > 0, 'Unexpected "measureset_id" column value: ' + record['measureset_id']
assert len(map(lambda rcv: int(rcv.strip('RCV')), record['clnacc'].split(';'))) > 0, 'Unexpected "clnacc" column value: ' + record['clnacc']
assert int(record['allele_id']) > 0, 'Unexpected "rcv" column value: ' + record['allele_id']
assert len(record['hgvs_c']) == 0 or "c." in record['hgvs_c'], 'Unexpected "hgvs_c" column value: ' + record['hgvs_c']
assert len(record['clnhgvs']) == 0 or "c." in record['clnhgvs'], 'Unexpected "hgvs_c" column value: ' + record['clnhgvs']
assert len(record['hgvs_p']) == 0 or "p." in record['hgvs_p'], 'Unexpected "hgvs_p" column value: ' + record['hgvs_p']
#assert record['molecular_consequence'], 'Unexpected "molecular_consequence" column value: ' + record['molecular_consequence']

Expand Down
6 changes: 3 additions & 3 deletions src/clinvar_alleles_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
alleles_name = sys.argv[1]

columns_to_summarize = [
'variation_type', 'clinical_significance',
'review_status', 'gold_stars', 'all_submitters',
'measureset_type', 'original_clnsig',
'clnrevstat', 'gold_stars', 'all_submitters',
'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism',
'origin']
'clnorigin']

df = pd.read_csv(alleles_name, sep="\t", compression='gzip', index_col=False)

Expand Down
73 changes: 61 additions & 12 deletions src/clinvar_table_to_vcf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import argparse
import calendar
import collections
import gzip
import os
import re
import pandas as pd
import sys

from parse_clinvar_xml import HEADER
from join_variant_summary_with_clinvar_alleles import FINAL_HEADER


def gzopen(path, mode='r', verbose=True):
Expand All @@ -16,7 +17,7 @@ def gzopen(path, mode='r', verbose=True):
return open(path, mode)


def table_to_vcf(input_table_path, input_reference_genome):
def table_to_vcf(input_table_path, input_reference_genome, version):
# validate args
input_reference_genome_fai = input_reference_genome + ".fai"
if not os.path.isfile(input_table_path):
Expand All @@ -35,11 +36,59 @@ def table_to_vcf(input_table_path, input_reference_genome):
print("""##fileformat=VCFv4.1\n##source=clinvar""")

descriptions = {
'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.",
'gold_stars': "Number of Gold Stars (numeric representation of review status)",
'clnsig': "Most Severe ClinVar Pathogenicity",
'original_clnsig': "Clinical Significance",
'pathogenic': "Number of 'Pathogenic' Submissions",
'likely_pathogenic': "Number of 'Likely pathogenic' Submissions",
'uncertain_significance': "Number of 'Uncertain significance' Submissions",
'likely_benign': "Number of 'Likely benign' Submissions",
'benign': "Number of 'Benign' Submissions",
'conflicted': "Conflicting Pathogenicities (0 = False; 1 = True)",
'clnhgvs': "HGVSc",
'clnrevstat': "Review Status",
'clndbn': "Phenotypes",
'clnorigin': "Allele Origin",
'rs': "rsID",
'all_pmids': "Pubmed IDs Documenting Evidence of Phenotypes",
'clnacc': "RCV Accession Number",
'measureset_type': "Measureset Type",
'measureset_id': "Measureset ID",
'allele_id': "Allele ID",
'symbol': "Gene Symbol",
'molecular_consequence': "Molecular Consequence",
'hgvs_p': "HGVSp",
'all_submitters': "Submitters of Variant Phenotype",
'inheritance_modes': "Modes of Inheritance",
'xrefs': "Cross-References to other Data Sources"
}
for key in HEADER:
print("""##INFO=<ID={},Number=1,Type=String,Description="{}">"""
.format(key.upper(), descriptions.get(key, key.upper())))
not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand',
'clinical_significance_ordered', 'review_status_ordered',
'dates_ordered', 'last_evaluated', 'submitters_ordered', 'scv', 'type',
'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism']
single_value_fields = ['symbol', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign',
'benign', 'clnsig', 'gold_stars', 'conflicted']
for key in FINAL_HEADER:
if key not in not_required_in_header_or_info:
number = '1' if key in single_value_fields else '.'
print("""##INFO=<ID={},Number={},Type=String,Description="{}">""".format(
key.upper(), number, descriptions.get(key, key.upper())))

version_date = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})', version)
try:
month_string = calendar.month_name[int(version_date.group('month'))]
except IndexError:
raise ValueError('Cannot convert version month value "{}" to a known month (must be between 01-12)'.format(
version_date.group('month')
))

print('##CONGENICA_VCF_VALIDATION=<ID=CLINVAR_MASTER_NUM_TOTAL_VARIANTS,VALUE={},DESCRIPTION="Number of variants '
'in VCF"'.format(t.shape[0]))
print('##CURATED_VARIANT_LIST_INFO=<ID=VERSION,VALUE="{}",DESCRIPTION="Version of source data">'.format(version))
print('##CURATED_VARIANT_LIST_INFO=<ID=DESCRIPTION,VALUE="ClinVar variants (SNVs and indels) in the {}-{} '
'version of the ClinVar dataset",DESCRIPTION="Description for the Curated Variant List">'.format(
version_date.group('year'), month_string))

with open(input_reference_genome_fai) as in_fai:
for line in in_fai:
chrom, length, _ = line.split("\t", 2)
Expand All @@ -52,7 +101,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
vcf_row = []
vcf_row.append(table_row["chrom"])
vcf_row.append(table_row["pos"])
vcf_row.append('.') # ID
vcf_row.append(table_row["rs"]) # ID = rsID
vcf_row.append(table_row["ref"])
vcf_row.append(table_row["alt"])
vcf_row.append('.') # QUAL
Expand All @@ -64,14 +113,13 @@ def table_to_vcf(input_table_path, input_reference_genome):
# INFO - additional information: (String, no white-space, semi-colons, or equals-signs permitted; commas are
# permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short
# keys with optional values in the format: <key>=<data>[,data].
loc_column = ['chrom', 'pos', 'ref', 'alt']
for key in HEADER:
if key not in loc_column:
for key in FINAL_HEADER:
if key not in not_required_in_header_or_info:
if pd.isnull(table_row[key]):
continue
value = str(table_row[key])
value = re.sub('\s*[,]\s*', '..', value) # replace , with ..
value = re.sub('\s*[;]\s*', '|', value) # replace ; with |
value = re.sub('\s*[;]\s*', ',', value) # replace ; with |
value = value.replace("=", " eq ").replace(" ", "_")

info_field[key.upper()] = value
Expand All @@ -86,6 +134,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
parser = argparse.ArgumentParser()
parser.add_argument('input_table_path', help="Tab-delimited input table")
parser.add_argument('input_reference_genome', help="Reference FASTA used. The associated .fai file, e.g. b38.fa.fai, is necessary for the VCF header generation")
parser.add_argument('clinvar_version', help="Version of ClinVar e.g. 2019-02")
args = parser.parse_args()

table_to_vcf(args.input_table_path, args.input_reference_genome)
table_to_vcf(args.input_table_path, args.input_reference_genome, args.clinvar_version)
60 changes: 53 additions & 7 deletions src/join_variant_summary_with_clinvar_alleles.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,50 @@
#!/usr/bin/env python
from collections import OrderedDict
import sys
import pandas as pd

from parse_clinvar_xml import HEADER

FINAL_HEADER = HEADER + ['gold_stars', 'conflicted']
FINAL_HEADER = HEADER + ['clnsig', 'type', 'gold_stars', 'conflicted']


def convert_terms_to_clnsig(terms):
clnsig_list = [term.lower() for term in terms.split(',')]
classifications = OrderedDict([
('Pathogenic', ('pathogenic/likely pathogenic', 'pathogenic')),
('Likely pathogenic', ('likely pathogenic',)),
('Uncertain significance', ('uncertain significance',)),
('Likely benign', ('benign/likely benign', 'likely benign')),
('Benign', ('benign',)),
('', ('association not found', 'affects', 'drug response', 'confers sensitivity',
'risk factor', 'other', 'association', 'protective', 'not provided',
'conflicting data from submitters', 'conflicting interpretations of '
'pathogenicity', 'no interpretation for the single variant'))
])
for clnsig, original in classifications.items():
if not set(clnsig_list).isdisjoint(original):
return clnsig

raise ValueError('Unrecognised clinical significance term in {}'.format(clnsig_list))


def determine_most_pathogenic_submission(row):
if row['pathogenic'] > 0:
return 'Pathogenic'
elif row['likely_pathogenic'] > 0:
return 'Likely pathogenic'
elif row['uncertain_significance'] > 0:
return 'Uncertain significance'
elif row['likely_benign'] > 0:
return 'Likely benign'
elif row['benign'] > 0:
return 'Benign'
else:
# may need to revisit this code to determine what to with these instances, really they shouldn't be classified
# as conflicted but the earlier join can be ambiguous due to a variant in variant_summary.txt.gz appearing in
# both the multi and single files with different numbers of submissions. For now, retain the current
# functionality of leaving the field blank for the conflicted variant
return ''


def join_variant_summary_with_clinvar_alleles(
Expand All @@ -31,10 +71,10 @@ def join_variant_summary_with_clinvar_alleles(
variant_summary = variant_summary[
variant_summary['assembly'] == genome_build_id]
variant_summary = variant_summary[
['allele_id', 'clinicalsignificance', 'reviewstatus','lastevaluated']]
['allele_id', 'clinicalsignificance', 'reviewstatus', 'lastevaluated', 'type']]
variant_summary = variant_summary.rename(
columns={'clinicalsignificance': 'clinical_significance',
'reviewstatus': 'review_status',
columns={'clinicalsignificance': 'original_clnsig',
'reviewstatus': 'clnrevstat',
'lastevaluated':'last_evaluated'})

# remove the duplicated records in variant summary due to alternative loci such as PAR but would be problematic for rare cases like translocation
Expand All @@ -43,7 +83,7 @@ def join_variant_summary_with_clinvar_alleles(

# remove clinical_significance and review_status from clinvar_alleles:
clinvar_alleles = clinvar_alleles.drop(
['clinical_significance', 'review_status','last_evaluated'], axis=1)
['original_clnsig', 'clnrevstat', 'last_evaluated'], axis=1)

# pandas is sensitive to some rows having allele_id joined on ;, causing
# an object dtype, with some entries being ints and others strs
Expand All @@ -57,6 +97,9 @@ def join_variant_summary_with_clinvar_alleles(
on='allele_id', how='inner')
print "merged raw", df.shape

# map clinical_significance to clnsig
df['clnsig'] = df.original_clnsig.map(convert_terms_to_clnsig)

# map review_status to gold starts:
gold_star_map = {
'no assertion provided': 0,
Expand All @@ -69,13 +112,16 @@ def join_variant_summary_with_clinvar_alleles(
'practice guideline': 4,
'-':'-'
}
df['gold_stars'] = df.review_status.map(gold_star_map)
df['gold_stars'] = df.clnrevstat.map(gold_star_map)

# The use of expressions on clinical significance on ClinVar aggregate records (RCV) https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/#conflicts
# conflicted = 1 if using "conflicting"
df['conflicted'] = df['clinical_significance'].str.contains(
df['conflicted'] = df['original_clnsig'].str.contains(
"onflicting", case=False).astype(int)

conflicted_variants = df['conflicted'] == 1
df.loc[conflicted_variants, 'clnsig'] = df[conflicted_variants].apply(determine_most_pathogenic_submission, axis=1)

# reorder columns just in case
df = df.ix[:, FINAL_HEADER]
print "merged final", df.shape
Expand Down