macarthur-lab · kevin-dunnicliffe · May 10, 2018 · May 24, 2018 · May 29, 2018 · May 29, 2018
diff --git a/src/check_allele_table.py b/src/check_allele_table.py
@@ -34,13 +34,15 @@
         assert int(record['pos']) > 0 and int(record['pos']) < 3*10**8, 'Unexpected "pos" column value: ' + record['pos']
         assert all(b in 'ACGTN' for b in record['ref']), 'Unexpected "ref" column value: ' + record['ref']
         assert all(b in 'ACGTN' for b in record['alt']), 'Unexpected "alt" column value: ' + record['alt']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
-        assert record['variation_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \
-            'Unexpected "variation_type" column value: ' + record['variation_type']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
+        assert record['measureset_type'] in [
+            "Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes",
+            "CompoundHeterozygote;Haplotype", "Variant;gene-variant", "Haplotype;Phase unknown"
+        ], 'Unexpected "measureset_type" column value: ' + record['measureset_type']
 
-        assert len(map(int, record['variation_id'].split(';'))) > 0, 'Unexpected "variation_id" column value: ' + record['variation_id']
-        assert len(map(lambda rcv: int(rcv.strip('RCV')), record['rcv'].split(';'))) > 0, 'Unexpected "rcv" column value: ' + record['rcv']
+        assert len(map(int, record['measureset_id'].split(';'))) > 0, 'Unexpected "measureset_id" column value: ' + record['measureset_id']
+        assert len(map(lambda rcv: int(rcv.strip('RCV')), record['clnacc'].split(';'))) > 0, 'Unexpected "clnacc" column value: ' + record['clnacc']
         assert int(record['allele_id']) > 0, 'Unexpected "rcv" column value: ' + record['allele_id']
-        assert len(record['hgvs_c']) == 0 or "c." in record['hgvs_c'], 'Unexpected "hgvs_c" column value: ' + record['hgvs_c']
+        assert len(record['clnhgvs']) == 0 or "c." in record['clnhgvs'], 'Unexpected "hgvs_c" column value: ' + record['clnhgvs']
         assert len(record['hgvs_p']) == 0 or "p." in record['hgvs_p'], 'Unexpected "hgvs_p" column value: ' + record['hgvs_p']
         #assert record['molecular_consequence'], 'Unexpected "molecular_consequence" column value: ' + record['molecular_consequence']
 

diff --git a/src/clinvar_alleles_stats.py b/src/clinvar_alleles_stats.py
@@ -10,10 +10,10 @@
 alleles_name = sys.argv[1]
 
 columns_to_summarize = [
-    'variation_type', 'clinical_significance',
-    'review_status', 'gold_stars', 'all_submitters',
+    'measureset_type', 'original_clnsig',
+    'clnrevstat', 'gold_stars', 'all_submitters',
     'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism',
-    'origin']
+    'clnorigin']
 
 df = pd.read_csv(alleles_name, sep="\t", compression='gzip', index_col=False)
 

diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
@@ -1,12 +1,13 @@
 import argparse
+import calendar
 import collections
 import gzip
 import os
 import re
 import pandas as pd
 import sys
 
-from parse_clinvar_xml import HEADER
+from join_variant_summary_with_clinvar_alleles import FINAL_HEADER
 
 
 def gzopen(path, mode='r', verbose=True):
@@ -16,7 +17,7 @@ def gzopen(path, mode='r', verbose=True):
         return open(path, mode)
 
 
-def table_to_vcf(input_table_path, input_reference_genome):
+def table_to_vcf(input_table_path, input_reference_genome, version):
     # validate args
     input_reference_genome_fai = input_reference_genome + ".fai"
     if not os.path.isfile(input_table_path):
@@ -35,11 +36,59 @@ def table_to_vcf(input_table_path, input_reference_genome):
     print("""##fileformat=VCFv4.1\n##source=clinvar""")
 
     descriptions = {
-        'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.",
+        'gold_stars': "Number of Gold Stars (numeric representation of review status)",
+        'clnsig': "Most Severe ClinVar Pathogenicity",
+        'original_clnsig': "Clinical Significance",
+        'pathogenic': "Number of 'Pathogenic' Submissions",
+        'likely_pathogenic': "Number of 'Likely pathogenic' Submissions",
+        'uncertain_significance': "Number of 'Uncertain significance' Submissions",
+        'likely_benign': "Number of 'Likely benign' Submissions",
+        'benign': "Number of 'Benign' Submissions",
+        'conflicted': "Conflicting Pathogenicities (0 = False; 1 = True)",
+        'clnhgvs': "HGVSc",
+        'clnrevstat': "Review Status",
+        'clndbn': "Phenotypes",
+        'clnorigin': "Allele Origin",
+        'rs': "rsID",
+        'all_pmids': "Pubmed IDs Documenting Evidence of Phenotypes",
+        'clnacc': "RCV Accession Number",
+        'measureset_type': "Measureset Type",
+        'measureset_id': "Measureset ID",
+        'allele_id': "Allele ID",
+        'symbol': "Gene Symbol",
+        'molecular_consequence': "Molecular Consequence",
+        'hgvs_p': "HGVSp",
+        'all_submitters': "Submitters of Variant Phenotype",
+        'inheritance_modes': "Modes of Inheritance",
+        'xrefs': "Cross-References to other Data Sources"
     }
-    for key in HEADER:
-        print("""##INFO=<ID={},Number=1,Type=String,Description="{}">"""
-              .format(key.upper(), descriptions.get(key, key.upper())))
+    not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand',
+                                      'clinical_significance_ordered', 'review_status_ordered',
+                                      'dates_ordered', 'last_evaluated', 'submitters_ordered', 'scv', 'type',
+                                      'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism']
+    single_value_fields = ['symbol', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign',
+                           'benign', 'clnsig', 'gold_stars', 'conflicted']
+    for key in FINAL_HEADER:
+        if key not in not_required_in_header_or_info:
+            number = '1' if key in single_value_fields else '.'
+            print("""##INFO=<ID={},Number={},Type=String,Description="{}">""".format(
+                key.upper(), number, descriptions.get(key, key.upper())))
+
+    version_date = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})', version)
+    try:
+        month_string = calendar.month_name[int(version_date.group('month'))]
+    except IndexError:
+        raise ValueError('Cannot convert version month value "{}" to a known month (must be between 01-12)'.format(
+            version_date.group('month')
+        ))
+
+    print('##CONGENICA_VCF_VALIDATION=<ID=CLINVAR_MASTER_NUM_TOTAL_VARIANTS,VALUE={},DESCRIPTION="Number of variants '
+          'in VCF"'.format(t.shape[0]))
+    print('##CURATED_VARIANT_LIST_INFO=<ID=VERSION,VALUE="{}",DESCRIPTION="Version of source data">'.format(version))
+    print('##CURATED_VARIANT_LIST_INFO=<ID=DESCRIPTION,VALUE="ClinVar variants (SNVs and indels) in the {}-{} '
+          'version of the ClinVar dataset",DESCRIPTION="Description for the Curated Variant List">'.format(
+              version_date.group('year'), month_string))
+
     with open(input_reference_genome_fai) as in_fai:
         for line in in_fai:
             chrom, length, _ = line.split("\t", 2)
@@ -52,7 +101,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
         vcf_row = []
         vcf_row.append(table_row["chrom"])
         vcf_row.append(table_row["pos"])
-        vcf_row.append('.')  # ID
+        vcf_row.append(table_row["rs"])  # ID = rsID
         vcf_row.append(table_row["ref"])
         vcf_row.append(table_row["alt"])
         vcf_row.append('.')  # QUAL
@@ -64,14 +113,13 @@ def table_to_vcf(input_table_path, input_reference_genome):
         #    INFO - additional information: (String, no white-space, semi-colons, or equals-signs permitted; commas are
         #    permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short
         #    keys with optional values in the format: <key>=<data>[,data].
-        loc_column = ['chrom', 'pos', 'ref', 'alt']
-        for key in HEADER:
-            if key not in loc_column:
+        for key in FINAL_HEADER:
+            if key not in not_required_in_header_or_info:
                 if pd.isnull(table_row[key]):
                     continue
                 value = str(table_row[key])
                 value = re.sub('\s*[,]\s*', '..', value)  # replace , with ..
-                value = re.sub('\s*[;]\s*', '|', value)  # replace ; with |
+                value = re.sub('\s*[;]\s*', ',', value)  # replace ; with |
                 value = value.replace("=", " eq ").replace(" ", "_")
 
                 info_field[key.upper()] = value
@@ -86,6 +134,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
     parser = argparse.ArgumentParser()
     parser.add_argument('input_table_path', help="Tab-delimited input table")
     parser.add_argument('input_reference_genome', help="Reference FASTA used. The associated .fai file, e.g. b38.fa.fai, is necessary for the VCF header generation")
+    parser.add_argument('clinvar_version', help="Version of ClinVar e.g. 2019-02")
     args = parser.parse_args()
 
-    table_to_vcf(args.input_table_path, args.input_reference_genome)
+    table_to_vcf(args.input_table_path, args.input_reference_genome, args.clinvar_version)
diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py
@@ -1,10 +1,50 @@
 #!/usr/bin/env python
+from collections import OrderedDict
 import sys
 import pandas as pd
 
 from parse_clinvar_xml import HEADER
 
-FINAL_HEADER = HEADER + ['gold_stars', 'conflicted']
+FINAL_HEADER = HEADER + ['clnsig', 'type', 'gold_stars', 'conflicted']
+
+
+def convert_terms_to_clnsig(terms):
+    clnsig_list = [term.lower() for term in terms.split(',')]
+    classifications = OrderedDict([
+        ('Pathogenic', ('pathogenic/likely pathogenic', 'pathogenic')),
+        ('Likely pathogenic', ('likely pathogenic',)),
+        ('Uncertain significance', ('uncertain significance',)),
+        ('Likely benign', ('benign/likely benign', 'likely benign')),
+        ('Benign', ('benign',)),
+        ('', ('association not found', 'affects', 'drug response', 'confers sensitivity',
+              'risk factor', 'other', 'association', 'protective', 'not provided',
+              'conflicting data from submitters', 'conflicting interpretations of '
+              'pathogenicity', 'no interpretation for the single variant'))
+    ])
+    for clnsig, original in classifications.items():
+        if not set(clnsig_list).isdisjoint(original):
+            return clnsig
+
+    raise ValueError('Unrecognised clinical significance term in {}'.format(clnsig_list))
+
+
+def determine_most_pathogenic_submission(row):
+    if row['pathogenic'] > 0:
+        return 'Pathogenic'
+    elif row['likely_pathogenic'] > 0:
+        return 'Likely pathogenic'
+    elif row['uncertain_significance'] > 0:
+        return 'Uncertain significance'
+    elif row['likely_benign'] > 0:
+        return 'Likely benign'
+    elif row['benign'] > 0:
+        return 'Benign'
+    else:
+        # may need to revisit this code to determine what to with these instances, really they shouldn't be classified
+        # as conflicted but the earlier join can be ambiguous due to a variant in variant_summary.txt.gz appearing in
+        # both the multi and single files with different numbers of submissions. For now, retain the current
+        # functionality of leaving the field blank for the conflicted variant
+        return ''
 
 
 def join_variant_summary_with_clinvar_alleles(
@@ -31,10 +71,10 @@ def join_variant_summary_with_clinvar_alleles(
     variant_summary = variant_summary[
         variant_summary['assembly'] == genome_build_id]
     variant_summary = variant_summary[
-        ['allele_id', 'clinicalsignificance', 'reviewstatus','lastevaluated']]
+        ['allele_id', 'clinicalsignificance', 'reviewstatus', 'lastevaluated', 'type']]
     variant_summary = variant_summary.rename(
-        columns={'clinicalsignificance': 'clinical_significance',
-                 'reviewstatus': 'review_status',
+        columns={'clinicalsignificance': 'original_clnsig',
+                 'reviewstatus': 'clnrevstat',
                  'lastevaluated':'last_evaluated'})
 
     # remove the duplicated records in variant summary due to alternative loci such as PAR but would be problematic for rare cases like translocation
@@ -43,7 +83,7 @@ def join_variant_summary_with_clinvar_alleles(
 
     # remove clinical_significance and review_status from clinvar_alleles:
     clinvar_alleles = clinvar_alleles.drop(
-        ['clinical_significance', 'review_status','last_evaluated'], axis=1)
+        ['original_clnsig', 'clnrevstat', 'last_evaluated'], axis=1)
 
     # pandas is sensitive to some rows having allele_id joined on ;, causing
     # an object dtype, with some entries being ints and others strs
@@ -57,6 +97,9 @@ def join_variant_summary_with_clinvar_alleles(
                   on='allele_id', how='inner')
     print "merged raw", df.shape
 
+    # map clinical_significance to clnsig
+    df['clnsig'] = df.original_clnsig.map(convert_terms_to_clnsig)
+
     # map review_status to gold starts:
     gold_star_map = {
         'no assertion provided': 0,
@@ -69,13 +112,16 @@ def join_variant_summary_with_clinvar_alleles(
         'practice guideline': 4,
         '-':'-'
     }
-    df['gold_stars'] = df.review_status.map(gold_star_map)
+    df['gold_stars'] = df.clnrevstat.map(gold_star_map)
 
     # The use of expressions on clinical significance on ClinVar aggregate records (RCV) https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/#conflicts
     # conflicted = 1 if using "conflicting"
-    df['conflicted'] = df['clinical_significance'].str.contains(
+    df['conflicted'] = df['original_clnsig'].str.contains(
         "onflicting", case=False).astype(int)
 
+    conflicted_variants = df['conflicted'] == 1
+    df.loc[conflicted_variants, 'clnsig'] = df[conflicted_variants].apply(determine_most_pathogenic_submission, axis=1)
+
     # reorder columns just in case
     df = df.ix[:, FINAL_HEADER]
     print "merged final", df.shape