From 9fd9ce11cc745df39b8cfc23279122ec6b9cdeb2 Mon Sep 17 00:00:00 2001 From: Hongxin <5400599+zhx828@users.noreply.github.com> Date: Mon, 23 Aug 2021 15:14:36 -0400 Subject: [PATCH] Split citations column to multiple based on data fields Update test_Annotation.py --- AnnotatorCore.py | 34 +++++++++++++++++++-------- README.md | 29 ++++++++++++----------- test_Annotation.py | 57 +++++++++++++++++++++++----------------------- 3 files changed, 69 insertions(+), 51 deletions(-) diff --git a/AnnotatorCore.py b/AnnotatorCore.py index 4d3ebbf..da23e26 100644 --- a/AnnotatorCore.py +++ b/AnnotatorCore.py @@ -367,16 +367,17 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy outf.write("\t" + VARIANT_IN_ONCOKB_HEADER) outf.write("\tMUTATION_EFFECT") + outf.write("\tMUTATION_EFFECT_CITATIONS") outf.write("\tONCOGENIC") - newncols += 4 + newncols += 5 for l in levels: outf.write('\t' + l) newncols += len(levels) outf.write("\tHIGHEST_LEVEL") - outf.write("\tCITATIONS") + outf.write("\tTX_CITATIONS") newncols += 2 for l in dxLevels: @@ -384,14 +385,16 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy newncols += len(dxLevels) outf.write("\tHIGHEST_DX_LEVEL") - newncols += 1 + outf.write("\tDX_CITATIONS") + newncols += 2 for l in pxLevels: outf.write('\t' + l) newncols += len(pxLevels) outf.write("\tHIGHEST_PX_LEVEL") - newncols += 1 + outf.write("\tPX_CITATIONS") + newncols += 2 outf.write("\n") @@ -1417,8 +1420,11 @@ def gettumortypename(tumortype): return tumortype['mainType']['name'] -def getimplications(oncokbdata, levels, implications): +def getimplications(oncokbdata, implication_type, levels, implications): + citation_column_key = implication_type + '_citations' for implication in implications: + oncokbdata[citation_column_key] = appendoncokbcitations(oncokbdata[citation_column_key], implication['pmids'], + implication['abstracts']) level = implication['levelOfEvidence'] if level is not None: @@ -1629,8 +1635,12 @@ def process_oncokb_annotation(annotation, annotate_hotspot): oncokbdata[GENE_IN_ONCOKB_HEADER] = GENE_IN_ONCOKB_DEFAULT oncokbdata[VARIANT_IN_ONCOKB_HEADER] = VARIANT_IN_ONCOKB_DEFAULT oncokbdata['mutation_effect'] = "" + oncokbdata['mutation_effect_citations'] = [] oncokbdata['citations'] = [] oncokbdata['oncogenic'] = "" + oncokbdata['tx_citations'] = [] + oncokbdata['dx_citations'] = [] + oncokbdata['px_citations'] = [] try: # oncogenic @@ -1646,7 +1656,7 @@ def process_oncokb_annotation(annotation, annotate_hotspot): # mutation effect if (annotation['mutationEffect'] is not None): oncokbdata['mutation_effect'] = annotation['mutationEffect']['knownEffect'] - oncokbdata['citations'] = appendoncokbcitations(oncokbdata['citations'], + oncokbdata['mutation_effect_citations'] = appendoncokbcitations(oncokbdata['mutation_effect_citations'], annotation['mutationEffect']['citations']['pmids'], annotation['mutationEffect']['citations']['abstracts']) @@ -1663,7 +1673,7 @@ def process_oncokb_annotation(annotation, annotate_hotspot): else: drugs = treatment['drugs'] - oncokbdata['citations'] = appendoncokbcitations(oncokbdata['citations'], treatment['pmids'], + oncokbdata['tx_citations'] = appendoncokbcitations(oncokbdata['tx_citations'], treatment['pmids'], treatment['abstracts']) if len(drugs) == 0: @@ -1676,10 +1686,10 @@ def process_oncokb_annotation(annotation, annotate_hotspot): if treatmentname not in oncokbdata[level]: oncokbdata[level].append('+'.join(drugnames)) if annotation['diagnosticImplications'] is not None: - getimplications(oncokbdata, dxLevels, annotation['diagnosticImplications']) + getimplications(oncokbdata, 'dx', dxLevels, annotation['diagnosticImplications']) if annotation['prognosticImplications'] is not None: - getimplications(oncokbdata, pxLevels, annotation['prognosticImplications']) + getimplications(oncokbdata, 'px', pxLevels, annotation['prognosticImplications']) oncokbdata['highestDiagnosticImplicationLevel'] = annotation['highestDiagnosticImplicationLevel'] oncokbdata['highestPrognosticImplicationLevel'] = annotation['highestPrognosticImplicationLevel'] @@ -1701,18 +1711,22 @@ def process_oncokb_annotation(annotation, annotate_hotspot): ret.append(oncokbdata[GENE_IN_ONCOKB_HEADER]) ret.append(oncokbdata[VARIANT_IN_ONCOKB_HEADER]) ret.append(oncokbdata['mutation_effect']) + ret.append(';'.join(oncokbdata['mutation_effect_citations'])) ret.append(oncokbdata['oncogenic']) for l in levels: ret.append(','.join(oncokbdata[l])) ret.append(gethighestsensitivitylevel(oncokbdata)) - ret.append(';'.join(oncokbdata['citations'])) + ret.append(';'.join(oncokbdata['tx_citations'])) + for l in dxLevels: ret.append(','.join(oncokbdata[l])) ret.append(gethighestDxPxlevel(dxLevels, [oncokbdata['highestDiagnosticImplicationLevel']])) + ret.append(';'.join(oncokbdata['dx_citations'])) for l in pxLevels: ret.append(','.join(oncokbdata[l])) ret.append(gethighestDxPxlevel(pxLevels, [oncokbdata['highestPrognosticImplicationLevel']])) + ret.append(';'.join(oncokbdata['px_citations'])) return ret diff --git a/README.md b/README.md index 106d12a..feb68da 100644 --- a/README.md +++ b/README.md @@ -93,19 +93,22 @@ python ${FILE_NAME.py} -i ${INPUT_FILE} -o ${OUTPUT_FILE} -b ${ONCOKB_API_TOKEN} ## Columns added in the annotation files -| Column | Possible Values | Description | -|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| GENE_IN_ONCOKB | TRUE, FALSE | Whether the gene has been curated by the OncoKB Team | -| VARIANT_IN_ONCOKB | TRUE, FALSE | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations. | -| MUTATION_EFFECT | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. | -| ONCOGENIC | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance | In OncoKB, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014). | -| LEVEL_* | Therapeutic implications | The leveled therapeutic implications | -| HIGHEST_LEVEL | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2 | The highest level of evidence for therapeutic implications | -| CITATIONS | PMID, Abstract, Website Link | All citations related to a mutation/alteration | -| LEVEL_Dx* | Tumor type the level of evidence is assigned to | The leveled diagnostic implications | -| HIGHEST_DX_LEVEL | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3 | The highest level of evidence for diagnostic implications | -| LEVEL_Px* | Tumor type the level of evidence is assigned to | The leveled prognostic implications | -| HIGHEST_PX_LEVEL | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3 | The highest level of evidence for prognostic implications | +| Column | Possible Values | Description | +|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| GENE_IN_ONCOKB | TRUE, FALSE | Whether the gene has been curated by the OncoKB Team | +| VARIANT_IN_ONCOKB | TRUE, FALSE | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations. | +| MUTATION_EFFECT | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. | +| MUTATION_EFFECT_CITATIONS | PMID, Abstract, Website Link | All citations related to the biological effect | +| ONCOGENIC | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance | In OncoKB, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014). | +| LEVEL_* | Therapeutic implications | The leveled therapeutic implications | +| HIGHEST_LEVEL | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2 | The highest level of evidence for therapeutic implications | +| TX_CITATIONS | PMID, Abstract, Website Link | All citations related to therapeutic implications | +| LEVEL_Dx* | Tumor type the level of evidence is assigned to | The leveled diagnostic implications | +| HIGHEST_DX_LEVEL | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3 | The highest level of evidence for diagnostic implications | +| DX_CITATIONS | PMID, Abstract, Website Link | All citations related to diagnostic implications | +| LEVEL_Px* | Tumor type the level of evidence is assigned to | The leveled prognostic implications | +| HIGHEST_PX_LEVEL | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3 | The highest level of evidence for prognostic implications | +| PX_CITATIONS | PMID, Abstract, Website Link | All citations related to prognostic implications | ## Questions? The best way is to email contact@oncokb.org so all our team members can help. diff --git a/test_Annotation.py b/test_Annotation.py index add4d9e..6124ef3 100644 --- a/test_Annotation.py +++ b/test_Annotation.py @@ -11,20 +11,21 @@ VARIANT_EXISTS_INDEX = 1 MUTATION_EFFECT_INDEX = 2 -ONCOGENIC_INDEX = 3 -LEVEL_1_INDEX =4 -LEVEL_2_INDEX = 5 -LEVEL_3A_INDEX = 6 -HIGHEST_LEVEL_INDEX = 12 -HIGHEST_DX_LEVEL_INDEX = 17 -HIGHEST_PX_LEVEL_INDEX = 21 +ONCOGENIC_INDEX = 4 +LEVEL_1_INDEX =5 +LEVEL_2_INDEX = 6 +LEVEL_3A_INDEX = 7 +HIGHEST_LEVEL_INDEX = 13 +HIGHEST_DX_LEVEL_INDEX = 18 +HIGHEST_PX_LEVEL_INDEX = 23 UNKNOWN = 'Unknown' +NUMBER_OF_ANNOTATION_COLUMNS = 25 def fake_gene_one_query_suite(annotations): assert len(annotations) == 1 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN assert annotation[ONCOGENIC_INDEX] == '' assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -41,13 +42,13 @@ def test_check_protein_change(): assert len(annotations) == 2 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -105,25 +106,25 @@ def test_check_atypical_alts(): assert len(annotations) == 4 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' annotation = annotations[2] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' annotation_dup = annotations[3] - assert len(annotation_dup) == 22 + assert len(annotation_dup) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation == annotation_dup @@ -139,19 +140,19 @@ def test_check_hgvsg(): assert len(annotations) == 3 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[2] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -168,19 +169,19 @@ def test_check_genomic_change(): assert len(annotations) == 3 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[2] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -197,19 +198,19 @@ def test_check_fusions(): assert len(annotations) == 3 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_3B' annotation = annotations[2] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -240,25 +241,25 @@ def test_cna(): assert len(annotations) == 4 annotation = annotations[0] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Loss-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' annotation = annotations[2] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_2' annotation = annotations[3] - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Loss-of-function' assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -276,7 +277,7 @@ def test_fake_cna(): fake_gene_one_query_suite(annotations) def check_brca2_n3214i_without_cancertype(annotation): - assert len(annotation) == 22 + assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Loss-of-function' assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'