Skip to content

Commit 0b9336a

Browse files
authored
Merge pull request #681 from biolink/fix_isoforms
swaps subject with gene when subject is PR isoform
2 parents df8f688 + b76b665 commit 0b9336a

File tree

6 files changed

+4845
-4728
lines changed

6 files changed

+4845
-4728
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ poetry.lock
3838
pip-log.txt
3939
pip-delete-this-directory.txt
4040

41+
groups/
42+
4143
# Unit test / coverage reports
4244
htmlcov/
4345
.tox/

Makefile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,17 @@ travis_test:
3232
tests/test_goassociation_model.py tests/test_relations.py \
3333
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
3434
tests/test_collections.py \
35-
tests/test_gocamgen.py; \
35+
tests/test_gocamgen.py \
36+
tests/test_gpi_isoform_replacement.py; \
3637
else \
3738
pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
3839
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
3940
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
4041
tests/test_goassociation_model.py tests/test_relations.py \
4142
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
4243
tests/test_collections.py \
43-
tests/test_gocamgen.py; \
44+
tests/test_gocamgen.py \
45+
tests/test_gpi_isoform_replacement.py; \
4446
fi
4547

4648
cleandist:

bin/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ To test validate.py "validate" command, the command that produces the final GPAD
55

66
```bash
77
poetry install
8-
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI
9-
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa
8+
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
9+
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
1010
```
1111

1212

bin/validate.py

Lines changed: 109 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,14 @@
33
import click
44
import json
55
import os
6-
import yaml
76
import requests
87
import gzip
98
import urllib
109
import shutil
11-
import re
12-
import glob
1310
import logging
14-
import sys
1511
import traceback
16-
from typing import Dict, List
17-
import yamldown
18-
19-
from functools import wraps
20-
21-
# from ontobio.util.user_agent import get_user_agent
12+
from ontobio.model.association import Curie, ExtensionUnit
13+
from ontobio.io.entityparser import GpiParser
2214
from ontobio.ontol_factory import OntologyFactory
2315
from ontobio.io.gafparser import GafParser
2416
from ontobio.io.gpadparser import GpadParser
@@ -34,7 +26,6 @@
3426
from ontobio.validation import tools
3527
from ontobio.validation import rules
3628

37-
3829
from typing import Dict, Set
3930

4031
# logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING)
@@ -224,7 +215,8 @@ def create_parser(config, group, dataset, format="gaf"):
224215

225216
@tools.gzips
226217
def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None,
227-
goref_metadata=None, ref_species_metadata=None, db_type_name_regex_id_syntax=None, retracted_pub_set=None, db_entities=None, group_idspace=None,
218+
goref_metadata=None, ref_species_metadata=None, db_type_name_regex_id_syntax=None,
219+
retracted_pub_set=None, db_entities=None, group_idspace=None,
228220
format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None,
229221
extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2",
230222
rule_set=assocparser.RuleSet.ALL) -> list[str]:
@@ -613,7 +605,8 @@ def cli(ctx, verbose):
613605
@click.option("--only-dataset", default=None)
614606
@click.option("--gaf-output-version", default="2.2", type=click.Choice(["2.1", "2.2"]))
615607
@click.option("--rule-set", "-l", "rule_set", default=[assocparser.RuleSet.ALL], multiple=True)
616-
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file")
608+
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False,
609+
help="Path to retracted publications file")
617610
def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target, ontology, exclude, base_download_url,
618611
suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version,
619612
rule_set, retracted_pub_set):
@@ -676,7 +669,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
676669

677670
db_entities = metadata.database_entities(absolute_metadata)
678671
group_ids = metadata.groups(absolute_metadata)
679-
extensions_constraints = metadata.extensions_constraints_file(absolute_metadata)
672+
extensions_constraints = metadata.extensions_constraints_file(absolute_metadata)
680673

681674
gaferences = None
682675
if gaferencer_file:
@@ -685,21 +678,20 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
685678
# Default comes through as single-element tuple
686679
if rule_set == (assocparser.RuleSet.ALL,):
687680
rule_set = assocparser.RuleSet.ALL
688-
681+
689682
db_type_name_regex_id_syntax = metadata.database_type_name_regex_id_syntax(absolute_metadata)
690683

691-
retracted_pubs = None
692684
if retracted_pub_set:
693685
retracted_pubs = metadata.retracted_pub_set(retracted_pub_set)
694686
else:
695-
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)
687+
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)
696688

697689
for dataset_metadata, source_gaf in downloaded_gaf_sources:
698690
dataset = dataset_metadata["dataset"]
699691
# Set paint to True when the group is "paint".
700692
# This will prevent filtering of IBA (GO_RULE:26) when paint is being treated as a top level group,
701693
# like for paint_other.
702-
click.echo("source_gaf: {}".format(source_gaf))
694+
click.echo("Producing GAF by passing through validation rules... {}".format(dataset))
703695
valid_gaf = produce_gaf(dataset, source_gaf, ontology_graph,
704696
paint=(group == "paint"),
705697
group=group,
@@ -719,10 +711,14 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
719711
rule_set=rule_set
720712
)[0]
721713

714+
click.echo("Producing GPI from GAF files...")
722715
gpi = produce_gpi(dataset, absolute_target, valid_gaf, ontology_graph, gpad_gpi_output_version)
723716

724717
gpi_list = [gpi]
725-
# Try to find other GPIs in metadata and merge
718+
719+
matching_gpi_path = None
720+
click.echo("Try to find other GPIs in metadata and merge...")
721+
726722
for ds in group_metadata["datasets"]:
727723
# Where type=GPI for the same dataset (e.g. "zfin", "goa_cow")
728724
if ds["type"] == "gpi" and ds["dataset"] == dataset and ds.get("source"):
@@ -732,6 +728,9 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
732728
matching_gpi_path = unzip_simple(matching_gpi_path)
733729
gpi_list.append(matching_gpi_path)
734730

731+
click.echo("Found the matching gpi path...{}".format(matching_gpi_path))
732+
733+
click.echo("Downloading the noctua and paint GPAD files...")
735734
noctua_gpad_src = check_and_download_mixin_source(noctua_metadata, group_metadata["id"], dataset, target,
736735
base_download_url=base_download_url,
737736
replace_existing_files=not skip_existing_files)
@@ -740,6 +739,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
740739
replace_existing_files=not skip_existing_files)
741740
if paint_metadata else None)
742741

742+
click.echo("Executing 'make_gpads' in validate.produce with all the assembled GAF files...")
743743
make_gpads(dataset, valid_gaf, products,
744744
ontology_graph, noctua_gpad_src, paint_gaf_src,
745745
gpi, gpad_gpi_output_version)
@@ -750,9 +750,92 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
750750
rule_metadata=rule_metadata, replace_existing_files=not skip_existing_files,
751751
gaf_output_version=gaf_output_version)
752752

753-
click.echo(end_gaf)
753+
click.echo("Executing the isoform fixing step in validate.produce...")
754+
# run the resulting gaf through one last parse and replace, to handle the isoforms
755+
# see: https://github.com/geneontology/go-site/issues/2291
756+
output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}.gaf".format(dataset))
757+
isoform_fixed_gaf = fix_pro_isoforms_in_gaf(end_gaf, matching_gpi_path, ontology_graph, output_gaf_path)
758+
click.echo(isoform_fixed_gaf)
754759

755-
make_ttls(dataset, end_gaf, products, ontology_graph)
760+
click.echo("Creating ttl files...")
761+
make_ttls(dataset, isoform_fixed_gaf, products, ontology_graph)
762+
763+
764+
def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str, gpi_file: str, ontology_graph, output_file_path: str) -> str:
765+
"""
766+
Given a GAF file and a GPI file, fix the GAF file by converting isoform annotations to gene annotations. Storing
767+
the isoforms back in subject_extensions collection, changing the full_name, synonyms, label, and type back to the
768+
gene in the GPI file.
769+
:param gaf_file_to_fix: The path to the GAF file to fix
770+
:param gpi_file: The path to the GPI file
771+
:param ontology_graph: The ontology graph to use for parsing the associations
772+
:param output_file_path: The path to write the fixed GAF file to
773+
:return: The path to the fixed GAF file
774+
"""
775+
fixed_associations = []
776+
gpiparser = GpiParser(config=assocparser.AssocParserConfig(ontology=ontology_graph))
777+
# Parse the GPI file, creating a map of identifiers to GPI entries
778+
gpis = gpiparser.parse(gpi_file, None)
779+
gpi_map = {}
780+
for gpi_entry in gpis:
781+
gpi_map[gpi_entry.get('id')] = {"encoded_by": gpi_entry.get('encoded_by'),
782+
"full_name": gpi_entry.get('full_name'),
783+
"label": gpi_entry.get('label'),
784+
"synonyms": gpi_entry.get('synonyms'),
785+
# GPI spec says this is single valued, but GpiParser returns this as a list.
786+
"type": gpi_entry.get('type')[0],
787+
"id": gpi_entry.get('id')}
788+
789+
gafparser = GafParser(config=assocparser.AssocParserConfig(ontology=ontology_graph))
790+
gafwriter = GafWriter(file=open(output_file_path, "w"), source="test", version=gafparser.version)
791+
792+
# these are statistic parameters that record when a substitution is made.
793+
substitution_count = 0
794+
no_substitution_count = 0
795+
796+
with open(gaf_file_to_fix, "r") as file:
797+
for line in file:
798+
annotation = gafparser.parse_line(line)
799+
for source_assoc in annotation.associations:
800+
if isinstance(source_assoc, dict):
801+
continue # skip the header
802+
if source_assoc.subject.id.namespace.startswith("PR"):
803+
full_old_identifier = source_assoc.subject.id.namespace + ":" + source_assoc.subject.id.identity
804+
old_namespace = source_assoc.subject.id.namespace
805+
old_identity = source_assoc.subject.id.identity
806+
# TODO: right now we get the FIRST encoded_by result -- this is what the original script from Chris did??
807+
if "MGI" == gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[0]:
808+
source_assoc.subject.id.namespace = gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[0]
809+
source_assoc.subject.id.identity = "MGI:" + gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[2]
810+
else:
811+
source_assoc.subject.id.namespace = \
812+
gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[0]
813+
source_assoc.subject.id.identity = \
814+
gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[1]
815+
full_new_identifier = source_assoc.subject.id.namespace + ":" + source_assoc.subject.id.identity
816+
source_assoc.subject.full_name = gpi_map[full_new_identifier].get("full_name")
817+
source_assoc.subject.label = gpi_map[full_new_identifier].get("label")
818+
source_assoc.subject.synonyms = gpi_map[full_new_identifier].get("synonyms")
819+
source_assoc.subject.type = gpi_map[full_new_identifier].get("type")
820+
821+
# we need to put the isoform currently being swapped, back into "Column 17" which is a
822+
# subject_extension member.
823+
isoform_term = Curie(namespace=old_identity, identity=old_namespace)
824+
isoform_relation = Curie(namespace="RO", identity="0002327")
825+
new_subject_extension = ExtensionUnit(relation=isoform_relation, term=isoform_term)
826+
source_assoc.subject_extensions.append(new_subject_extension)
827+
828+
# count the substitution here for reporting later
829+
substitution_count += 1
830+
else:
831+
no_substitution_count += 1
832+
833+
# Join fields back into a string and write to output file
834+
fixed_associations.append(source_assoc)
835+
836+
gafwriter.write(fixed_associations)
837+
click.echo(f"Substituted {substitution_count} entries in {gaf_file_to_fix} "
838+
f"and left {no_substitution_count} entries unchanged.")
756839

757840

758841
@cli.command()
@@ -808,14 +891,14 @@ def paint(group, dataset, metadata, target, ontology):
808891
absolute_target = os.path.abspath(target)
809892
os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True)
810893
paint_metadata = metadata.dataset_metadata_file(absolute_metadata, "paint")
811-
894+
812895
paint_src_gaf = check_and_download_mixin_source(paint_metadata, dataset, absolute_target)
813896

814897
click.echo("Loading ontology: {}...".format(ontology))
815898
ontology_graph = OntologyFactory().create(ontology)
816899

817900
gpi_path = os.path.join(absolute_target, "groups", dataset, "{}.gpi".format(dataset))
818-
click.echo("Using GPI at {}".format(gpi_path))
901+
click.echo("Using GPI at {}".format(gpi_path))
819902
paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path)
820903

821904

@@ -825,7 +908,8 @@ def paint(group, dataset, metadata, target, ontology):
825908
@click.option("--ontology", type=click.Path(), required=True)
826909
@click.option("--gaferencer-file", "-I", type=click.Path(exists=True), default=None, required=False,
827910
help="Path to Gaferencer output to be used for inferences")
828-
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file")
911+
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False,
912+
help="Path to retracted publications file")
829913
def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set):
830914
absolute_metadata = os.path.abspath(metadata_dir)
831915

@@ -840,8 +924,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set):
840924
if retracted_pub_set:
841925
retracted_pubs = metadata.retracted_pub_set(retracted_pub_set)
842926
else:
843-
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)
844-
927+
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)
845928

846929
click.echo("Found {} GO Rules".format(len(gorule_metadata.keys())))
847930

0 commit comments

Comments
 (0)