3
3
import click
4
4
import json
5
5
import os
6
- import yaml
7
6
import requests
8
7
import gzip
9
8
import urllib
10
9
import shutil
11
- import re
12
- import glob
13
10
import logging
14
- import sys
15
11
import traceback
16
- from typing import Dict , List
17
- import yamldown
18
-
19
- from functools import wraps
20
-
21
- # from ontobio.util.user_agent import get_user_agent
12
+ from ontobio .model .association import Curie , ExtensionUnit
13
+ from ontobio .io .entityparser import GpiParser
22
14
from ontobio .ontol_factory import OntologyFactory
23
15
from ontobio .io .gafparser import GafParser
24
16
from ontobio .io .gpadparser import GpadParser
34
26
from ontobio .validation import tools
35
27
from ontobio .validation import rules
36
28
37
-
38
29
from typing import Dict , Set
39
30
40
31
# logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING)
@@ -224,7 +215,8 @@ def create_parser(config, group, dataset, format="gaf"):
224
215
225
216
@tools .gzips
226
217
def produce_gaf (dataset , source_gaf , ontology_graph , gpipaths = None , paint = False , group = "unknown" , rule_metadata = None ,
227
- goref_metadata = None , ref_species_metadata = None , db_type_name_regex_id_syntax = None , retracted_pub_set = None , db_entities = None , group_idspace = None ,
218
+ goref_metadata = None , ref_species_metadata = None , db_type_name_regex_id_syntax = None ,
219
+ retracted_pub_set = None , db_entities = None , group_idspace = None ,
228
220
format = "gaf" , suppress_rule_reporting_tags = [], annotation_inferences = None , group_metadata = None ,
229
221
extensions_constraints = None , rule_contexts = [], gaf_output_version = "2.2" ,
230
222
rule_set = assocparser .RuleSet .ALL ) -> list [str ]:
@@ -613,7 +605,8 @@ def cli(ctx, verbose):
613
605
@click .option ("--only-dataset" , default = None )
614
606
@click .option ("--gaf-output-version" , default = "2.2" , type = click .Choice (["2.1" , "2.2" ]))
615
607
@click .option ("--rule-set" , "-l" , "rule_set" , default = [assocparser .RuleSet .ALL ], multiple = True )
616
- @click .option ("--retracted_pub_set" , type = click .Path (exists = True ), default = None , required = False , help = "Path to retracted publications file" )
608
+ @click .option ("--retracted_pub_set" , type = click .Path (exists = True ), default = None , required = False ,
609
+ help = "Path to retracted publications file" )
617
610
def produce (ctx , group , metadata_dir , gpad , gpad_gpi_output_version , ttl , target , ontology , exclude , base_download_url ,
618
611
suppress_rule_reporting_tag , skip_existing_files , gaferencer_file , only_dataset , gaf_output_version ,
619
612
rule_set , retracted_pub_set ):
@@ -676,7 +669,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
676
669
677
670
db_entities = metadata .database_entities (absolute_metadata )
678
671
group_ids = metadata .groups (absolute_metadata )
679
- extensions_constraints = metadata .extensions_constraints_file (absolute_metadata )
672
+ extensions_constraints = metadata .extensions_constraints_file (absolute_metadata )
680
673
681
674
gaferences = None
682
675
if gaferencer_file :
@@ -685,21 +678,20 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
685
678
# Default comes through as single-element tuple
686
679
if rule_set == (assocparser .RuleSet .ALL ,):
687
680
rule_set = assocparser .RuleSet .ALL
688
-
681
+
689
682
db_type_name_regex_id_syntax = metadata .database_type_name_regex_id_syntax (absolute_metadata )
690
683
691
- retracted_pubs = None
692
684
if retracted_pub_set :
693
685
retracted_pubs = metadata .retracted_pub_set (retracted_pub_set )
694
686
else :
695
- retracted_pubs = metadata .retracted_pub_set_from_meta (absolute_metadata )
687
+ retracted_pubs = metadata .retracted_pub_set_from_meta (absolute_metadata )
696
688
697
689
for dataset_metadata , source_gaf in downloaded_gaf_sources :
698
690
dataset = dataset_metadata ["dataset" ]
699
691
# Set paint to True when the group is "paint".
700
692
# This will prevent filtering of IBA (GO_RULE:26) when paint is being treated as a top level group,
701
693
# like for paint_other.
702
- click .echo ("source_gaf: {}" .format (source_gaf ))
694
+ click .echo ("Producing GAF by passing through validation rules... {}" .format (dataset ))
703
695
valid_gaf = produce_gaf (dataset , source_gaf , ontology_graph ,
704
696
paint = (group == "paint" ),
705
697
group = group ,
@@ -719,10 +711,14 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
719
711
rule_set = rule_set
720
712
)[0 ]
721
713
714
+ click .echo ("Producing GPI from GAF files..." )
722
715
gpi = produce_gpi (dataset , absolute_target , valid_gaf , ontology_graph , gpad_gpi_output_version )
723
716
724
717
gpi_list = [gpi ]
725
- # Try to find other GPIs in metadata and merge
718
+
719
+ matching_gpi_path = None
720
+ click .echo ("Try to find other GPIs in metadata and merge..." )
721
+
726
722
for ds in group_metadata ["datasets" ]:
727
723
# Where type=GPI for the same dataset (e.g. "zfin", "goa_cow")
728
724
if ds ["type" ] == "gpi" and ds ["dataset" ] == dataset and ds .get ("source" ):
@@ -732,6 +728,9 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
732
728
matching_gpi_path = unzip_simple (matching_gpi_path )
733
729
gpi_list .append (matching_gpi_path )
734
730
731
+ click .echo ("Found the matching gpi path...{}" .format (matching_gpi_path ))
732
+
733
+ click .echo ("Downloading the noctua and paint GPAD files..." )
735
734
noctua_gpad_src = check_and_download_mixin_source (noctua_metadata , group_metadata ["id" ], dataset , target ,
736
735
base_download_url = base_download_url ,
737
736
replace_existing_files = not skip_existing_files )
@@ -740,6 +739,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
740
739
replace_existing_files = not skip_existing_files )
741
740
if paint_metadata else None )
742
741
742
+ click .echo ("Executing 'make_gpads' in validate.produce with all the assembled GAF files..." )
743
743
make_gpads (dataset , valid_gaf , products ,
744
744
ontology_graph , noctua_gpad_src , paint_gaf_src ,
745
745
gpi , gpad_gpi_output_version )
@@ -750,9 +750,92 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
750
750
rule_metadata = rule_metadata , replace_existing_files = not skip_existing_files ,
751
751
gaf_output_version = gaf_output_version )
752
752
753
- click .echo (end_gaf )
753
+ click .echo ("Executing the isoform fixing step in validate.produce..." )
754
+ # run the resulting gaf through one last parse and replace, to handle the isoforms
755
+ # see: https://github.com/geneontology/go-site/issues/2291
756
+ output_gaf_path = os .path .join (os .path .split (end_gaf )[0 ], "{}.gaf" .format (dataset ))
757
+ isoform_fixed_gaf = fix_pro_isoforms_in_gaf (end_gaf , matching_gpi_path , ontology_graph , output_gaf_path )
758
+ click .echo (isoform_fixed_gaf )
754
759
755
- make_ttls (dataset , end_gaf , products , ontology_graph )
760
+ click .echo ("Creating ttl files..." )
761
+ make_ttls (dataset , isoform_fixed_gaf , products , ontology_graph )
762
+
763
+
764
+ def fix_pro_isoforms_in_gaf (gaf_file_to_fix : str , gpi_file : str , ontology_graph , output_file_path : str ) -> str :
765
+ """
766
+ Given a GAF file and a GPI file, fix the GAF file by converting isoform annotations to gene annotations. Storing
767
+ the isoforms back in subject_extensions collection, changing the full_name, synonyms, label, and type back to the
768
+ gene in the GPI file.
769
+ :param gaf_file_to_fix: The path to the GAF file to fix
770
+ :param gpi_file: The path to the GPI file
771
+ :param ontology_graph: The ontology graph to use for parsing the associations
772
+ :param output_file_path: The path to write the fixed GAF file to
773
+ :return: The path to the fixed GAF file
774
+ """
775
+ fixed_associations = []
776
+ gpiparser = GpiParser (config = assocparser .AssocParserConfig (ontology = ontology_graph ))
777
+ # Parse the GPI file, creating a map of identifiers to GPI entries
778
+ gpis = gpiparser .parse (gpi_file , None )
779
+ gpi_map = {}
780
+ for gpi_entry in gpis :
781
+ gpi_map [gpi_entry .get ('id' )] = {"encoded_by" : gpi_entry .get ('encoded_by' ),
782
+ "full_name" : gpi_entry .get ('full_name' ),
783
+ "label" : gpi_entry .get ('label' ),
784
+ "synonyms" : gpi_entry .get ('synonyms' ),
785
+ # GPI spec says this is single valued, but GpiParser returns this as a list.
786
+ "type" : gpi_entry .get ('type' )[0 ],
787
+ "id" : gpi_entry .get ('id' )}
788
+
789
+ gafparser = GafParser (config = assocparser .AssocParserConfig (ontology = ontology_graph ))
790
+ gafwriter = GafWriter (file = open (output_file_path , "w" ), source = "test" , version = gafparser .version )
791
+
792
+ # these are statistic parameters that record when a substitution is made.
793
+ substitution_count = 0
794
+ no_substitution_count = 0
795
+
796
+ with open (gaf_file_to_fix , "r" ) as file :
797
+ for line in file :
798
+ annotation = gafparser .parse_line (line )
799
+ for source_assoc in annotation .associations :
800
+ if isinstance (source_assoc , dict ):
801
+ continue # skip the header
802
+ if source_assoc .subject .id .namespace .startswith ("PR" ):
803
+ full_old_identifier = source_assoc .subject .id .namespace + ":" + source_assoc .subject .id .identity
804
+ old_namespace = source_assoc .subject .id .namespace
805
+ old_identity = source_assoc .subject .id .identity
806
+ # TODO: right now we get the FIRST encoded_by result -- this is what the original script from Chris did??
807
+ if "MGI" == gpi_map [full_old_identifier ].get ("encoded_by" )[0 ].split (":" )[0 ]:
808
+ source_assoc .subject .id .namespace = gpi_map [full_old_identifier ].get ("encoded_by" )[0 ].split (":" )[0 ]
809
+ source_assoc .subject .id .identity = "MGI:" + gpi_map [full_old_identifier ].get ("encoded_by" )[0 ].split (":" )[2 ]
810
+ else :
811
+ source_assoc .subject .id .namespace = \
812
+ gpi_map [full_old_identifier ].get ("encoded_by" )[0 ].split (":" )[0 ]
813
+ source_assoc .subject .id .identity = \
814
+ gpi_map [full_old_identifier ].get ("encoded_by" )[0 ].split (":" )[1 ]
815
+ full_new_identifier = source_assoc .subject .id .namespace + ":" + source_assoc .subject .id .identity
816
+ source_assoc .subject .full_name = gpi_map [full_new_identifier ].get ("full_name" )
817
+ source_assoc .subject .label = gpi_map [full_new_identifier ].get ("label" )
818
+ source_assoc .subject .synonyms = gpi_map [full_new_identifier ].get ("synonyms" )
819
+ source_assoc .subject .type = gpi_map [full_new_identifier ].get ("type" )
820
+
821
+ # we need to put the isoform currently being swapped, back into "Column 17" which is a
822
+ # subject_extension member.
823
+ isoform_term = Curie (namespace = old_identity , identity = old_namespace )
824
+ isoform_relation = Curie (namespace = "RO" , identity = "0002327" )
825
+ new_subject_extension = ExtensionUnit (relation = isoform_relation , term = isoform_term )
826
+ source_assoc .subject_extensions .append (new_subject_extension )
827
+
828
+ # count the substitution here for reporting later
829
+ substitution_count += 1
830
+ else :
831
+ no_substitution_count += 1
832
+
833
+ # Join fields back into a string and write to output file
834
+ fixed_associations .append (source_assoc )
835
+
836
+ gafwriter .write (fixed_associations )
837
+ click .echo (f"Substituted { substitution_count } entries in { gaf_file_to_fix } "
838
+ f"and left { no_substitution_count } entries unchanged." )
756
839
757
840
758
841
@cli .command ()
@@ -808,14 +891,14 @@ def paint(group, dataset, metadata, target, ontology):
808
891
absolute_target = os .path .abspath (target )
809
892
os .makedirs (os .path .join (absolute_target , "groups" ), exist_ok = True )
810
893
paint_metadata = metadata .dataset_metadata_file (absolute_metadata , "paint" )
811
-
894
+
812
895
paint_src_gaf = check_and_download_mixin_source (paint_metadata , dataset , absolute_target )
813
896
814
897
click .echo ("Loading ontology: {}..." .format (ontology ))
815
898
ontology_graph = OntologyFactory ().create (ontology )
816
899
817
900
gpi_path = os .path .join (absolute_target , "groups" , dataset , "{}.gpi" .format (dataset ))
818
- click .echo ("Using GPI at {}" .format (gpi_path ))
901
+ click .echo ("Using GPI at {}" .format (gpi_path ))
819
902
paint_gaf = produce_gaf ("paint_{}" .format (dataset ), paint_src_gaf , ontology_graph , gpipath = gpi_path )
820
903
821
904
@@ -825,7 +908,8 @@ def paint(group, dataset, metadata, target, ontology):
825
908
@click .option ("--ontology" , type = click .Path (), required = True )
826
909
@click .option ("--gaferencer-file" , "-I" , type = click .Path (exists = True ), default = None , required = False ,
827
910
help = "Path to Gaferencer output to be used for inferences" )
828
- @click .option ("--retracted_pub_set" , type = click .Path (exists = True ), default = None , required = False , help = "Path to retracted publications file" )
911
+ @click .option ("--retracted_pub_set" , type = click .Path (exists = True ), default = None , required = False ,
912
+ help = "Path to retracted publications file" )
829
913
def rule (metadata_dir , out , ontology , gaferencer_file , retracted_pub_set ):
830
914
absolute_metadata = os .path .abspath (metadata_dir )
831
915
@@ -840,8 +924,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set):
840
924
if retracted_pub_set :
841
925
retracted_pubs = metadata .retracted_pub_set (retracted_pub_set )
842
926
else :
843
- retracted_pubs = metadata .retracted_pub_set_from_meta (absolute_metadata )
844
-
927
+ retracted_pubs = metadata .retracted_pub_set_from_meta (absolute_metadata )
845
928
846
929
click .echo ("Found {} GO Rules" .format (len (gorule_metadata .keys ())))
847
930
0 commit comments