/
GoTermsMap.py
52 lines (42 loc) · 3.71 KB
/
GoTermsMap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# ~~~~ Cera Fisher (2018)
# Takes a file of gene IDs and GoTerms from EnTap and makes a many-to-many tab delimited mapping file appropriate for use with GoSeq in R.
# Create the infile in bash using:
# cut -f 1,28,29,30 entap.final_annotations.tsv > entap_goTerms.tsv
#### TODO: Fix this so that the output is exactly what GoSeq needs - three columns with third column == GO id
import sys
infile=sys.argv[1]
outfile=sys.argv[2]
# This function takes as input the gene id, one of the three GOTerm fields from EnTAP, and an output file
# (specified as an argument when the script is run)
# It splits the list of GoTerms on the characters "GO:"
# (rather than the comma delimiter, because some GOTerms have commas in them)
# and writes one line with the gene id for each goterm
def writeGoLine(id, field, outfile):
if field != '': # We don't want to bother with blank fields.
Terms = field.split("GO:") # Splitting the field into an array, at the character "GO:" bc some GOterms have commas
for term in Terms: # Going to loop through this newly created array
term = term.strip() # strip spaces off both sides
term = term.rstrip(",") # strip the trailing comma
if term != '': # use this to skip the incidentally created blank first item
with open(outfile, "a") as mapping: # writing with with open bc it's better for memory
mapping.write(id+'\tGO:'+term+'\n') # Sneakily adding back in the "GO:" that we lost when we split the field on it
with open(infile, "r") as goterms: # Open the infile to read
for line in goterms: # This is where we read the file created from EnTAP output
Cols = line.split("\t") # Split apart the different fields
id = Cols[0].strip() # Get the transcript ID
goBio = Cols[1].strip() # Get the list of Biological Process terms
goCell = Cols[2].strip() # And the list of Cellular Component Terms
goMol = Cols[3].strip() # And the list of Molecular function terms
# Below, if you want ONLY one kind of annotation (bio process, cell. component)
# comment out the lines that write the kind of annotation you DON'T want.
writeGoLine(id, goBio, outfile) # Here we use the function defined above to write the one-to-one mapping file.
writeGoLine(id, goCell, outfile) # Note, you could modify this to specify three different output files, and
writeGoLine(id, goMol, outfile) # write each type of GO term to a separate file, but it's very unnecessary.
print("I think I'm done") # I like when my scripts talk to me.
# IMPORTANT NOTE: the output file may still need to be modified in order to use it with goseq. GoSeq wants only the GO identifier, e.g.
# GO:0032991 but our EnTAP annotation is a bit more verbose, listing the GO identifier and the term, a la
# GO:0003008-system process(L=2)
# For my own purposes, I decided to take the output file made by this script and, in bash, do
# cut -f 2 outputfile.txt (cutting the GOterm column) > fileofjustgoterms.txt
# sed -i "s/-.*$//g" (using regular expressions to find and replace the text that occurs right after the GO id, which starts with a hyphen
# paste outputfile.txt fileofjustgorterms.txt > newthreecolumnfile.txt