/
clusterDomStrs.py
79 lines (65 loc) · 3.29 KB
/
clusterDomStrs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Copyright (C) 2016 Emmanuel LC. de los Santos
# University of Warwick
# Warwick Integrative Synthetic Biology Centre
#
# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
'''
This file is part of clusterTools.
clusterTools is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
clusterTools is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with clusterTools. If not, see <http://www.gnu.org/licenses/>.
'''
import numpy as np
import align,aligntools,hungarian
from scipy.optimize import linear_sum_assignment
from itertools import chain,count
from copy import copy
# I had to switch to the linear_sum_assignment module in scipy for the hungarian algorithm. hungarian wasn't
# doing the matching correctly. This is slower though since it's a python implementation.
# May need to port over a different c implementation if speed becomes an issue
def compareDomStrs(cluster1,cluster2,match,mismatch,gap,scale):
#first index each cluster
clus1Strs = dict(enumerate(cluster1))
clus2Strs = dict(enumerate(cluster2))
clus1Size = len(cluster1)
clus2Size = len(cluster2)
scoreMatrix,word2num,num2word = aligntools.buildScoringDictScaled(chain(*(cluster1+cluster2)),match,mismatch,scale)
alignScores = np.ndarray((clus1Size,clus2Size))
# score each pairwise alignment of domain strings to populate a alignment scores matrix
for i,domStr1 in enumerate(cluster1):
for j,domStr2 in enumerate(cluster2):
num1 = [word2num[x] for x in domStr1]
num2 = [word2num[x] for x in domStr2]
alignScore,a1,a2 = align.align(num1,num2,gap,gap,scoreMatrix,local=True)
alignScores[i,j] = alignScore
#prepare scoring matrix for hungarian algorithm: negate scores and pad matrix
if clus1Size < clus2Size:
costMatrix = -np.vstack((copy(alignScores),np.zeros((clus2Size-clus1Size,clus2Size))))
elif clus2Size < clus1Size:
costMatrix = -np.hstack((copy(alignScores),np.zeros((clus1Size,clus1Size-clus2Size))))
else:
costMatrix = -copy(alignScores)
# apply hungarian algorithm for matching
pairings = [(x,y) for x,y in zip(*linear_sum_assignment(costMatrix)) if (x<clus1Size) and (y<clus2Size)]
clusterScore = sum(alignScores[pairing] for pairing in pairings)
pairStrings = [(alignScores[(x,y)],clus1Strs[x],clus2Strs[y]) for x,y in pairings]
pairStrings.sort(reverse=True)
return clusterScore,pairStrings
def parseDomStrsFile(domStrFile):
domStrDict = {}
with open(domStrFile,'rb') as domStrHandle:
for line in domStrHandle:
if line[0] == '>':
clusterName = line[1:-1]
domStrDict[clusterName] = []
else:
domStrDict[clusterName].append(line.split())
return domStrDict