-
Notifications
You must be signed in to change notification settings - Fork 4
/
median_string.py
57 lines (47 loc) · 1.79 KB
/
median_string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from hamming_distance import hamming_distance
from kmer import all_kmers
def median_string(k, dna):
"""
Find the a k-mer x that minimizes the HammingDistance(x, dna)
:param k: the size of the k-mer to find
:param dna: a list of DNA sequences
:return: a k-mer that minimizes the distance between itself and the list of sequences. If multiple k-mers are found,
return only a single one.
Efficiency: O(4^k * kns)
"""
min_distance = (float("inf"), None)
for pattern in all_kmers(k):
distance = hamming_distance(pattern, dna)
if distance < min_distance[0]:
min_distance = (distance, pattern)
return min_distance[1]
def median_strings(k, dna):
"""
Find all k-mers x that minimizes the HammingDistance(x, dna)
:param k: the size of the k-mer to find
:param dna: a list of DNA sequences
:return: all k-mer that minimizes the distance between themselves and the list of sequences
Efficiency: O(4^k * kns)
"""
result = {}
for pattern in all_kmers(k):
distance = hamming_distance(pattern, dna)
result[pattern] = distance
min_value = min(result.values())
mins = [sequence for sequence, value in result.items() if value == min_value]
return mins
def __main__():
dna = """AGGTACTATCAGTAAAAAGGTTGAACATTTGGTGCGATAGCG
TCAACTCGGTACGCGCATAGCAGGGCAGTTGTACGGGGTGCC
CGTTTAATGACGTTAGCAGGACACCGGTACAGGCTAAAACTA
TGCCAAGTCAACGTTTCCTGGTACGGCTACCGTGTTGGACTC
CGGCCCTGGTACACATGCTTGTGATATCTGGCAATTCTTCAC
CATGGGGGGTACGACAGAAGACATCGATATCCAACTGCGATC
TAACTCAGGTACGTACAGTATTACCGCGGTCTAGTACAAGCT
TTGCCGTGATTTACCCGACACGTGAGGTACTGCTTGCGGAAT
AAACGCGACTGGCCCCACGCTTCAAACAACCTAGGAAGGTAC
TGTTAGGAGAAAACCCATACGCTGAATCGATCGGCCCGGTAC"""
k = 6
print(median_string(k, dna.split('\n')))
if __name__ == '__main__':
__main__()