/
backwards_full_2d.py
290 lines (232 loc) · 12 KB
/
backwards_full_2d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env python3
'''
Coalescent simulation with recombination.
Takes the output of the forward time simulations as input.
Read the frequency of beneficial mutation and go backwards in time until n individuals all coalesce.
As time goes back, record the number of leaves for each node.
The neutral mutations arise as a Poisson process with rate Un = 1 under our assumptions.
The expected SFS is the histogram of the leaf counts
Pre-sweep coalescent process can account for more than one merger in a single generation
when there are many individuals are left (large compared to sqrt(2 * N))
Code can handle 2-D populations by converting the 2-D array into a 1-D array with
migration patterns calculated using a single index.
'''
import numpy as np
from multiprocessing import Pool
from multiprocessing import Process
import sys
from numpy import random
import random as rand
from functions_combined import *
import time
import os.path
from os import path
start = time.time()
L = int(sys.argv[1]) # number of demes
rho = int(sys.argv[2]) # deme capacity
s = float(sys.argv[3]) # selection coefficient
m = float(sys.argv[4]) # migration rate
l0 = int(sys.argv[5]) # location of the origin of the sweep
nbase = int(sys.argv[6]) # sample size for the coalescent simulation
N_SFS = int(sys.argv[7]) # number of coalescent simulation we run for ensemble average.
T_after_fix = int(sys.argv[8]) # number of generations between the end of the sweep and sampling
Nforw = int(sys.argv[9]) # number of forward simulations, used for parallel jobs
dimension = int(sys.argv[10]) # To set spatial dimension of population (1D or 2D)
r = float(sys.argv[11]) # recombination rate
mutationSite = int(sys.argv[12])
r_per_base = 1e-8
mutationRate = 1e-11
step = 1
#Old code: sys.argv[12] if len(sys.argv) >= 13 else
extra_gen_cutoff = int(L ** 2 / m / rho)
## python backwards_combined.py L N s m l0 n_base N_SFS T_after_fix Nforw dimensions
## python backwards_combined.py 500 20000 0.05 0.250 1 10000 4 3000 1 1
if (dimension == 1):
print ('1_D')
fname = 'L={}_N={}_s={:.6f}_m={:.6f}_{}.txt'.format(L, rho, s, m, Nforw)
print(fname)
lines = np.loadtxt(fname, dtype = np.int64)
elif(dimension == 2):
print('2-D')
##Read input file and flatten lines[i] into 1-D array here for use within existing framework
fname = 'L={}_N={}_s={:.3f}_m={:.2f}_l0={}_Nforw={}.txt'.format(L, rho, s, m, l0, Nforw)
#print(fname)
lines = np.loadtxt(fname, dtype = np.int64)
tfinal = int(len(lines) / L)
lines = lines.reshape(tfinal, L, L) ###lines = Data from forward simulation
lines = [lines[i].flatten() for i in range(len(lines))]
print(tfinal)
else:
print("Dimension not recognised")
quit()
def runner(idx):
'''
idx: argumemt to spawn multiple jobs. Not relevant for the body of the simulation.
Reads a generation, starting from the last one. Tracks the sampled indviduals backwards in time
till all mergers have happened and then calculates the branch length to get the SFS.
'''
rho_e = lines[-1] ##Mutants in a deme. We pick the absolute last generation we simulated forward in time till to start simulating backwards.
n = nbase
SFS = np.zeros(n)
rho_e = (rho_e).astype(np.int64)
'''Creating the primary data structure by sampling'''
# data structure('individuals') format will be [mut_type, deme index, individual index(inside the deme)] in a row and as many rows as the individuals we sample
individuals = sample_data(rho_e, n, rho)
unique, leaf_counts = np.unique(individuals, axis = 0, return_counts = True) #To avoid overcounting of repeated locations and ensure we actually sample what we want
# The SFS is the histogram of these leaf counts counted over all generations
hist, bin_edges = np.histogram(leaf_counts, bins = np.arange(1, n + 2))
'''Starting simulations backward in time now'''
t_after_fix = 0
'''Going backwards in time for user given generations after sweep has fixed'''
#Since the population doesnt change once sweep has fixed, we don't need to change the input data for this
while (t_after_fix < T_after_fix) and (len(individuals) > 1):
t_after_fix += 1
individuals_post_recombination = recombination(rho_e, individuals, rho, r)
individuals_post_migration = migration(rho_e, individuals_post_recombination, rho, m, L, dimension)
individuals, leaf_counts = coalescent(individuals_post_migration, leaf_counts)
hist, bin_edges = np.histogram(leaf_counts, bins = np.arange(1, n + 2))
SFS += hist
'''Going back in time during the sweep, all the way back to the time when mutation is first seeded'''
line_num = -1
while (len(individuals) > 1) and (line_num > -len(lines)): ##The maximum we can go backward in time is till T2
#print('sweep')
line_num -= 1
rho_e_parent = (lines[line_num]).astype(np.int64) ##Getting the parent generation from each time step of our forward simulation
individuals_post_migration = migration(rho_e_parent, individuals, rho, m, L, dimension)
individuals_post_recombination = recombination(rho_e_parent, individuals_post_migration, rho, r)
individuals, leaf_counts = coalescent(individuals_post_recombination, leaf_counts)
hist, bin_edges = np.histogram(leaf_counts,bins = np.arange(1, n + 2))
SFS += hist
rho_e = rho_e_parent
'''
The first round of coalescence simulation ends when we get to the time
when the beneficial mmutation arose, and all the left over individuals
are wild type (WT).
If there is no recombination, the simulation should end here since the MRCA
will be the benficial mutation that gave rise to the sweep.
From this point till we reach the MRCA, the coalescence will be extremely slow. The MRCA can be different
from the beneficial mutation because of recombination.
We run the same kind of simulation until the individuals
disperse for L^2 / m / N. We speed up the simulation by recording the number
of generations between the coalescence events.
'''
left_individuals = len(individuals) ##Individuals left to still coalesce in time before the mutation arose
# Make sure that all individuals are WT (mutation type = 0)
mut_types, deme_arr, ind_in_deme_arr = individuals.T
mut_types = np.zeros_like(mut_types)
mut_types = (mut_types).astype(np.int64)
individuals = np.vstack((mut_types, deme_arr, ind_in_deme_arr)).T
branch_len = 0 # number of generations until first merging event
extra_gen = 0 # extra run time before stopping the coalescent simulation.
while left_individuals > 1 and extra_gen < extra_gen_cutoff: ###Will it be same for 2-D
#print('extra gens')
branch_len += 1
extra_gen += 1
rho_e_parent_pre_sweep = np.zeros_like(rho_e)
rho_e_parent_pre_sweep = (rho_e_parent_pre_sweep).astype(np.int64)
# Before the beneficial mutation appeared, everyone is WT. Thus, rho_e_parent is just an array of zeros.
# Also, we can skip the recombination step, since the genotype will always be zero for everyone.
individuals_post_migration = migration(rho_e_parent_pre_sweep, individuals, rho, m, L, dimension)
individuals, leaf_counts = coalescent(individuals_post_migration, leaf_counts)
current_individuals_counts = len(individuals)
if current_individuals_counts < left_individuals:
hist, bin_edges = np.histogram(leaf_counts * branch_len, bins = np.arange(1, n + 2))
SFS += hist
branch_len = 0
left_individuals = current_individuals_counts
'''
After the individuals disperse, we may ignore the spatial structure.
If there are more than sqrt(2 * N * L) individuals left, it is likely
to have more than 1 merging event in a single generation. Thus, we manually
draw a parent for every individual and find if some of them are the same.
Once there are much smaller number of individuals left, it will take a long
time to coalesce. Thus, we draw T2 (time until the first coalescent) from
geometric probablity distribution. (This is found from probablity of choosing a different
parent for every individual left.)
'''
while left_individuals > 1:
T2 = random.exponential(rho * L / ((left_individuals - 1) * left_individuals / 2)) ###Drawing the T2 from geometric distribution
hist, bin_edges = np.histogram(leaf_counts, bins = np.arange(1, n + 2))
SFS += hist * T2 ##That many branches will exist
# Now choose two random branches that will coalesce in T2 generations.
coal_inds = random.choice(range(left_individuals), size = 2, replace = False)
# Like in the coalescent function, as two branches merge, two leaf counts are summed up and the number of remaining individuals decreases by 1.
leaf_counts_copy = leaf_counts
leaf_counts_copy[coal_inds[0]] += leaf_counts[coal_inds[1]]
leaf_counts_copy[coal_inds[1]] = 0
leaf_counts = leaf_counts_copy[leaf_counts_copy > 0]
left_individuals -= 1
f = np.arange(1, n + 1) / n ##Calculating frquency of mutant
H = np.sum(2 * f * (1 - f) * SFS) / np.sum(SFS) ##Calculating Heterezygosity
return SFS, H
#############################################################
'''Main body to create multiple backward trees and average'''
#############################################################
def recalcRecombination(position):
relDistance = abs(mutationSite - position)
genDistance = float(relDistance * r_per_base)
newRecomb = 1 - (np.exp(-2 * genDistance))
newRecomb *= 0.5
return newRecomb
def updateStep(position):
relDistance = abs(mutationSite - position)
if relDistance < 10000:
return 1000
elif relDistance <= 100000:
return 10000
elif relDistance <= 1000000:
return 100000
else:
return 1000000
def writeOutput(position, mutationCount, outputFileName):
if path.exists(outputFileName) == False:
output = open(outputFileName, "w")
output.write("position\t" + "x\t" + "n\t" + "folded\n")
output.close()
output = open(outputFileName, "a")
output.write(str(position) + "\t" + str(mutationCount) + "\t" + str(nbase) + "\t" + "0\n")
output.close()
def explicit_sim(left, right, outputFile):
p = Pool(20) # 20 is the nunmber of cores we used to spawn jobs parallely on, adjust as needed
ret = p.map(runner, range(N_SFS)) ###Number of simulations you want to average over
SFS_items = [r[0] for r in ret]
H_items = [r[1] for r in ret]
# Averaging all SFSs
SFS = np.sum(SFS_items, axis=0)
SFS /= N_SFS
SFS *= mutationRate
SFS_List = SFS.tolist()
zeroProbability = 1 - sum(SFS_List)
SFS_List.append(zeroProbability)
mutations = []
for x in range(1, nbase + 1):
mutations.append(x)
mutations.append(0)
resultListLeft = rand.choices(mutations, SFS_List, k = step)
resultListRight = rand.choices(mutations, SFS_List, k = step)
targetLociLeft = rand.sample(range(left, left + step), step)
targetLociRight = rand.sample(range(right - step, right), step)
for i in range(step):
if resultListLeft[i] > 0:
writeOutput(targetLociLeft[i], resultListLeft[i], outputFile)
if resultListRight[i] > 0:
writeOutput(targetLociRight[i], resultListRight[i], outputFile)
if __name__ == '__main__':
lo = int(mutationSite - ((10 * s) / r_per_base))
hi = int(mutationSite + ((10 * s) / r_per_base))
left = mutationSite
right = mutationSite
while right < hi:
step = updateStep(right)
right += step
left -= step
processCount = 1
r = recalcRecombination(right)
processes = []
for x in range(processCount):
p = Process(target=explicit_sim, args=(left, right, "output_explicit_" + str(x) + ".txt",))
processes.append(p)
for process in processes:
process.start()
for process in processes:
process.join()