-
Notifications
You must be signed in to change notification settings - Fork 4
/
diffhash.jl
45 lines (37 loc) · 1.12 KB
/
diffhash.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
using CSV
using DataFrames
using BioSequences
using JLD
function update_kmercount!(filename, kmers, pos)
# modifies kmers
reader = FASTA.Reader(open(filename, "r"))
for record in reader
# Do something
for (_, kmer) in each(DNAKmer{13}, sequence(record))
cank = convert(String, canonical(kmer)) # store kmers as strings
oldcount = get!(kmers, cank, zeros(Int64, nrow(df)))
kmers[cank][pos] = oldcount[pos] + 1
end
end
close(reader)
end
function count_kmers(df)
kmers = Dict()
for rowi in 1:nrow(df)
sample_name = df[rowi,:rep_id]
println(sample_name)
forward_name = "simulated_reads/" * sample_name * "_1.fasta"
reverse_name = "simulated_reads/" * sample_name * "_2.fasta"
update_kmercount!(forward_name, kmers, rowi)
update_kmercount!(reverse_name, kmers, rowi)
end
return kmers
end
##update_kmercount("reads.fasta", kmers, 1)
### main
file = "simulated_reads/sim_rep_info.txt"
df = CSV.File(file, delim = "\t") |> DataFrame
@show df
kmers = count_kmers(df)
@save "kmers.jld" kmers
#@show kmers