/
read_Naica.R
101 lines (91 loc) · 5.73 KB
/
read_Naica.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
## Function to read the raw fluorescence output files from the Stilla Naica
## digital PCR platform. Files can be exported by choosing options>export amplitude and cluster data
## The program will create a folder (droplet-level-data/RawData) with several .csv files per reaction (well).
## To read data from multiple runs, all files should be placed into one folder. The latter's path (eg. "D:/Documents/dPCR Run")
## is submitted as a string to the function below.
## The ddPCR .csv files should be the ONLY .csv files in the directory.
## Otherwise errors may occur!
# Updates will appear at: github.com/Gromgorgel
## Input:
#‘directory' is a character vector of length 1 indicating the location of the CSV files, the algorithm will attempt to analyze
# any csv files present in the directory (unless 'file' is specified) and expects all csv files to contain droplet amplitudes.
# defaults to "C:\\Quantalife\\Data"
#‘files' is a character vector. if specified, the algorithm will analyse only the filenames in the vector and will look for them
# in the current working directory. The file extension should be present in the file name.
#‘nr.r' is an optional number indicating the number of rows in the matrix (missing values will be padded with NA)
# (if nr.r = NA, the largest number of droplets found will be used)
# autoname logical. If TRUE the well names are collected from the csv files
# and used as column names. Only works if there are 96 or less csv files
# If there are more, a warning message will be reported. Problems
# may occur if additional underscores ("_") are present in the name of the run/experiment.
## Output:
# The function creates a list containing matrices with fluorescence readings
# one for each channel: $Ch1, $Ch2, etc.. The matrix has as many columns
# as there are samples. The number of rows can be either set (max: approx 23500)
# otherwise, the number of rows equals the largest number of readings found.
# Missing values are reported as NA.
## TO DO
## select chip type
## check data structure with less channels
read.Naica <- function(directory = "C:\\droplet-level-data\\RawData", files = NA, nr.r = NA, autoname = TRUE){
##Prepping to read
bup <- getwd() #save current work directory
if(is.na(files)){ # files not specified
setwd(directory) # go to data directory
files <- Sys.glob("*.csv") #save all filenames
files <- files[!seq_along(files) %in% grep('Channel', files)] # select files with all channel data (these do not contain 'Channel' in the filename)
}else{ # files is specified, we look in the current working directory
directory <- bup
}
# check rownumber required #note 0.25 nl is an estimate I made, it is not the actual droplet volume, I will add this once I find it.
d.max <- round(5/0.00025, digits = -1)*1.275 #maximum number of droplet reads possible, add 27.5% extra as some reactions have been yielding more droplest than possible
if(isTRUE(nr.r > d.max)){ #in case someone wants more padding than droplets are possible
message("this is not a good idea")
message(paste("nr.r reduced to ",d.max))
nr.r <- d.max
}
r.max <- 0 #highest number of droplets found in this run
##step one: prep the list to store all data
list.out <- list()
list.out$Ch1 <- matrix(nrow = d.max, ncol = length(files)) #generate output file for channel 1
list.out$Ch2 <- matrix(nrow = d.max, ncol = length(files)) #generate output file for channel 2
list.out$Ch3 <- matrix(nrow = d.max, ncol = length(files)) #generate output file for channel 3
list.out$Ch4 <- matrix(nrow = d.max, ncol = length(files)) #generate output file for channel 4
list.out$Ch5 <- matrix(nrow = d.max, ncol = length(files)) #generate output file for channel 5
list.out$Ch6 <- matrix(nrow = d.max, ncol = length(files)) #generate output file for channel 6
for(i in seq_along(files)){ #read each file, add variable zero padding to max size and store in matrix
temp <- read.csv(files[i], skip = 1, header = F) #read csv file
n.dr <- dim(temp)[1] # check how many droplets were generated in this sample
n.ch <- dim(temp)[2] - 3 # check number of channels there are in this sample (there are three non-channel columns: x & y coordinates, & index)
#save data into output matrices
for(j in 1:n.ch){
list.out[[j]][1:n.dr, i] <- temp[,j + 2]
#check if current number of droplets is higher the run max
if(n.dr > r.max){
r.max <- n.dr #we have a new winner
}}} # close all statments
##step two (optional): extract colnames from files
if(isTRUE(autoname)){
if(length(files) > 96){
message("more than 96 files found, switching off autoname")
}else{
wells <-vector(mode="character", length(files))
for(i in seq_along(files)){ #extract colnames from each file
wells[i] <- sub("?.*_(.*?)_RawData.csv","\\1",files[i])
}
#check for errors and report:
if(length(wells) > length(files)){
message("more names than columns found, switching off autoname")
}else{
for(j in 1:n.ch){ # I looked for an alternative to a loop here, but I could not get it to work with matrices.
colnames(list.out[[j]]) <- wells
}}}}# close all statements
##step three: trimming
r.trim <- if(isTRUE(nr.r > r.max)){nr.r}else{r.max} # if an exact number of rows was specificied (and it is larger than r.max, we keep it)
for(j in 1:n.ch){
list.out[[j]] <- list.out[[j]][1:r.trim, ]
}
##step four: output
setwd(bup) #return to previous wd
return(list.out)
}