-
Notifications
You must be signed in to change notification settings - Fork 0
/
reconstruct.Rd
158 lines (139 loc) · 6.97 KB
/
reconstruct.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tzara.R
\name{reconstruct}
\alias{reconstruct}
\title{Reconstruct a longer region out of ASVs or consensus sequence of individual
domains.}
\usage{
reconstruct(
seqtabs,
regions = names(seqtabs),
regions_regex = NULL,
regions_replace = NULL,
output = "concat",
use_output = c("first", "second", "no"),
order = setdiff(regions, output),
read_column = "seq.id",
asv_column = "dada.seq",
rawtabs = seqtabs,
raw_column = NULL,
raw_regions = names(rawtabs),
sample_column = NULL,
sample_regex = NULL,
sample_replace = NULL,
chimera_offset = 0,
allow_map = TRUE,
allow_consensus = TRUE,
allow_raw = FALSE,
...
)
}
\arguments{
\item{seqtabs}{(\code{list} of \code{data.frame}) with columns
\code{read_column}, \code{asv_column}, and optionally \code{sample_column}.
Any additional columns are ignored. \code{read_column} should give a unique
ID for each sequencing read, and \code{asv_column} should give the denoised
sequence for the read.}
\item{regions}{(\code{character} vector with the same length as
\code{seqtabs}) The names of the regions/domains represented by each of the
tables in \code{seqtabs}. If not supplied, then \code{seqtabs} should be
named by the regions.}
\item{regions_regex}{(\code{character} scalar, or \code{NULL})
A \link[stringi:stringi-search-regex]{regular expression}. If
\code{regions_regex} is given but \code{regions_replace} is not, then only
the part of the entries in \code{regions} matching the regex
are used to define samples (using \code{\link[stringr]{str_extract}}). If
\code{regions_replace} is also used, then the regex is instead replaced by
\code{regions_replace} (using \code{\link[stringr]{str_replace}}).
\code{NA_character} is treated the same way as \code{NULL}.}
\item{regions_replace}{(\code{character} scalar, or \code{NULL})
Replacement string for \code{regions_regex}.
\code{NA_character} is treated the same way as \code{NULL}.}
\item{output}{(\code{character} scalar or named list of \code{character}
vectors) If a \code{character} scalar, then the name to be used for the
(single) output region. In this case the region will be the concatenation
of all the regions in \code{order}. Alternatively, a list where the names
are the names of the output regions, and the values are \code{character}
vectors giving the regions which should be concatenated for each output
region.}
\item{use_output}{(one of \code{"first"}, \code{"second"}, or \code{"no"}) If
one of the regions given by \code{output} is also present in
\code{seqtabs}, then the \code{seqtabs} version is used preferentially
\code{use_output == "first"}, as a backup value when one of the
subregions/domains is missing if \code{use_output == "second"}, or not
at all if \code{use_output == "no"}.}
\item{order}{(\code{character} vector) The order in which the
sub-regions/domains should be concatenated to produce the output(s).}
\item{read_column}{(\code{character} scalar) Column name from the
\code{seqtabs} which uniquely identifies each read (but different
regions extracted from the same read should have the same ID.)}
\item{asv_column}{(\code{character} scalar) Column name from the
\code{seqtabs} which gives the denoised sequences.}
\item{rawtabs}{(\code{list} of \code{data.frame}) Data sources of the same
format as \code{seqtabs}, with columns \code{read_column} and
\code{raw_column}. These should be of the same number as
\code{seqtabs}, and correspond to the sub-regions/domains specified in
\code{regions}. The default is to look for \code{raw_column} in
\code{seqtabs}.}
\item{raw_column}{(\code{character} scalar, or \code{NULL})
Column name from the \code{seqtabs} which gives the raw sequences. If
\code{NULL} or \code{NA_character_}, then consensus sequences will not
be used as a backup when no denoised sequence is present.}
\item{raw_regions}{(\code{character} vector with the same length as
\code{rawtabs}) The names of the regions/domains represented by each
of the tables in \code{rawtabs}. These will be processed using
\code{regions_regex} and \code{regions_replace}, if given.}
\item{sample_column}{(\code{character} scalar, or \code{NULL}) An optional
column name from the \code{seqtabs} which identifies which sample each
sequence is from. If given, this is used (after possible modification
by \code{sample_regex} and \code{sample_replace}) to identify
different samples for \code{\link[dada2]{isBimeraDenovoTable}}.
\code{NA_character} is treated the same way as \code{NULL}.}
\item{sample_regex}{(\code{character} scalar, or \code{NULL}) A
\link[stringi:stringi-search-regex]{regular expression}. If
\code{sample_regex} is given but \code{sample_replace} is not, then
only the part of the entries in \code{sample_column} matching the
regex are used to define samples (using
\code{\link[stringr]{str_extract}}). If \code{sample_replace} is also
used, then the regex is instead replaced by \code{sample_replace}
(using \code{\link[stringr]{str_replace}}). \code{NA_character} is
treated the same way as \code{NULL}.}
\item{sample_replace}{(\code{character} scalar, or \code{NULL})
Replacement string for \code{sample_regex}. \code{NA_character} is
treated the same way as \code{NULL}.}
\item{chimera_offset}{(\code{integer}) By default, bimeras are checked for
sub-region/domains 1, 2, 3; 3, 4, 5; 5, 6, 7; etc. This is appropriate
if the domains alternate variable, conserved, variable, etc. If a
more conserved domain is first, use \code{chimera_offset = 1}.}
\item{allow_map}{(\code{logical} scalar) If \code{TRUE} and if \code{asvs}
contains non-missing values, attempt to map each raw read without a
corresponding ASV to the nearest ASV.}
\item{allow_consensus}{(\code{logical} scalar) If \code{TRUE} and if
\code{allow_map} is \code{FALSE} or there are no non-missing values in
\code{asvs}, then attempt to make a consensus of all raw reads.}
\item{allow_raw}{(\code{logical} scalar) If \code{TRUE}, then after mapping
and/or consensus building, remaining raw reads are taken as they are.
If \code{FALSE}, the corresponding results will be \code{NA}.}
\item{...}{additional arguments passed to \code{\link[dada2]{isBimeraDenovo}}
or \code{\link[dada2]{isBimeraDenovoTable}}.}
}
\value{
a \code{\link[tibble]{tibble}} with column "\code{seq.id}" and
\code{sample_column} (if given), as well as one column for each value
of \code{regions} and \code{output}, representing the
sub-regions/domains and the concatenated full region.
}
\description{
The sequences from each denoised sub-region/domain are concatenated to create
a denoised sequence
for the long region. Additionally, de-novo bimera detection is performed
using \code{\link[dada2]{isBimeraDenovo}} or
\code{\link[dada2]{isBimeraDenovoTable}} on
sets of three consecutive sub-regions/domains; in the intended application,
these sets will be variable--conserved--variable.
}
\details{
When not all sub-regions/domains for a given read have been successfully
denoised with DADA, then the missing regions are constructed using
\code{\link{cluster_consensus}}.
}