-
Notifications
You must be signed in to change notification settings - Fork 24
/
get_file.R
178 lines (165 loc) · 5.67 KB
/
get_file.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#' @rdname files
#'
#' @title Download dataverse file as a raw binary
#'
#' @description Download Dataverse File(s). `get_file_*`
#' functions return a raw binary file, which cannot be readily analyzed in R.
#' To use the objects as dataframes, see the `get_dataset_*` functions at
#' \link{get_dataset} instead.
#'
#' @details This function provides access to data files from a Dataverse entry.
#' `get_file` is a general wrapper,
#' and can take either dataverse objects, file IDs, or a filename and dataverse.
#' Internally, all functions download each file by `get_file_by_id`.
#' `get_file_by_name` is a shorthand for running `get_file` by
#' specifying a file name (`filename`) and dataset (`dataset`).
#' `get_file_by_doi` obtains a file by its file DOI, bypassing the
#' `dataset` argument.
#'
#' @param file An integer specifying a file identifier; or a vector of integers
#' specifying file identifiers; or, if used with the prefix \code{"doi:"}, a
#' character with the file-specific DOI; or, if used without the prefix, a
#' filename accompanied by a dataset DOI in the `dataset` argument, or an object of
#' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.
#' @param dataset @kuriwaki, can you please add a description for this parameter?
#' @param format A character string specifying a file format for download.
#' by default, this is \dQuote{original} (the original file format). If `NULL`,
#' no query is added, so ingested files are returned in their ingested TSV form.
#' For tabular datasets, the option \dQuote{bundle} downloads the bundle
#' of the original and archival versions, as well as the documentation.
#' See <https://guides.dataverse.org/en/latest/api/dataaccess.html> for details.
#' @param vars A character vector specifying one or more variable names, used to
#' extract a subset of the data.
#'
#' @template envvars
#' @template dots
#' @template ds
#'
#' @return \code{get_file} returns a raw vector (or list of raw vectors,
#' if \code{length(file) > 1}), which can be saved locally with the `writeBin`
#' function. To load datasets into the R environment dataframe, see
#' \link{get_dataframe_by_name}.
#'
#' @seealso To load the objects as datasets \link{get_dataframe_by_name}.
#'
#' @examples
#' \dontrun{
#'
#' # 1. Using filename and dataverse
#' f1 <- get_file_by_name(
#' filename = "nlsw88.tab",
#' dataset = "10.70122/FK2/PPIAXE",
#' server = "demo.dataverse.org"
#' )
#'
#' # 2. Using file DOI
#' f2 <- get_file_by_doi(
#' filedoi = "10.70122/FK2/PPIAXE/MHDB0O",
#' server = "demo.dataverse.org"
#' )
#'
#' # 3. Two-steps: Find ID from get_dataset
#' d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org")
#' f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org")
#'
#' # 4. Retrieve multiple raw data in list
#' f4_vec <- get_dataset(
#' "doi:10.70122/FK2/PPIAXE",
#' server = "demo.dataverse.org"
#' )$files$id
#'
#' f4 <- get_file(f4_vec, server = "demo.dataverse.org")
#' length(f4)
#'
#' # Write binary files
#' # (see `get_dataframe_by_name` to load in environment)
#' # The appropriate file extension needs to be assigned by the user.
#' writeBin(f1, "nlsw88.dta")
#' writeBin(f2, "nlsw88.dta")
#'
#' writeBin(f4[[1]], "nlsw88.rds") # originally a rds file
#' writeBin(f4[[2]], "nlsw88.dta") # originally a dta file
#' }
#'
#' @export
get_file <- function(
file,
dataset = NULL,
format = c("original", "bundle"),
vars = NULL,
key = Sys.getenv("DATAVERSE_KEY"),
server = Sys.getenv("DATAVERSE_SERVER"),
original = TRUE,
...
) {
format <- match.arg(format)
# single file ID
if (is.numeric(file))
fileid <- file
# get file ID from 'dataset'. Streamline in feature relying on get_fileid
if (!is.numeric(file) & inherits(file, "dataverse_file"))
fileid <- get_fileid.dataverse_file(file, key = key, server = server)
if (!is.numeric(file) & !inherits(file, "dataverse_file") & !is.null(dataset))
fileid <- get_fileid.character(dataset, file, key = key, server = server, ...)
if (!is.numeric(file) & !inherits(file, "dataverse_file") & is.null(dataset)) {
if (grepl(x = file, pattern = "^doi")) {
fileid <- file # doi is allowed
} else {
stop("When 'file' is a character (non-global ID), dataset must be specified.")
}
}
# Main function. Call get_file_by_id
out <- vector("list", length(fileid))
for (i in seq_along(fileid)) {
out[[i]] <- get_file_by_id(
fileid = fileid[i],
dataset = dataset,
format = format,
vars = vars,
key = key,
server = server,
original = original,
...
)
}
if (length(out) == 1L) { # return the raw vector if there's a single file
return(out[[1]])
} else {
return(out) # return a list of raw vectors otherwise
}
}
#' @rdname files
#'
#' @param filename Filename of the dataset, with file extension as shown in Dataverse
#' (for example, if nlsw88.dta was the original but is displayed as the ingested
#' nlsw88.tab, use the ingested version.)
#'
#' @export
get_file_by_name <- function (
filename,
dataset,
format = c("original", "bundle"),
vars = NULL,
key = Sys.getenv("DATAVERSE_KEY"),
server = Sys.getenv("DATAVERSE_SERVER"),
original = TRUE,
...
) {
format <- match.arg(format)
# retrieve ID
fileid <- get_fileid.character(
x = dataset,
file = filename,
server = server,
...
)
get_file_by_id(
fileid,
format = format,
vars = vars,
key = key,
server = server,
original = original,
...
)
}