1 |
#' Download Census Population Data |
|
2 |
#' |
|
3 |
#' Download American Community Survey population data from the U.S. Census Bureau. |
|
4 |
#' |
|
5 |
#' @param year Data year. |
|
6 |
#' @param out_dir Directory to download the file to. |
|
7 |
#' @param state_only Logical; if \code{TRUE}, will only load state data. |
|
8 |
#' Will still download county data. |
|
9 |
#' @param overwrite Logical; if \code{TRUE}, will re-download and overwrite existing data. |
|
10 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages. |
|
11 |
#' @returns A \code{data.frame} including \code{GEOID} and \code{region_name} |
|
12 |
#' for states and counties, along with their population, in total and within |
|
13 |
#' age brackets. |
|
14 |
#' @examples |
|
15 |
#' if (file.exists("../../resources/census_population_2021.csv.xz")) { |
|
16 |
#' dcf_load_census(2021, "../../resources")[1:10, ] |
|
17 |
#' } |
|
18 |
#' @export |
|
19 | ||
20 |
dcf_load_census <- function( |
|
21 |
year = 2021, |
|
22 |
out_dir = NULL, |
|
23 |
state_only = FALSE, |
|
24 |
overwrite = FALSE, |
|
25 |
verbose = TRUE |
|
26 |
) { |
|
27 | 2x |
out_file <- paste0(out_dir, "/census_population_", year, ".csv.xz") |
28 | 2x |
write_out <- !is.null(out_dir) |
29 | 2x |
if (!overwrite && write_out && file.exists(out_file)) { |
30 | 1x |
if (verbose) { |
31 | 1x |
cli::cli_progress_step("reading in existing file") |
32 |
} |
|
33 | 1x |
invisible(as.data.frame(vroom::vroom( |
34 | 1x |
out_file, |
35 | 1x |
delim = ",", |
36 | 1x |
col_types = list( |
37 | 1x |
GEOID = "c", |
38 | 1x |
region_name = "c", |
39 | 1x |
Total = "i", |
40 | 1x |
`<10 Years` = "i", |
41 | 1x |
`10-14 Years` = "i", |
42 | 1x |
`15-19 Years` = "i", |
43 | 1x |
`20-39 Years` = "i", |
44 | 1x |
`40-64 Years` = "i", |
45 | 1x |
`65+ Years` = "i" |
46 |
), |
|
47 | 1x |
n_max = if (state_only) 52L else Inf |
48 |
))) |
|
49 |
} else { |
|
50 |
# GEOID to region name mapping |
|
51 | 1x |
id_url <- "https://www2.census.gov/geo/docs/reference/codes2020/national_" |
52 | 1x |
if (verbose) { |
53 | 1x |
cli::cli_progress_step("downloading state IDs map") |
54 |
} |
|
55 | 1x |
state_ids <- vroom::vroom( |
56 | 1x |
paste0(id_url, "state2020.txt"), |
57 | 1x |
delim = "|", |
58 | 1x |
col_types = list( |
59 | 1x |
STATE = "c", |
60 | 1x |
STATEFP = "c", |
61 | 1x |
STATENS = "c", |
62 | 1x |
STATE_NAME = "c" |
63 |
) |
|
64 |
) |
|
65 | 1x |
if (verbose) { |
66 | 1x |
cli::cli_progress_step("downloading county IDs map") |
67 |
} |
|
68 | 1x |
county_ids <- vroom::vroom( |
69 | 1x |
paste0(id_url, "county2020.txt"), |
70 | 1x |
delim = "|", |
71 | 1x |
col_types = list( |
72 | 1x |
STATE = "c", |
73 | 1x |
STATEFP = "c", |
74 | 1x |
COUNTYFP = "c", |
75 | 1x |
COUNTYNS = "c", |
76 | 1x |
COUNTYNAME = "c", |
77 | 1x |
CLASSFP = "c", |
78 | 1x |
FUNCSTAT = "c" |
79 |
) |
|
80 |
) |
|
81 | 1x |
region_name = structure( |
82 | 1x |
sub( |
83 | 1x |
" County", |
84 |
"", |
|
85 | 1x |
c( |
86 | 1x |
state_ids$STATE_NAME, |
87 | 1x |
paste0(county_ids$COUNTYNAME, ", ", county_ids$STATE) |
88 |
), |
|
89 | 1x |
fixed = TRUE |
90 |
), |
|
91 | 1x |
names = c( |
92 | 1x |
state_ids$STATEFP, |
93 | 1x |
paste0(county_ids$STATEFP, county_ids$COUNTYFP) |
94 |
) |
|
95 |
) |
|
96 | ||
97 |
# population data |
|
98 | ||
99 |
## age group labels from IDs |
|
100 | 1x |
if (verbose) { |
101 | 1x |
cli::cli_progress_step("downloading ACS variable lables") |
102 |
} |
|
103 | 1x |
labels <- vroom::vroom( |
104 | 1x |
paste0( |
105 | 1x |
"https://www2.census.gov/programs-surveys/acs/summary_file/", |
106 | 1x |
min(2021L, year), |
107 | 1x |
"/sequence-based-SF/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt" |
108 |
), |
|
109 | 1x |
delim = ",", |
110 | 1x |
col_types = list( |
111 | 1x |
`File ID` = "c", |
112 | 1x |
`Table ID` = "c", |
113 | 1x |
`Sequence Number` = "c", |
114 | 1x |
`Line Number` = "d", |
115 | 1x |
`Start Position` = "i", |
116 | 1x |
`Total Cells in Table` = "c", |
117 | 1x |
`Total Cells in Sequence` = "i", |
118 | 1x |
`Table Title` = "c", |
119 | 1x |
`Subject Area` = "c" |
120 |
) |
|
121 |
) |
|
122 | 1x |
variable_labels <- structure( |
123 | 1x |
labels$`Table Title`, |
124 | 1x |
names = paste0( |
125 | 1x |
labels$`Table ID`, |
126 | 1x |
"_E", |
127 | 1x |
formatC(labels$`Line Number`, width = 3L, flag = 0L) |
128 |
) |
|
129 |
) |
|
130 | ||
131 |
## age group counts |
|
132 | 1x |
url <- paste0( |
133 | 1x |
"https://www2.census.gov/programs-surveys/acs/summary_file/", |
134 | 1x |
year, |
135 | 1x |
"/table-based-SF/data/5YRData/acsdt5y", |
136 | 1x |
year, |
137 | 1x |
"-b01001.dat" |
138 |
) |
|
139 | 1x |
if (verbose) { |
140 | 1x |
cli::cli_progress_step("downloading population data") |
141 |
} |
|
142 | 1x |
data <- vroom::vroom(url, delim = "|", col_types = list(GEO_ID = "c")) |
143 | 1x |
data <- data[ |
144 | 1x |
grep("0[45]00000US", data$GEO_ID), |
145 | 1x |
grep("E", colnames(data), fixed = TRUE) |
146 |
] |
|
147 | 1x |
colnames(data)[-1L] <- variable_labels[colnames(data)[-1L]] |
148 | ||
149 | 1x |
age_groups <- list( |
150 | 1x |
Total = "Total:", |
151 | 1x |
`<10 Years` = c("Under 5 years", "5 to 9 years"), |
152 | 1x |
`10-14 Years` = "10 to 14 years", |
153 | 1x |
`15-19 Years` = c("15 to 17 years", "18 and 19 years"), |
154 | 1x |
`20-39 Years` = c( |
155 | 1x |
"20 years", |
156 | 1x |
"21 years", |
157 | 1x |
"22 to 24 years", |
158 | 1x |
"25 to 29 years", |
159 | 1x |
"30 to 34 years", |
160 | 1x |
"35 to 39 years" |
161 |
), |
|
162 | 1x |
`40-64 Years` = c( |
163 | 1x |
"40 to 44 years", |
164 | 1x |
"45 to 49 years", |
165 | 1x |
"50 to 54 years", |
166 | 1x |
"55 to 59 years", |
167 | 1x |
"60 and 61 years", |
168 | 1x |
"62 to 64 years" |
169 |
), |
|
170 | 1x |
`65+ Years` = c( |
171 | 1x |
"65 and 66 years", |
172 | 1x |
"67 to 69 years", |
173 | 1x |
"70 to 74 years", |
174 | 1x |
"75 to 79 years", |
175 | 1x |
"80 to 84 years", |
176 | 1x |
"85 years and over" |
177 |
) |
|
178 |
) |
|
179 | 1x |
if (verbose) { |
180 | 1x |
cli::cli_progress_step("agregating across sex and fine age groups") |
181 |
} |
|
182 | 1x |
pop <- cbind( |
183 | 1x |
data.frame(GEOID = substring(data$GEO_ID, 10L), region_name = ""), |
184 | 1x |
do.call( |
185 | 1x |
cbind, |
186 | 1x |
lapply(age_groups, function(l) rowSums(data[, colnames(data) %in% l])) |
187 |
) |
|
188 |
) |
|
189 | 1x |
pop$region_name = region_name[pop$GEOID] |
190 | 1x |
states <- pop[1L:52L, ] |
191 | 1x |
health_regions <- as.data.frame(do.call( |
192 | 1x |
rbind, |
193 | 1x |
lapply( |
194 | 1x |
split( |
195 | 1x |
states[, -(1L:2L)], |
196 | 1x |
dcf_to_health_region(states$GEOID, "hhs_") |
197 |
), |
|
198 | 1x |
colSums |
199 |
) |
|
200 |
)) |
|
201 | 1x |
health_regions$GEOID <- rownames(health_regions) |
202 | 1x |
health_regions$region_name <- sub( |
203 | 1x |
"hhs_", |
204 | 1x |
"Health Region ", |
205 | 1x |
rownames(health_regions), |
206 | 1x |
fixed = TRUE |
207 |
) |
|
208 | 1x |
pop <- rbind(pop, health_regions[, colnames(pop)]) |
209 | ||
210 | 1x |
if (write_out) { |
211 | 1x |
if (verbose) { |
212 | 1x |
cli::cli_progress_step("writing output") |
213 |
} |
|
214 | 1x |
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) |
215 | 1x |
vroom::vroom_write(pop, out_file, ",") |
216 |
} |
|
217 | 1x |
invisible(if (state_only) states else pop) |
218 |
} |
|
219 |
} |
1 |
#' Adds documentation of a dataset to a datapackage |
|
2 |
#' |
|
3 |
#' Add information about variables in a dataset to a \code{datapackage.json} metadata file. |
|
4 |
#' |
|
5 |
#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}. |
|
6 |
#' @param meta Information about each data file. A list with a list entry for each entry in |
|
7 |
#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all. |
|
8 |
#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to |
|
9 |
#' \code{dir}, or a list with the read-in version. |
|
10 |
#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}. |
|
11 |
#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting |
|
12 |
#' \code{packagename}. |
|
13 |
#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information. |
|
14 |
#' @param sha A number specifying the Secure Hash Algorithm function, |
|
15 |
#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}). |
|
16 |
#' @param pretty Logical; if \code{TRUE}, will pretty-print the datapackage. |
|
17 |
#' @param summarize_ids Logical; if \code{TRUE}, will include ID columns in schema field summaries. |
|
18 |
#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving. |
|
19 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages. |
|
20 |
#' @details |
|
21 |
#' \code{meta} should be a list with unnamed entries for entry in \code{filename}, |
|
22 |
#' and each entry can include a named entry for any of these: |
|
23 |
#' \describe{ |
|
24 |
#' \item{source}{ |
|
25 |
#' A list or list of lists with entries for at least \code{name}, and ideally for \code{url}. |
|
26 |
#' } |
|
27 |
#' \item{ids}{ |
|
28 |
#' A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset). |
|
29 |
#' Might also include \code{map} with a list or path to a JSON file resulting in a list with an |
|
30 |
#' entry for each ID, and additional information about that entity, to be read in a its features. |
|
31 |
#' All files will be loaded to help with aggregation, but local files will be included in the datapackage, |
|
32 |
#' whereas hosted files will be loaded client-side. |
|
33 |
#' } |
|
34 |
#' \item{time}{ |
|
35 |
#' A string giving the name of a variable in the dataset representing a repeated observation of the same entity. |
|
36 |
#' } |
|
37 |
#' \item{variables}{ |
|
38 |
#' A list with named entries providing more information about the variables in the dataset. |
|
39 |
#' See \code{\link{dcf_measure_info}}. |
|
40 |
#' } |
|
41 |
#' } |
|
42 |
#' @examples |
|
43 |
#' \dontrun{ |
|
44 |
#' # write example data |
|
45 |
#' write.csv(mtcars, "mtcars.csv") |
|
46 |
#' |
|
47 |
#' # add it to an existing datapackage.json file in the current working directory |
|
48 |
#' dcf_datapackage_add("mtcars.csv") |
|
49 |
#' } |
|
50 |
#' @return An invisible version of the updated datapackage, which is also written to |
|
51 |
#' \code{datapackage.json} if \code{write = TRUE}. |
|
52 |
#' @seealso Initialize the \code{datapackage.json} file with \code{\link{dcf_datapackage_init}}. |
|
53 |
#' @export |
|
54 | ||
55 |
dcf_datapackage_add <- function( |
|
56 |
filename, |
|
57 |
meta = list(), |
|
58 |
packagename = "datapackage.json", |
|
59 |
dir = ".", |
|
60 |
write = TRUE, |
|
61 |
refresh = TRUE, |
|
62 |
sha = "512", |
|
63 |
pretty = FALSE, |
|
64 |
summarize_ids = FALSE, |
|
65 |
open_after = FALSE, |
|
66 |
verbose = interactive() |
|
67 |
) { |
|
68 | 3x |
if (missing(filename)) { |
69 | ! |
cli::cli_abort("{.arg filename} must be specified") |
70 |
} |
|
71 | 3x |
setnames <- names(filename) |
72 | 3x |
if (file.exists(filename[[1]])) { |
73 | ! |
if (dir == ".") { |
74 | ! |
dir <- dirname(filename[[1]]) |
75 |
} |
|
76 | ! |
filename <- basename(filename) |
77 |
} |
|
78 | 3x |
if (any(!file.exists(paste0(dir, "/", filename)))) { |
79 | ! |
filename <- filename[!file.exists(filename)] |
80 | ! |
cli::cli_abort("{?a file/files} did not exist: {filename}") |
81 |
} |
|
82 | 3x |
package <- if ( |
83 | 3x |
is.character(packagename) && file.exists(paste0(dir, "/", packagename)) |
84 |
) { |
|
85 | 3x |
paste0(dir, "/", packagename) |
86 |
} else { |
|
87 | ! |
packagename |
88 |
} |
|
89 | 3x |
if (write) { |
90 | 3x |
if (is.character(package)) { |
91 | 3x |
package <- paste0(dir, "/", packagename) |
92 | 3x |
package <- if (file.exists(package)) { |
93 | 3x |
packagename <- package |
94 | 3x |
jsonlite::read_json(package) |
95 |
} else { |
|
96 | ! |
dcf_datapackage_init( |
97 | ! |
if (!is.null(setnames)) setnames[[1]] else filename[[1]], |
98 | ! |
dir = dir |
99 |
) |
|
100 |
} |
|
101 |
} |
|
102 | 3x |
if (!is.list(package)) { |
103 | ! |
cli::cli_abort(c( |
104 | ! |
"{.arg package} does not appear to be in the right format", |
105 | ! |
i = "this should be (or be read in from JSON as) a list with a {.code resource} entry" |
106 |
)) |
|
107 |
} |
|
108 |
} |
|
109 | 3x |
if (!is.list(package)) { |
110 | ! |
package <- list() |
111 |
} |
|
112 | 3x |
single_meta <- FALSE |
113 | 3x |
metas <- if (!is.null(names(meta))) { |
114 | 3x |
if (!is.null(setnames) && all(setnames %in% names(meta))) { |
115 | ! |
meta[setnames] |
116 |
} else { |
|
117 | 3x |
single_meta <- TRUE |
118 | 3x |
if (length(meta$variables) == 1 && is.character(meta$variables)) { |
119 | ! |
if (!file.exists(meta$variables)) { |
120 | ! |
meta$variables <- paste0(dir, "/", meta$variables) |
121 |
} |
|
122 | ! |
if (file.exists(meta$variables)) { |
123 | ! |
meta$variables <- jsonlite::read_json(meta$variables) |
124 |
} |
|
125 |
} |
|
126 | 3x |
meta$variables <- replace_equations(meta$variables) |
127 | 3x |
meta |
128 |
} |
|
129 |
} else { |
|
130 | ! |
meta[seq_along(filename)] |
131 |
} |
|
132 | 3x |
if (!single_meta) { |
133 | ! |
metas <- lapply(metas, function(m) { |
134 | ! |
m$variables <- replace_equations(m$variables) |
135 | ! |
m |
136 |
}) |
|
137 |
} |
|
138 | 3x |
collect_metadata <- function(file) { |
139 | 3x |
f <- paste0(dir, "/", filename[[file]]) |
140 | 3x |
m <- if (single_meta) meta else metas[[file]] |
141 | 3x |
format <- if (grepl(".parquet", f, fixed = TRUE)) { |
142 | ! |
"parquet" |
143 | 3x |
} else if (grepl(".csv", f, fixed = TRUE)) { |
144 | 3x |
"csv" |
145 | 3x |
} else if (grepl(".rds", f, fixed = TRUE)) { |
146 | ! |
"rds" |
147 |
} else { |
|
148 | ! |
"tsv" |
149 |
} |
|
150 | 3x |
if (is.na(format)) { |
151 | ! |
format <- "rds" |
152 |
} |
|
153 | 3x |
info <- file.info(f) |
154 | 3x |
metas <- list() |
155 | 3x |
unpack_meta <- function(n) { |
156 | 9x |
if (!length(m[[n]])) { |
157 | 3x |
list() |
158 | 6x |
} else if (is.list(m[[n]][[1]])) { |
159 | ! |
m[[n]] |
160 |
} else { |
|
161 | 6x |
list(m[[n]]) |
162 |
} |
|
163 |
} |
|
164 | 3x |
ids <- unpack_meta("ids") |
165 | 3x |
idvars <- NULL |
166 | 3x |
for (i in seq_along(ids)) { |
167 | 3x |
if (is.list(ids[[i]])) { |
168 | 3x |
if ( |
169 | ! |
length(ids[[i]]$map) == 1 && |
170 | ! |
is.character(ids[[i]]$map) && |
171 | ! |
file.exists(ids[[i]]$map) |
172 |
) { |
|
173 | ! |
ids[[i]]$map_content <- paste( |
174 | ! |
readLines(ids[[i]]$map, warn = FALSE), |
175 | ! |
collapse = "" |
176 |
) |
|
177 |
} |
|
178 |
} else { |
|
179 | 3x |
ids[[i]] <- list(variable = ids[[i]]) |
180 |
} |
|
181 | 3x |
if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable) |
182 |
} |
|
183 | 3x |
data <- if (format == "rds") { |
184 | ! |
tryCatch(readRDS(f), error = function(e) NULL) |
185 | 3x |
} else if (format == "parquet") { |
186 | ! |
tryCatch(arrow::read_parquet(f), error = function(e) NULL) |
187 |
} else { |
|
188 | 3x |
attempt_read(f, c("geography", "time", idvars)) |
189 |
} |
|
190 | 3x |
if (is.null(data)) { |
191 | ! |
cli::cli_abort(c( |
192 | ! |
paste0("failed to read in the data file ({.file {f}})"), |
193 | ! |
i = "check that it is in a compatible format" |
194 |
)) |
|
195 |
} |
|
196 | 3x |
if (!all(rownames(data) == seq_len(nrow(data)))) { |
197 | ! |
data <- cbind(`_row` = rownames(data), data) |
198 |
} |
|
199 | 3x |
timevar <- unlist(unpack_meta("time")) |
200 | 3x |
times <- if (is.null(timevar)) rep(1, nrow(data)) else data[[timevar]] |
201 | 3x |
times_unique <- unique(times) |
202 | 3x |
if (!single_meta) { |
203 | ! |
varinf <- unpack_meta("variables") |
204 | ! |
if (length(varinf) == 1 && is.character(varinf[[1]])) { |
205 | ! |
if (!file.exists(varinf[[1]])) { |
206 | ! |
varinf[[1]] <- paste0(dir, "/", varinf[[1]]) |
207 |
} |
|
208 | ! |
if (file.exists(varinf[[1]])) { |
209 | ! |
if (varinf[[1]] %in% names(metas)) { |
210 | ! |
varinf <- metas[[varinf[[1]]]] |
211 |
} else { |
|
212 | ! |
varinf <- metas[[varinf[[1]]]] <- dcf_measure_info( |
213 | ! |
varinf[[1]], |
214 | ! |
write = FALSE, |
215 | ! |
render = TRUE |
216 |
) |
|
217 |
} |
|
218 | ! |
varinf <- varinf[varinf != ""] |
219 |
} |
|
220 |
} |
|
221 | ! |
varinf_full <- names(varinf) |
222 | ! |
varinf_suf <- sub("^[^:]+:", "", varinf_full) |
223 |
} |
|
224 | 3x |
res <- list( |
225 | 3x |
bytes = as.integer(info$size), |
226 | 3x |
encoding = stringi::stri_enc_detect(f)[[1]][1, 1], |
227 | 3x |
md5 = tools::md5sum(f)[[1]], |
228 | 3x |
format = format, |
229 | 3x |
name = if (!is.null(setnames)) { |
230 | ! |
setnames[file] |
231 | 3x |
} else if (!is.null(m$name)) { |
232 | ! |
m$name |
233 |
} else { |
|
234 | 3x |
sub("\\.[^.]*$", "", basename(filename[[file]])) |
235 |
}, |
|
236 | 3x |
filename = filename[[file]], |
237 | 3x |
versions = get_versions(f), |
238 | 3x |
source = unpack_meta("source"), |
239 | 3x |
ids = ids, |
240 | 3x |
id_length = if (length(idvars)) { |
241 | 3x |
id_lengths <- nchar(data[[idvars[1]]]) |
242 | 3x |
id_lengths <- id_lengths[!is.na(id_lengths)] |
243 | ! |
if (all(id_lengths == id_lengths[1])) id_lengths[1] else 0 |
244 |
} else { |
|
245 | ! |
0 |
246 |
}, |
|
247 | 3x |
time = timevar, |
248 | 3x |
profile = "data-resource", |
249 | 3x |
created = as.character(info$mtime), |
250 | 3x |
last_modified = as.character(info$ctime), |
251 | 3x |
row_count = nrow(data), |
252 | 3x |
entity_count = if (length(idvars)) { |
253 | 3x |
length(unique(data[[idvars[1]]])) |
254 |
} else { |
|
255 | ! |
nrow(data) |
256 |
}, |
|
257 | 3x |
schema = list( |
258 | 3x |
fields = lapply( |
259 | 3x |
if (summarize_ids) { |
260 | 3x |
colnames(data) |
261 |
} else { |
|
262 | ! |
colnames(data)[!colnames(data) %in% idvars] |
263 |
}, |
|
264 | 3x |
function(cn) { |
265 | 9x |
v <- data[[cn]] |
266 | 9x |
invalid <- !is.finite(v) |
267 | 9x |
r <- list(name = cn, duplicates = sum(duplicated(v))) |
268 | 9x |
if (!single_meta) { |
269 | ! |
if (cn %in% varinf_full) { |
270 | ! |
r$info <- varinf[[cn]] |
271 | ! |
} else if (cn %in% varinf_suf) { |
272 | ! |
r$info <- varinf[[which(varinf_suf == cn)]] |
273 |
} |
|
274 | ! |
r$info <- r$info[r$info != ""] |
275 |
} |
|
276 | 9x |
su <- !is.na(v) |
277 | 9x |
if (any(su)) { |
278 | 9x |
r$time_range <- which(times_unique %in% range(times[su])) - 1 |
279 | 9x |
r$time_range <- if (length(r$time_range)) { |
280 | 9x |
r$time_range[c(1, length(r$time_range))] |
281 |
} else { |
|
282 | ! |
c(-1, -1) |
283 |
} |
|
284 |
} else { |
|
285 | ! |
r$time_range <- c(-1, -1) |
286 |
} |
|
287 | 9x |
if (!is.character(v) && all(invalid)) { |
288 | ! |
r$type <- "unknown" |
289 | ! |
r$missing <- length(v) |
290 | 9x |
} else if (is.numeric(v)) { |
291 | 3x |
r$type <- if (all(invalid | as.integer(v) == v)) { |
292 | 3x |
"integer" |
293 |
} else { |
|
294 | ! |
"float" |
295 |
} |
|
296 | 3x |
r$missing <- sum(invalid) |
297 | 3x |
r$mean <- round(mean(v, na.rm = TRUE), 6) |
298 | 3x |
r$sd <- round(stats::sd(v, na.rm = TRUE), 6) |
299 | 3x |
r$min <- round(min(v, na.rm = TRUE), 6) |
300 | 3x |
r$max <- round(max(v, na.rm = TRUE), 6) |
301 |
} else { |
|
302 | 6x |
r$type <- "string" |
303 | 6x |
if (!is.factor(v)) { |
304 | 6x |
v <- as.factor(as.character(v)) |
305 |
} |
|
306 | 6x |
r$missing <- sum(is.na(v) | is.nan(v) | grepl("^[\\s.-]$", v)) |
307 | 6x |
r$table <- structure(as.list(tabulate(v)), names = levels(v)) |
308 |
} |
|
309 | 9x |
r |
310 |
} |
|
311 |
) |
|
312 |
) |
|
313 |
) |
|
314 | 3x |
if (!single_meta && "_references" %in% names(varinf)) { |
315 | ! |
res[["_references"]] <- varinf[["_references"]] |
316 |
} |
|
317 | 3x |
if (Sys.which("openssl") != "") { |
318 | 3x |
res[[paste0("sha", sha)]] <- calculate_sha(f, sha) |
319 |
} |
|
320 | 3x |
res |
321 |
} |
|
322 | 3x |
metadata <- lapply(seq_along(filename), collect_metadata) |
323 | 3x |
if (single_meta) { |
324 | 3x |
package$measure_info <- lapply(meta$variables, function(e) e[e != ""]) |
325 |
} |
|
326 | 3x |
names <- vapply(metadata, "[[", "", "filename") |
327 | 3x |
for (resource in package$resources) { |
328 | 2x |
if (length(resource$versions)) { |
329 | ! |
su <- which(names %in% resource$filename) |
330 | ! |
if (length(su)) { |
331 | ! |
if (length(metadata[[su]]$versions)) { |
332 | ! |
metadata[[su]]$versions <- rbind( |
333 | ! |
metadata[[su]]$versions, |
334 | ! |
if (is.data.frame(resource$versions)) { |
335 | ! |
resource$versions |
336 |
} else { |
|
337 | ! |
as.data.frame(do.call(cbind, resource$versions)) |
338 |
} |
|
339 |
) |
|
340 | ! |
metadata[[su]]$versions <- metadata[[su]]$versions[ |
341 | ! |
!duplicated(metadata[[su]]$versions), |
342 |
] |
|
343 |
} |
|
344 |
} |
|
345 |
} |
|
346 |
} |
|
347 | 3x |
if (refresh) { |
348 | 3x |
package$resources <- metadata |
349 |
} else { |
|
350 | ! |
package$resources <- c( |
351 | ! |
metadata, |
352 | ! |
package$resources[ |
353 | ! |
!(vapply(package$resources, "[[", "", "filename") %in% names) |
354 |
] |
|
355 |
) |
|
356 |
} |
|
357 | 3x |
if (write) { |
358 | 3x |
packagename <- if (is.character(packagename)) { |
359 | 3x |
packagename |
360 |
} else { |
|
361 | ! |
"datapackage.json" |
362 |
} |
|
363 | 3x |
jsonlite::write_json( |
364 | 3x |
package, |
365 | 3x |
if (file.exists(packagename)) { |
366 | 3x |
packagename |
367 |
} else { |
|
368 | ! |
paste0(dir, "/", packagename) |
369 |
}, |
|
370 | 3x |
auto_unbox = TRUE, |
371 | 3x |
digits = 6, |
372 | 3x |
dataframe = "columns", |
373 | 3x |
pretty = pretty |
374 |
) |
|
375 | 3x |
if (verbose) { |
376 | ! |
cli::cli_bullets(c( |
377 | ! |
v = paste( |
378 | ! |
if (refresh) "updated resource in" else "added resource to", |
379 | ! |
"datapackage.json:" |
380 |
), |
|
381 | ! |
"*" = paste0("{.path ", packagename, "}") |
382 |
)) |
|
383 | ! |
if (open_after) rstudioapi::navigateToFile(packagename) |
384 |
} |
|
385 |
} |
|
386 | 3x |
invisible(package) |
387 |
} |
|
388 | ||
389 |
get_versions <- function(file) { |
|
390 | 3x |
log <- suppressWarnings(system2( |
391 | 3x |
"git", |
392 | 3x |
c("log", file), |
393 | 3x |
stdout = TRUE |
394 |
)) |
|
395 | 3x |
if (is.null(attr(log, "status"))) { |
396 | 1x |
log_entries <- strsplit(paste(log, collapse = "|"), "commit ")[[ |
397 | 1x |
1 |
398 |
]] |
|
399 | 1x |
log_entries <- do.call( |
400 | 1x |
rbind, |
401 | 1x |
Filter( |
402 | 1x |
function(x) length(x) == 4L, |
403 | 1x |
strsplit( |
404 | 1x |
log_entries[log_entries != ""], |
405 | 1x |
"\\|+(?:[^:]+:)?\\s*" |
406 |
) |
|
407 |
) |
|
408 |
) |
|
409 | 1x |
if (length(log_entries)) { |
410 | 1x |
colnames(log_entries) <- c( |
411 | 1x |
"hash", |
412 | 1x |
"author", |
413 | 1x |
"date", |
414 | 1x |
"message" |
415 |
) |
|
416 | 1x |
as.data.frame(log_entries) |
417 |
} |
|
418 |
} |
|
419 |
} |
|
420 | ||
421 |
attempt_read <- function(file, id_cols) { |
|
422 | 3x |
tryCatch( |
423 |
{ |
|
424 | 3x |
sep <- if (grepl(".csv", file, fixed = TRUE)) "," else "\t" |
425 | 3x |
cols <- scan(file, "", nlines = 1, sep = sep, quiet = TRUE) |
426 | 3x |
types <- rep("?", length(cols)) |
427 | 3x |
types[cols %in% id_cols] <- "c" |
428 | 3x |
arrow::read_delim_arrow( |
429 | 3x |
gzfile(file), |
430 | 3x |
sep, |
431 | 3x |
col_names = cols, |
432 | 3x |
col_types = paste(types, collapse = ""), |
433 | 3x |
skip = 1 |
434 |
) |
|
435 |
}, |
|
436 | 3x |
error = function(e) NULL |
437 |
) |
|
438 |
} |
|
439 | ||
440 |
calculate_sha <- function(file, level) { |
|
441 | 3x |
if (Sys.which("openssl") != "") { |
442 | 3x |
tryCatch( |
443 | 3x |
strsplit( |
444 | 3x |
system2( |
445 | 3x |
"openssl", |
446 | 3x |
c("dgst", paste0("-sha", level), shQuote(file)), |
447 | 3x |
TRUE |
448 |
), |
|
449 |
" ", |
|
450 | 3x |
fixed = TRUE |
451 | 3x |
)[[1]][2], |
452 | 3x |
error = function(e) "" |
453 |
) |
|
454 |
} else { |
|
455 |
"" |
|
456 |
} |
|
457 |
} |
1 |
#' Standardize Epic Data |
|
2 |
#' |
|
3 |
#' Standardize a raw Epic data table. |
|
4 |
#' |
|
5 |
#' @param raw_data Raw Epic data, such as returned from \link{dcf_read_epic}. |
|
6 |
#' @returns A standardized form of \code{data}. |
|
7 |
#' @section Standardization: |
|
8 |
#' \itemize{ |
|
9 |
#' \item Collapse location columns (\code{state} or \code{county}) to a single |
|
10 |
#' \code{geography} column, and region names to IDs. |
|
11 |
#' \item Collapse time columns (\code{year}, \code{month}, or \code{week}) to a single |
|
12 |
#' \code{time} column, and clean up value formatting. |
|
13 |
#' \item Drop rows with no values across value columns. |
|
14 |
#' } |
|
15 |
#' @examples |
|
16 |
#' \dontrun{ |
|
17 |
#' raw_data <- dcf_read_epic("data/epic/raw/flu.csv.xz") |
|
18 |
#' standard_data <- dcf_process_epic_raw(raw_data) |
|
19 |
#' } |
|
20 |
#' |
|
21 |
#' @export |
|
22 | ||
23 |
dcf_standardize_epic <- function(raw_data) { |
|
24 | 1x |
region_names <- epic_id_maps$regions |
25 | 1x |
names(region_names) <- gsub( |
26 | 1x |
" (?:CITY AND BOROUGH|BOROUGH|PARISH|MUNICIPALITY|MUNICIPIO)|[.']", |
27 |
"", |
|
28 | 1x |
toupper(names(region_names)) |
29 |
) |
|
30 | 1x |
cols <- colnames(raw_data) |
31 | 1x |
time_col <- which(cols == "year") |
32 | 1x |
if (length(time_col)) { |
33 | 1x |
colnames(raw_data)[time_col] <- "time" |
34 | 1x |
raw_data$time <- as.integer(substring( |
35 | 1x |
raw_data$time, |
36 | 1x |
nchar(raw_data$time) - 4L |
37 |
)) |
|
38 |
} |
|
39 | 1x |
month_col <- which(cols == "month") |
40 | 1x |
if (length(month_col)) { |
41 | ! |
raw_data$time <- paste0( |
42 | ! |
raw_data$time, |
43 |
"-", |
|
44 | ! |
epic_id_maps$months[raw_data$month] |
45 |
) |
|
46 |
} |
|
47 | 1x |
week_col <- which(cols == "week") |
48 | 1x |
if (length(week_col)) { |
49 | 1x |
raw_data$time <- paste0( |
50 | 1x |
raw_data$time, |
51 |
"-", |
|
52 | 1x |
vapply( |
53 | 1x |
strsplit(raw_data$week, "[^A-Za-z0-9]"), |
54 | 1x |
function(p) { |
55 | 4x |
paste0( |
56 | 4x |
epic_id_maps$months[[p[[1L]]]], |
57 |
"-", |
|
58 | 4x |
formatC(as.integer(p[[2L]]), width = 2L, flag = "0") |
59 |
) |
|
60 |
}, |
|
61 |
"" |
|
62 |
) |
|
63 |
) |
|
64 |
} |
|
65 | 1x |
geo_col <- grep("^(?:state|county)", cols) |
66 | 1x |
if (length(geo_col)) { |
67 | 1x |
colnames(raw_data)[geo_col] <- "geography" |
68 | 1x |
raw_data$geography <- toupper(raw_data$geography) |
69 | 1x |
missing_geo <- !(raw_data$geography %in% names(region_names)) |
70 | 1x |
if (any(missing_geo)) { |
71 | 1x |
geo <- sub( |
72 | 1x |
"LA ", |
73 | 1x |
"LA", |
74 | 1x |
sub("^SAINT", "ST", raw_data$geography[missing_geo]), |
75 | 1x |
fixed = TRUE |
76 |
) |
|
77 | 1x |
if (any(grepl(", VA", geo, fixed = TRUE))) { |
78 | 1x |
geo[geo == "SALEM, VA"] <- "SALEM CITY, VA" |
79 | 1x |
geo[geo == "RADFORD, VA"] <- "RADFORD CITY, VA" |
80 | 1x |
geo[geo == "DONA ANA, NM"] <- "DO\u00d1A ANA, NM" |
81 | 1x |
geo[geo == "MATANUSKA SUSITNA, AK"] <- "MATANUSKA-SUSITNA, AK" |
82 |
} |
|
83 | 1x |
raw_data$geography[missing_geo] <- geo |
84 |
} |
|
85 | 1x |
missing_regions <- raw_data$geography[ |
86 | 1x |
!(raw_data$geography %in% names(region_names)) |
87 |
] |
|
88 | 1x |
if (length(missing_regions)) { |
89 | ! |
cli::cli_warn( |
90 | ! |
'unrecognized regions: {paste(unique(missing_regions), collapse = "; ")}' |
91 |
) |
|
92 |
} |
|
93 | 1x |
raw_data$geography <- region_names[raw_data$geography] |
94 | 1x |
raw_data <- raw_data[!is.na(raw_data$geography), ] |
95 |
} |
|
96 | 1x |
raw_data <- raw_data[, |
97 | 1x |
!(colnames(raw_data) %in% c("state", "county", "year", "month", "week")) |
98 |
] |
|
99 | 1x |
raw_data[ |
100 | 1x |
rowSums( |
101 | 1x |
!is.na(raw_data[, |
102 | 1x |
!(colnames(raw_data) %in% c("geography", "time", "age")) |
103 |
]) |
|
104 |
) != |
|
105 | 1x |
0L, |
106 |
] |
|
107 |
} |
1 |
#' Download Data from the CDC |
|
2 |
#' |
|
3 |
#' Download data and metadata from the Centers for Disease Control and Prevention (CDC). |
|
4 |
#' |
|
5 |
#' @param id ID of the resource (e.g., \code{ijqb-a7ye}). |
|
6 |
#' @param out_dir Directory in which to save the metadata and data files. |
|
7 |
#' @param state The state ID of a previous download; if provided, will only download if the |
|
8 |
#' new state does not match. |
|
9 |
#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet. |
|
10 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages. |
|
11 |
#' @returns The state ID of the downloaded files; |
|
12 |
#' downloads files (\code{<id>.json} and \code{<id>.csv.xz}) to \code{out_dir} |
|
13 |
#' @section \code{data.cdc.gov} URLs: |
|
14 |
#' |
|
15 |
#' For each resource ID, there are 3 relevant CDC URLs: |
|
16 |
#' \itemize{ |
|
17 |
#' \item \strong{\code{resource/<id>}}: This redirects to the resource's main page, |
|
18 |
#' with displayed metadata and a data preview |
|
19 |
#' (e.g., \href{https://data.cdc.gov/resource/ijqb-a7ye}{data.cdc.gov/resource/ijqb-a7ye}). |
|
20 |
#' \item \strong{\code{api/views/<id>}}: This is a direct link to the underlying |
|
21 |
#' JSON metadata (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye}{data.cdc.gov/api/views/ijqb-a7ye}). |
|
22 |
#' \item \strong{\code{api/views/<id>/rows.csv}}: This is a direct link to the full |
|
23 |
#' CSV dataset (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye/rows.csv}{data.cdc.gov/api/views/ijqb-a7ye/rows.csv}). |
|
24 |
#' } |
|
25 |
#' |
|
26 |
#' @examples |
|
27 |
#' \dontrun{ |
|
28 |
#' dcf_download_cdc("ijqb-a7ye") |
|
29 |
#' } |
|
30 |
#' @export |
|
31 | ||
32 |
dcf_download_cdc <- function( |
|
33 |
id, |
|
34 |
out_dir = "raw", |
|
35 |
state = NULL, |
|
36 |
parquet = FALSE, |
|
37 |
verbose = TRUE |
|
38 |
) { |
|
39 | 2x |
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) |
40 | 2x |
if (verbose) { |
41 | 2x |
resource_url <- paste0("https://data.cdc.gov/resource/", id) |
42 | 2x |
cli::cli_h1( |
43 | 2x |
"downloading resource {.url {resource_url}}" |
44 |
) |
|
45 |
} |
|
46 | 2x |
url <- paste0("https://data.cdc.gov/api/views/", id) |
47 | 2x |
initial_timeout <- options(timeout = 99999L)$timeout |
48 | 2x |
on.exit(options(timeout = initial_timeout)) |
49 | 2x |
if (verbose) { |
50 | 2x |
cli::cli_progress_step("metadata: {.url {url}}") |
51 |
} |
|
52 | 2x |
metadata_file <- paste0(tempdir(), "/", id, ".json") |
53 | 2x |
status <- utils::download.file(url, metadata_file, quiet = TRUE) |
54 | 2x |
if (status != 0L) { |
55 | ! |
cli::cli_abort("failed to download metadata") |
56 |
} |
|
57 | 2x |
metadata <- jsonlite::read_json(metadata_file) |
58 | 2x |
new_state <- if (is.null(metadata$rowsUpdatedAt)) { |
59 | ! |
as.list(tools::md5sum(metadata_file)) |
60 |
} else { |
|
61 | 2x |
metadata$rowsUpdatedAt |
62 |
} |
|
63 | 2x |
if (!identical(new_state, state)) { |
64 | 2x |
file.rename(metadata_file, paste0(out_dir, "/", id, ".json")) |
65 | 2x |
data_url <- paste0(url, "/rows.csv") |
66 | 2x |
out_path <- paste0(out_dir, "/", id, ".csv") |
67 | 2x |
if (verbose) { |
68 | 2x |
cli::cli_progress_step("data: {.url {data_url}}") |
69 |
} |
|
70 | 2x |
status <- utils::download.file(data_url, out_path, quiet = TRUE) |
71 | 2x |
if (status != 0L) { |
72 | ! |
cli::cli_abort("failed to download data") |
73 |
} |
|
74 | 2x |
if (verbose) { |
75 | 2x |
cli::cli_progress_step("compressing data") |
76 |
} |
|
77 | 2x |
if (parquet) { |
78 | 1x |
data <- vroom::vroom(out_path, show_col_types = FALSE) |
79 | 1x |
arrow::write_parquet( |
80 | 1x |
data, |
81 | 1x |
compression = "gzip", |
82 | 1x |
sub(".csv", ".parquet", out_path, fixed = TRUE) |
83 |
) |
|
84 | 1x |
unlink(out_path) |
85 |
} else { |
|
86 | 1x |
unlink(paste0(out_path, ".xz")) |
87 | 1x |
status <- system2("xz", c("-f", out_path)) |
88 | 1x |
if (status != 0L) { |
89 | ! |
cli::cli_abort("failed to compress data") |
90 |
} |
|
91 |
} |
|
92 | 2x |
if (verbose) { |
93 | 2x |
cli::cli_progress_done() |
94 |
} |
|
95 | 2x |
invisible(new_state) |
96 |
} else { |
|
97 | ! |
unlink(metadata_file) |
98 | ! |
invisible(state) |
99 |
} |
|
100 |
} |
1 |
#' Make a measurement metadata file |
|
2 |
#' |
|
3 |
#' Make a \code{measure_info.json} file, or add measure entries to an existing one. |
|
4 |
#' |
|
5 |
#' @param path Path to the \code{measure_info.json} file, existing or to be created. |
|
6 |
#' @param ... Lists containing individual measure items. See the Measure Entries section. |
|
7 |
#' @param info A list containing measurement information to be added. |
|
8 |
#' @param references A list containing citation entries. See the Reference Entries section. |
|
9 |
#' @param strict Logical; if \code{TRUE}, will only allow recognized entries and values. |
|
10 |
#' @param include_empty Logical; if \code{FALSE}, will omit entries that have not been provided. |
|
11 |
#' @param overwrite_entry Logical; if \code{TRUE}, will replace rather than add to an existing entry. |
|
12 |
#' @param render Path to save a version of \code{path} to, with dynamic entries expanded. See the |
|
13 |
#' Dynamic Entries section. |
|
14 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite rather than add to an existing \code{path}. |
|
15 |
#' @param write Logical; if \code{FALSE}, will not write the build or rendered measure info. |
|
16 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages. |
|
17 |
#' @param open_after Logical; if \code{FALSE}, will not open the measure file after writing/updating. |
|
18 |
#' @section Measure Entries: |
|
19 |
#' Measure entries are named by the unique variable name with any of these entries (if \code{strict}): |
|
20 |
#' \itemize{ |
|
21 |
#' \item \strong{\code{id}}: Unique identifier of the measure; same as the entry name. |
|
22 |
#' This is meant to correspond to the column name containing the measure in data files. |
|
23 |
#' It should be minimal in length while still being unique across all files within the project. |
|
24 |
#' It should only contain the characters \code{a-z}, \code{0-9}, or \code{_}. |
|
25 |
#' \item \strong{\code{short_name}}: Shortest possible display name. |
|
26 |
#' \item \strong{\code{long_name}}: Longer display name. |
|
27 |
#' \item \strong{\code{category}}: Arbitrary category for the measure. |
|
28 |
#' \item \strong{\code{short_description}}: Shortest possible description. |
|
29 |
#' \item \strong{\code{long_description}}: Complete description. Either description can include |
|
30 |
#' TeX-style equations, enclosed in escaped square brackets (e.g., |
|
31 |
#' \code{"The equation \\\\[a_{i} = b^\\\\frac{c}{d}\\\\] was used."}; or \code{$...$}, |
|
32 |
#' \code{\\\\(...\\\\)}, or \code{\\\\begin{math}...\\\\end{math}}). The final enclosing symbol must be |
|
33 |
#' followed by a space or the end of the string. These are pre-render to MathML with |
|
34 |
#' \code{\link[katex]{katex_mathml}}. |
|
35 |
#' \item \strong{\code{statement}}: String with dynamic references to entity features |
|
36 |
#' (e.g., \code{"measure value = {value}"}). References can include: |
|
37 |
#' \itemize{ |
|
38 |
#' \item \code{value}: Value of a currently displaying variable at a current time. |
|
39 |
#' \item \code{region_name}: Alias of \code{features.name}. |
|
40 |
#' \item \code{features.<entry>}: An entity feature, coming from \code{entity_info.json} or GeoJSON properties. |
|
41 |
#' All entities have at least \code{name} and \code{id} entries (e.g., \code{"{features.id}"}). |
|
42 |
#' \item \code{variables.<entry>}: A variable feature such as \code{name} which is the same as |
|
43 |
#' \code{id} (e.g., \code{"{variables.name}"}). |
|
44 |
#' \item \code{data.<variable>}: The value of another variable at a current time (e.g., \code{"{data.variable_a}"}). |
|
45 |
#' } |
|
46 |
#' \item \strong{\code{measure_type}}: Type of the measure's value. Recognized types are displayed in a special way: |
|
47 |
#' \itemize{ |
|
48 |
#' \item \code{year} or \code{integer} show as entered (usually as whole numbers). Other numeric |
|
49 |
#' types are rounded to show a set number of digits. |
|
50 |
#' \item \code{percent} shows as \code{{value}\%}. |
|
51 |
#' \item \code{minutes} shows as \code{{value} minutes}. |
|
52 |
#' \item \code{dollar} shows as \code{${value}}. |
|
53 |
#' \item \code{internet speed} shows as \code{{value} Mbps}. |
|
54 |
#' } |
|
55 |
#' \item \strong{\code{unit}}: Prefix or suffix associated with the measure's type, such as \code{\%} for \code{percent}, |
|
56 |
#' or \code{Mbps} for \code{rate}. |
|
57 |
#' \item \strong{\code{time_resolution}}: Temporal resolution of the variable, such as \code{year} or \code{week}. |
|
58 |
#' \item \strong{\code{restrictions}}: A license or description of restrictions that may apply to the measure. |
|
59 |
#' \item \strong{\code{sources}}: A list or list of list containing source information, including any of these entries: |
|
60 |
#' \itemize{ |
|
61 |
#' \item \code{name}: Name of the source (such as an organization name). |
|
62 |
#' \item \code{url}: General URL of the source (such as an organization's website). |
|
63 |
#' \item \code{location}: More specific description of the source (such as a the name of a particular data product). |
|
64 |
#' \item \code{location_url}: More direct URL to the resource (such as a page listing data products). |
|
65 |
#' \item \code{date_accessed}: Date of retrieval (arbitrary format). |
|
66 |
#' } |
|
67 |
#' \item \strong{\code{citations}}: A vector of reference ids (the names of \code{reference} entries; e.g., \code{c("ref1", "ref3")}). |
|
68 |
#' \item \strong{\code{categories}}: A named list of categories, with any of the other measure entries, or a |
|
69 |
#' \code{default} entry giving a default category name. See the Dynamic Entries section. |
|
70 |
#' \item \strong{\code{variants}}: A named list of variants, with any of the other measure entries, or a |
|
71 |
#' \code{default} entry giving a default variant name. See the Dynamic Entries section. |
|
72 |
#' } |
|
73 |
#' @section Dynamic Entries: |
|
74 |
#' You may have several closely related variables in a dataset, which share sections of metadata, |
|
75 |
#' or have formulaic differences. In cases like this, the \code{categories} and/or \code{variants} entries |
|
76 |
#' can be used along with dynamic notation to construct multiple entries from a single template. |
|
77 |
#' |
|
78 |
#' Though functionally the same, \code{categories} might include broken-out subsets of some total |
|
79 |
#' (such as race groups, as categories of a total population), whereas \code{variants} may be different |
|
80 |
#' transformations of the same variable (such as raw counts versus percentages). |
|
81 |
#' |
|
82 |
#' In dynamic entries, \code{{category}} or \code{{variant}} refers to entries in the \code{categories} |
|
83 |
#' or \code{variants} lists. By default, these are replaced with the name of each entries in those lists |
|
84 |
#' (e.g., \code{"variable_{category}"} where \code{categories = "a"} would become \code{"variable_a"}). |
|
85 |
#' A \code{default} entry would change this behavior (e.g., with \code{categories = list(a = list(default = "b")} |
|
86 |
#' that would become \code{"variable_b"}). Adding \code{.name} would force the original behavior (e.g., |
|
87 |
#' \code{"variable_{category.name}"} would be \code{"variable_a"}). A name of \code{"blank"} is treated as |
|
88 |
#' an empty string. |
|
89 |
#' |
|
90 |
#' When notation appears in a measure info entry, they will first default to a matching name in the \code{categories} |
|
91 |
#' or \code{variants} list; for example, \code{short_name} in \code{list(short_name = "variable {category}")} with |
|
92 |
#' \code{categories = list(a = list(short_name = "(category a)"))} would become \code{"variable (category a)"}. |
|
93 |
#' To force this behavior, the entry name can be included in the notation (e.g., |
|
94 |
#' \code{"{category.short_name}"} would be \code{"variable (category a)"} in any entry). |
|
95 |
#' |
|
96 |
#' Only string entries are processed dynamically -- any list-like entries (such as |
|
97 |
#' \code{source}, \code{citations}, or \code{layer}) appearing in |
|
98 |
#' \code{categories} or \code{variants} entries will fully replace the base entry. |
|
99 |
#' |
|
100 |
#' Dynamic entries can be kept dynamic when passed to a data site, but can be rendered for other uses, |
|
101 |
#' where the rendered version will have each dynamic entry replaced with all unique combinations of |
|
102 |
#' \code{categories} and \code{variants} entries, assuming both are used in the dynamic entry's name |
|
103 |
#' (e.g., \code{"variable_{category}_{variant}"}). See Examples. |
|
104 |
#' @section Reference Entries: |
|
105 |
#' Reference entries can be included in a \code{_references} entry, and should have names corresponding to |
|
106 |
#' those included in any of the measures' \code{citation} entries. These can include any of these entries: |
|
107 |
#' \itemize{ |
|
108 |
#' \item \strong{\code{id}}: The reference id, same as the entry name. |
|
109 |
#' \item \strong{\code{author}}: A list or list of lists specifying one or more authors. These can include |
|
110 |
#' entries for \code{given} and \code{family} names. |
|
111 |
#' \item \strong{\code{year}}: Year of the publication. |
|
112 |
#' \item \strong{\code{title}}: Title of the publication. |
|
113 |
#' \item \strong{\code{journal}}: Journal in which the publication appears. |
|
114 |
#' \item \strong{\code{volume}}: Volume number of the journal. |
|
115 |
#' \item \strong{\code{page}}: Page number of the journal. |
|
116 |
#' \item \strong{\code{doi}}: Digital Object Identifier, from which a link is made (\code{https://doi.org/{doi}}). |
|
117 |
#' \item \strong{\code{version}}: Version number of software. |
|
118 |
#' \item \strong{\code{url}}: Link to the publication, alternative to a DOI. |
|
119 |
#' } |
|
120 |
#' @examples |
|
121 |
#' path <- tempfile() |
|
122 |
#' |
|
123 |
#' # make an initial file |
|
124 |
#' dcf_measure_info(path, "measure_name" = list( |
|
125 |
#' id = "measure_name", |
|
126 |
#' short_description = "A measure.", |
|
127 |
#' statement = "This entity has {value} measure units." |
|
128 |
#' ), verbose = FALSE) |
|
129 |
#' |
|
130 |
#' # add another measure to that |
|
131 |
#' measure_info <- dcf_measure_info(path, "measure_two" = list( |
|
132 |
#' id = "measure_two", |
|
133 |
#' short_description = "Another measure.", |
|
134 |
#' statement = "This entity has {value} measure units." |
|
135 |
#' ), verbose = FALSE) |
|
136 |
#' names(measure_info) |
|
137 |
#' |
|
138 |
#' # add a dynamic measure, and make a rendered version |
|
139 |
#' measure_info_rendered <- dcf_measure_info( |
|
140 |
#' path, |
|
141 |
#' "measure_{category}_{variant.name}" = list( |
|
142 |
#' id = "measure_{category}_{variant.name}", |
|
143 |
#' short_description = "Another measure ({category}; {variant}).", |
|
144 |
#' statement = "This entity has {value} {category} {variant}s.", |
|
145 |
#' categories = c("a", "b"), |
|
146 |
#' variants = list(u1 = list(default = "U1"), u2 = list(default = "U2")) |
|
147 |
#' ), |
|
148 |
#' render = TRUE, verbose = FALSE |
|
149 |
#' ) |
|
150 |
#' names(measure_info_rendered) |
|
151 |
#' measure_info_rendered[["measure_a_u1"]]$statement |
|
152 |
#' @return An invisible list containing measurement metadata (the rendered version if made). |
|
153 |
#' @export |
|
154 | ||
155 |
dcf_measure_info <- function( |
|
156 |
path, |
|
157 |
..., |
|
158 |
info = list(), |
|
159 |
references = list(), |
|
160 |
strict = FALSE, |
|
161 |
include_empty = TRUE, |
|
162 |
overwrite_entry = FALSE, |
|
163 |
render = NULL, |
|
164 |
overwrite = FALSE, |
|
165 |
write = TRUE, |
|
166 |
verbose = TRUE, |
|
167 |
open_after = interactive() |
|
168 |
) { |
|
169 | 14x |
if (write) { |
170 | 7x |
if (missing(path) || !is.character(path)) { |
171 | ! |
cli::cli_abort( |
172 | ! |
"enter a path to the measure_info.json file as {.arg path}" |
173 |
) |
|
174 |
} |
|
175 | 7x |
dir.create(dirname(path), FALSE, TRUE) |
176 |
} |
|
177 | 14x |
built <- list() |
178 | 14x |
if (!overwrite && is.character(path) && file.exists(path)) { |
179 | 12x |
if (verbose) { |
180 | 4x |
cli::cli_bullets(c( |
181 | 4x |
i = "updating existing file: {.path {basename(path)}}" |
182 |
)) |
|
183 |
} |
|
184 | 12x |
built <- jsonlite::read_json(path) |
185 | 12x |
if (all(c("id", "measure_type") %in% names(built))) { |
186 | ! |
built <- list(built) |
187 | ! |
names(built) <- built[[1]]$id |
188 |
} |
|
189 |
} |
|
190 | 14x |
if (length(references)) { |
191 | 1x |
references <- c(references, built$`_references`) |
192 | 1x |
references <- references[!duplicated(names(references))] |
193 | 1x |
built$`_references` <- references |
194 |
} else { |
|
195 | 13x |
references <- built$`_references` |
196 |
} |
|
197 | 14x |
defaults <- list( |
198 | 14x |
id = "", |
199 | 14x |
short_name = "", |
200 | 14x |
long_name = "", |
201 | 14x |
category = "", |
202 | 14x |
short_description = "", |
203 | 14x |
long_description = "", |
204 | 14x |
statement = "", |
205 | 14x |
measure_type = "", |
206 | 14x |
unit = "", |
207 | 14x |
time_resolution = "", |
208 | 14x |
restrictions = "", |
209 | 14x |
sources = list(), |
210 | 14x |
citations = list() |
211 |
) |
|
212 | 14x |
if (!is.list(info)) { |
213 | ! |
info <- sapply(info, function(name) list()) |
214 |
} |
|
215 | 14x |
info <- c(list(...), info) |
216 | 14x |
if (length(info) && is.null(names(info))) { |
217 | ! |
cli::cli_abort("supplied measure entries must be named") |
218 |
} |
|
219 | 14x |
for (n in names(info)) { |
220 | 7x |
if (overwrite_entry || is.null(built[[n]])) { |
221 | 6x |
l <- info[[n]] |
222 |
} else { |
|
223 | 1x |
l <- c(info[[n]], built[[n]]) |
224 | 1x |
l <- l[!duplicated(names(l))] |
225 |
} |
|
226 | 7x |
if (is.null(l$id)) { |
227 | 2x |
l$id <- n |
228 |
} |
|
229 | 7x |
if (strict) { |
230 | 1x |
su <- names(l) %in% names(defaults) |
231 | 1x |
if (verbose && any(!su)) { |
232 | 1x |
cli::cli_warn(paste0( |
233 | 1x |
"unrecognized {?entry/entries} in ", |
234 | 1x |
n, |
235 | 1x |
": {names(l)[!su]}" |
236 |
)) |
|
237 |
} |
|
238 | 1x |
if (include_empty) { |
239 | ! |
for (e in names(l)) { |
240 | ! |
if (!is.null(defaults[[e]])) { |
241 | ! |
defaults[[e]] <- l[[e]] |
242 |
} |
|
243 |
} |
|
244 | ! |
l <- defaults |
245 |
} else { |
|
246 | 1x |
l <- l[su] |
247 |
} |
|
248 | 6x |
} else if (include_empty) { |
249 | 5x |
su <- !names(defaults) %in% names(l) |
250 | 5x |
if (any(su)) l <- c(l, defaults[su]) |
251 |
} |
|
252 | 7x |
if (!is.null(l$categories) && !is.list(l$categories)) { |
253 | 1x |
l$categories <- structure( |
254 | 1x |
lapply(l$categories, function(e) list(default = e)), |
255 | 1x |
names = l$categories |
256 |
) |
|
257 |
} |
|
258 | 7x |
if (!is.null(l$variants) && !is.list(l$variants)) { |
259 | ! |
l$variants <- structure( |
260 | ! |
lapply(l$variants, function(e) list(default = e)), |
261 | ! |
names = l$categories |
262 |
) |
|
263 |
} |
|
264 | 7x |
if (verbose && !is.null(l$citations)) { |
265 | 5x |
su <- !l$citations %in% names(references) |
266 | 5x |
if (any(su)) { |
267 | 1x |
cli::cli_warn( |
268 | 1x |
"no matching reference entry for {.val {l$citations[su]}} in {.val {n}}" |
269 |
) |
|
270 |
} |
|
271 |
} |
|
272 | 7x |
built[[n]] <- l |
273 |
} |
|
274 | 14x |
built <- built[order(grepl("^_", names(built)))] |
275 | 14x |
if (write) { |
276 | 7x |
if (verbose) { |
277 | 5x |
cli::cli_bullets(c(i = "writing info to {.path {path}}")) |
278 |
} |
|
279 | 7x |
jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE) |
280 |
} |
|
281 | 14x |
if (!is.null(render)) { |
282 | 8x |
expanded <- list() |
283 | 8x |
for (name in names(built)) { |
284 | 14x |
expanded <- c( |
285 | 14x |
expanded, |
286 | 14x |
if (grepl("{", name, fixed = TRUE)) { |
287 | 1x |
render_info(built[name]) |
288 |
} else { |
|
289 | 13x |
structure(list(built[[name]]), names = name) |
290 |
} |
|
291 |
) |
|
292 |
} |
|
293 | 8x |
changed <- !identical(built, expanded) |
294 | 8x |
built <- expanded |
295 | 8x |
if (write && changed) { |
296 | 1x |
path <- if (is.character(render)) { |
297 | ! |
render |
298 |
} else { |
|
299 | 1x |
sub("\\.json", "_rendered.json", path, TRUE) |
300 |
} |
|
301 | 1x |
if (verbose) { |
302 | 1x |
cli::cli_bullets(c(i = "writing rendered info to {.path {path}}")) |
303 |
} |
|
304 | 1x |
jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE) |
305 |
} |
|
306 |
} |
|
307 | 14x |
if (open_after) { |
308 | ! |
rstudioapi::navigateToFile(path) |
309 |
} |
|
310 | 14x |
invisible(built) |
311 |
} |
|
312 | ||
313 |
replace_equations <- function(info) { |
|
314 | 3x |
lapply(info, function(e) { |
315 | ! |
if (!is.list(e)) e <- list(default = e) |
316 | 4x |
descriptions <- grep("description", names(e), fixed = TRUE) |
317 | 4x |
if (length(descriptions)) { |
318 | 4x |
for (d in descriptions) { |
319 | 8x |
p <- gregexpr( |
320 | 8x |
"(?:\\$|\\\\\\[|\\\\\\(|\\\\begin\\{math\\})(.+?)(?:\\$|\\\\\\]|\\\\\\)|\\\\end\\{math\\})(?=\\s|$)", |
321 | 8x |
e[[d]], |
322 | 8x |
perl = TRUE |
323 | 8x |
)[[1]] |
324 | 8x |
if (p[[1]] != -1) { |
325 | ! |
re <- paste("", e[[d]], "") |
326 | ! |
fm <- regmatches(e[[d]], p) |
327 | ! |
for (i in seq_along(p)) { |
328 | ! |
mp <- attr(p, "capture.start")[i, ] |
329 | ! |
eq <- substring(e[[d]], mp, mp + attr(p, "capture.length")[i, ] - 1) |
330 | ! |
parsed <- tryCatch( |
331 | ! |
katex::katex_mathml(eq), |
332 | ! |
error = function(e) NULL |
333 |
) |
|
334 | ! |
if (!is.null(parsed)) { |
335 | ! |
re <- paste( |
336 | ! |
strsplit(re, fm[[i]], fixed = TRUE)[[1]], |
337 | ! |
collapse = sub("^<[^>]*>", "", sub("<[^>]*>$", "", parsed)) |
338 |
) |
|
339 |
} |
|
340 |
} |
|
341 | ! |
e[[d]] <- gsub("^ | $", "", re) |
342 |
} |
|
343 |
} |
|
344 |
} |
|
345 | ! |
if (is.list(e$categories)) e$categories <- replace_equations(e$categories) |
346 | ! |
if (is.list(e$variants)) e$variants <- replace_equations(e$variants) |
347 | 4x |
e |
348 |
}) |
|
349 |
} |
|
350 | ||
351 |
preprocess <- function(l) { |
|
352 | ! |
if (!is.list(l)) l <- sapply(l, function(n) list()) |
353 | 2x |
ns <- names(l) |
354 | 2x |
for (i in seq_along(l)) { |
355 | 4x |
name <- if (ns[i] == "blank") "" else ns[i] |
356 | 4x |
l[[i]]$name <- name |
357 | 1x |
if (is.null(l[[i]]$default)) l[[i]]$default <- name |
358 |
} |
|
359 | 2x |
l |
360 |
} |
|
361 | ||
362 |
replace_dynamic <- function(e, p, s, v = NULL, default = "default") { |
|
363 | 68x |
m <- gregexpr(p, e) |
364 | 68x |
if (m[[1]][[1]] != -1) { |
365 | 16x |
t <- regmatches(e, m)[[1]] |
366 | 16x |
tm <- structure(gsub("\\{[^.]+\\.?|\\}", "", t), names = t) |
367 | 16x |
tm <- tm[!duplicated(names(tm))] |
368 | 16x |
tm[tm == ""] <- default |
369 | 16x |
for (tar in names(tm)) { |
370 | 32x |
us <- (if (is.null(v) || substring(tar, 2, 2) == "c") s else v) |
371 | 32x |
entry <- tm[[tar]] |
372 | 32x |
if (is.null(us[[entry]]) && grepl("description", entry, fixed = TRUE)) { |
373 | 8x |
entry <- default <- "description" |
374 |
} |
|
375 | 20x |
if (is.null(us[[entry]]) && entry == default) entry <- "default" |
376 | 32x |
if (is.null(us[[entry]])) |
377 | ! |
cli::cli_abort("failed to render measure info from {tar}") |
378 | 32x |
e <- gsub(tar, us[[entry]], e, fixed = TRUE) |
379 |
} |
|
380 |
} |
|
381 | 68x |
e |
382 |
} |
|
383 | ||
384 |
prepare_source <- function(o, s, p) { |
|
385 | 8x |
if (length(o)) { |
386 | 8x |
lapply(o, function(e) { |
387 | 2x |
if (is.character(e) && length(e) == 1) replace_dynamic(e, p, s) else e |
388 |
}) |
|
389 |
} else { |
|
390 | ! |
list(name = "", default = "") |
391 |
} |
|
392 |
} |
|
393 | ||
394 |
render_info_names <- function(infos) { |
|
395 | ! |
r <- lapply(names(infos), function(n) render_info(infos[n], TRUE)) |
396 | ! |
structure(rep(names(infos), vapply(r, length, 0)), names = unlist(r)) |
397 |
} |
|
398 | ||
399 |
render_info <- function(info, names_only = FALSE) { |
|
400 | 1x |
base_name <- names(info) |
401 | 1x |
base <- info[[1]] |
402 | 1x |
if (is.null(base$categories) && is.null(base$variants)) { |
403 | ! |
return(if (names_only) base_name else info) |
404 |
} |
|
405 | 1x |
categories <- preprocess(base$categories) |
406 | 1x |
variants <- preprocess(base$variants) |
407 | 1x |
base$categories <- NULL |
408 | 1x |
base$variants <- NULL |
409 | 1x |
expanded <- NULL |
410 | 1x |
vars <- strsplit( |
411 | 1x |
as.character(outer( |
412 | 1x |
if (is.null(names(categories))) "" else names(categories), |
413 | 1x |
if (is.null(names(variants))) "" else names(variants), |
414 | 1x |
paste, |
415 | 1x |
sep = "|||" |
416 |
)), |
|
417 |
"|||", |
|
418 | 1x |
fixed = TRUE |
419 |
) |
|
420 | 1x |
for (var in vars) { |
421 | 4x |
cs <- if (var[1] == "") list() else categories[[var[1]]] |
422 | 4x |
vs <- if (length(var) == 1 || var[2] == "") list() else variants[[var[2]]] |
423 | 4x |
cs <- prepare_source(cs, vs, "\\{variants?(?:\\.[^}]+?)?\\}") |
424 | 4x |
vs <- prepare_source(vs, cs, "\\{categor(?:y|ies)(?:\\.[^}]+?)?\\}") |
425 | 4x |
s <- c(cs, vs[!names(vs) %in% names(cs)]) |
426 | 4x |
p <- "\\{(?:categor(?:y|ies)|variants?)(?:\\.[^}]+?)?\\}" |
427 | 4x |
key <- replace_dynamic(base_name, p, cs, vs) |
428 | 4x |
if (names_only) { |
429 | ! |
expanded <- c(expanded, key) |
430 |
} else { |
|
431 | 4x |
expanded[[key]] <- c( |
432 | 4x |
structure( |
433 | 4x |
lapply(names(base), function(n) { |
434 | 52x |
e <- base[[n]] |
435 | 52x |
if (is.character(e) && length(e) == 1) |
436 | 44x |
e <- replace_dynamic(e, p, cs, vs, n) |
437 | 52x |
e |
438 |
}), |
|
439 | 4x |
names = names(base) |
440 |
), |
|
441 | 4x |
s[ |
442 | 4x |
!names(s) %in% |
443 | 4x |
c( |
444 | 4x |
"default", |
445 | 4x |
"name", |
446 | 4x |
if (any(base[c("long_description", "short_description")] != "")) |
447 | 4x |
"description", |
448 | 4x |
names(base) |
449 |
) |
|
450 |
] |
|
451 |
) |
|
452 |
} |
|
453 |
} |
|
454 | 1x |
expanded |
455 |
} |
1 |
#' Read Epic Cosmos Data |
|
2 |
#' |
|
3 |
#' Read in metadata and data from an Epic Cosmos file. |
|
4 |
#' |
|
5 |
#' @param path Path to the file. |
|
6 |
#' @param path_root Directory containing \code{path}, if it is not full. |
|
7 |
#' @returns A list with \code{data.frame} entries for \code{metadata} and \code{data}. |
|
8 |
#' |
|
9 |
#' @examples |
|
10 |
#' # write an example file |
|
11 |
#' path <- tempfile(fileext = ".csv") |
|
12 |
#' raw_lines <- c( |
|
13 |
#' "metadata field,metadata value,", |
|
14 |
#' ",,", |
|
15 |
#' ",Measures,Value Name", |
|
16 |
#' "Year,Measure 1,", |
|
17 |
#' "2020,m1,1", |
|
18 |
#' ",m2,2", |
|
19 |
#' "2021,m1,3", |
|
20 |
#' ",m2,4" |
|
21 |
#' ) |
|
22 |
#' writeLines(raw_lines, path) |
|
23 |
#' |
|
24 |
#' # read it in |
|
25 |
#' dcf_read_epic(basename(path), dirname(path)) |
|
26 |
#' |
|
27 |
#' @export |
|
28 | ||
29 |
dcf_read_epic <- function(path, path_root = ".") { |
|
30 | 2x |
full_path <- if (file.exists(path)) { |
31 | 2x |
path |
32 |
} else { |
|
33 | ! |
sub("//", "/", paste0(path_root, "/", path), fixed = TRUE) |
34 |
} |
|
35 | 2x |
lines <- readLines(full_path, n = 25L, skipNul = FALSE) |
36 | 2x |
metadata_break <- grep("^[, ]*$", lines) |
37 | 2x |
if (!length(metadata_break)) { |
38 | ! |
cli::cli_abort( |
39 | ! |
"path does not appear to point to a file in the Epic format (no metadata separation)" |
40 |
) |
|
41 |
} |
|
42 | 2x |
meta_end <- min(metadata_break) - 1L |
43 | 2x |
data_start <- (if (length(metadata_break) == 1L) { |
44 | 2x |
metadata_break |
45 |
} else { |
|
46 | ! |
max(metadata_break[ |
47 | ! |
metadata_break == c(-1L, metadata_break[-1L]) |
48 |
]) |
|
49 |
}) + |
|
50 | 2x |
1L |
51 | 2x |
meta <- c( |
52 | 2x |
list( |
53 | 2x |
file = path, |
54 | 2x |
md5 = unname(tools::md5sum(full_path)), |
55 | 2x |
date_processed = Sys.time(), |
56 | 2x |
standard_name = "" |
57 |
), |
|
58 | 2x |
as.list(unlist(lapply( |
59 | 2x |
strsplit(sub(",+$", "", lines[seq_len(meta_end)]), ",", fixed = TRUE), |
60 | 2x |
function(r) { |
61 | 2x |
l <- list(paste(r[-1L], collapse = ",")) |
62 | 2x |
if (l[[1]] == "") { |
63 | ! |
r <- strsplit(r, ": ", fixed = TRUE)[[1L]] |
64 | ! |
l <- list(paste(r[-1L], collapse = ",")) |
65 |
} |
|
66 | 2x |
names(l) <- r[[1L]] |
67 | 2x |
l[[1L]] <- gsub('^"|"$', "", l[[1L]]) |
68 | 2x |
l |
69 |
} |
|
70 |
))) |
|
71 |
) |
|
72 | 2x |
standard_names <- c( |
73 | 2x |
vaccine_mmr = "MMR receipt", |
74 | 2x |
rsv_tests = "RSV tests", |
75 | 2x |
flu = "Influenza", |
76 | 2x |
self_harm = "self-harm", |
77 | 2x |
covid = "COVID", |
78 | 2x |
rsv = "RSV", |
79 | 2x |
obesity = "BMI", |
80 | 2x |
obesity = "obesity", |
81 | 2x |
all_encounters = "All ED Encounters" |
82 |
) |
|
83 | 2x |
meta_string <- paste(unlist(meta), collapse = " ") |
84 | 2x |
for (i in seq_along(standard_names)) { |
85 | 12x |
if (grepl(standard_names[[i]], meta_string, fixed = TRUE)) { |
86 | 2x |
meta$standard_name = names(standard_names)[[i]] |
87 | 2x |
break |
88 |
} |
|
89 |
} |
|
90 | 2x |
id_cols <- seq_len( |
91 | 2x |
length(strsplit(lines[data_start], "^,|Measures,")[[1L]]) - 1L |
92 |
) |
|
93 | 2x |
header <- c( |
94 | 2x |
strsplit(lines[data_start + 1L], ",", fixed = TRUE)[[1L]][id_cols], |
95 | 2x |
strsplit(lines[data_start], ",", fixed = TRUE)[[1L]][-id_cols] |
96 |
) |
|
97 | 2x |
data <- arrow::read_csv_arrow( |
98 | 2x |
full_path, |
99 | 2x |
col_names = header, |
100 | 2x |
col_types = paste(rep("c", length(header)), collapse = ""), |
101 | 2x |
na = c("", "-"), |
102 | 2x |
skip = data_start + 1L |
103 |
) |
|
104 | 2x |
percents <- grep("^(?:Percent|Base|RSV test)", header) |
105 | 2x |
if (length(percents)) { |
106 | ! |
for (col in percents) { |
107 | ! |
data[[col]] <- sub("%", "", data[[col]], fixed = TRUE) |
108 |
} |
|
109 |
} |
|
110 | 2x |
number <- grep("Number", header, fixed = TRUE) |
111 | 2x |
if (length(number)) { |
112 | ! |
for (col in number) { |
113 | ! |
data[[col]][data[[col]] == "10 or fewer"] <- 5L |
114 |
} |
|
115 |
} |
|
116 | 2x |
for (col in id_cols) { |
117 | 10x |
data[[col]] <- vctrs::vec_fill_missing(data[[col]], "down") |
118 |
} |
|
119 | 2x |
if (all(c("Measures", "Base Patient") %in% colnames(data))) { |
120 | ! |
data <- Reduce( |
121 | ! |
merge, |
122 | ! |
lapply(split(data, data$Measures), function(d) { |
123 | ! |
measure <- d$Measures[[1L]] |
124 | ! |
d[[measure]] <- d[["Base Patient"]] |
125 | ! |
d[, !(colnames(d) %in% c("Measures", "Base Patient"))] |
126 |
}) |
|
127 |
) |
|
128 |
} |
|
129 | 2x |
colnames(data) <- standard_columns(colnames(data)) |
130 | 2x |
if (meta$standard_name == "obesity") { |
131 | ! |
meta$standard_name <- paste0( |
132 | ! |
meta$standard_name, |
133 |
"_", |
|
134 | ! |
if ("state" %in% colnames(data)) "state" else "county" |
135 |
) |
|
136 |
} |
|
137 | 2x |
if ("age" %in% colnames(data)) { |
138 | 2x |
std_age <- standard_age(data$age) |
139 | 2x |
missed_ages <- (data$age != "No value") & is.na(std_age) |
140 | 2x |
if (any(missed_ages)) { |
141 | ! |
std_age[missed_ages] <- data$age[missed_ages] |
142 | ! |
missed_levels <- unique(data$age[missed_ages]) |
143 | ! |
cli::cli_warn("missed age levels: {.field {missed_levels}}") |
144 |
} |
|
145 | 2x |
data$age <- std_age |
146 |
} |
|
147 | 2x |
list(metadata = meta, data = data) |
148 |
} |
|
149 | ||
150 |
standard_age <- function(age) { |
|
151 | 2x |
c( |
152 | 2x |
`less than 1 years` = "<1 Years", |
153 | 2x |
`1 and < 2 years` = "1-2 Years", |
154 | 2x |
`2 and < 3 years` = "2-3 Years", |
155 | 2x |
`3 and < 4 years` = "3-4 Years", |
156 | 2x |
`1 and < 5 years` = "1-4 Years", |
157 | 2x |
`1 year or more and less than 5 years` = "1-4 Years", |
158 | 2x |
`4 and < 5 years` = "4-5 Years", |
159 | 2x |
`less than 5 years` = "<5 Years", |
160 | 2x |
`5 and < 6 years` = "5-6 Years", |
161 | 2x |
`5 and < 18 years` = "5-17 Years", |
162 | 2x |
`5 years or more and less than 18 years (1)` = "5-17 Years", |
163 | 2x |
`6 and < 7 years` = "6-7 Years", |
164 | 2x |
`6 years or more` = "6+ Years", |
165 | 2x |
`7 and < 8 years` = "7-8 Years", |
166 | 2x |
`8 and < 9 years` = "8-9 Years", |
167 | 2x |
`9 years or more` = "9+ Years", |
168 | 2x |
`less than 10 years` = "<10 Years", |
169 | 2x |
`10 and < 15 years` = "10-14 Years", |
170 | 2x |
`15 and < 20 years` = "15-19 Years", |
171 | 2x |
`18 and < 40 years` = "18-39 Years", |
172 | 2x |
`18 and < 50 years` = "18-49 Years", |
173 | 2x |
`18 years or more and less than 50 years` = "18-49 Years", |
174 | 2x |
`20 and < 40 years` = "20-39 Years", |
175 | 2x |
`40 and < 65 years` = "40-64 Years", |
176 | 2x |
`50 and < 65 years` = "50-64 Years", |
177 | 2x |
`50 years or more and less than 64 years` = "50-64 Years", |
178 | 2x |
`65 years or more` = "65+ Years", |
179 | 2x |
`65 and < 110 years` = "65+ Years", |
180 | 2x |
`total` = "Total" |
181 |
)[ |
|
182 | 2x |
sub("^[^a-z0-9]+|:.*$", "", tolower(age)) |
183 |
] |
|
184 |
} |
|
185 | ||
186 |
standard_columns <- function(cols) { |
|
187 | 2x |
cols <- gsub(" ", "_", sub("number of ", "n_", tolower(cols)), fixed = TRUE) |
188 | 2x |
cols[grep("^age", cols)] <- "age" |
189 | 2x |
cols[grep("^state", cols)] <- "state" |
190 | 2x |
cols[grep("^county", cols)] <- "county" |
191 | 2x |
cols[grep("bmi_30", cols)] <- "bmi_30_49.8" |
192 | 2x |
cols[grep("hemoglobin_a1c_7", cols)] <- "hemoglobin_a1c_7" |
193 | 2x |
cols[grep("mmr_receipt", cols)] <- "mmr_receipt" |
194 | 2x |
cols[grep("^rsv_tests", cols)] <- "rsv_tests" |
195 | 2x |
cols |
196 |
} |
1 |
#' Check Data Sources |
|
2 |
#' |
|
3 |
#' Check the data files and measure info of source projects. |
|
4 |
#' |
|
5 |
#' @param names Name or names of source projects. |
|
6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
7 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages. |
|
8 |
#' @returns A list with an entry for each source, containing a character vector |
|
9 |
#' including any issue codes: |
|
10 |
#' \itemize{ |
|
11 |
#' \item \code{not_compressed}: The file does not appear to be compressed. |
|
12 |
#' \item \code{cant_read}: Failed to read the file in. |
|
13 |
#' \item \code{geography_missing}: File does not contain a \code{geography} column. |
|
14 |
#' \item \code{geography_nas}: The file's \code{geography} column contains NAs. |
|
15 |
#' \item \code{time_missing}: File does not contain a \code{time} column. |
|
16 |
#' \item \code{time_nas}: The file's \code{time} column contains NAs. |
|
17 |
#' \item \code{missing_info: {column_name}}: The file's indicated column does not have |
|
18 |
#' a matching entry in \code{measure_info.json}. |
|
19 |
#' } |
|
20 |
#' @examples |
|
21 |
#' \dontrun{ |
|
22 |
#' dcf_check_sources("gtrends") |
|
23 |
#' } |
|
24 |
#' @export |
|
25 | ||
26 |
dcf_check_sources <- function( |
|
27 |
names = NULL, |
|
28 |
project_dir = ".", |
|
29 |
verbose = TRUE |
|
30 |
) { |
|
31 | 3x |
settings <- dcf_read_settings(project_dir) |
32 | 3x |
base_dir <- paste0(project_dir, "/", settings$data_dir) |
33 | 3x |
if (is.null(names)) { |
34 | 2x |
names <- list.dirs(base_dir, recursive = FALSE, full.names = FALSE) |
35 |
} |
|
36 | 3x |
issues <- list() |
37 | 3x |
for (name in names) { |
38 | 4x |
source_dir <- paste0(base_dir, "/", name, "/") |
39 | 4x |
if (!dir.exists(source_dir)) { |
40 | ! |
cli::cli_abort("specify the name of an existing data source") |
41 |
} |
|
42 | 4x |
process_file <- paste0(source_dir, "process.json") |
43 | 4x |
dcf_add_source(name, project_dir, FALSE) |
44 | 4x |
if (!file.exists(process_file)) { |
45 | ! |
cli::cli_abort("{name} does not appear to be a data source project") |
46 |
} |
|
47 | 4x |
process <- dcf_process_record(process_file) |
48 | 4x |
info_file <- paste0(source_dir, "measure_info.json") |
49 | 4x |
info <- tryCatch( |
50 | 4x |
dcf_measure_info( |
51 | 4x |
info_file, |
52 | 4x |
render = TRUE, |
53 | 4x |
write = FALSE, |
54 | 4x |
verbose = FALSE, |
55 | 4x |
open_after = FALSE |
56 |
), |
|
57 | 4x |
error = function(e) NULL |
58 |
) |
|
59 | 4x |
if (is.null(info)) { |
60 | ! |
cli::cli_abort("{.file {info_file}} is malformed") |
61 |
} |
|
62 | 4x |
if (verbose) { |
63 | 4x |
cli::cli_bullets(c("", "Checking data source {.strong {name}}")) |
64 |
} |
|
65 | 4x |
data_files <- list.files( |
66 | 4x |
paste0(source_dir, "standard"), |
67 | 4x |
"\\.(?:csv|parquet)", |
68 | 4x |
full.names = TRUE |
69 |
) |
|
70 | 4x |
source_issues <- list() |
71 | 4x |
for (file in list.files( |
72 | 4x |
paste0(source_dir, "raw"), |
73 | 4x |
"csv$", |
74 | 4x |
full.names = TRUE |
75 |
)) { |
|
76 | 1x |
source_issues[[file]] <- list(data = "not_compressed") |
77 |
} |
|
78 | 4x |
if (length(data_files)) { |
79 | 3x |
for (file in data_files) { |
80 | 3x |
issue_messages <- NULL |
81 | 3x |
if (verbose) { |
82 | 3x |
cli::cli_progress_step("checking file {.file {file}}", spinner = TRUE) |
83 |
} |
|
84 | 3x |
data_issues <- NULL |
85 | 3x |
measure_issues <- NULL |
86 | 3x |
data <- tryCatch( |
87 | 3x |
if (grepl("parquet$", file)) { |
88 | ! |
dplyr::collect(arrow::read_parquet(file)) |
89 |
} else { |
|
90 | 3x |
con <- gzfile(file) |
91 | 3x |
on.exit(con) |
92 | 3x |
vroom::vroom(con, show_col_types = FALSE) |
93 |
}, |
|
94 | 3x |
error = function(e) NULL |
95 |
) |
|
96 | 3x |
if (is.null(data)) { |
97 | ! |
data_issues <- c(data_issues, "cant_read") |
98 |
} else { |
|
99 | 3x |
if (grepl("csv$", file)) { |
100 | 1x |
data_issues <- c(data_issues, "not_compressed") |
101 | 1x |
if (verbose) { |
102 | 1x |
issue_messages <- c( |
103 | 1x |
issue_messages, |
104 | 1x |
"file is not compressed" |
105 |
) |
|
106 |
} |
|
107 |
} |
|
108 | 3x |
if (!("geography" %in% colnames(data))) { |
109 | ! |
data_issues <- c(data_issues, "geography_missing") |
110 | ! |
if (verbose) { |
111 | ! |
issue_messages <- c( |
112 | ! |
issue_messages, |
113 | ! |
"missing {.emph geography} column" |
114 |
) |
|
115 |
} |
|
116 | 3x |
} else if (anyNA(data$geography)) { |
117 | 1x |
data_issues <- c(data_issues, "geography_nas") |
118 | 1x |
if (verbose) { |
119 | 1x |
issue_messages <- c( |
120 | 1x |
issue_messages, |
121 | 1x |
"{.emph geography} column contains NAs" |
122 |
) |
|
123 |
} |
|
124 |
} |
|
125 | 3x |
if (!("time" %in% colnames(data))) { |
126 | ! |
data_issues <- c(data_issues, "time_missing") |
127 | ! |
if (verbose) { |
128 | ! |
issue_messages <- c( |
129 | ! |
issue_messages, |
130 | ! |
"missing {.emph time} column" |
131 |
) |
|
132 |
} |
|
133 | 3x |
} else if (anyNA(data$time)) { |
134 | 1x |
data_issues <- c(data_issues, "time_nas") |
135 | 1x |
if (verbose) { |
136 | 1x |
issue_messages <- c( |
137 | 1x |
issue_messages, |
138 | 1x |
"{.emph time} column contains NAs" |
139 |
) |
|
140 |
} |
|
141 |
} |
|
142 | 3x |
for (col in colnames(data)) { |
143 | 9x |
if (!(col %in% c("geography", "time")) && !(col %in% names(info))) { |
144 | 1x |
measure_issues <- c(measure_issues, paste("missing_info:", col)) |
145 | 1x |
if (verbose) { |
146 | 1x |
issue_messages <- c( |
147 | 1x |
issue_messages, |
148 | 1x |
paste0( |
149 | 1x |
"{.emph ", |
150 | 1x |
col, |
151 | 1x |
"} column does not have an entry in measure_info" |
152 |
) |
|
153 |
) |
|
154 |
} |
|
155 |
} |
|
156 |
} |
|
157 |
} |
|
158 | 3x |
file_issues <- list() |
159 | 3x |
if (length(data_issues)) { |
160 | 1x |
file_issues$data <- data_issues |
161 |
} |
|
162 | 3x |
if (length(measure_issues)) { |
163 | 1x |
file_issues$measures <- measure_issues |
164 |
} |
|
165 | 3x |
source_issues[[file]] <- file_issues |
166 | 3x |
if (verbose) { |
167 | 3x |
if (length(issue_messages)) { |
168 | 1x |
cli::cli_progress_done(result = "failed") |
169 | 1x |
cli::cli_bullets(structure( |
170 | 1x |
issue_messages, |
171 | 1x |
names = rep(" ", length(issue_messages)) |
172 |
)) |
|
173 |
} else { |
|
174 | 2x |
cli::cli_progress_done() |
175 |
} |
|
176 |
} |
|
177 |
} |
|
178 |
} else { |
|
179 | 1x |
if (verbose) cli::cli_alert_info("no standard data files found to check") |
180 |
} |
|
181 | 4x |
if (!identical(process$check_results, source_issues)) { |
182 | 3x |
process$checked <- Sys.time() |
183 | 3x |
process$check_results <- source_issues |
184 | 3x |
dcf_process_record(process_file, process) |
185 |
} |
|
186 | 4x |
issues[[name]] <- source_issues |
187 |
} |
|
188 | ||
189 | 3x |
invisible(issues) |
190 |
} |
1 |
#' Process Epic Stating Files |
|
2 |
#' |
|
3 |
#' Process Epic stating files, lightly standardizing them and moving them to raw. |
|
4 |
#' |
|
5 |
#' @param staging_dir Directory containing the staging files. |
|
6 |
#' @param out_dir Directory to write new raw files to. |
|
7 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages. |
|
8 |
#' @param cleanup Logical; if \code{FALSE}, will not remove staging files after being processed. |
|
9 |
#' @returns \code{NULL} if no staging files are found. |
|
10 |
#' Otherwise, a list with entries for \code{data} and \code{metadata}. |
|
11 |
#' Each of these are lists with entries for each recognized standard name, |
|
12 |
#' with potentially combined outputs similar to \code{\link{dcf_read_epic}} |
|
13 |
#' |
|
14 |
#' @examples |
|
15 |
#' \dontrun{ |
|
16 |
#' # run from a source project |
|
17 |
#' dcf_process_epic_staging() |
|
18 |
#' } |
|
19 |
#' |
|
20 |
#' @export |
|
21 | ||
22 |
dcf_process_epic_staging <- function( |
|
23 |
staging_dir = "raw/staging", |
|
24 |
out_dir = "raw", |
|
25 |
verbose = TRUE, |
|
26 |
cleanup = TRUE |
|
27 |
) { |
|
28 | 1x |
files <- sort(list.files( |
29 | 1x |
staging_dir, |
30 | 1x |
"csv", |
31 | 1x |
full.names = TRUE, |
32 | 1x |
recursive = TRUE |
33 |
)) |
|
34 | 1x |
files <- files[!grepl("census", files)] |
35 | 1x |
if (!length(files)) { |
36 | ! |
if (verbose) { |
37 | ! |
cli::cli_progress_message("no staging files found") |
38 |
} |
|
39 | ! |
return(NULL) |
40 |
} |
|
41 | 1x |
id_cols <- c("state", "county", "age", "year", "month", "week") |
42 | 1x |
metadata <- list() |
43 | 1x |
data <- list() |
44 | 1x |
for (file in files) { |
45 | 2x |
if (verbose) { |
46 | 2x |
cli::cli_progress_step("processing file {.file {file}}", spinner = TRUE) |
47 |
} |
|
48 | 2x |
epic <- tryCatch(dcf_read_epic(file), error = function(e) NULL) |
49 | 2x |
if (is.null(epic)) { |
50 | ! |
if (verbose) { |
51 | ! |
cli::cli_progress_done(result = "failed") |
52 |
} |
|
53 | ! |
next |
54 |
} |
|
55 | 2x |
if (epic$metadata$standard_name == "") { |
56 | ! |
if (verbose) { |
57 | ! |
cli::cli_progress_update( |
58 | ! |
status = "failed to identify standard type for {.file {file}}" |
59 |
) |
|
60 | ! |
cli::cli_progress_done(result = "failed") |
61 |
} |
|
62 | ! |
next |
63 |
} |
|
64 | 2x |
name <- epic$metadata$standard_name |
65 | 2x |
metadata[[name]] <- c(list(epic$metadata), metadata[[name]]) |
66 | 2x |
file_id_cols <- id_cols[id_cols %in% colnames(epic$data)] |
67 | 2x |
epic$data <- epic$data[ |
68 | 2x |
rowMeans(is.na(epic$data[, |
69 | 2x |
!(colnames(epic$data) %in% file_id_cols), |
70 | 2x |
drop = FALSE |
71 |
])) != |
|
72 | 2x |
1, |
73 |
] |
|
74 | 2x |
n_col <- grep("^n_", colnames(epic$data)) |
75 | 2x |
if (length(n_col)) { |
76 | ! |
colnames(epic$data)[[n_col]] <- paste0("n_", epic$metadata$standard_name) |
77 |
} |
|
78 | 2x |
if (!is.null(data[[name]])) { |
79 | 1x |
cols <- colnames(data[[name]]) |
80 | 1x |
cols <- cols[!(cols %in% colnames(epic$data))] |
81 | 1x |
if (length(cols)) { |
82 | ! |
epic$data[, cols] <- NA |
83 |
} |
|
84 | 1x |
epic$data <- epic$data[, colnames(data[[name]])] |
85 | 1x |
file_id_cols <- id_cols[id_cols %in% colnames(data[[name]])] |
86 | 1x |
data[[name]] <- data[[name]][ |
87 | 1x |
!(do.call(paste, data[[name]][, file_id_cols]) %in% |
88 | 1x |
do.call(paste, epic$data[, file_id_cols])), |
89 |
] |
|
90 |
} |
|
91 | 2x |
data[[name]] <- rbind(epic$data, data[[name]]) |
92 | 2x |
if (verbose) cli::cli_progress_done() |
93 |
} |
|
94 | 1x |
for (name in names(metadata)) { |
95 | 1x |
if (verbose) { |
96 | 1x |
cli::cli_progress_step( |
97 | 1x |
"writing standard raw output for {.field {name}}", |
98 | 1x |
spinner = TRUE |
99 |
) |
|
100 |
} |
|
101 | 1x |
paths <- paste0(out_dir, "/", name, ".", c("json", "csv.xz")) |
102 | 1x |
jsonlite::write_json( |
103 | 1x |
metadata[[name]], |
104 | 1x |
paths[[1L]], |
105 | 1x |
auto_unbox = TRUE, |
106 | 1x |
pretty = TRUE |
107 |
) |
|
108 | 1x |
vroom::vroom_write(data[[name]], paths[[2L]]) |
109 | 1x |
if (cleanup) { |
110 | 1x |
unlink(vapply(metadata[[name]], "[[", "", "file")) |
111 |
} |
|
112 | 1x |
if (verbose) cli::cli_process_done() |
113 |
} |
|
114 | 1x |
return(list(metadata = metadata, data = data)) |
115 |
} |
1 |
#' Run Data Project Processes |
|
2 |
#' |
|
3 |
#' Operates over data source and bundle projects, optionally running the source |
|
4 |
#' ingest scripts, then collecting metadata. |
|
5 |
#' |
|
6 |
#' @param name Name of a source project to process. Will |
|
7 |
#' @param project_dir Path to the project directory. If not specified, and being called |
|
8 |
#' from a source project, this will be assumed to be two steps back from the working directory. |
|
9 |
#' @param ingest Logical; if \code{FALSE}, will re-process standardized data without running |
|
10 |
#' ingestion scripts. Only applies to source projects. |
|
11 |
#' @param is_auto Logical; if \code{TRUE}, will skip process scripts marked as manual. |
|
12 |
#' @param force Logical; if \code{TRUE}, will ignore process frequencies |
|
13 |
#' (will run scripts even if recently run). |
|
14 |
#' @param clear_state Logical; if \code{TRUE}, will clear stored states before processing. |
|
15 |
#' @returns A list with processing results: |
|
16 |
#' \itemize{ |
|
17 |
#' \item \code{timings}: How many seconds the scripts took to run. |
|
18 |
#' \item \code{logs}: The captured output of the scripts. |
|
19 |
#' } |
|
20 |
#' Each entry has an entry for each project. |
|
21 |
#' |
|
22 |
#' A `datapackage.json` file is also created / update in each source's `standard` directory. |
|
23 |
#' @examples |
|
24 |
#' \dontrun{ |
|
25 |
#' # run from a directory containing a `data` directory containing the source |
|
26 |
#' dcf_process("source_name") |
|
27 |
#' |
|
28 |
#' # run without executing the ingestion script |
|
29 |
#' dcf_process("source_name", ingest = FALSE) |
|
30 |
#' } |
|
31 |
#' @export |
|
32 | ||
33 |
dcf_process <- function( |
|
34 |
name = NULL, |
|
35 |
project_dir = ".", |
|
36 |
ingest = TRUE, |
|
37 |
is_auto = FALSE, |
|
38 |
force = FALSE, |
|
39 |
clear_state = FALSE |
|
40 |
) { |
|
41 | 4x |
settings_file <- paste0(project_dir, "/settings.json") |
42 | 4x |
from_project <- file.exists(settings_file) |
43 | 4x |
if (from_project) { |
44 | 4x |
source_dir <- paste0( |
45 | 4x |
project_dir, |
46 |
"/", |
|
47 | 4x |
jsonlite::read_json(settings_file)$data_dir |
48 |
) |
|
49 |
} else { |
|
50 | ! |
source_dir <- "../.." |
51 | ! |
name <- dirname(getwd()) |
52 |
} |
|
53 | ||
54 | 4x |
sources <- if (is.null(name)) { |
55 | 2x |
list.files( |
56 | 2x |
source_dir, |
57 | 2x |
"process\\.json", |
58 | 2x |
recursive = TRUE, |
59 | 2x |
full.names = TRUE |
60 |
) |
|
61 |
} else { |
|
62 | 2x |
process_files <- paste0(source_dir, "/", name, "/process.json") |
63 | 2x |
if (any(!file.exists(process_files))) { |
64 | ! |
cli::cli_abort( |
65 | ! |
"missing process file{?/s}: {.emph {process_files[!file.exists(process_files)]}}" |
66 |
) |
|
67 |
} |
|
68 | 2x |
process_files |
69 |
} |
|
70 | 4x |
decide_to_run <- function(process_script) { |
71 | 4x |
if (is_auto && process_script$manual) { |
72 | ! |
return(FALSE) |
73 |
} |
|
74 | 4x |
if ( |
75 | 4x |
force || process_script$last_run == "" || process_script$frequency == 0L |
76 |
) { |
|
77 | 4x |
return(TRUE) |
78 |
} |
|
79 | 4x |
if ( |
80 | ! |
difftime(Sys.time(), as.POSIXct(process_script$last_run), units = "day") > |
81 | ! |
process_script$frequency |
82 |
) { |
|
83 | ! |
return(TRUE) |
84 |
} |
|
85 | ! |
FALSE |
86 |
} |
|
87 | 4x |
collect_env <- new.env() |
88 | 4x |
collect_env$timings <- list() |
89 | 4x |
collect_env$logs <- list() |
90 | 4x |
process_source <- function(process_file) { |
91 | 4x |
process_def <- dcf_process_record(process_file) |
92 | 4x |
if (clear_state) { |
93 | ! |
process_def$raw_state <- NULL |
94 | ! |
process_def$standard_state <- NULL |
95 | ! |
dcf_process_record(process_file, process_def) |
96 |
} |
|
97 | 4x |
name <- process_def$name |
98 | 4x |
dcf_add_source(name, project_dir, open_after = FALSE) |
99 | 4x |
for (si in seq_along(process_def$scripts)) { |
100 | 4x |
st <- proc.time()[[3]] |
101 | 4x |
process_script <- process_def$scripts[[si]] |
102 | 4x |
run_current <- ingest && decide_to_run(process_script) |
103 | 4x |
base_dir <- dirname(process_file) |
104 | 4x |
standard_dir <- paste0(base_dir, "/standard") |
105 | 4x |
script <- paste0(base_dir, "/", process_script$path) |
106 | 4x |
file_ref <- if (run_current) paste0(" ({.emph ", script, "})") else NULL |
107 | 4x |
cli::cli_progress_step( |
108 | 4x |
paste0("processing source {.strong ", name, "}", file_ref), |
109 | 4x |
spinner = TRUE |
110 |
) |
|
111 | 4x |
env <- new.env() |
112 | 4x |
env$dcf_process_continue <- TRUE |
113 | 4x |
status <- if (ingest) { |
114 | 4x |
tryCatch( |
115 | 4x |
list( |
116 | 4x |
log = utils::capture.output( |
117 | 4x |
source(script, env, chdir = TRUE), |
118 | 4x |
type = "message" |
119 |
), |
|
120 | 4x |
success = TRUE |
121 |
), |
|
122 | 4x |
error = function(e) list(log = e$message, success = FALSE) |
123 |
) |
|
124 |
} else { |
|
125 | ! |
list(log = "", success = TRUE) |
126 |
} |
|
127 | 4x |
collect_env$logs[[name]] <- status$log |
128 | 4x |
if (run_current) { |
129 | 4x |
process_script$last_run <- Sys.time() |
130 | 4x |
process_script$run_time <- proc.time()[[3]] - st |
131 | 4x |
process_script$last_status <- status |
132 | 4x |
process_def$scripts[[si]] <- process_script |
133 |
} |
|
134 | 4x |
if (status$success) { |
135 | 4x |
collect_env$timings[[name]] <- process_script$run_time |
136 |
} |
|
137 | ! |
if (!env$dcf_process_continue) break |
138 |
} |
|
139 | 4x |
process_def_current <- dcf_process_record(process_file) |
140 | 4x |
if ( |
141 | 4x |
is.null(process_def_current$raw_state) || |
142 | 4x |
!identical(process_def$raw_state, process_def_current$raw_state) |
143 |
) { |
|
144 | 4x |
process_def_current$scripts <- process_def$scripts |
145 | 4x |
dcf_process_record(process_file, process_def_current) |
146 |
} |
|
147 | 4x |
data_files <- list.files(standard_dir, "\\.(?:csv|parquet)") |
148 | 4x |
if (length(data_files)) { |
149 | 4x |
measure_info_file <- paste0(base_dir, "/measure_info.json") |
150 | 4x |
standard_state <- as.list(tools::md5sum(c( |
151 | 4x |
measure_info_file, |
152 | 4x |
paste0(standard_dir, "/", data_files) |
153 |
))) |
|
154 | 4x |
if (!identical(process_def_current$standard_state, standard_state)) { |
155 | 3x |
measure_info <- dcf_measure_info( |
156 | 3x |
measure_info_file, |
157 | 3x |
include_empty = FALSE, |
158 | 3x |
render = TRUE, |
159 | 3x |
write = FALSE, |
160 | 3x |
open_after = FALSE, |
161 | 3x |
verbose = FALSE |
162 |
) |
|
163 | 3x |
measure_sources <- list() |
164 | 3x |
for (info in measure_info) { |
165 | 4x |
for (s in info$sources) { |
166 | 4x |
if ( |
167 | ! |
!is.null(s$location) && |
168 | ! |
!(s$location %in% names(sources)) |
169 |
) { |
|
170 | ! |
measure_sources[[s$location]] <- s |
171 |
} |
|
172 |
} |
|
173 |
} |
|
174 | 3x |
if (!file.exists(paste0(standard_dir, "/datapackage.json"))) { |
175 | ! |
dcf_datapackage_init(name, dir = standard_dir, quiet = TRUE) |
176 |
} |
|
177 | 3x |
dcf_datapackage_add( |
178 | 3x |
data_files, |
179 | 3x |
meta = list( |
180 | 3x |
source = unname(measure_sources), |
181 | 3x |
base_dir = base_dir, |
182 | 3x |
ids = "geography", |
183 | 3x |
time = "time", |
184 | 3x |
variables = measure_info |
185 |
), |
|
186 | 3x |
dir = standard_dir, |
187 | 3x |
pretty = TRUE, |
188 | 3x |
summarize_ids = TRUE, |
189 | 3x |
verbose = FALSE |
190 |
) |
|
191 | 3x |
process_def_current$standard_state <- standard_state |
192 | 3x |
dcf_process_record(process_file, process_def_current) |
193 |
} |
|
194 | 4x |
cli::cli_progress_done(result = if (status$success) "done" else "failed") |
195 |
} else { |
|
196 | ! |
cli::cli_progress_done(result = "failed") |
197 | ! |
cli::cli_bullets( |
198 | ! |
c(" " = "no standard data files found in {.path {process_file}}") |
199 |
) |
|
200 |
} |
|
201 |
} |
|
202 | 4x |
process_bundle <- function(process_file) { |
203 | 1x |
process_def <- dcf_process_record(process_file) |
204 | 1x |
if (clear_state) { |
205 | ! |
process_def$source_state <- NULL |
206 | ! |
process_def$dist_state <- NULL |
207 | ! |
dcf_process_record(process_file, process_def) |
208 |
} |
|
209 | 1x |
name <- process_def$name |
210 | 1x |
dcf_add_bundle(name, project_dir, open_after = FALSE) |
211 | 1x |
for (si in seq_along(process_def$scripts)) { |
212 | 1x |
st <- proc.time()[[3]] |
213 | 1x |
process_script <- process_def$scripts[[si]] |
214 | 1x |
base_dir <- dirname(process_file) |
215 | 1x |
script <- paste0(base_dir, "/", process_script$path) |
216 | 1x |
measure_info_file <- paste0(base_dir, "/measure_info.json") |
217 | 1x |
run_current <- TRUE |
218 | 1x |
if (length(process_def$source_files)) { |
219 | 1x |
standard_files <- paste0(source_dir, "/", process_def$source_files) |
220 | 1x |
standard_state <- as.list(tools::md5sum(c( |
221 | 1x |
measure_info_file, |
222 | 1x |
paste0(source_dir, "/", process_def$source_files) |
223 |
))) |
|
224 | 1x |
run_current <- !identical(standard_state, process_def$source_state) |
225 |
} |
|
226 | 1x |
if (run_current) { |
227 | 1x |
cli::cli_progress_step( |
228 | 1x |
paste0( |
229 | 1x |
"processing bundle {.strong ", |
230 | 1x |
name, |
231 | 1x |
"} ({.emph ", |
232 | 1x |
script, |
233 |
"})" |
|
234 |
), |
|
235 | 1x |
spinner = TRUE |
236 |
) |
|
237 | 1x |
env <- new.env() |
238 | 1x |
env$dcf_process_continue <- TRUE |
239 | 1x |
status <- tryCatch( |
240 | 1x |
list( |
241 | 1x |
log = utils::capture.output( |
242 | 1x |
source(script, env, chdir = TRUE), |
243 | 1x |
type = "message" |
244 |
), |
|
245 | 1x |
success = TRUE |
246 |
), |
|
247 | 1x |
error = function(e) list(log = e$message, success = FALSE) |
248 |
) |
|
249 | 1x |
collect_env$logs[[name]] <- status$log |
250 | 1x |
if (run_current) { |
251 | 1x |
process_script$last_run <- Sys.time() |
252 | 1x |
process_script$run_time <- proc.time()[[3]] - st |
253 | 1x |
process_script$last_status <- status |
254 | 1x |
process_def$scripts[[si]] <- process_script |
255 |
} |
|
256 | 1x |
if (status$success) { |
257 | 1x |
collect_env$timings[[name]] <- process_script$run_time |
258 |
} |
|
259 | ! |
if (!env$dcf_process_continue) break |
260 |
} |
|
261 |
} |
|
262 | 1x |
process_def_current <- dcf_process_record(process_file) |
263 | 1x |
dist_files <- list.files(paste0(base_dir, "/dist")) |
264 | 1x |
if (length(dist_files)) { |
265 | 1x |
measure_info_file <- paste0(base_dir, "/measure_info.json") |
266 | 1x |
dist_state <- as.list(tools::md5sum(c( |
267 | 1x |
measure_info_file, |
268 | 1x |
paste0(base_dir, "/dist/", dist_files) |
269 |
))) |
|
270 | 1x |
if (!identical(process_def_current$dist_state, dist_state)) { |
271 | 1x |
process_def_current$scripts <- process_def$scripts |
272 | 1x |
process_def_current$dist_state <- dist_state |
273 | 1x |
dcf_process_record(process_file, process_def_current) |
274 |
} |
|
275 | 1x |
cli::cli_progress_done(result = if (status$success) "done" else "failed") |
276 |
} else { |
|
277 | ! |
cli::cli_progress_done(result = "failed") |
278 | ! |
cli::cli_bullets( |
279 | ! |
c(" " = "no standard data files found in {.path {process_file}}") |
280 |
) |
|
281 |
} |
|
282 |
} |
|
283 | 4x |
for (process_file in sources[order( |
284 | 4x |
vapply( |
285 | 4x |
sources, |
286 | 4x |
function(f) { |
287 | 5x |
type <- jsonlite::read_json(f)$type |
288 | 5x |
is.null(type) || type == "bundle" |
289 |
}, |
|
290 | 4x |
TRUE |
291 |
) == |
|
292 | 4x |
"bundle" |
293 |
)]) { |
|
294 | 5x |
process_def <- dcf_process_record(process_file) |
295 | 5x |
if (is.null(process_def$type) || process_def$type == "source") { |
296 | 4x |
process_source(process_file) |
297 |
} else { |
|
298 | 1x |
process_bundle(process_file) |
299 |
} |
|
300 |
} |
|
301 | 4x |
invisible(list(timings = collect_env$timings, logs = collect_env$logs)) |
302 |
} |
1 |
dcf_read_settings <- function(project_dir = ".") { |
|
2 | 17x |
settings_file <- paste0(project_dir, "/settings.json") |
3 | 17x |
if (!file.exists(settings_file)) { |
4 | ! |
cli::cli_abort( |
5 | ! |
"{.arg project_dir} ({project_dir}) does not appear to be a Data Collection Framework project" |
6 |
) |
|
7 |
} |
|
8 | 17x |
jsonlite::read_json(settings_file) |
9 |
} |
1 |
#' Retrieve A Data File |
|
2 |
#' |
|
3 |
#' Load a data file from a source data project, or list versions of the file. |
|
4 |
#' |
|
5 |
#' @param path Path to the file. |
|
6 |
#' @param date Date of the version to load; A \code{Date}, or \code{character} in the format |
|
7 |
#' \code{YYYY-MM-DD}. Will match to the nearest version. |
|
8 |
#' @param commit_hash SHA signature of the committed version; |
|
9 |
#' can be the first 6 or so characters. Ignored if \code{date} is provided. |
|
10 |
#' @param versions Logical; if \code{TRUE}, will return a list of available version, |
|
11 |
#' rather than a |
|
12 |
#' @returns If \code{versions} is \code{TRUE}, a \code{data.frame} with columns for |
|
13 |
#' the \code{hash}, \code{author}, \code{date}, and \code{message} of each commit. |
|
14 |
#' Otherwise, the path to a temporary file, if one was extracted. |
|
15 |
#' |
|
16 |
#' @examples |
|
17 |
#' path <- "../../../pophive/data/wastewater/raw/flua.csv.xz" |
|
18 |
#' if (file.exists(path)) { |
|
19 |
#' # list versions |
|
20 |
#' versions <- dcf_get_file(path, versions = TRUE) |
|
21 |
#' print(versions[, c("date", "hash")]) |
|
22 |
#' |
|
23 |
#' # extract a version to a temporary file |
|
24 |
#' temp_path <- dcf_get_file(path, "2025-05") |
|
25 |
#' basename(temp_path) |
|
26 |
#' } |
|
27 |
#' |
|
28 |
#' @export |
|
29 | ||
30 |
dcf_get_file <- function( |
|
31 |
path, |
|
32 |
date = NULL, |
|
33 |
commit_hash = NULL, |
|
34 |
versions = FALSE |
|
35 |
) { |
|
36 | 3x |
if (missing(path)) { |
37 | ! |
cli::cli_abort("specify a path") |
38 |
} |
|
39 | 3x |
if (!file.exists(path)) { |
40 | ! |
cli::cli_abort("path does not exist") |
41 |
} |
|
42 | 3x |
vs <- data.frame( |
43 | 3x |
hash = character(), |
44 | 3x |
author = character(), |
45 | 3x |
date = character(), |
46 | 3x |
message = character() |
47 |
) |
|
48 | 3x |
if (versions || !is.null(date)) { |
49 | 2x |
commits <- sys::exec_internal("git", c("log", path)) |
50 | 2x |
if (commits$status == 0L) { |
51 | 2x |
commits <- do.call( |
52 | 2x |
rbind, |
53 | 2x |
Filter( |
54 | 2x |
function(e) length(e) == 4L, |
55 | 2x |
strsplit( |
56 | 2x |
strsplit(rawToChar(commits$stdout), "commit ", fixed = TRUE)[[1L]], |
57 | 2x |
"\\n+(?:[^:]+:)?\\s*" |
58 |
) |
|
59 |
) |
|
60 |
) |
|
61 | 2x |
colnames(commits) <- colnames(vs) |
62 | 2x |
vs <- as.data.frame(commits) |
63 |
} else { |
|
64 | ! |
cli::cli_abort("failed to git log: {rawToChar(commits$stderr)}") |
65 |
} |
|
66 |
} |
|
67 | 3x |
if (versions) { |
68 | 1x |
return(vs) |
69 |
} |
|
70 | 2x |
if (!is.null(date)) { |
71 | 1x |
if (nrow(vs) == 0L) { |
72 | ! |
return(path) |
73 |
} |
|
74 | 1x |
if (is.character(date)) { |
75 | 1x |
date <- as.POSIXct( |
76 | 1x |
date, |
77 | 1x |
tryFormats = c( |
78 | 1x |
"%Y-%m-%d %H:%M:%S", |
79 | 1x |
"%Y-%m-%d %H:%M", |
80 | 1x |
"%Y-%m-%d", |
81 | 1x |
"%Y-%m", |
82 | 1x |
"%Y" |
83 |
), |
|
84 | 1x |
tz = "UTC" |
85 |
) |
|
86 |
} |
|
87 | 1x |
commit_hash <- vs$hash[which.min(abs( |
88 | 1x |
as.POSIXct(vs$date, "%a %b %d %H:%M:%S %Y", tz = "UTC") - date |
89 |
))] |
|
90 |
} |
|
91 | 2x |
if (is.null(commit_hash)) { |
92 | ! |
return(path) |
93 |
} |
|
94 | 2x |
name_parts <- strsplit(basename(path), ".", fixed = TRUE)[[1L]] |
95 | 2x |
out_path <- paste0( |
96 | 2x |
tempdir(), |
97 |
"/", |
|
98 | 2x |
name_parts[[1L]], |
99 |
"-", |
|
100 | 2x |
substring(commit_hash, 1L, 6L), |
101 |
".", |
|
102 | 2x |
paste(name_parts[-1L], collapse = ".") |
103 |
) |
|
104 | 2x |
if (file.exists(out_path)) { |
105 | ! |
return(out_path) |
106 |
} |
|
107 | 2x |
status <- sys::exec_wait( |
108 | 2x |
"git", |
109 | 2x |
c("show", paste0(commit_hash, ":", path)), |
110 | 2x |
std_out = out_path |
111 |
) |
|
112 | 2x |
if (status != 0L) { |
113 | ! |
cli::cli_abort("failed to git show: {rawToChar(status$stderr)}") |
114 |
} |
|
115 | 2x |
out_path |
116 |
} |
1 |
#' Adds a Bundle Project |
|
2 |
#' |
|
3 |
#' Establishes a new data bundle project, used to prepare outputs from standardized datasets. |
|
4 |
#' |
|
5 |
#' @param name Name of the bundle |
|
6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
7 |
#' @param source_files Character vector of paths to standard files form source projects. |
|
8 |
#' @param open_after Logical; if \code{FALSE}, will not open the project. |
|
9 |
#' @returns Nothing; creates default files and directories. |
|
10 |
#' @section Project: |
|
11 |
#' |
|
12 |
#' Within a bundle project, there are two files to edits: |
|
13 |
#' \itemize{ |
|
14 |
#' \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun. |
|
15 |
#' It should store raw data and resources in \code{raw/} where possible, |
|
16 |
#' then use what's in \code{raw/} to produce standard-format files in \code{standard/}. |
|
17 |
#' This file is sourced from its location during processing, so any system paths |
|
18 |
#' must be relative to itself. |
|
19 |
#' \item \strong{\code{measure_info.json}}: This is where you can record information |
|
20 |
#' about the variables included in the standardized data files. |
|
21 |
#' See \code{\link{dcf_measure_info}}. |
|
22 |
#' } |
|
23 |
#' |
|
24 |
#' @examples |
|
25 |
#' project_dir <- paste0(tempdir(), "/temp_project") |
|
26 |
#' dcf_init("temp_project", dirname(project_dir)) |
|
27 |
#' dcf_add_bundle("bundle_name", project_dir) |
|
28 |
#' list.files(paste0(project_dir, "/data/bundle_name")) |
|
29 |
#' |
|
30 |
#' @export |
|
31 | ||
32 |
dcf_add_bundle <- function( |
|
33 |
name, |
|
34 |
project_dir = ".", |
|
35 |
source_files = NULL, |
|
36 |
open_after = interactive() |
|
37 |
) { |
|
38 | 2x |
if (missing(name)) { |
39 | ! |
cli::cli_abort("specify a name") |
40 |
} |
|
41 | 2x |
name <- gsub("[^A-Za-z0-9]+", "_", name) |
42 | 2x |
settings <- dcf_read_settings(project_dir) |
43 | 2x |
base_dir <- paste(c(project_dir, settings$data_dir, name), collapse = "/") |
44 | 2x |
dir.create(paste0(base_dir, "/dist"), showWarnings = FALSE, recursive = TRUE) |
45 | 2x |
paths <- paste0( |
46 | 2x |
base_dir, |
47 |
"/", |
|
48 | 2x |
c( |
49 | 2x |
"README.md", |
50 | 2x |
"project.Rproj", |
51 | 2x |
"process.json", |
52 | 2x |
"measure_info.json", |
53 | 2x |
"build.R" |
54 |
) |
|
55 |
) |
|
56 | 2x |
if (!file.exists(paths[[1L]])) { |
57 | 1x |
writeLines( |
58 | 1x |
paste0( |
59 | 1x |
c( |
60 | 1x |
paste("#", name), |
61 |
"", |
|
62 | 1x |
"This is a Data Collection Framework data bundle project, initialized with `dcf::dcf_add_bundle`.", |
63 |
"", |
|
64 | 1x |
"You can us the `dcf` package to rebuild the bundle:", |
65 |
"", |
|
66 | 1x |
"```R", |
67 | 1x |
paste0('dcf::dcf_process("', name, '", "..")'), |
68 |
"```" |
|
69 |
), |
|
70 | 1x |
collapse = "\n" |
71 |
), |
|
72 | 1x |
paths[[1L]] |
73 |
) |
|
74 |
} |
|
75 | 2x |
if (!file.exists(paths[[2L]])) { |
76 | 1x |
writeLines("Version: 1.0\n", paths[[2L]]) |
77 |
} |
|
78 | 2x |
if (!file.exists(paths[[3L]])) { |
79 | 1x |
jsonlite::write_json( |
80 | 1x |
list( |
81 | 1x |
name = name, |
82 | 1x |
type = "bundle", |
83 | 1x |
scripts = list( |
84 | 1x |
list( |
85 | 1x |
path = "build.R", |
86 | 1x |
last_run = "", |
87 | 1x |
run_time = "", |
88 | 1x |
last_status = list(log = "", success = TRUE) |
89 |
) |
|
90 |
), |
|
91 | 1x |
source_files = source_files |
92 |
), |
|
93 | 1x |
paths[[3L]], |
94 | 1x |
auto_unbox = TRUE, |
95 | 1x |
pretty = TRUE |
96 |
) |
|
97 |
} |
|
98 | 2x |
if (!file.exists(paths[[4L]])) { |
99 | 1x |
writeLines("{}\n", paths[[4L]]) |
100 |
} |
|
101 | 2x |
if (!file.exists(paths[[5L]])) { |
102 | 1x |
writeLines( |
103 | 1x |
paste0( |
104 | 1x |
c( |
105 | 1x |
"# read data from data source projects", |
106 | 1x |
"# and write to this project's `dist` directory", |
107 |
"" |
|
108 |
), |
|
109 | 1x |
collapse = "\n" |
110 |
), |
|
111 | 1x |
paths[[5L]] |
112 |
) |
|
113 |
} |
|
114 | ! |
if (open_after) rstudioapi::openProject(paths[[2L]], newSession = TRUE) |
115 |
} |
1 |
#' Initialize a Data Collection Project |
|
2 |
#' |
|
3 |
#' Establishes a new data collection framework project. |
|
4 |
#' |
|
5 |
#' @param name Name of the source. Defaults to the current directory name. |
|
6 |
#' @param base_dir Path to the parent of the project directory (where the \code{name} |
|
7 |
#' directory should be created). If \code{name} is not specified, will treat the current |
|
8 |
#' directory as \code{name}, and \code{".."} as \code{base_dir}. |
|
9 |
#' @param data_dir Name of the directory to store projects in, relative to \code{base_dir}. |
|
10 |
#' @param github_account Name of the GitHub account that will host the repository. |
|
11 |
#' @param branch Name of the repository's branch. |
|
12 |
#' @param repo_name Name of the repository. |
|
13 |
#' @param use_git Logical; if \code{TRUE}, will initialize a git repository. |
|
14 |
#' @param open_after Logical; if \code{TRUE}, will open the project in a new RStudio instance. |
|
15 |
#' @returns Nothing; creates default files and directories. |
|
16 |
#' @section Data Collection Project: |
|
17 |
#' |
|
18 |
#' A data collection project starts with a \code{settings.json} file, which |
|
19 |
#' specifies where source and bundle projects live (a \code{data} subdirectory by default). |
|
20 |
#' |
|
21 |
#' The bulk of the project will then be in the source and bundle projects, as created |
|
22 |
#' by the \code{\link{dcf_add_source}} and \code{\link{dcf_add_bundle}}. |
|
23 |
#' |
|
24 |
#' Once these sub-projects are in place, they can be operated over by the |
|
25 |
#' \code{\link{dcf_build}}, which processes each sub-project using |
|
26 |
#' \code{\link{dcf_process}}, and checks them with \code{\link{dcf_check_sources}}, |
|
27 |
#' resulting in a report. |
|
28 |
#' |
|
29 |
#' @examples |
|
30 |
#' base_dir <- tempdir() |
|
31 |
#' dcf_init("project_name", base_dir) |
|
32 |
#' list.files(paste0(base_dir, "/project_name")) |
|
33 |
#' |
|
34 |
#' @export |
|
35 | ||
36 |
dcf_init <- function( |
|
37 |
name, |
|
38 |
base_dir = ".", |
|
39 |
data_dir = "data", |
|
40 |
github_account = "", |
|
41 |
branch = "main", |
|
42 |
repo_name = name, |
|
43 |
use_git = TRUE, |
|
44 |
open_after = FALSE |
|
45 |
) { |
|
46 | 1x |
if (missing(name)) { |
47 | ! |
name <- basename(getwd()) |
48 | ! |
base_dir <- ".." |
49 |
} else { |
|
50 | 1x |
name <- gsub("[^A-Za-z0-9]+", "_", name) |
51 |
} |
|
52 | 1x |
base_path <- paste0(base_dir, "/", name, "/") |
53 | 1x |
dir.create(base_path, showWarnings = FALSE, recursive = TRUE) |
54 | 1x |
paths <- paste0( |
55 | 1x |
base_path, |
56 | 1x |
c( |
57 | 1x |
"project.Rproj", |
58 | 1x |
"settings.json", |
59 | 1x |
"README.md", |
60 | 1x |
"scripts/build.R", |
61 | 1x |
".github/workflows/build.yaml", |
62 | 1x |
".gitignore" |
63 |
) |
|
64 |
) |
|
65 |
if ( |
|
66 | 1x |
!file.exists(paths[[1L]]) && !length(list.files(base_path, "\\.Rproj$")) |
67 |
) { |
|
68 | 1x |
writeLines("Version: 1.0\n", paths[[1L]]) |
69 |
} |
|
70 | 1x |
if (!file.exists(paths[[2L]])) { |
71 | 1x |
jsonlite::write_json( |
72 | 1x |
list( |
73 | 1x |
name = name, |
74 | 1x |
data_dir = data_dir, |
75 | 1x |
github_account = github_account, |
76 | 1x |
branch = "main", |
77 | 1x |
repo_name = repo_name |
78 |
), |
|
79 | 1x |
paths[[2L]], |
80 | 1x |
auto_unbox = TRUE, |
81 | 1x |
pretty = TRUE |
82 |
) |
|
83 |
} |
|
84 | 1x |
if (!file.exists(paths[[3L]])) { |
85 | 1x |
writeLines( |
86 | 1x |
paste0( |
87 | 1x |
c( |
88 | 1x |
paste("#", name), |
89 |
"", |
|
90 | 1x |
"This is a Data Collection Framework project, initialized with `dcf::dcf_init`.", |
91 |
"", |
|
92 | 1x |
"You can us the `dcf` package to check the source projects:", |
93 |
"", |
|
94 | 1x |
"```R", |
95 | 1x |
paste0('dcf::dcf_check_source()'), |
96 |
"```", |
|
97 |
"", |
|
98 | 1x |
"And process them:", |
99 |
"", |
|
100 | 1x |
"```R", |
101 | 1x |
paste0('dcf::dcf_process()'), |
102 |
"```" |
|
103 |
), |
|
104 | 1x |
collapse = "\n" |
105 |
), |
|
106 | 1x |
paths[[3L]] |
107 |
) |
|
108 |
} |
|
109 | 1x |
if (!file.exists(paths[[4L]])) { |
110 | 1x |
dir.create(dirname(paths[[4L]]), showWarnings = FALSE) |
111 | 1x |
writeLines( |
112 | 1x |
paste(c("library(dcf)", "dcf_build()"), collapse = "\n"), |
113 | 1x |
paths[[4L]] |
114 |
) |
|
115 |
} |
|
116 | 1x |
if (!file.exists(paths[[5L]])) { |
117 | 1x |
dir.create(dirname(paths[[5L]]), recursive = TRUE, showWarnings = FALSE) |
118 | 1x |
file.copy(system.file("workflows/build.yaml", package = "dcf"), paths[[5L]]) |
119 |
} |
|
120 | 1x |
if (!file.exists(paths[[6L]])) { |
121 | 1x |
writeLines( |
122 | 1x |
paste( |
123 | 1x |
c( |
124 | 1x |
"*.Rproj", |
125 | 1x |
".Rproj.user", |
126 | 1x |
".Rprofile", |
127 | 1x |
".Rhistory", |
128 | 1x |
".Rdata", |
129 | 1x |
".DS_Store", |
130 | 1x |
"renv" |
131 |
), |
|
132 | 1x |
collapse = "\n" |
133 |
), |
|
134 | 1x |
paths[[6L]] |
135 |
) |
|
136 |
} |
|
137 | ! |
if (open_after) rstudioapi::openProject(paths[[1L]], newSession = TRUE) |
138 |
} |
1 |
#' Run a Project's Build Process |
|
2 |
#' |
|
3 |
#' Build a Data Collection Framework project, |
|
4 |
#' which involves processing and checking all data projects. |
|
5 |
#' |
|
6 |
#' @param project_dir Path to the Data Collection Framework project to be built. |
|
7 |
#' @param is_auto Logical; if \code{FALSE}, will run \code{\link{dcf_process}} as if it were run |
|
8 |
#' manually. |
|
9 |
#' @param ... Passes arguments to \code{\link{dcf_process}}. |
|
10 |
#' @param make_diagram Logical; if \code{FALSE}, will not make a \code{status.md} diagram. |
|
11 |
#' @returns A version of the project report, which is also written to |
|
12 |
#' \code{project_dir/docs/report.json.gz}. |
|
13 |
#' @examples |
|
14 |
#' project_file <- "../../../pophive" |
|
15 |
#' if (file.exists(project_file)) { |
|
16 |
#' report <- dcf_build(project_file) |
|
17 |
#' } |
|
18 |
#' @export |
|
19 | ||
20 |
dcf_build <- function( |
|
21 |
project_dir = ".", |
|
22 |
is_auto = TRUE, |
|
23 |
..., |
|
24 |
make_diagram = TRUE |
|
25 |
) { |
|
26 | 2x |
settings <- dcf_read_settings(project_dir) |
27 | 2x |
data_dir <- paste0(project_dir, "/", settings$data_dir) |
28 | 2x |
processes <- list.files( |
29 | 2x |
data_dir, |
30 | 2x |
"process\\.json", |
31 | 2x |
recursive = TRUE, |
32 | 2x |
full.names = TRUE |
33 |
) |
|
34 | 2x |
process_state <- tools::md5sum(processes) |
35 | 2x |
process <- dcf_process(project_dir = project_dir, is_auto = TRUE, ...) |
36 | 2x |
issues <- dcf_check_sources(project_dir = project_dir) |
37 | 2x |
report_file <- paste0(project_dir, "/report.json.gz") |
38 |
if ( |
|
39 | 2x |
!identical( |
40 | 2x |
process_state, |
41 | 2x |
tools::md5sum(list.files( |
42 | 2x |
data_dir, |
43 | 2x |
"process\\.json", |
44 | 2x |
recursive = TRUE, |
45 | 2x |
full.names = TRUE |
46 |
)) |
|
47 |
) |
|
48 |
) { |
|
49 | 2x |
datapackages <- list.files( |
50 | 2x |
data_dir, |
51 | 2x |
"datapackage\\.json", |
52 | 2x |
recursive = TRUE, |
53 | 2x |
full.names = TRUE |
54 |
) |
|
55 | 2x |
names(datapackages) <- list.dirs( |
56 | 2x |
data_dir, |
57 | 2x |
recursive = FALSE, |
58 | 2x |
full.names = FALSE |
59 |
) |
|
60 | 2x |
names(processes) <- names(datapackages) |
61 | 2x |
report <- list( |
62 | 2x |
date = Sys.time(), |
63 | 2x |
settings = settings, |
64 | 2x |
source_times = process$timings, |
65 | 2x |
logs = process$logs, |
66 | 2x |
issues = issues, |
67 | 2x |
metadata = lapply(datapackages, jsonlite::read_json), |
68 | 2x |
processes = lapply(processes, jsonlite::read_json) |
69 |
) |
|
70 | 2x |
jsonlite::write_json( |
71 | 2x |
report, |
72 | 2x |
gzfile(report_file), |
73 | 2x |
auto_unbox = TRUE, |
74 | 2x |
dataframe = "columns" |
75 |
) |
|
76 | 2x |
if (make_diagram) { |
77 | 2x |
writeLines( |
78 | 2x |
dcf_status_diagram(project_dir), |
79 | 2x |
paste0(project_dir, "/status.md") |
80 |
) |
|
81 |
} |
|
82 | 2x |
invisible(report) |
83 |
} else { |
|
84 | ! |
invisible(jsonlite::read_json(report_file)) |
85 |
} |
|
86 |
} |
1 |
#' Make a Status Diagram |
|
2 |
#' |
|
3 |
#' Make a Data Collection Project status diagram. |
|
4 |
#' |
|
5 |
#' @param project_dir Path to the Data Collection Framework project to be built. |
|
6 |
#' @returns A character vector of the status diagram, which is also written to |
|
7 |
#' the \code{project_dir/status.md} file. |
|
8 |
#' @examples |
|
9 |
#' \dontrun{ |
|
10 |
#' dcf_status_diagram("project_directory") |
|
11 |
#' } |
|
12 |
#' @export |
|
13 | ||
14 |
dcf_status_diagram <- function(project_dir = ".") { |
|
15 | 2x |
report_file <- paste0(project_dir, "/report.json.gz") |
16 | 2x |
if (!file.exists(report_file)) { |
17 | ! |
cli::cli_abort("no report file found") |
18 |
} |
|
19 | 2x |
report <- jsonlite::read_json(report_file) |
20 | 2x |
data_dir <- if (is.null(report$settings$data_dir)) "data" else |
21 | 2x |
report$settings$data_dir |
22 | 2x |
branch <- if (is.null(report$settings$branch)) "main" else |
23 | 2x |
report$settings$branch |
24 | 2x |
repo <- if (report$settings$github_account == "") { |
25 | 2x |
NULL |
26 |
} else { |
|
27 | ! |
paste0(report$settings$github_account, "/", report$settings$repo_name) |
28 |
} |
|
29 | 2x |
indent <- " " |
30 | 2x |
d <- c( |
31 | 2x |
'classDef source fill:#ffcdb4, stroke:#303', |
32 | 2x |
'classDef pass fill:#66bb6a, stroke:#303', |
33 | 2x |
'classDef warn fill:#ffa726, stroke:#303', |
34 | 2x |
'classDef fail fill:#f44336, stroke:#303' |
35 |
) |
|
36 | 2x |
sources <- NULL |
37 | 2x |
source_ids <- list() |
38 | 2x |
file_ids <- NULL |
39 | 2x |
relationships <- NULL |
40 | 2x |
projects <- NULL |
41 | 2x |
node_id <- 0L |
42 | 2x |
for (project_meta in report$metadata[names(sort(vapply( |
43 | 2x |
report$processes, |
44 | 2x |
function(p) !is.na(p$type) && p$type == "bundle", |
45 | 2x |
TRUE |
46 |
)))]) { |
|
47 | 3x |
name <- project_meta$name |
48 | 3x |
timing <- report$source_times[[name]] |
49 | 3x |
issues <- report$issues[[name]] |
50 | 3x |
measures <- report$metadata[[name]]$measure_info |
51 | 3x |
process <- report$processes[[name]] |
52 | 3x |
contents <- NULL |
53 | 3x |
node_id <- node_id + 1L |
54 | 3x |
if (!is.null(process$type) && process$type == "bundle") { |
55 | 1x |
dist_files <- grep( |
56 | 1x |
"measure_info", |
57 | 1x |
names(process$dist_state), |
58 | 1x |
value = TRUE, |
59 | 1x |
invert = TRUE |
60 |
) |
|
61 | 1x |
for (filename in basename(dist_files)) { |
62 | 1x |
contents <- c( |
63 | 1x |
contents, |
64 | 1x |
paste0( |
65 | 1x |
"n", |
66 | 1x |
node_id, |
67 |
'["`', |
|
68 | 1x |
if (is.null(repo)) filename else |
69 | 1x |
make_link( |
70 | 1x |
paste0( |
71 | 1x |
"https://github.com/", |
72 | 1x |
repo, |
73 | 1x |
"/blob/", |
74 | 1x |
branch, |
75 |
"/", |
|
76 | 1x |
data_dir, |
77 |
"/", |
|
78 | 1x |
name, |
79 | 1x |
"/dist/", |
80 | 1x |
filename |
81 |
), |
|
82 | 1x |
filename |
83 |
), |
|
84 |
'`"]' |
|
85 |
) |
|
86 |
) |
|
87 |
} |
|
88 | 1x |
relationships <- c( |
89 | 1x |
relationships, |
90 | 1x |
paste0("n", file_ids[process$source_files], " --> ", name) |
91 |
) |
|
92 |
} else { |
|
93 | 2x |
for (r in project_meta$resources) { |
94 | 2x |
file_path <- paste0( |
95 |
"./", |
|
96 | 2x |
report$settings$data_dir, |
97 |
"/", |
|
98 | 2x |
name, |
99 | 2x |
"/standard/", |
100 | 2x |
r$filename |
101 |
) |
|
102 | 2x |
file_ids[paste0(name, "/standard/", r$filename)] <- node_id |
103 | 2x |
file_issues <- issues[[file_path]] |
104 | 2x |
measure_sources <- NULL |
105 | 2x |
for (field in r$schema$fields) { |
106 | 6x |
field_source <- measures[[field$name]]$source |
107 | 6x |
for (s in field_source) { |
108 | ! |
if (is.null(source_ids[[s$name]])) { |
109 | ! |
source_id <- paste0("s", length(source_ids)) |
110 | ! |
source_ids[[s$name]] <- source_id |
111 | ! |
sources[[source_id]] <- list( |
112 | ! |
id = source_id, |
113 | ! |
general = make_link(s$url, s$name), |
114 | ! |
specific = NULL |
115 |
) |
|
116 |
} |
|
117 | ! |
source_id <- source_ids[[s$name]] |
118 | ! |
if (!is.null(s$location)) { |
119 | ! |
sources[[source_id]]$specific <- unique(c( |
120 | ! |
sources[[source_id]]$specific, |
121 | ! |
make_link(s$location_url, s$location) |
122 |
)) |
|
123 |
} |
|
124 | ! |
relationships <- unique(c( |
125 | ! |
relationships, |
126 | ! |
paste0(source_id, " --> n", node_id) |
127 |
)) |
|
128 |
} |
|
129 |
} |
|
130 | 2x |
contents <- c( |
131 | 2x |
contents, |
132 | 2x |
paste0( |
133 | 2x |
"n", |
134 | 2x |
node_id, |
135 |
'["`', |
|
136 | 2x |
if (is.null(repo)) r$filename else |
137 | 2x |
make_link( |
138 | 2x |
paste0( |
139 | 2x |
"https://github.com/", |
140 | 2x |
repo, |
141 | 2x |
"/blob/", |
142 | 2x |
branch, |
143 |
"/", |
|
144 | 2x |
data_dir, |
145 |
"/", |
|
146 | 2x |
name, |
147 | 2x |
"/standard/", |
148 | 2x |
r$filename |
149 |
), |
|
150 | 2x |
r$filename |
151 |
), |
|
152 | 2x |
if (length(file_issues)) make_list(unlist(file_issues)), |
153 | 2x |
paste0('`"]:::', if (length(file_issues)) "warn" else "pass") |
154 |
) |
|
155 |
) |
|
156 |
} |
|
157 |
} |
|
158 | 3x |
projects <- c( |
159 | 3x |
projects, |
160 | 3x |
c( |
161 | 3x |
paste0( |
162 | 3x |
"subgraph ", |
163 | 3x |
name, |
164 |
'["`', |
|
165 | 3x |
if (is.null(repo)) name else |
166 | 3x |
make_link( |
167 | 3x |
paste0( |
168 | 3x |
"https://github.com/", |
169 | 3x |
repo, |
170 | 3x |
"/tree/", |
171 | 3x |
branch, |
172 |
"/", |
|
173 | 3x |
data_dir, |
174 |
"/", |
|
175 | 3x |
name |
176 |
), |
|
177 | 3x |
name |
178 |
), |
|
179 |
'`"]' |
|
180 |
), |
|
181 | 3x |
paste0(indent, contents), |
182 | 3x |
"end" |
183 |
) |
|
184 |
) |
|
185 |
} |
|
186 | 2x |
c( |
187 | 2x |
"```mermaid", |
188 | 2x |
"flowchart LR", |
189 | 2x |
paste0( |
190 | 2x |
indent, |
191 | 2x |
c( |
192 | 2x |
d, |
193 | 2x |
vapply( |
194 | 2x |
sources, |
195 | 2x |
function(s) |
196 | 2x |
paste( |
197 | 2x |
c( |
198 | 2x |
s$id, |
199 | 2x |
'("`<h4>', |
200 | 2x |
s$general, |
201 | 2x |
"</h4>", |
202 | 2x |
if (length(s$specific)) paste0("<br/>", make_list(s$specific)), |
203 | 2x |
'`"):::source' |
204 |
), |
|
205 | 2x |
collapse = "" |
206 |
), |
|
207 |
"" |
|
208 |
), |
|
209 | 2x |
projects, |
210 | 2x |
relationships |
211 |
) |
|
212 |
), |
|
213 |
"```" |
|
214 |
) |
|
215 |
} |
|
216 | ||
217 |
make_link <- function(url, name = NULL) { |
|
218 | ! |
paste0( |
219 | ! |
'<a href="', |
220 | ! |
url, |
221 | ! |
'" target="_blank" rel="noreferrer">', |
222 | ! |
if (is.null(name)) sub("https?://(?:www\\.)?", "", url) else name, |
223 | ! |
"</a>" |
224 |
) |
|
225 |
} |
|
226 | ||
227 |
make_list <- function(items) { |
|
228 | ! |
paste0( |
229 | ! |
"<ul>", |
230 | ! |
vapply(items, function(i) paste0("<br/><li>", i, "</li>"), ""), |
231 | ! |
"</ul>" |
232 |
) |
|
233 |
} |
1 |
#' Interact with a Source Process File |
|
2 |
#' |
|
3 |
#' Read or update the current source process file. |
|
4 |
#' |
|
5 |
#' @param path Path to the process JSON file. |
|
6 |
#' @param updated An update version of the process definition. If specified, will |
|
7 |
#' write this as the new process file, rather than reading any existing file. |
|
8 |
#' @returns The process definition of the source project. |
|
9 |
#' @examples |
|
10 |
#' epic_process_file <- "../../data/epic/process.json" |
|
11 |
#' if (file.exists(epic_process_file)) { |
|
12 |
#' dcf_process_record(path = epic_process_file) |
|
13 |
#' } |
|
14 |
#' @export |
|
15 | ||
16 |
dcf_process_record <- function(path = "process.json", updated = NULL) { |
|
17 | 30x |
if (is.null(updated)) { |
18 | 19x |
if (!file.exists(path)) { |
19 | ! |
cli::cli_abort("process file {path} does not exist") |
20 |
} |
|
21 | 19x |
jsonlite::read_json(path) |
22 |
} else { |
|
23 | 11x |
jsonlite::write_json(updated, path, auto_unbox = TRUE, pretty = TRUE) |
24 | 11x |
updated |
25 |
} |
|
26 |
} |
1 |
#' Update renv.lock |
|
2 |
#' |
|
3 |
#' Updates the \code{renv.lock} file with dependencies found in project scripts. |
|
4 |
#' |
|
5 |
#' @param project_dir Directory of the Data Collection project. |
|
6 |
#' @param refresh Logical; if \code{FALSE}, will update an existing |
|
7 |
#' \code{renv.lock} file, rather than recreating it. |
|
8 |
#' @returns Nothing; writes an \code{renv.lock} file. |
|
9 |
#' @examples |
|
10 |
#' \dontrun{ |
|
11 |
#' dcf_update_lock() |
|
12 |
#' } |
|
13 |
#' @export |
|
14 | ||
15 |
dcf_update_lock <- function( |
|
16 |
project_dir = ".", |
|
17 |
refresh = TRUE |
|
18 |
) { |
|
19 | 1x |
settings <- dcf_read_settings(project_dir) |
20 | 1x |
extra <- unique( |
21 | 1x |
renv::dependencies(list.files( |
22 | 1x |
paste0(project_dir, "/", settings$data_dir), |
23 | 1x |
"\\.[Rr]$", |
24 | 1x |
recursive = TRUE, |
25 | 1x |
full.names = TRUE |
26 | 1x |
))$Package |
27 |
) |
|
28 | 1x |
not_installed <- !(extra %in% rownames(utils::installed.packages())) |
29 | ! |
if (any(not_installed)) utils::install.packages(extra[not_installed]) |
30 | 1x |
if (refresh) unlink(paste0(project_dir, "/renv.lock")) |
31 | 1x |
renv::snapshot(packages = extra, lockfile = paste0(project_dir, "/renv.lock")) |
32 |
} |
1 |
#' Create a datapackage.json template |
|
2 |
#' |
|
3 |
#' Initialize dataset documentation with a \code{datapackage.json} template, based on a |
|
4 |
#' \href{https://specs.frictionlessdata.io/data-package}{Data Package} standard. |
|
5 |
#' |
|
6 |
#' @param name A unique name for the dataset; allowed characters are \code{[a-z._/-]}. |
|
7 |
#' @param title A display name for the dataset; if not specified, will be a formatted version of \code{name}. |
|
8 |
#' @param dir Directory in which to save the \code{datapackage.json} file. |
|
9 |
#' @param ... passes arguments to \code{\link{dcf_datapackage_add}}. |
|
10 |
#' @param write Logical; if \code{FALSE}, the package object will not be written to a file. |
|
11 |
#' @param overwrite Logical; if \code{TRUE} and \code{write} is \code{TRUE}, an existing |
|
12 |
#' \code{datapackage.json} file will be overwritten. |
|
13 |
#' @param quiet Logical; if \code{TRUE}, will not print messages or navigate to files. |
|
14 |
#' @examples |
|
15 |
#' \dontrun{ |
|
16 |
#' # make a template datapackage.json file in the current working directory |
|
17 |
#' dcf_datapackage_init("mtcars", "Motor Trend Car Road Tests") |
|
18 |
#' } |
|
19 |
#' @return An invisible list with the content written to the \code{datapackage.json} file. |
|
20 |
#' @seealso Add basic information about a dataset with \code{\link{dcf_datapackage_add}}. |
|
21 |
#' @export |
|
22 | ||
23 |
dcf_datapackage_init <- function( |
|
24 |
name, |
|
25 |
title = name, |
|
26 |
dir = ".", |
|
27 |
..., |
|
28 |
write = TRUE, |
|
29 |
overwrite = FALSE, |
|
30 |
quiet = !interactive() |
|
31 |
) { |
|
32 | 2x |
if (missing(name)) { |
33 | ! |
cli::cli_abort("{.arg name} must be specified") |
34 |
} |
|
35 | 2x |
package <- list( |
36 | 2x |
name = name, |
37 | 2x |
title = if (title == name) { |
38 | 2x |
gsub("\\b(\\w)", "\\U\\1", gsub("[._/-]", " ", name), perl = TRUE) |
39 |
} else { |
|
40 | ! |
title |
41 |
}, |
|
42 | 2x |
licence = list( |
43 | 2x |
url = "http://opendatacommons.org/licenses/pddl", |
44 | 2x |
name = "Open Data Commons Public Domain", |
45 | 2x |
version = "1.0", |
46 | 2x |
id = "odc-pddl" |
47 |
), |
|
48 | 2x |
resources = list() |
49 |
) |
|
50 | 2x |
package_path <- normalizePath(paste0(dir, "/datapackage.json"), "/", FALSE) |
51 | 2x |
if (write && !overwrite && file.exists(package_path)) { |
52 | ! |
cli::cli_abort(c( |
53 | ! |
"datapackage ({.path {package_path}}) already exists", |
54 | ! |
i = "add {.code overwrite = TRUE} to overwrite it" |
55 |
)) |
|
56 |
} |
|
57 | 2x |
if (length(list(...))) { |
58 | ! |
package$resources <- dcf_datapackage_add(..., dir = dir, write = FALSE) |
59 |
} |
|
60 | 2x |
if (write) { |
61 | 2x |
if (!dir.exists(dir)) { |
62 | ! |
dir.create(dir, recursive = TRUE) |
63 |
} |
|
64 | 2x |
jsonlite::write_json( |
65 | 2x |
package, |
66 | 2x |
package_path, |
67 | 2x |
auto_unbox = TRUE, |
68 | 2x |
digits = 6, |
69 | 2x |
pretty = TRUE |
70 |
) |
|
71 | 2x |
if (!quiet) { |
72 | ! |
cli::cli_bullets(c( |
73 | ! |
v = "created metadata template for {name}:", |
74 | ! |
"*" = paste0("{.path ", package_path, "}") |
75 |
)) |
|
76 | ! |
rstudioapi::navigateToFile(package_path) |
77 |
} |
|
78 |
} |
|
79 | 2x |
invisible(package) |
80 |
} |
1 |
#' Adds a Source Project |
|
2 |
#' |
|
3 |
#' Establishes a new data source project, used to collect and prepare data from a new source. |
|
4 |
#' |
|
5 |
#' @param name Name of the source. |
|
6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
7 |
#' @param open_after Logical; if \code{FALSE}, will not open the project. |
|
8 |
#' @returns Nothing; creates default files and directories. |
|
9 |
#' @section Project: |
|
10 |
#' |
|
11 |
#' Within a source project, there are two files to edits: |
|
12 |
#' \itemize{ |
|
13 |
#' \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun. |
|
14 |
#' It should store raw data and resources in \code{raw/} where possible, |
|
15 |
#' then use what's in \code{raw/} to produce standard-format files in \code{standard/}. |
|
16 |
#' This file is sourced from its location during processing, so any system paths |
|
17 |
#' must be relative to itself. |
|
18 |
#' \item \strong{\code{measure_info.json}}: This is where you can record information |
|
19 |
#' about the variables included in the standardized data files. |
|
20 |
#' See \code{\link{dcf_measure_info}}. |
|
21 |
#' } |
|
22 |
#' |
|
23 |
#' @examples |
|
24 |
#' project_dir <- paste0(tempdir(), "/temp_project") |
|
25 |
#' dcf_init("temp_project", dirname(project_dir)) |
|
26 |
#' dcf_add_source("source_name", project_dir) |
|
27 |
#' list.files(paste0(project_dir, "/data/source_name")) |
|
28 |
#' |
|
29 |
#' @export |
|
30 | ||
31 |
dcf_add_source <- function( |
|
32 |
name, |
|
33 |
project_dir = ".", |
|
34 |
open_after = interactive() |
|
35 |
) { |
|
36 | 9x |
if (missing(name)) { |
37 | ! |
cli::cli_abort("specify a name") |
38 |
} |
|
39 | 9x |
name <- gsub("[^A-Za-z0-9]+", "_", name) |
40 | 9x |
settings <- dcf_read_settings(project_dir) |
41 | 9x |
base_dir <- paste0(project_dir, "/", settings$data_dir) |
42 | 9x |
base_path <- paste0(base_dir, "/", name, "/") |
43 | 9x |
dir.create(paste0(base_path, "raw"), showWarnings = FALSE, recursive = TRUE) |
44 | 9x |
dir.create(paste0(base_path, "standard"), showWarnings = FALSE) |
45 | 9x |
paths <- paste0( |
46 | 9x |
base_path, |
47 | 9x |
c( |
48 | 9x |
"measure_info.json", |
49 | 9x |
"ingest.R", |
50 | 9x |
"project.Rproj", |
51 | 9x |
"standard/datapackage.json", |
52 | 9x |
"process.json", |
53 | 9x |
"README.md" |
54 |
) |
|
55 |
) |
|
56 | 9x |
if (!file.exists(paths[[1L]])) { |
57 | 1x |
dcf_measure_info( |
58 | 1x |
paths[[1L]], |
59 | 1x |
example_variable = list(), |
60 | 1x |
verbose = FALSE, |
61 | 1x |
open_after = FALSE |
62 |
) |
|
63 |
} |
|
64 | 9x |
if (!file.exists(paths[[2L]])) { |
65 | 2x |
writeLines( |
66 | 2x |
paste0( |
67 | 2x |
c( |
68 |
"#", |
|
69 | 2x |
"# Download", |
70 |
"#", |
|
71 |
"", |
|
72 | 2x |
"# add files to the `raw` directory", |
73 |
"", |
|
74 |
"#", |
|
75 | 2x |
"# Reformat", |
76 |
"#", |
|
77 |
"", |
|
78 | 2x |
"# read from the `raw` directory, and write to the `standard` directory", |
79 |
"" |
|
80 |
), |
|
81 | 2x |
collapse = "\n" |
82 |
), |
|
83 | 2x |
paths[[2L]] |
84 |
) |
|
85 |
} |
|
86 | 9x |
if (!file.exists(paths[[3L]])) { |
87 | 1x |
writeLines("Version: 1.0\n", paths[[3L]]) |
88 |
} |
|
89 | 9x |
if (!file.exists(paths[[4L]])) { |
90 | 2x |
dcf_datapackage_init( |
91 | 2x |
name, |
92 | 2x |
dir = paste0(base_path, "standard"), |
93 | 2x |
quiet = TRUE |
94 |
) |
|
95 |
} |
|
96 | ||
97 | 9x |
if (!file.exists(paths[[5L]])) { |
98 | 1x |
jsonlite::write_json( |
99 | 1x |
list( |
100 | 1x |
name = name, |
101 | 1x |
type = "source", |
102 | 1x |
scripts = list( |
103 | 1x |
list( |
104 | 1x |
path = "ingest.R", |
105 | 1x |
manual = FALSE, |
106 | 1x |
frequency = 0L, |
107 | 1x |
last_run = "", |
108 | 1x |
run_time = "", |
109 | 1x |
last_status = list(log = "", success = TRUE) |
110 |
) |
|
111 |
), |
|
112 | 1x |
checked = "", |
113 | 1x |
check_results = list() |
114 |
), |
|
115 | 1x |
paths[[5L]], |
116 | 1x |
auto_unbox = TRUE, |
117 | 1x |
pretty = TRUE |
118 |
) |
|
119 |
} |
|
120 | 9x |
if (!file.exists(paths[[6L]])) { |
121 | 1x |
writeLines( |
122 | 1x |
paste0( |
123 | 1x |
c( |
124 | 1x |
paste("#", name), |
125 |
"", |
|
126 | 1x |
"This is a dcf data source project, initialized with `dcf::dcf_add_source`.", |
127 |
"", |
|
128 | 1x |
"You can us the `dcf` package to check the project:", |
129 |
"", |
|
130 | 1x |
"```R", |
131 | 1x |
paste0('dcf_check_source("', name, '", "..")'), |
132 |
"```", |
|
133 |
"", |
|
134 | 1x |
"And process it:", |
135 |
"", |
|
136 | 1x |
"```R", |
137 | 1x |
paste0('dcf_process("', name, '", "..")'), |
138 |
"```" |
|
139 |
), |
|
140 | 1x |
collapse = "\n" |
141 |
), |
|
142 | 1x |
paths[[6L]] |
143 |
) |
|
144 |
} |
|
145 | ! |
if (open_after) rstudioapi::openProject(paths[[3L]], newSession = TRUE) |
146 |
} |
1 |
#' Map States to Health Regions |
|
2 |
#' |
|
3 |
#' Maps state FIPS state numeric codes to Human Health Service regions. |
|
4 |
#' |
|
5 |
#' @param geoids Character vector of GEOIDs. |
|
6 |
#' @param prefix A prefix to add to region IDs. |
|
7 |
#' @returns A vector of Health Region names the same length as \code{geoids}. |
|
8 |
#' @examples |
|
9 |
#' dcf_to_health_region(c("01", "01001", "02", "02001")) |
|
10 |
#' @export |
|
11 | ||
12 |
dcf_to_health_region <- function(geoids, prefix = "Region ") { |
|
13 | 1x |
regions <- c( |
14 | 1x |
"01" = 4, |
15 | 1x |
"02" = 10, |
16 | 1x |
"04" = 9, |
17 | 1x |
"05" = 6, |
18 | 1x |
"06" = 9, |
19 | 1x |
"08" = 8, |
20 | 1x |
"09" = 1, |
21 | 1x |
"10" = 3, |
22 | 1x |
"11" = 3, |
23 | 1x |
"12" = 4, |
24 | 1x |
"13" = 4, |
25 | 1x |
"15" = 9, |
26 | 1x |
"16" = 10, |
27 | 1x |
"17" = 5, |
28 | 1x |
"18" = 5, |
29 | 1x |
"19" = 7, |
30 | 1x |
"20" = 7, |
31 | 1x |
"21" = 4, |
32 | 1x |
"22" = 6, |
33 | 1x |
"23" = 1, |
34 | 1x |
"24" = 3, |
35 | 1x |
"25" = 1, |
36 | 1x |
"26" = 5, |
37 | 1x |
"27" = 5, |
38 | 1x |
"28" = 4, |
39 | 1x |
"29" = 7, |
40 | 1x |
"30" = 8, |
41 | 1x |
"31" = 7, |
42 | 1x |
"32" = 9, |
43 | 1x |
"33" = 1, |
44 | 1x |
"34" = 2, |
45 | 1x |
"35" = 6, |
46 | 1x |
"36" = 2, |
47 | 1x |
"37" = 4, |
48 | 1x |
"38" = 8, |
49 | 1x |
"39" = 5, |
50 | 1x |
"40" = 6, |
51 | 1x |
"41" = 10, |
52 | 1x |
"42" = 3, |
53 | 1x |
"44" = 1, |
54 | 1x |
"45" = 4, |
55 | 1x |
"46" = 8, |
56 | 1x |
"47" = 4, |
57 | 1x |
"48" = 6, |
58 | 1x |
"49" = 8, |
59 | 1x |
"50" = 1, |
60 | 1x |
"51" = 3, |
61 | 1x |
"53" = 10, |
62 | 1x |
"54" = 3, |
63 | 1x |
"55" = 5, |
64 | 1x |
"56" = 8, |
65 | 1x |
"72" = 2, |
66 | 1x |
"66" = 6, |
67 | 1x |
"74" = 2 |
68 |
) |
|
69 | 1x |
regions[] <- paste0(prefix, regions) |
70 | 1x |
unname(regions[substring(geoids, 1L, 2L)]) |
71 |
} |