| 1 |
#' Run Data Project Processes |
|
| 2 |
#' |
|
| 3 |
#' Operates over data source and bundle projects, optionally running the source |
|
| 4 |
#' ingest scripts, then collecting metadata. |
|
| 5 |
#' |
|
| 6 |
#' @param name Name of a source project to process. Will default to the name of the |
|
| 7 |
#' current working directory. |
|
| 8 |
#' @param project_dir Path to the project directory. If not specified, and being called |
|
| 9 |
#' from a source project, this will be assumed to be two steps back from the working directory. |
|
| 10 |
#' @param ingest Logical; if \code{FALSE}, will re-process standardized data without running
|
|
| 11 |
#' ingestion scripts. Only applies to source projects. |
|
| 12 |
#' @param is_auto Logical; if \code{TRUE}, will skip process scripts marked as manual.
|
|
| 13 |
#' @param force Logical; if \code{TRUE}, will ignore process frequencies
|
|
| 14 |
#' (will run scripts even if recently run). |
|
| 15 |
#' @param clear_state Logical; if \code{TRUE}, will clear stored states before processing.
|
|
| 16 |
#' @returns A list with processing results: |
|
| 17 |
#' \itemize{
|
|
| 18 |
#' \item \code{timings}: How many seconds the scripts took to run.
|
|
| 19 |
#' \item \code{logs}: The captured output of the scripts.
|
|
| 20 |
#' } |
|
| 21 |
#' Each entry has an entry for each project. |
|
| 22 |
#' |
|
| 23 |
#' A `datapackage.json` file is also created / update in each source's `standard` directory. |
|
| 24 |
#' @examples |
|
| 25 |
#' \dontrun{
|
|
| 26 |
#' # run from a directory containing a `data` directory containing the source |
|
| 27 |
#' dcf_process("source_name")
|
|
| 28 |
#' |
|
| 29 |
#' # run without executing the ingestion script |
|
| 30 |
#' dcf_process("source_name", ingest = FALSE)
|
|
| 31 |
#' } |
|
| 32 |
#' @export |
|
| 33 | ||
| 34 |
dcf_process <- function( |
|
| 35 |
name = NULL, |
|
| 36 |
project_dir = ".", |
|
| 37 |
ingest = TRUE, |
|
| 38 |
is_auto = FALSE, |
|
| 39 |
force = FALSE, |
|
| 40 |
clear_state = FALSE |
|
| 41 |
) {
|
|
| 42 | 4x |
settings_file <- paste0(project_dir, "/settings.json") |
| 43 | 4x |
from_project <- file.exists(settings_file) |
| 44 | 4x |
if (from_project) {
|
| 45 | 4x |
source_dir <- paste0( |
| 46 | 4x |
project_dir, |
| 47 |
"/", |
|
| 48 | 4x |
jsonlite::read_json(settings_file)$data_dir |
| 49 |
) |
|
| 50 |
} else {
|
|
| 51 | ! |
project_dir <- "../.." |
| 52 | ! |
source_dir <- ".." |
| 53 | ! |
name <- basename(getwd()) |
| 54 |
} |
|
| 55 | ||
| 56 | 4x |
sources <- if (is.null(name)) {
|
| 57 | 2x |
list.files( |
| 58 | 2x |
source_dir, |
| 59 | 2x |
"process\\.json", |
| 60 | 2x |
recursive = TRUE, |
| 61 | 2x |
full.names = TRUE |
| 62 |
) |
|
| 63 |
} else {
|
|
| 64 | 2x |
process_files <- paste0(source_dir, "/", name, "/process.json") |
| 65 | 2x |
if (any(!file.exists(process_files))) {
|
| 66 | ! |
cli::cli_abort( |
| 67 | ! |
"missing process file{?/s}: {.emph {process_files[!file.exists(process_files)]}}"
|
| 68 |
) |
|
| 69 |
} |
|
| 70 | 2x |
process_files |
| 71 |
} |
|
| 72 | 4x |
decide_to_run <- function(process_script) {
|
| 73 | 4x |
if (is_auto && process_script$manual) {
|
| 74 | ! |
return(FALSE) |
| 75 |
} |
|
| 76 | 4x |
if ( |
| 77 | 4x |
force || process_script$last_run == "" || process_script$frequency == 0L |
| 78 |
) {
|
|
| 79 | 4x |
return(TRUE) |
| 80 |
} |
|
| 81 | 4x |
if ( |
| 82 | ! |
difftime(Sys.time(), as.POSIXct(process_script$last_run), units = "day") > |
| 83 | ! |
process_script$frequency |
| 84 |
) {
|
|
| 85 | ! |
return(TRUE) |
| 86 |
} |
|
| 87 | ! |
FALSE |
| 88 |
} |
|
| 89 | 4x |
collect_env <- new.env() |
| 90 | 4x |
collect_env$timings <- list() |
| 91 | 4x |
collect_env$logs <- list() |
| 92 | 4x |
process_source <- function(process_file) {
|
| 93 | 4x |
process_def <- dcf_process_record(process_file) |
| 94 | 4x |
if (clear_state) {
|
| 95 | ! |
raw_states <- grep("raw_state", names(process_def), fixed = TRUE)
|
| 96 | ! |
if (length(raw_states)) {
|
| 97 | ! |
process_def[raw_states] <- NULL |
| 98 |
} |
|
| 99 | ! |
process_def$standard_state <- NULL |
| 100 | ! |
dcf_process_record(process_file, process_def) |
| 101 |
} |
|
| 102 | 4x |
name <- process_def$name |
| 103 | 4x |
dcf_add_source(name, project_dir, open_after = FALSE) |
| 104 | 4x |
for (si in seq_along(process_def$scripts)) {
|
| 105 | 4x |
st <- proc.time()[[3]] |
| 106 | 4x |
process_script <- process_def$scripts[[si]] |
| 107 | 4x |
run_current <- ingest && decide_to_run(process_script) |
| 108 | 4x |
base_dir <- dirname(process_file) |
| 109 | 4x |
standard_dir <- paste0(base_dir, "/standard") |
| 110 | 4x |
script <- paste0(base_dir, "/", process_script$path) |
| 111 | 4x |
file_ref <- if (run_current) paste0(" ({.emph ", script, "})") else NULL
|
| 112 | 4x |
cli::cli_progress_step( |
| 113 | 4x |
paste0("processing source {.strong ", name, "}", file_ref),
|
| 114 | 4x |
spinner = TRUE |
| 115 |
) |
|
| 116 | 4x |
env <- new.env() |
| 117 | 4x |
env$dcf_process_continue <- TRUE |
| 118 | 4x |
status <- if (ingest) {
|
| 119 | 4x |
tryCatch( |
| 120 | 4x |
list( |
| 121 | 4x |
log = utils::capture.output( |
| 122 | 4x |
source(script, env, chdir = TRUE), |
| 123 | 4x |
type = "message" |
| 124 |
), |
|
| 125 | 4x |
success = TRUE |
| 126 |
), |
|
| 127 | 4x |
error = function(e) {
|
| 128 | ! |
cli::cli_warn("scripts {.file {script}} failed: {e$message}")
|
| 129 | ! |
list(log = e$message, success = FALSE) |
| 130 |
} |
|
| 131 |
) |
|
| 132 |
} else {
|
|
| 133 | ! |
list(log = "", success = TRUE) |
| 134 |
} |
|
| 135 | 4x |
collect_env$logs[[name]] <- status$log |
| 136 | 4x |
if (run_current) {
|
| 137 | 4x |
process_script$last_run <- Sys.time() |
| 138 | 4x |
process_script$run_time <- proc.time()[[3]] - st |
| 139 | 4x |
process_script$last_status <- status |
| 140 | 4x |
process_def$scripts[[si]] <- process_script |
| 141 |
} |
|
| 142 | 4x |
if (status$success) {
|
| 143 | 4x |
collect_env$timings[[name]] <- process_script$run_time |
| 144 |
} |
|
| 145 | ! |
if (!env$dcf_process_continue) break |
| 146 |
} |
|
| 147 | 4x |
process_def_current <- dcf_process_record(process_file) |
| 148 | 4x |
if ( |
| 149 | 4x |
is.null(process_def_current$raw_state) || |
| 150 | 4x |
!identical(process_def$raw_state, process_def_current$raw_state) |
| 151 |
) {
|
|
| 152 | 4x |
process_def_current$scripts <- process_def$scripts |
| 153 | 4x |
dcf_process_record(process_file, process_def_current) |
| 154 |
} |
|
| 155 | 4x |
data_files <- list.files(standard_dir, "\\.(?:csv|parquet|json)") |
| 156 | 4x |
data_files <- data_files[!grepl("datapackage", data_files, fixed = TRUE)]
|
| 157 | 4x |
if (length(data_files)) {
|
| 158 | 4x |
measure_info_file <- paste0(base_dir, "/measure_info.json") |
| 159 | 4x |
standard_state <- as.list(tools::md5sum(c( |
| 160 | 4x |
measure_info_file, |
| 161 | 4x |
paste0(standard_dir, "/", data_files) |
| 162 |
))) |
|
| 163 | 4x |
if (!identical(process_def_current$standard_state, standard_state)) {
|
| 164 | 3x |
measure_info <- dcf_measure_info( |
| 165 | 3x |
measure_info_file, |
| 166 | 3x |
include_empty = FALSE, |
| 167 | 3x |
render = TRUE, |
| 168 | 3x |
write = FALSE, |
| 169 | 3x |
open_after = FALSE, |
| 170 | 3x |
verbose = FALSE |
| 171 |
) |
|
| 172 | 3x |
measure_sources <- list() |
| 173 | 3x |
for (measure_id in names(measure_info)) {
|
| 174 | 5x |
measure_info[[measure_id]]$id <- measure_id |
| 175 | 5x |
info <- measure_info[[measure_id]] |
| 176 | 5x |
for (s in info$sources) {
|
| 177 | 4x |
if ( |
| 178 | 2x |
is.list(s) && |
| 179 | 2x |
!is.null(s$location) && |
| 180 | 2x |
!(s$location %in% names(sources)) |
| 181 |
) {
|
|
| 182 | ! |
measure_sources[[s$location]] <- s |
| 183 |
} |
|
| 184 |
} |
|
| 185 |
} |
|
| 186 | 3x |
if (!file.exists(paste0(standard_dir, "/datapackage.json"))) {
|
| 187 | ! |
dcf_datapackage_init(name, dir = standard_dir, quiet = TRUE) |
| 188 |
} |
|
| 189 | 3x |
base_meta <- list( |
| 190 | 3x |
source = unname(measure_sources), |
| 191 | 3x |
base_dir = base_dir, |
| 192 | 3x |
ids = "geography", |
| 193 | 3x |
time = "time", |
| 194 | 3x |
variables = measure_info |
| 195 |
) |
|
| 196 | 3x |
dcf_datapackage_add( |
| 197 | 3x |
data_files, |
| 198 | 3x |
meta = if (length(process_def_current$vintages)) {
|
| 199 | 3x |
vintages <- process_def_current$vintages |
| 200 | 3x |
lapply(structure(data_files, names = data_files), function(f) {
|
| 201 | 3x |
base_meta$vintage <- vintages[[f]] |
| 202 | 3x |
base_meta |
| 203 |
}) |
|
| 204 |
} else {
|
|
| 205 | ! |
base_meta |
| 206 |
}, |
|
| 207 | 3x |
dir = standard_dir, |
| 208 | 3x |
pretty = TRUE, |
| 209 | 3x |
summarize_ids = TRUE, |
| 210 | 3x |
verbose = FALSE |
| 211 |
) |
|
| 212 | 3x |
process_def_current$standard_state <- standard_state |
| 213 | 3x |
dcf_process_record(process_file, process_def_current) |
| 214 |
} |
|
| 215 | 4x |
cli::cli_progress_done(result = if (status$success) "done" else "failed") |
| 216 |
} else {
|
|
| 217 | ! |
cli::cli_progress_done(result = "failed") |
| 218 | ! |
cli::cli_bullets( |
| 219 | ! |
c(" " = "no standard data files found in {.path {process_file}}")
|
| 220 |
) |
|
| 221 |
} |
|
| 222 |
} |
|
| 223 | 4x |
process_bundle <- function(process_file) {
|
| 224 | 1x |
process_def <- dcf_process_record(process_file) |
| 225 | 1x |
source_files <- if (length(process_def$source_files)) {
|
| 226 | 1x |
if (!is.null(names(process_def$source_files))) {
|
| 227 | 1x |
names(process_def$source_files) |
| 228 |
} else {
|
|
| 229 | ! |
process_def$source_files |
| 230 |
} |
|
| 231 |
} else {
|
|
| 232 | ! |
NULL |
| 233 |
} |
|
| 234 | 1x |
if (clear_state) {
|
| 235 | ! |
process_def$source_state <- NULL |
| 236 | ! |
process_def$dist_state <- NULL |
| 237 | ! |
dcf_process_record(process_file, process_def) |
| 238 |
} |
|
| 239 | 1x |
name <- process_def$name |
| 240 | 1x |
dcf_add_bundle(name, project_dir, open_after = FALSE) |
| 241 | 1x |
for (si in seq_along(process_def$scripts)) {
|
| 242 | 1x |
st <- proc.time()[[3]] |
| 243 | 1x |
process_script <- process_def$scripts[[si]] |
| 244 | 1x |
base_dir <- dirname(process_file) |
| 245 | 1x |
script <- paste0(base_dir, "/", process_script$path) |
| 246 | 1x |
run_current <- TRUE |
| 247 | 1x |
standard_state <- NULL |
| 248 | 1x |
if (length(source_files)) {
|
| 249 | 1x |
standard_files <- paste0(source_dir, "/", source_files) |
| 250 | 1x |
standard_state <- as.list(tools::md5sum(paste0( |
| 251 | 1x |
source_dir, |
| 252 |
"/", |
|
| 253 | 1x |
source_files |
| 254 |
))) |
|
| 255 | 1x |
run_current <- !identical(standard_state, process_def$source_state) |
| 256 |
} |
|
| 257 | 1x |
if (run_current) {
|
| 258 | 1x |
cli::cli_progress_step( |
| 259 | 1x |
paste0( |
| 260 | 1x |
"processing bundle {.strong ",
|
| 261 | 1x |
name, |
| 262 | 1x |
"} ({.emph ",
|
| 263 | 1x |
script, |
| 264 |
"})" |
|
| 265 |
), |
|
| 266 | 1x |
spinner = TRUE |
| 267 |
) |
|
| 268 | 1x |
env <- new.env() |
| 269 | 1x |
env$dcf_process_continue <- TRUE |
| 270 | 1x |
status <- tryCatch( |
| 271 | 1x |
list( |
| 272 | 1x |
log = utils::capture.output( |
| 273 | 1x |
source(script, env, chdir = TRUE), |
| 274 | 1x |
type = "message" |
| 275 |
), |
|
| 276 | 1x |
success = TRUE |
| 277 |
), |
|
| 278 | 1x |
error = function(e) {
|
| 279 | ! |
cli::cli_warn("scripts {.file {script}} failed: {e$message}")
|
| 280 | ! |
list(log = e$message, success = FALSE) |
| 281 |
} |
|
| 282 |
) |
|
| 283 | 1x |
collect_env$logs[[name]] <- status$log |
| 284 | 1x |
if (run_current) {
|
| 285 | 1x |
process_script$last_run <- Sys.time() |
| 286 | 1x |
process_script$run_time <- proc.time()[[3]] - st |
| 287 | 1x |
process_script$last_status <- status |
| 288 | 1x |
process_def$scripts[[si]] <- process_script |
| 289 |
} |
|
| 290 | 1x |
if (status$success) {
|
| 291 | 1x |
collect_env$timings[[name]] <- process_script$run_time |
| 292 |
} |
|
| 293 | ! |
if (!env$dcf_process_continue) break |
| 294 |
} |
|
| 295 |
} |
|
| 296 | 1x |
process_def_current <- dcf_process_record(process_file) |
| 297 | 1x |
dist_dir <- paste0(base_dir, "/dist") |
| 298 | 1x |
dist_files <- grep( |
| 299 | 1x |
"datapackage", |
| 300 | 1x |
list.files(dist_dir, recursive = TRUE), |
| 301 | 1x |
fixed = TRUE, |
| 302 | 1x |
invert = TRUE, |
| 303 | 1x |
value = TRUE |
| 304 |
) |
|
| 305 | 1x |
if (length(dist_files)) {
|
| 306 | 1x |
dist_state <- as.list(tools::md5sum(paste0( |
| 307 | 1x |
base_dir, |
| 308 | 1x |
"/dist/", |
| 309 | 1x |
dist_files |
| 310 |
))) |
|
| 311 | 1x |
if (!identical(process_def_current$dist_state, dist_state)) {
|
| 312 | 1x |
process_def_current$scripts <- process_def$scripts |
| 313 | 1x |
process_def_current$dist_state <- dist_state |
| 314 | 1x |
process_def_current$standard_state <- standard_state |
| 315 | 1x |
dcf_process_record(process_file, process_def_current) |
| 316 | ||
| 317 |
# merge with standard measure infos |
|
| 318 | 1x |
measure_info <- dcf_measure_info( |
| 319 | 1x |
paste0(base_dir, "/measure_info.json"), |
| 320 | 1x |
include_empty = FALSE, |
| 321 | 1x |
render = TRUE, |
| 322 | 1x |
write = FALSE, |
| 323 | 1x |
open_after = FALSE, |
| 324 | 1x |
verbose = FALSE |
| 325 |
) |
|
| 326 | 1x |
if (length(source_files)) {
|
| 327 | 1x |
source_measure_info_files <- list.files( |
| 328 | 1x |
sub("/.*$", "", NULL),
|
| 329 | 1x |
"datapackage\\.json", |
| 330 | 1x |
recursive = TRUE |
| 331 |
) |
|
| 332 | 1x |
if (length(source_measure_info_files)) {
|
| 333 | ! |
source_measure_info <- Reduce( |
| 334 | ! |
c, |
| 335 | ! |
lapply( |
| 336 | ! |
source_measure_info_files, |
| 337 | ! |
function(f) jsonlite::read_json(f)$measure_info |
| 338 |
) |
|
| 339 |
) |
|
| 340 | ! |
for (measure_id in names(measure_info)) {
|
| 341 | ! |
info <- measure_info[[measure_id]] |
| 342 | ! |
info$id <- measure_id |
| 343 | ! |
source_id <- if (!is.null(info$source_id)) {
|
| 344 | ! |
info$source_id |
| 345 |
} else {
|
|
| 346 | ! |
measure_id |
| 347 |
} |
|
| 348 | ! |
source_info <- source_measure_info[[source_id]] |
| 349 | ! |
if (!is.null(source_info)) {
|
| 350 | ! |
for (entry_name in names(source_info)) {
|
| 351 | 4x |
if ( |
| 352 | ! |
is.null(info[[entry_name]]) || |
| 353 | ! |
(is.character(info[[entry_name]]) && |
| 354 | ! |
info[[entry_name]] == "") |
| 355 |
) {
|
|
| 356 | ! |
info[[entry_name]] <- source_info[[entry_name]] |
| 357 | ! |
} else if (is.list(info[[entry_name]])) {
|
| 358 | ! |
info[[entry_name]] <- unique(c( |
| 359 | ! |
info[[entry_name]], |
| 360 | ! |
source_info[[entry_name]] |
| 361 |
)) |
|
| 362 |
} |
|
| 363 |
} |
|
| 364 |
} |
|
| 365 | ! |
measure_info[[measure_id]] <- info |
| 366 |
} |
|
| 367 |
} |
|
| 368 |
} |
|
| 369 | 1x |
measure_sources <- list() |
| 370 | 1x |
for (info in measure_info) {
|
| 371 | 1x |
for (s in info$sources) {
|
| 372 | 4x |
if ( |
| 373 | ! |
is.list(s) && |
| 374 | ! |
!is.null(s$location) && |
| 375 | ! |
!(s$location %in% names(sources)) |
| 376 |
) {
|
|
| 377 | ! |
measure_sources[[s$location]] <- s |
| 378 |
} |
|
| 379 |
} |
|
| 380 |
} |
|
| 381 | 1x |
if (!file.exists(paste0(dist_dir, "/datapackage.json"))) {
|
| 382 | 1x |
dcf_datapackage_init(name, dir = dist_dir, quiet = TRUE) |
| 383 |
} |
|
| 384 | 1x |
metas <- list( |
| 385 | 1x |
source = unname(measure_sources), |
| 386 | 1x |
base_dir = base_dir, |
| 387 | 1x |
ids = "geography", |
| 388 | 1x |
time = "time", |
| 389 | 1x |
variables = measure_info |
| 390 |
) |
|
| 391 | 1x |
if (!is.null(names(process_def_current$source_files))) {
|
| 392 | 1x |
bundle_source_files <- names(process_def_current$source_files) |
| 393 | 1x |
package_files <- paste0( |
| 394 | 1x |
dirname(base_dir), |
| 395 |
"/", |
|
| 396 | 1x |
dirname(bundle_source_files), |
| 397 | 1x |
"/datapackage.json" |
| 398 |
) |
|
| 399 | 1x |
vintages <- process_def_current$vintages |
| 400 | 1x |
for (i in seq_along(bundle_source_files)) {
|
| 401 | 1x |
package_file <- package_files[[i]] |
| 402 | 1x |
source_dist_files <- process_def_current$source_files[[bundle_source_files[[ |
| 403 | 1x |
i |
| 404 |
]]]] |
|
| 405 | 1x |
if (file.exists(package_file)) {
|
| 406 | 1x |
package <- jsonlite::read_json(package_file) |
| 407 | 1x |
for (resource in package$resources) {
|
| 408 | 1x |
if (length(resource$vintage)) {
|
| 409 | 1x |
for (dist_file in source_dist_files) {
|
| 410 | 2x |
vintages[[dist_file]] <- max( |
| 411 | 2x |
vintages[[dist_file]], |
| 412 | 2x |
resource$vintage |
| 413 |
) |
|
| 414 |
} |
|
| 415 |
} |
|
| 416 |
} |
|
| 417 |
} |
|
| 418 |
} |
|
| 419 | 1x |
if (length(vintages)) {
|
| 420 | 1x |
metas <- lapply( |
| 421 | 1x |
structure(dist_files, names = dist_files), |
| 422 | 1x |
function(dist_file) {
|
| 423 | 1x |
metas$vintage <- vintages[[dist_file]] |
| 424 | 1x |
metas |
| 425 |
} |
|
| 426 |
) |
|
| 427 |
} |
|
| 428 | ! |
} else if (length(process_def_current$vintages)) {
|
| 429 | ! |
vintages <- process_def_current$vintages |
| 430 | ! |
metas <- lapply( |
| 431 | ! |
structure(dist_files, names = dist_files), |
| 432 | ! |
function(f) {
|
| 433 | ! |
metas$vintage <- vintages[[f]] |
| 434 | ! |
metas |
| 435 |
} |
|
| 436 |
) |
|
| 437 |
} |
|
| 438 | 1x |
dcf_datapackage_add( |
| 439 | 1x |
dist_files, |
| 440 | 1x |
meta = metas, |
| 441 | 1x |
dir = dist_dir, |
| 442 | 1x |
pretty = TRUE, |
| 443 | 1x |
summarize_ids = TRUE, |
| 444 | 1x |
verbose = FALSE |
| 445 |
) |
|
| 446 |
} |
|
| 447 | 1x |
cli::cli_progress_done(result = if (status$success) "done" else "failed") |
| 448 |
} else {
|
|
| 449 | ! |
cli::cli_progress_done(result = "failed") |
| 450 | ! |
cli::cli_bullets( |
| 451 | ! |
c(" " = "no standard data files found in {.path {process_file}}")
|
| 452 |
) |
|
| 453 |
} |
|
| 454 |
} |
|
| 455 | 4x |
for (process_file in sources[order( |
| 456 | 4x |
vapply( |
| 457 | 4x |
sources, |
| 458 | 4x |
function(f) {
|
| 459 | 5x |
type <- jsonlite::read_json(f)$type |
| 460 | 5x |
is.null(type) || type != "bundle" |
| 461 |
}, |
|
| 462 | 4x |
TRUE |
| 463 |
), |
|
| 464 | 4x |
decreasing = TRUE |
| 465 |
)]) {
|
|
| 466 | 5x |
process_def <- dcf_process_record(process_file) |
| 467 | 5x |
if (is.null(process_def$type) || process_def$type == "source") {
|
| 468 | 4x |
process_source(process_file) |
| 469 |
} else {
|
|
| 470 | 1x |
process_bundle(process_file) |
| 471 |
} |
|
| 472 |
} |
|
| 473 | 4x |
invisible(list(timings = collect_env$timings, logs = collect_env$logs)) |
| 474 |
} |
| 1 |
#' Make a measurement metadata file |
|
| 2 |
#' |
|
| 3 |
#' Make a \code{measure_info.json} file, or add measure entries to an existing one.
|
|
| 4 |
#' |
|
| 5 |
#' @param path Path to the \code{measure_info.json} file, existing or to be created.
|
|
| 6 |
#' @param ... Lists containing individual measure items. See the Measure Entries section. |
|
| 7 |
#' @param info A list containing measurement information to be added. |
|
| 8 |
#' @param references A list containing citation entries. See the Reference Entries section. |
|
| 9 |
#' @param references A list containing source entries. See the Sources Entries section. |
|
| 10 |
#' @param strict Logical; if \code{TRUE}, will only allow recognized entries and values.
|
|
| 11 |
#' @param include_empty Logical; if \code{FALSE}, will omit entries that have not been provided.
|
|
| 12 |
#' @param overwrite_entry Logical; if \code{TRUE}, will replace rather than add to an existing entry.
|
|
| 13 |
#' @param render Path to save a version of \code{path} to, with dynamic entries expanded. See the
|
|
| 14 |
#' Dynamic Entries section. |
|
| 15 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite rather than add to an existing \code{path}.
|
|
| 16 |
#' @param write Logical; if \code{FALSE}, will not write the build or rendered measure info.
|
|
| 17 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 18 |
#' @param open_after Logical; if \code{FALSE}, will not open the measure file after writing/updating.
|
|
| 19 |
#' @section Measure Entries: |
|
| 20 |
#' Measure entries are named by the unique variable name with any of these entries (if \code{strict}):
|
|
| 21 |
#' \itemize{
|
|
| 22 |
#' \item \strong{\code{id}}: Unique identifier of the measure; same as the entry name.
|
|
| 23 |
#' This is meant to correspond to the column name containing the measure in data files. |
|
| 24 |
#' It should be minimal in length while still being unique across all files within the project. |
|
| 25 |
#' It should only contain the characters \code{a-z}, \code{0-9}, or \code{_}.
|
|
| 26 |
#' \item \strong{\code{short_name}}: Shortest possible display name.
|
|
| 27 |
#' \item \strong{\code{long_name}}: Longer display name.
|
|
| 28 |
#' \item \strong{\code{category}}: Arbitrary category for the measure.
|
|
| 29 |
#' \item \strong{\code{short_description}}: Shortest possible description.
|
|
| 30 |
#' \item \strong{\code{long_description}}: Complete description. Either description can include
|
|
| 31 |
#' TeX-style equations, enclosed in escaped square brackets (e.g., |
|
| 32 |
#' \code{"The equation \\\\[a_{i} = b^\\\\frac{c}{d}\\\\] was used."}; or \code{$...$},
|
|
| 33 |
#' \code{\\\\(...\\\\)}, or \code{\\\\begin{math}...\\\\end{math}}). The final enclosing symbol must be
|
|
| 34 |
#' followed by a space or the end of the string. These are pre-render to MathML with |
|
| 35 |
#' \code{\link[katex]{katex_mathml}}.
|
|
| 36 |
#' \item \strong{\code{statement}}: String with dynamic references to entity features
|
|
| 37 |
#' (e.g., \code{"measure value = {value}"}). References can include:
|
|
| 38 |
#' \itemize{
|
|
| 39 |
#' \item \code{value}: Value of a currently displaying variable at a current time.
|
|
| 40 |
#' \item \code{region_name}: Alias of \code{features.name}.
|
|
| 41 |
#' \item \code{features.<entry>}: An entity feature, coming from \code{entity_info.json} or GeoJSON properties.
|
|
| 42 |
#' All entities have at least \code{name} and \code{id} entries (e.g., \code{"{features.id}"}).
|
|
| 43 |
#' \item \code{variables.<entry>}: A variable feature such as \code{name} which is the same as
|
|
| 44 |
#' \code{id} (e.g., \code{"{variables.name}"}).
|
|
| 45 |
#' \item \code{data.<variable>}: The value of another variable at a current time (e.g., \code{"{data.variable_a}"}).
|
|
| 46 |
#' } |
|
| 47 |
#' \item \strong{\code{measure_type}}: Type of the measure's value. Recognized types are displayed in a special way:
|
|
| 48 |
#' \itemize{
|
|
| 49 |
#' \item \code{year} or \code{integer} show as entered (usually as whole numbers). Other numeric
|
|
| 50 |
#' types are rounded to show a set number of digits. |
|
| 51 |
#' \item \code{percent} shows as \code{{value}\%}.
|
|
| 52 |
#' \item \code{minutes} shows as \code{{value} minutes}.
|
|
| 53 |
#' \item \code{dollar} shows as \code{${value}}.
|
|
| 54 |
#' \item \code{internet speed} shows as \code{{value} Mbps}.
|
|
| 55 |
#' } |
|
| 56 |
#' \item \strong{\code{unit}}: Prefix or suffix associated with the measure's type, such as \code{\%} for \code{percent},
|
|
| 57 |
#' or \code{Mbps} for \code{rate}.
|
|
| 58 |
#' \item \strong{\code{time_resolution}}: Temporal resolution of the variable, such as \code{year} or \code{week}.
|
|
| 59 |
#' \item \strong{\code{restrictions}}: A license or description of restrictions that may apply to the measure.
|
|
| 60 |
#' \item \strong{\code{sources}}: A list or list of list containing source information, including any of these entries:
|
|
| 61 |
#' \itemize{
|
|
| 62 |
#' \item \code{id}: An ID found in the \code{_sources} entry, to inherit entries from.
|
|
| 63 |
#' \item \code{name}: Name of the source (such as an organization name).
|
|
| 64 |
#' \item \code{url}: General URL of the source (such as an organization's website).
|
|
| 65 |
#' \item \code{location}: More specific description of the source (such as a the name of a particular data product).
|
|
| 66 |
#' \item \code{location_url}: More direct URL to the resource (such as a page listing data products).
|
|
| 67 |
#' } |
|
| 68 |
#' \item \strong{\code{citations}}: A vector of reference ids (the names of \code{reference} entries; e.g., \code{c("ref1", "ref3")}).
|
|
| 69 |
#' \item \strong{\code{categories}}: A named list of categories, with any of the other measure entries, or a
|
|
| 70 |
#' \code{default} entry giving a default category name. See the Dynamic Entries section.
|
|
| 71 |
#' \item \strong{\code{variants}}: A named list of variants, with any of the other measure entries, or a
|
|
| 72 |
#' \code{default} entry giving a default variant name. See the Dynamic Entries section.
|
|
| 73 |
#' } |
|
| 74 |
#' @section Dynamic Entries: |
|
| 75 |
#' You may have several closely related variables in a dataset, which share sections of metadata, |
|
| 76 |
#' or have formulaic differences. In cases like this, the \code{categories} and/or \code{variants} entries
|
|
| 77 |
#' can be used along with dynamic notation to construct multiple entries from a single template. |
|
| 78 |
#' |
|
| 79 |
#' Though functionally the same, \code{categories} might include broken-out subsets of some total
|
|
| 80 |
#' (such as race groups, as categories of a total population), whereas \code{variants} may be different
|
|
| 81 |
#' transformations of the same variable (such as raw counts versus percentages). |
|
| 82 |
#' |
|
| 83 |
#' In dynamic entries, \code{{category}} or \code{{variant}} refers to entries in the \code{categories}
|
|
| 84 |
#' or \code{variants} lists. By default, these are replaced with the name of each entries in those lists
|
|
| 85 |
#' (e.g., \code{"variable_{category}"} where \code{categories = "a"} would become \code{"variable_a"}).
|
|
| 86 |
#' A \code{default} entry would change this behavior (e.g., with \code{categories = list(a = list(default = "b")}
|
|
| 87 |
#' that would become \code{"variable_b"}). Adding \code{.name} would force the original behavior (e.g.,
|
|
| 88 |
#' \code{"variable_{category.name}"} would be \code{"variable_a"}). A name of \code{"blank"} is treated as
|
|
| 89 |
#' an empty string. |
|
| 90 |
#' |
|
| 91 |
#' When notation appears in a measure info entry, they will first default to a matching name in the \code{categories}
|
|
| 92 |
#' or \code{variants} list; for example, \code{short_name} in \code{list(short_name = "variable {category}")} with
|
|
| 93 |
#' \code{categories = list(a = list(short_name = "(category a)"))} would become \code{"variable (category a)"}.
|
|
| 94 |
#' To force this behavior, the entry name can be included in the notation (e.g., |
|
| 95 |
#' \code{"{category.short_name}"} would be \code{"variable (category a)"} in any entry).
|
|
| 96 |
#' |
|
| 97 |
#' Only string entries are processed dynamically -- any list-like entries (such as |
|
| 98 |
#' \code{source}, \code{citations}, or \code{layer}) appearing in
|
|
| 99 |
#' \code{categories} or \code{variants} entries will fully replace the base entry.
|
|
| 100 |
#' |
|
| 101 |
#' Dynamic entries can be kept dynamic when passed to a data site, but can be rendered for other uses, |
|
| 102 |
#' where the rendered version will have each dynamic entry replaced with all unique combinations of |
|
| 103 |
#' \code{categories} and \code{variants} entries, assuming both are used in the dynamic entry's name
|
|
| 104 |
#' (e.g., \code{"variable_{category}_{variant}"}). See Examples.
|
|
| 105 |
#' @section Reference Entries: |
|
| 106 |
#' Reference entries can be included in a \code{_references} entry, and should have names corresponding to
|
|
| 107 |
#' those included in any of the measures' \code{citation} entries. These can include any of these entries:
|
|
| 108 |
#' \itemize{
|
|
| 109 |
#' \item \strong{\code{id}}: The reference id, same as the entry name.
|
|
| 110 |
#' \item \strong{\code{author}}: A list or list of lists specifying one or more authors. These can include
|
|
| 111 |
#' entries for \code{given} and \code{family} names.
|
|
| 112 |
#' \item \strong{\code{year}}: Year of the publication.
|
|
| 113 |
#' \item \strong{\code{title}}: Title of the publication.
|
|
| 114 |
#' \item \strong{\code{journal}}: Journal in which the publication appears.
|
|
| 115 |
#' \item \strong{\code{volume}}: Volume number of the journal.
|
|
| 116 |
#' \item \strong{\code{page}}: Page number of the journal.
|
|
| 117 |
#' \item \strong{\code{doi}}: Digital Object Identifier, from which a link is made (\code{https://doi.org/{doi}}).
|
|
| 118 |
#' \item \strong{\code{version}}: Version number of software.
|
|
| 119 |
#' \item \strong{\code{url}}: Link to the publication, alternative to a DOI.
|
|
| 120 |
#' } |
|
| 121 |
#' @section Source Entries: |
|
| 122 |
#' Source entries can be included in a \code{_sources} entry, and should have names corresponding to those
|
|
| 123 |
#' included in any of the measures' \code{sources} entry. These can include any of these entries:
|
|
| 124 |
#' \itemize{
|
|
| 125 |
#' \item \strong{\code{name}}: Name of the source.
|
|
| 126 |
#' \item \strong{\code{url}}: Link to the source's site.
|
|
| 127 |
#' \item \strong{\code{description}}: A description of the source.
|
|
| 128 |
#' \item \strong{\code{notes}}: A list of additional notes about the source.
|
|
| 129 |
#' \item \strong{\code{organization}}: Name of a higher-level organization that the source is a part of.
|
|
| 130 |
#' \item \strong{\code{organization_url}}: Link to the organization's site.
|
|
| 131 |
#' \item \strong{\code{category}}: A top-level category classification.
|
|
| 132 |
#' \item \strong{\code{subcategory}}: A lower-level category classification.
|
|
| 133 |
#' } |
|
| 134 |
#' @examples |
|
| 135 |
#' path <- tempfile() |
|
| 136 |
#' |
|
| 137 |
#' # make an initial file |
|
| 138 |
#' dcf_measure_info(path, "measure_name" = list( |
|
| 139 |
#' id = "measure_name", |
|
| 140 |
#' short_description = "A measure.", |
|
| 141 |
#' statement = "This entity has {value} measure units."
|
|
| 142 |
#' ), verbose = FALSE) |
|
| 143 |
#' |
|
| 144 |
#' # add another measure to that |
|
| 145 |
#' measure_info <- dcf_measure_info(path, "measure_two" = list( |
|
| 146 |
#' id = "measure_two", |
|
| 147 |
#' short_description = "Another measure.", |
|
| 148 |
#' statement = "This entity has {value} measure units."
|
|
| 149 |
#' ), verbose = FALSE) |
|
| 150 |
#' names(measure_info) |
|
| 151 |
#' |
|
| 152 |
#' # add a dynamic measure, and make a rendered version |
|
| 153 |
#' measure_info_rendered <- dcf_measure_info( |
|
| 154 |
#' path, |
|
| 155 |
#' "measure_{category}_{variant.name}" = list(
|
|
| 156 |
#' id = "measure_{category}_{variant.name}",
|
|
| 157 |
#' short_description = "Another measure ({category}; {variant}).",
|
|
| 158 |
#' statement = "This entity has {value} {category} {variant}s.",
|
|
| 159 |
#' categories = c("a", "b"),
|
|
| 160 |
#' variants = list(u1 = list(default = "U1"), u2 = list(default = "U2")) |
|
| 161 |
#' ), |
|
| 162 |
#' render = TRUE, verbose = FALSE |
|
| 163 |
#' ) |
|
| 164 |
#' names(measure_info_rendered) |
|
| 165 |
#' measure_info_rendered[["measure_a_u1"]]$statement |
|
| 166 |
#' @return An invisible list containing measurement metadata (the rendered version if made). |
|
| 167 |
#' @export |
|
| 168 | ||
| 169 |
dcf_measure_info <- function( |
|
| 170 |
path, |
|
| 171 |
..., |
|
| 172 |
info = list(), |
|
| 173 |
references = list(), |
|
| 174 |
sources = list(), |
|
| 175 |
strict = FALSE, |
|
| 176 |
include_empty = TRUE, |
|
| 177 |
overwrite_entry = FALSE, |
|
| 178 |
render = NULL, |
|
| 179 |
overwrite = FALSE, |
|
| 180 |
write = TRUE, |
|
| 181 |
verbose = TRUE, |
|
| 182 |
open_after = interactive() |
|
| 183 |
) {
|
|
| 184 | 16x |
if (write) {
|
| 185 | 8x |
if (missing(path) || !is.character(path)) {
|
| 186 | ! |
cli::cli_abort( |
| 187 | ! |
"enter a path to the measure_info.json file as {.arg path}"
|
| 188 |
) |
|
| 189 |
} |
|
| 190 | 8x |
dir.create(dirname(path), FALSE, TRUE) |
| 191 |
} |
|
| 192 | 16x |
built <- list() |
| 193 | 16x |
if (!overwrite && is.character(path) && file.exists(path)) {
|
| 194 | 14x |
if (verbose) {
|
| 195 | 5x |
cli::cli_bullets(c( |
| 196 | 5x |
i = "updating existing file: {.path {basename(path)}}"
|
| 197 |
)) |
|
| 198 |
} |
|
| 199 | 14x |
built <- jsonlite::read_json(path) |
| 200 | 14x |
if (all(c("id", "measure_type") %in% names(built))) {
|
| 201 | ! |
built <- list(built) |
| 202 | ! |
names(built) <- built[[1]]$id |
| 203 |
} |
|
| 204 |
} |
|
| 205 | 16x |
if (length(references)) {
|
| 206 | 1x |
references <- c(references, built$`_references`) |
| 207 | 1x |
references <- references[!duplicated(names(references))] |
| 208 | 1x |
built$`_references` <- references |
| 209 |
} else {
|
|
| 210 | 15x |
references <- built$`_references` |
| 211 |
} |
|
| 212 | 16x |
if (length(sources)) {
|
| 213 | 1x |
sources <- c(sources, built$`_sources`) |
| 214 | 1x |
sources <- sources[!duplicated(names(sources))] |
| 215 | 1x |
built$`_sources` <- sources |
| 216 |
} else {
|
|
| 217 | 15x |
sources <- built$`_sources` |
| 218 |
} |
|
| 219 | 16x |
defaults <- list( |
| 220 | 16x |
id = "", |
| 221 | 16x |
short_name = "", |
| 222 | 16x |
long_name = "", |
| 223 | 16x |
category = "", |
| 224 | 16x |
short_description = "", |
| 225 | 16x |
long_description = "", |
| 226 | 16x |
statement = "", |
| 227 | 16x |
measure_type = "", |
| 228 | 16x |
unit = "", |
| 229 | 16x |
time_resolution = "", |
| 230 | 16x |
restrictions = "", |
| 231 | 16x |
sources = list(), |
| 232 | 16x |
citations = list() |
| 233 |
) |
|
| 234 | 16x |
if (!is.list(info)) {
|
| 235 | ! |
info <- sapply(info, function(name) list()) |
| 236 |
} |
|
| 237 | 16x |
info <- c(list(...), info) |
| 238 | 16x |
if (length(info) && is.null(names(info))) {
|
| 239 | ! |
cli::cli_abort("supplied measure entries must be named")
|
| 240 |
} |
|
| 241 | 16x |
for (n in names(info)) {
|
| 242 | 8x |
if (overwrite_entry || is.null(built[[n]])) {
|
| 243 | 7x |
l <- info[[n]] |
| 244 |
} else {
|
|
| 245 | 1x |
l <- c(info[[n]], built[[n]]) |
| 246 | 1x |
l <- l[!duplicated(names(l))] |
| 247 |
} |
|
| 248 | 8x |
if (is.null(l$id)) {
|
| 249 | 3x |
l$id <- n |
| 250 |
} |
|
| 251 | 8x |
if (strict) {
|
| 252 | 1x |
su <- names(l) %in% names(defaults) |
| 253 | 1x |
if (verbose && any(!su)) {
|
| 254 | 1x |
cli::cli_warn(paste0( |
| 255 | 1x |
"unrecognized {?entry/entries} in ",
|
| 256 | 1x |
n, |
| 257 | 1x |
": {names(l)[!su]}"
|
| 258 |
)) |
|
| 259 |
} |
|
| 260 | 1x |
if (include_empty) {
|
| 261 | ! |
for (e in names(l)) {
|
| 262 | ! |
if (!is.null(defaults[[e]])) {
|
| 263 | ! |
defaults[[e]] <- l[[e]] |
| 264 |
} |
|
| 265 |
} |
|
| 266 | ! |
l <- defaults |
| 267 |
} else {
|
|
| 268 | 1x |
l <- l[su] |
| 269 |
} |
|
| 270 | 7x |
} else if (include_empty) {
|
| 271 | 6x |
su <- !names(defaults) %in% names(l) |
| 272 | 6x |
if (any(su)) l <- c(l, defaults[su]) |
| 273 |
} |
|
| 274 | 8x |
if (!is.null(l$categories) && !is.list(l$categories)) {
|
| 275 | 1x |
l$categories <- structure( |
| 276 | 1x |
lapply(l$categories, function(e) list(default = e)), |
| 277 | 1x |
names = l$categories |
| 278 |
) |
|
| 279 |
} |
|
| 280 | 8x |
if (!is.null(l$variants) && !is.list(l$variants)) {
|
| 281 | ! |
l$variants <- structure( |
| 282 | ! |
lapply(l$variants, function(e) list(default = e)), |
| 283 | ! |
names = l$categories |
| 284 |
) |
|
| 285 |
} |
|
| 286 | 8x |
if (verbose && !is.null(l$citations)) {
|
| 287 | 6x |
su <- !l$citations %in% names(references) |
| 288 | 6x |
if (any(su)) {
|
| 289 | 1x |
cli::cli_warn( |
| 290 | 1x |
"no matching reference entry for {.val {l$citations[su]}} in {.val {n}}"
|
| 291 |
) |
|
| 292 |
} |
|
| 293 |
} |
|
| 294 | 8x |
if (verbose && !is.null(l$sources)) {
|
| 295 | 4x |
l_sources <- if (!is.character(l$sources)) l$sources else |
| 296 | 4x |
Filter( |
| 297 | 4x |
nchar, |
| 298 | 4x |
vapply( |
| 299 | 4x |
l$sources, |
| 300 | 4x |
function(s) |
| 301 | 4x |
if (is.character(s)) s else if (is.null(s$id)) "" else s$id, |
| 302 |
"" |
|
| 303 |
) |
|
| 304 |
) |
|
| 305 | 4x |
if (length(l_sources)) {
|
| 306 | ! |
su <- !l_sources %in% names(sources) |
| 307 | ! |
if (any(su)) {
|
| 308 | ! |
cli::cli_warn( |
| 309 | ! |
"no matching source entry for {.val {l_sources[su]}} in {.val {n}}"
|
| 310 |
) |
|
| 311 |
} |
|
| 312 |
} |
|
| 313 |
} |
|
| 314 | 8x |
built[[n]] <- l |
| 315 |
} |
|
| 316 | 16x |
built <- built[order(grepl("^_", names(built)))]
|
| 317 | 16x |
if (write) {
|
| 318 | 8x |
if (verbose) {
|
| 319 | 6x |
cli::cli_bullets(c(i = "writing info to {.path {path}}"))
|
| 320 |
} |
|
| 321 | 8x |
jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE) |
| 322 |
} |
|
| 323 | 16x |
if (!is.null(render)) {
|
| 324 | 9x |
expanded <- list() |
| 325 | 9x |
for (name in names(built)) {
|
| 326 | 19x |
expanded <- c( |
| 327 | 19x |
expanded, |
| 328 | 19x |
if (grepl("{", name, fixed = TRUE)) {
|
| 329 | 1x |
render_info(built[name]) |
| 330 |
} else {
|
|
| 331 | 18x |
structure(list(built[[name]]), names = name) |
| 332 |
} |
|
| 333 |
) |
|
| 334 |
} |
|
| 335 | 9x |
changed <- !identical(built, expanded) |
| 336 | 9x |
built <- expanded |
| 337 | 9x |
if (write && changed) {
|
| 338 | 1x |
path <- if (is.character(render)) {
|
| 339 | ! |
render |
| 340 |
} else {
|
|
| 341 | 1x |
sub("\\.json", "_rendered.json", path, TRUE)
|
| 342 |
} |
|
| 343 | 1x |
if (verbose) {
|
| 344 | 1x |
cli::cli_bullets(c(i = "writing rendered info to {.path {path}}"))
|
| 345 |
} |
|
| 346 | 1x |
jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE) |
| 347 |
} |
|
| 348 |
} |
|
| 349 | 16x |
if (open_after) {
|
| 350 | ! |
rstudioapi::navigateToFile(path) |
| 351 |
} |
|
| 352 | 16x |
invisible(built) |
| 353 |
} |
|
| 354 | ||
| 355 |
replace_equations <- function(info) {
|
|
| 356 | 4x |
lapply(info, function(e) {
|
| 357 | ! |
if (!is.list(e)) e <- list(default = e) |
| 358 | 6x |
descriptions <- grep("description", names(e), fixed = TRUE)
|
| 359 | 6x |
if (length(descriptions)) {
|
| 360 | 5x |
for (d in descriptions) {
|
| 361 | 10x |
p <- gregexpr( |
| 362 | 10x |
"(?:\\$|\\\\\\[|\\\\\\(|\\\\begin\\{math\\})(.+?)(?:\\$|\\\\\\]|\\\\\\)|\\\\end\\{math\\})(?=\\s|$)",
|
| 363 | 10x |
e[[d]], |
| 364 | 10x |
perl = TRUE |
| 365 | 10x |
)[[1]] |
| 366 | 10x |
if (p[[1]] != -1) {
|
| 367 | ! |
re <- paste("", e[[d]], "")
|
| 368 | ! |
fm <- regmatches(e[[d]], p) |
| 369 | ! |
for (i in seq_along(p)) {
|
| 370 | ! |
mp <- attr(p, "capture.start")[i, ] |
| 371 | ! |
eq <- substring(e[[d]], mp, mp + attr(p, "capture.length")[i, ] - 1) |
| 372 | ! |
parsed <- tryCatch( |
| 373 | ! |
katex::katex_mathml(eq), |
| 374 | ! |
error = function(e) NULL |
| 375 |
) |
|
| 376 | ! |
if (!is.null(parsed)) {
|
| 377 | ! |
re <- paste( |
| 378 | ! |
strsplit(re, fm[[i]], fixed = TRUE)[[1]], |
| 379 | ! |
collapse = sub("^<[^>]*>", "", sub("<[^>]*>$", "", parsed))
|
| 380 |
) |
|
| 381 |
} |
|
| 382 |
} |
|
| 383 | ! |
e[[d]] <- gsub("^ | $", "", re)
|
| 384 |
} |
|
| 385 |
} |
|
| 386 |
} |
|
| 387 | ! |
if (is.list(e$categories)) e$categories <- replace_equations(e$categories) |
| 388 | ! |
if (is.list(e$variants)) e$variants <- replace_equations(e$variants) |
| 389 | 6x |
e |
| 390 |
}) |
|
| 391 |
} |
|
| 392 | ||
| 393 |
preprocess <- function(l) {
|
|
| 394 | ! |
if (!is.list(l)) l <- sapply(l, function(n) list()) |
| 395 | 2x |
ns <- names(l) |
| 396 | 2x |
for (i in seq_along(l)) {
|
| 397 | 4x |
name <- if (ns[i] == "blank") "" else ns[i] |
| 398 | 4x |
l[[i]]$name <- name |
| 399 | 1x |
if (is.null(l[[i]]$default)) l[[i]]$default <- name |
| 400 |
} |
|
| 401 | 2x |
l |
| 402 |
} |
|
| 403 | ||
| 404 |
replace_dynamic <- function(e, p, s, v = NULL, default = "default") {
|
|
| 405 | 68x |
m <- gregexpr(p, e) |
| 406 | 68x |
if (m[[1]][[1]] != -1) {
|
| 407 | 16x |
t <- regmatches(e, m)[[1]] |
| 408 | 16x |
tm <- structure(gsub("\\{[^.]+\\.?|\\}", "", t), names = t)
|
| 409 | 16x |
tm <- tm[!duplicated(names(tm))] |
| 410 | 16x |
tm[tm == ""] <- default |
| 411 | 16x |
for (tar in names(tm)) {
|
| 412 | 32x |
us <- (if (is.null(v) || substring(tar, 2, 2) == "c") s else v) |
| 413 | 32x |
entry <- tm[[tar]] |
| 414 | 32x |
if (is.null(us[[entry]]) && grepl("description", entry, fixed = TRUE)) {
|
| 415 | 8x |
entry <- default <- "description" |
| 416 |
} |
|
| 417 | 20x |
if (is.null(us[[entry]]) && entry == default) entry <- "default" |
| 418 | 32x |
if (is.null(us[[entry]])) |
| 419 | ! |
cli::cli_abort("failed to render measure info from {tar}")
|
| 420 | 32x |
e <- gsub(tar, us[[entry]], e, fixed = TRUE) |
| 421 |
} |
|
| 422 |
} |
|
| 423 | 68x |
e |
| 424 |
} |
|
| 425 | ||
| 426 |
prepare_source <- function(o, s, p) {
|
|
| 427 | 8x |
if (length(o)) {
|
| 428 | 8x |
lapply(o, function(e) {
|
| 429 | 2x |
if (is.character(e) && length(e) == 1) replace_dynamic(e, p, s) else e |
| 430 |
}) |
|
| 431 |
} else {
|
|
| 432 | ! |
list(name = "", default = "") |
| 433 |
} |
|
| 434 |
} |
|
| 435 | ||
| 436 |
render_info_names <- function(infos) {
|
|
| 437 | ! |
r <- lapply(names(infos), function(n) render_info(infos[n], TRUE)) |
| 438 | ! |
structure(rep(names(infos), vapply(r, length, 0)), names = unlist(r)) |
| 439 |
} |
|
| 440 | ||
| 441 |
render_info <- function(info, names_only = FALSE) {
|
|
| 442 | 1x |
base_name <- names(info) |
| 443 | 1x |
base <- info[[1]] |
| 444 | 1x |
if (is.null(base$categories) && is.null(base$variants)) {
|
| 445 | ! |
return(if (names_only) base_name else info) |
| 446 |
} |
|
| 447 | 1x |
categories <- preprocess(base$categories) |
| 448 | 1x |
variants <- preprocess(base$variants) |
| 449 | 1x |
base$categories <- NULL |
| 450 | 1x |
base$variants <- NULL |
| 451 | 1x |
expanded <- NULL |
| 452 | 1x |
vars <- strsplit( |
| 453 | 1x |
as.character(outer( |
| 454 | 1x |
if (is.null(names(categories))) "" else names(categories), |
| 455 | 1x |
if (is.null(names(variants))) "" else names(variants), |
| 456 | 1x |
paste, |
| 457 | 1x |
sep = "|||" |
| 458 |
)), |
|
| 459 |
"|||", |
|
| 460 | 1x |
fixed = TRUE |
| 461 |
) |
|
| 462 | 1x |
for (var in vars) {
|
| 463 | 4x |
cs <- if (var[1] == "") list() else categories[[var[1]]] |
| 464 | 4x |
vs <- if (length(var) == 1 || var[2] == "") list() else variants[[var[2]]] |
| 465 | 4x |
cs <- prepare_source(cs, vs, "\\{variants?(?:\\.[^}]+?)?\\}")
|
| 466 | 4x |
vs <- prepare_source(vs, cs, "\\{categor(?:y|ies)(?:\\.[^}]+?)?\\}")
|
| 467 | 4x |
s <- c(cs, vs[!names(vs) %in% names(cs)]) |
| 468 | 4x |
p <- "\\{(?:categor(?:y|ies)|variants?)(?:\\.[^}]+?)?\\}"
|
| 469 | 4x |
key <- replace_dynamic(base_name, p, cs, vs) |
| 470 | 4x |
if (names_only) {
|
| 471 | ! |
expanded <- c(expanded, key) |
| 472 |
} else {
|
|
| 473 | 4x |
expanded[[key]] <- c( |
| 474 | 4x |
structure( |
| 475 | 4x |
lapply(names(base), function(n) {
|
| 476 | 52x |
e <- base[[n]] |
| 477 | 52x |
if (is.character(e) && length(e) == 1) |
| 478 | 44x |
e <- replace_dynamic(e, p, cs, vs, n) |
| 479 | 52x |
e |
| 480 |
}), |
|
| 481 | 4x |
names = names(base) |
| 482 |
), |
|
| 483 | 4x |
s[ |
| 484 | 4x |
!names(s) %in% |
| 485 | 4x |
c( |
| 486 | 4x |
"default", |
| 487 | 4x |
"name", |
| 488 | 4x |
if (any(base[c("long_description", "short_description")] != ""))
|
| 489 | 4x |
"description", |
| 490 | 4x |
names(base) |
| 491 |
) |
|
| 492 |
] |
|
| 493 |
) |
|
| 494 |
} |
|
| 495 |
} |
|
| 496 | 1x |
expanded |
| 497 |
} |
| 1 |
#' Retrieve A Data File |
|
| 2 |
#' |
|
| 3 |
#' Load a data file from a source data project, or list versions of the file. |
|
| 4 |
#' |
|
| 5 |
#' @param path Path to the file. |
|
| 6 |
#' @param date Date of the version to load; A \code{Date}, or \code{character} in the format
|
|
| 7 |
#' \code{YYYY-MM-DD}. Will match to the nearest version.
|
|
| 8 |
#' @param commit_hash SHA signature of the committed version; |
|
| 9 |
#' can be the first 6 or so characters. Ignored if \code{date} is provided.
|
|
| 10 |
#' @param versions Logical; if \code{TRUE}, will return a list of available version,
|
|
| 11 |
#' rather than a |
|
| 12 |
#' @returns If \code{versions} is \code{TRUE}, a \code{data.frame} with columns for
|
|
| 13 |
#' the \code{hash}, \code{author}, \code{date}, and \code{message} of each commit.
|
|
| 14 |
#' Otherwise, the path to a temporary file, if one was extracted. |
|
| 15 |
#' |
|
| 16 |
#' @examples |
|
| 17 |
#' path <- "../../../pophive/data/wastewater/raw/flua.csv.xz" |
|
| 18 |
#' if (file.exists(path)) {
|
|
| 19 |
#' # list versions |
|
| 20 |
#' versions <- dcf_get_file(path, versions = TRUE) |
|
| 21 |
#' print(versions[, c("date", "hash")])
|
|
| 22 |
#' |
|
| 23 |
#' # extract a version to a temporary file |
|
| 24 |
#' temp_path <- dcf_get_file(path, "2025-05") |
|
| 25 |
#' basename(temp_path) |
|
| 26 |
#' } |
|
| 27 |
#' |
|
| 28 |
#' @export |
|
| 29 | ||
| 30 |
dcf_get_file <- function( |
|
| 31 |
path, |
|
| 32 |
date = NULL, |
|
| 33 |
commit_hash = NULL, |
|
| 34 |
versions = FALSE |
|
| 35 |
) {
|
|
| 36 | 3x |
if (missing(path)) {
|
| 37 | ! |
cli::cli_abort("specify a path")
|
| 38 |
} |
|
| 39 | 3x |
if (!file.exists(path)) {
|
| 40 | ! |
cli::cli_abort("path does not exist")
|
| 41 |
} |
|
| 42 | 3x |
vs <- data.frame( |
| 43 | 3x |
hash = character(), |
| 44 | 3x |
author = character(), |
| 45 | 3x |
date = character(), |
| 46 | 3x |
message = character() |
| 47 |
) |
|
| 48 | 3x |
if (versions || !is.null(date)) {
|
| 49 | 2x |
commits <- sys::exec_internal("git", c("log", path))
|
| 50 | 2x |
if (commits$status == 0L) {
|
| 51 | 2x |
commits <- do.call( |
| 52 | 2x |
rbind, |
| 53 | 2x |
Filter( |
| 54 | 2x |
function(e) length(e) == 4L, |
| 55 | 2x |
strsplit( |
| 56 | 2x |
strsplit(rawToChar(commits$stdout), "commit ", fixed = TRUE)[[1L]], |
| 57 | 2x |
"\\n+(?:[^:]+:)?\\s*" |
| 58 |
) |
|
| 59 |
) |
|
| 60 |
) |
|
| 61 | 2x |
colnames(commits) <- colnames(vs) |
| 62 | 2x |
vs <- as.data.frame(commits) |
| 63 |
} else {
|
|
| 64 | ! |
cli::cli_abort("failed to git log: {rawToChar(commits$stderr)}")
|
| 65 |
} |
|
| 66 |
} |
|
| 67 | 3x |
if (versions) {
|
| 68 | 1x |
return(vs) |
| 69 |
} |
|
| 70 | 2x |
if (!is.null(date)) {
|
| 71 | 1x |
if (nrow(vs) == 0L) {
|
| 72 | ! |
return(path) |
| 73 |
} |
|
| 74 | 1x |
if (is.character(date)) {
|
| 75 | 1x |
date <- as.POSIXct( |
| 76 | 1x |
date, |
| 77 | 1x |
tryFormats = c( |
| 78 | 1x |
"%Y-%m-%d %H:%M:%S", |
| 79 | 1x |
"%Y-%m-%d %H:%M", |
| 80 | 1x |
"%Y-%m-%d", |
| 81 | 1x |
"%Y-%m", |
| 82 | 1x |
"%Y" |
| 83 |
), |
|
| 84 | 1x |
tz = "UTC" |
| 85 |
) |
|
| 86 |
} |
|
| 87 | 1x |
commit_hash <- vs$hash[which.min(abs( |
| 88 | 1x |
as.POSIXct(vs$date, "%a %b %d %H:%M:%S %Y", tz = "UTC") - date |
| 89 |
))] |
|
| 90 |
} |
|
| 91 | 2x |
if (is.null(commit_hash)) {
|
| 92 | ! |
return(path) |
| 93 |
} |
|
| 94 | 2x |
name_parts <- strsplit(basename(path), ".", fixed = TRUE)[[1L]] |
| 95 | 2x |
out_path <- paste0( |
| 96 | 2x |
tempdir(), |
| 97 |
"/", |
|
| 98 | 2x |
name_parts[[1L]], |
| 99 |
"-", |
|
| 100 | 2x |
substring(commit_hash, 1L, 6L), |
| 101 |
".", |
|
| 102 | 2x |
paste(name_parts[-1L], collapse = ".") |
| 103 |
) |
|
| 104 | 2x |
if (file.exists(out_path)) {
|
| 105 | ! |
return(out_path) |
| 106 |
} |
|
| 107 | 2x |
status <- sys::exec_wait( |
| 108 | 2x |
"git", |
| 109 | 2x |
c("show", paste0(commit_hash, ":", path)),
|
| 110 | 2x |
std_out = out_path |
| 111 |
) |
|
| 112 | 2x |
if (status != 0L) {
|
| 113 | ! |
cli::cli_abort("failed to git show: {rawToChar(status$stderr)}")
|
| 114 |
} |
|
| 115 | 2x |
out_path |
| 116 |
} |
| 1 |
#' Adds a Bundle Project |
|
| 2 |
#' |
|
| 3 |
#' Establishes a new data bundle project, used to prepare outputs from standardized datasets. |
|
| 4 |
#' |
|
| 5 |
#' @param name Name of the bundle |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
| 7 |
#' @param source_files A list or character vector, with names as paths to standard files |
|
| 8 |
#' form source projects (relative to the project's data directory), and distribution file |
|
| 9 |
#' names as entries. This associates input with output files, allowing for calculation of |
|
| 10 |
#' a source state, and metadata inheritance from source files. |
|
| 11 |
#' @param open_after Logical; if \code{FALSE}, will not open the project.
|
|
| 12 |
#' @returns Nothing; creates default files and directories. |
|
| 13 |
#' @section Project: |
|
| 14 |
#' |
|
| 15 |
#' Within a bundle project, there are two files to edits: |
|
| 16 |
#' \itemize{
|
|
| 17 |
#' \item \strong{\code{build.R}}: This is the primary script, which is automatically rerun.
|
|
| 18 |
#' It should read data from the \code{standard} directory of source projects,
|
|
| 19 |
#' and write to it's own \code{dist} directory.
|
|
| 20 |
#' \item \strong{\code{measure_info.json}}: This should list all non-ID variable names
|
|
| 21 |
#' in the data files within \code{dist}. These will inherit the standard measure info
|
|
| 22 |
#' if found in the source projects referred to in \code{source_files}.
|
|
| 23 |
#' If the \code{dist} name is different, but should still inherit standard measure info,
|
|
| 24 |
#' a \code{source_id} entry with the original measure ID will be used to identify the original
|
|
| 25 |
#' measure info. |
|
| 26 |
#' See \code{\link{dcf_measure_info}}.
|
|
| 27 |
#' } |
|
| 28 |
#' |
|
| 29 |
#' @examples |
|
| 30 |
#' project_dir <- paste0(tempdir(), "/temp_project") |
|
| 31 |
#' dcf_init("temp_project", dirname(project_dir))
|
|
| 32 |
#' dcf_add_bundle("bundle_name", project_dir)
|
|
| 33 |
#' list.files(paste0(project_dir, "/data/bundle_name")) |
|
| 34 |
#' |
|
| 35 |
#' @export |
|
| 36 | ||
| 37 |
dcf_add_bundle <- function( |
|
| 38 |
name, |
|
| 39 |
project_dir = ".", |
|
| 40 |
source_files = NULL, |
|
| 41 |
open_after = interactive() |
|
| 42 |
) {
|
|
| 43 | 2x |
if (missing(name)) {
|
| 44 | ! |
cli::cli_abort("specify a name")
|
| 45 |
} |
|
| 46 | 2x |
name <- gsub("[^A-Za-z0-9]+", "_", name)
|
| 47 | 2x |
settings <- dcf_read_settings(project_dir) |
| 48 | 2x |
if (!is.null(source_files)) {
|
| 49 | 1x |
su <- !file.exists(paste0( |
| 50 | 1x |
settings$data_dir, |
| 51 |
"/", |
|
| 52 | 1x |
if (is.null(names(source_files))) source_files else names(source_files) |
| 53 |
)) |
|
| 54 | 1x |
if (any(su)) {
|
| 55 | ! |
cli::cli_abort( |
| 56 | ! |
"source file{? doesn't/s don't} exist: {settings$data_dir[su]}"
|
| 57 |
) |
|
| 58 |
} |
|
| 59 |
} |
|
| 60 | 2x |
base_dir <- paste(c(project_dir, settings$data_dir, name), collapse = "/") |
| 61 | 2x |
dir.create(paste0(base_dir, "/dist"), showWarnings = FALSE, recursive = TRUE) |
| 62 | 2x |
paths <- paste0( |
| 63 | 2x |
base_dir, |
| 64 |
"/", |
|
| 65 | 2x |
c( |
| 66 | 2x |
"README.md", |
| 67 | 2x |
"project.Rproj", |
| 68 | 2x |
"process.json", |
| 69 | 2x |
"measure_info.json", |
| 70 | 2x |
"build.R" |
| 71 |
) |
|
| 72 |
) |
|
| 73 | 2x |
if (!file.exists(paths[[1L]])) {
|
| 74 | 1x |
writeLines( |
| 75 | 1x |
paste0( |
| 76 | 1x |
c( |
| 77 | 1x |
paste("#", name),
|
| 78 |
"", |
|
| 79 | 1x |
"This is a Data Collection Framework data bundle project, initialized with `dcf::dcf_add_bundle`.", |
| 80 |
"", |
|
| 81 | 1x |
"You can us the `dcf` package to rebuild the bundle:", |
| 82 |
"", |
|
| 83 | 1x |
"```R", |
| 84 | 1x |
paste0('dcf::dcf_process("', name, '", "..")'),
|
| 85 |
"```" |
|
| 86 |
), |
|
| 87 | 1x |
collapse = "\n" |
| 88 |
), |
|
| 89 | 1x |
paths[[1L]] |
| 90 |
) |
|
| 91 |
} |
|
| 92 | 2x |
if (!file.exists(paths[[2L]])) {
|
| 93 | 1x |
writeLines("Version: 1.0\n", paths[[2L]])
|
| 94 |
} |
|
| 95 | 2x |
if (!file.exists(paths[[3L]])) {
|
| 96 | 1x |
jsonlite::write_json( |
| 97 | 1x |
list( |
| 98 | 1x |
name = name, |
| 99 | 1x |
type = "bundle", |
| 100 | 1x |
scripts = list( |
| 101 | 1x |
list( |
| 102 | 1x |
path = "build.R", |
| 103 | 1x |
last_run = "", |
| 104 | 1x |
run_time = "", |
| 105 | 1x |
last_status = list(log = "", success = TRUE) |
| 106 |
) |
|
| 107 |
), |
|
| 108 | 1x |
source_files = if (!is.null(names(source_files))) |
| 109 | 1x |
as.list(source_files) else source_files |
| 110 |
), |
|
| 111 | 1x |
paths[[3L]], |
| 112 | 1x |
auto_unbox = TRUE, |
| 113 | 1x |
pretty = TRUE |
| 114 |
) |
|
| 115 |
} |
|
| 116 | 2x |
if (!file.exists(paths[[4L]])) {
|
| 117 | 1x |
writeLines("{}\n", paths[[4L]])
|
| 118 |
} |
|
| 119 | 2x |
if (!file.exists(paths[[5L]])) {
|
| 120 | 1x |
writeLines( |
| 121 | 1x |
paste0( |
| 122 | 1x |
c( |
| 123 | 1x |
"# read data from data source projects", |
| 124 | 1x |
"# and write to this project's `dist` directory", |
| 125 |
"" |
|
| 126 |
), |
|
| 127 | 1x |
collapse = "\n" |
| 128 |
), |
|
| 129 | 1x |
paths[[5L]] |
| 130 |
) |
|
| 131 |
} |
|
| 132 | ! |
if (open_after) rstudioapi::openProject(paths[[2L]], newSession = TRUE) |
| 133 |
} |
| 1 |
#' Map States to Health Regions |
|
| 2 |
#' |
|
| 3 |
#' Maps state FIPS state numeric codes to Human Health Service regions. |
|
| 4 |
#' |
|
| 5 |
#' @param geoids Character vector of GEOIDs. |
|
| 6 |
#' @param prefix A prefix to add to region IDs. |
|
| 7 |
#' @returns A vector of Health Region names the same length as \code{geoids}.
|
|
| 8 |
#' @examples |
|
| 9 |
#' dcf_to_health_region(c("01", "01001", "02", "02001"))
|
|
| 10 |
#' @export |
|
| 11 | ||
| 12 |
dcf_to_health_region <- function(geoids, prefix = "Region ") {
|
|
| 13 | 1x |
regions <- c( |
| 14 | 1x |
"01" = 4, |
| 15 | 1x |
"02" = 10, |
| 16 | 1x |
"04" = 9, |
| 17 | 1x |
"05" = 6, |
| 18 | 1x |
"06" = 9, |
| 19 | 1x |
"08" = 8, |
| 20 | 1x |
"09" = 1, |
| 21 | 1x |
"10" = 3, |
| 22 | 1x |
"11" = 3, |
| 23 | 1x |
"12" = 4, |
| 24 | 1x |
"13" = 4, |
| 25 | 1x |
"15" = 9, |
| 26 | 1x |
"16" = 10, |
| 27 | 1x |
"17" = 5, |
| 28 | 1x |
"18" = 5, |
| 29 | 1x |
"19" = 7, |
| 30 | 1x |
"20" = 7, |
| 31 | 1x |
"21" = 4, |
| 32 | 1x |
"22" = 6, |
| 33 | 1x |
"23" = 1, |
| 34 | 1x |
"24" = 3, |
| 35 | 1x |
"25" = 1, |
| 36 | 1x |
"26" = 5, |
| 37 | 1x |
"27" = 5, |
| 38 | 1x |
"28" = 4, |
| 39 | 1x |
"29" = 7, |
| 40 | 1x |
"30" = 8, |
| 41 | 1x |
"31" = 7, |
| 42 | 1x |
"32" = 9, |
| 43 | 1x |
"33" = 1, |
| 44 | 1x |
"34" = 2, |
| 45 | 1x |
"35" = 6, |
| 46 | 1x |
"36" = 2, |
| 47 | 1x |
"37" = 4, |
| 48 | 1x |
"38" = 8, |
| 49 | 1x |
"39" = 5, |
| 50 | 1x |
"40" = 6, |
| 51 | 1x |
"41" = 10, |
| 52 | 1x |
"42" = 3, |
| 53 | 1x |
"44" = 1, |
| 54 | 1x |
"45" = 4, |
| 55 | 1x |
"46" = 8, |
| 56 | 1x |
"47" = 4, |
| 57 | 1x |
"48" = 6, |
| 58 | 1x |
"49" = 8, |
| 59 | 1x |
"50" = 1, |
| 60 | 1x |
"51" = 3, |
| 61 | 1x |
"53" = 10, |
| 62 | 1x |
"54" = 3, |
| 63 | 1x |
"55" = 5, |
| 64 | 1x |
"56" = 8, |
| 65 | 1x |
"72" = 2, |
| 66 | 1x |
"66" = 6, |
| 67 | 1x |
"74" = 2 |
| 68 |
) |
|
| 69 | 1x |
regions[] <- paste0(prefix, regions) |
| 70 | 1x |
unname(regions[substring(geoids, 1L, 2L)]) |
| 71 |
} |
| 1 |
#' Check Data Projects |
|
| 2 |
#' |
|
| 3 |
#' Check the data files and measure info of subprojects. |
|
| 4 |
#' |
|
| 5 |
#' @param names Name or names of projects. |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
| 7 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
|
|
| 8 |
#' @returns A list with an entry for each source, containing a character vector |
|
| 9 |
#' including any issue codes: |
|
| 10 |
#' \itemize{
|
|
| 11 |
#' \item \code{not_compressed}: The file does not appear to be compressed.
|
|
| 12 |
#' \item \code{cant_read}: Failed to read the file in.
|
|
| 13 |
#' \item \code{geography_missing}: File does not contain a \code{geography} column.
|
|
| 14 |
#' \item \code{geography_nas}: The file's \code{geography} column contains NAs.
|
|
| 15 |
#' \item \code{time_missing}: File does not contain a \code{time} column.
|
|
| 16 |
#' \item \code{time_nas}: The file's \code{time} column contains NAs.
|
|
| 17 |
#' \item \code{missing_info: {column_name}}: The file's indicated column does not have
|
|
| 18 |
#' a matching entry in \code{measure_info.json}.
|
|
| 19 |
#' } |
|
| 20 |
#' @examples |
|
| 21 |
#' \dontrun{
|
|
| 22 |
#' dcf_check("gtrends")
|
|
| 23 |
#' } |
|
| 24 |
#' @export |
|
| 25 | ||
| 26 |
dcf_check <- function( |
|
| 27 |
names = NULL, |
|
| 28 |
project_dir = ".", |
|
| 29 |
verbose = TRUE |
|
| 30 |
) {
|
|
| 31 |
if ( |
|
| 32 | 3x |
is.null(names) && !file.exists(paste0(project_dir, "/", "settings.json")) |
| 33 |
) {
|
|
| 34 | ! |
project_dir <- "../.." |
| 35 | ! |
names <- basename(getwd()) |
| 36 |
} |
|
| 37 | ||
| 38 | 3x |
settings <- dcf_read_settings(project_dir) |
| 39 | 3x |
base_dir <- paste0(project_dir, "/", settings$data_dir) |
| 40 | 3x |
if (is.null(names)) {
|
| 41 | 2x |
names <- list.dirs(base_dir, recursive = FALSE, full.names = FALSE) |
| 42 |
} |
|
| 43 | 3x |
issues <- list() |
| 44 | 3x |
for (name in names) {
|
| 45 | 4x |
source_dir <- paste0(base_dir, "/", name, "/") |
| 46 | 4x |
if (!dir.exists(source_dir)) {
|
| 47 | ! |
cli::cli_abort("specify the name of an existing data project")
|
| 48 |
} |
|
| 49 | 4x |
process_file <- paste0(source_dir, "process.json") |
| 50 | 4x |
if (!file.exists(process_file)) {
|
| 51 | ! |
cli::cli_abort("{name} does not appear to be a data project")
|
| 52 |
} |
|
| 53 | 4x |
process <- dcf_process_record(process_file) |
| 54 | 4x |
is_bundle <- !is.null(process$type) && process$type == "bundle" |
| 55 | 4x |
info_file <- paste0(source_dir, "measure_info.json") |
| 56 | 4x |
info <- tryCatch( |
| 57 | 4x |
dcf_measure_info( |
| 58 | 4x |
info_file, |
| 59 | 4x |
render = TRUE, |
| 60 | 4x |
write = FALSE, |
| 61 | 4x |
verbose = FALSE, |
| 62 | 4x |
open_after = FALSE |
| 63 |
), |
|
| 64 | 4x |
error = function(e) NULL |
| 65 |
) |
|
| 66 | 4x |
if (is.null(info)) {
|
| 67 | ! |
cli::cli_abort("{.file {info_file}} is malformed")
|
| 68 |
} |
|
| 69 | 4x |
if (verbose) {
|
| 70 | 4x |
cli::cli_bullets(c("", "Checking project {.strong {name}}"))
|
| 71 |
} |
|
| 72 | 4x |
data_files <- list.files( |
| 73 | 4x |
paste0(source_dir, if (is_bundle) "dist" else "standard"), |
| 74 | 4x |
"\\.(?:csv|parquet|json)", |
| 75 | 4x |
full.names = TRUE |
| 76 |
) |
|
| 77 | 4x |
data_files <- data_files[!grepl("datapackage", data_files, fixed = TRUE)]
|
| 78 | 4x |
source_issues <- list() |
| 79 | 4x |
for (file in list.files( |
| 80 | 4x |
paste0(source_dir, "raw"), |
| 81 | 4x |
"csv$", |
| 82 | 4x |
full.names = TRUE |
| 83 |
)) {
|
|
| 84 | 1x |
source_issues[[file]] <- list(data = "not_compressed") |
| 85 |
} |
|
| 86 | 4x |
if (length(data_files)) {
|
| 87 | 4x |
for (file in data_files) {
|
| 88 | 4x |
issue_messages <- NULL |
| 89 | 4x |
if (verbose) {
|
| 90 | 4x |
cli::cli_progress_step("checking file {.file {file}}", spinner = TRUE)
|
| 91 |
} |
|
| 92 | 4x |
data_issues <- NULL |
| 93 | 4x |
measure_issues <- NULL |
| 94 | 4x |
data <- tryCatch( |
| 95 | 4x |
if (grepl(".parquet", file, fixed = TRUE)) {
|
| 96 | ! |
dplyr::collect(arrow::read_parquet(file)) |
| 97 | 4x |
} else if (grepl(".json", file, fixed = TRUE)) {
|
| 98 | 1x |
as.data.frame(jsonlite::read_json(file, simplifyVector = TRUE)) |
| 99 |
} else {
|
|
| 100 | 3x |
con <- gzfile(file) |
| 101 | 3x |
on.exit(con) |
| 102 | 3x |
vroom::vroom(con, show_col_types = FALSE) |
| 103 |
}, |
|
| 104 | 4x |
error = function(e) NULL |
| 105 |
) |
|
| 106 | 4x |
if (is.null(data)) {
|
| 107 | ! |
data_issues <- c(data_issues, "cant_read") |
| 108 |
} else {
|
|
| 109 | 4x |
if (grepl("csv$", file)) {
|
| 110 | 1x |
data_issues <- c(data_issues, "not_compressed") |
| 111 | 1x |
if (verbose) {
|
| 112 | 1x |
issue_messages <- c( |
| 113 | 1x |
issue_messages, |
| 114 | 1x |
"file is not compressed" |
| 115 |
) |
|
| 116 |
} |
|
| 117 |
} |
|
| 118 | 4x |
if (!("geography" %in% colnames(data))) {
|
| 119 | ! |
data_issues <- c(data_issues, "geography_missing") |
| 120 | ! |
if (verbose) {
|
| 121 | ! |
issue_messages <- c( |
| 122 | ! |
issue_messages, |
| 123 | ! |
"missing {.emph geography} column"
|
| 124 |
) |
|
| 125 |
} |
|
| 126 | 4x |
} else if (anyNA(data$geography)) {
|
| 127 | 1x |
data_issues <- c(data_issues, "geography_nas") |
| 128 | 1x |
if (verbose) {
|
| 129 | 1x |
issue_messages <- c( |
| 130 | 1x |
issue_messages, |
| 131 | 1x |
"{.emph geography} column contains NAs"
|
| 132 |
) |
|
| 133 |
} |
|
| 134 |
} |
|
| 135 | 4x |
if (!("time" %in% colnames(data))) {
|
| 136 | ! |
data_issues <- c(data_issues, "time_missing") |
| 137 | ! |
if (verbose) {
|
| 138 | ! |
issue_messages <- c( |
| 139 | ! |
issue_messages, |
| 140 | ! |
"missing {.emph time} column"
|
| 141 |
) |
|
| 142 |
} |
|
| 143 | 4x |
} else if (anyNA(data$time)) {
|
| 144 | 1x |
data_issues <- c(data_issues, "time_nas") |
| 145 | 1x |
if (verbose) {
|
| 146 | 1x |
issue_messages <- c( |
| 147 | 1x |
issue_messages, |
| 148 | 1x |
"{.emph time} column contains NAs"
|
| 149 |
) |
|
| 150 |
} |
|
| 151 |
} |
|
| 152 | 4x |
for (col in colnames(data)) {
|
| 153 | 12x |
if (!(col %in% c("geography", "time")) && !(col %in% names(info))) {
|
| 154 | 1x |
measure_issues <- c(measure_issues, paste("missing_info:", col))
|
| 155 | 1x |
if (verbose) {
|
| 156 | 1x |
issue_messages <- c( |
| 157 | 1x |
issue_messages, |
| 158 | 1x |
paste0( |
| 159 | 1x |
"{.emph ",
|
| 160 | 1x |
col, |
| 161 | 1x |
"} column does not have an entry in measure_info" |
| 162 |
) |
|
| 163 |
) |
|
| 164 |
} |
|
| 165 |
} |
|
| 166 |
} |
|
| 167 |
} |
|
| 168 | 4x |
file_issues <- list() |
| 169 | 4x |
if (length(data_issues)) {
|
| 170 | 1x |
file_issues$data <- data_issues |
| 171 |
} |
|
| 172 | 4x |
if (length(measure_issues)) {
|
| 173 | 1x |
file_issues$measures <- measure_issues |
| 174 |
} |
|
| 175 | 4x |
source_issues[[file]] <- file_issues |
| 176 | 4x |
if (verbose) {
|
| 177 | 4x |
if (length(issue_messages)) {
|
| 178 | 1x |
cli::cli_progress_done(result = "failed") |
| 179 | 1x |
cli::cli_bullets(structure( |
| 180 | 1x |
issue_messages, |
| 181 | 1x |
names = rep(" ", length(issue_messages))
|
| 182 |
)) |
|
| 183 |
} else {
|
|
| 184 | 3x |
cli::cli_progress_done() |
| 185 |
} |
|
| 186 |
} |
|
| 187 |
} |
|
| 188 |
} else {
|
|
| 189 | ! |
if (verbose) cli::cli_alert_info("no standard data files found to check")
|
| 190 |
} |
|
| 191 | 4x |
if (!identical(process$check_results, source_issues)) {
|
| 192 | 3x |
process$checked <- Sys.time() |
| 193 | 3x |
process$check_results <- source_issues |
| 194 | 3x |
dcf_process_record(process_file, process) |
| 195 |
} |
|
| 196 | 4x |
issues[[name]] <- source_issues |
| 197 |
} |
|
| 198 | ||
| 199 | 3x |
invisible(issues) |
| 200 |
} |
| 1 |
#' Standardize Epic Data |
|
| 2 |
#' |
|
| 3 |
#' Standardize a raw Epic data table. |
|
| 4 |
#' |
|
| 5 |
#' @param raw_data Raw Epic data, such as returned from \link{dcf_read_epic}.
|
|
| 6 |
#' @returns A standardized form of \code{data}.
|
|
| 7 |
#' @section Standardization: |
|
| 8 |
#' \itemize{
|
|
| 9 |
#' \item Collapse location columns (\code{state} or \code{county}) to a single
|
|
| 10 |
#' \code{geography} column, and region names to IDs.
|
|
| 11 |
#' \item Collapse time columns (\code{year}, \code{month}, or \code{week}) to a single
|
|
| 12 |
#' \code{time} column, and clean up value formatting.
|
|
| 13 |
#' \item Drop rows with no values across value columns. |
|
| 14 |
#' } |
|
| 15 |
#' @examples |
|
| 16 |
#' \dontrun{
|
|
| 17 |
#' raw_data <- dcf_read_epic("data/epic/raw/flu.csv.xz")
|
|
| 18 |
#' standard_data <- dcf_process_epic_raw(raw_data) |
|
| 19 |
#' } |
|
| 20 |
#' |
|
| 21 |
#' @export |
|
| 22 | ||
| 23 |
dcf_standardize_epic <- function(raw_data) {
|
|
| 24 | 1x |
region_names <- epic_id_maps$regions |
| 25 | 1x |
names(region_names) <- gsub( |
| 26 | 1x |
" (?:CITY AND BOROUGH|BOROUGH|PARISH|MUNICIPALITY|MUNICIPIO)|[.']", |
| 27 |
"", |
|
| 28 | 1x |
toupper(names(region_names)) |
| 29 |
) |
|
| 30 | 1x |
cols <- colnames(raw_data) |
| 31 | 1x |
time_col <- which(cols == "year") |
| 32 | 1x |
if (length(time_col)) {
|
| 33 | 1x |
colnames(raw_data)[time_col] <- "time" |
| 34 | 1x |
raw_data$time <- as.integer(substring( |
| 35 | 1x |
raw_data$time, |
| 36 | 1x |
0L, |
| 37 | 1x |
4L |
| 38 |
)) |
|
| 39 |
} |
|
| 40 | 1x |
month_col <- which(cols == "month") |
| 41 | 1x |
if (length(month_col)) {
|
| 42 | ! |
raw_data$time <- paste0( |
| 43 | ! |
raw_data$time, |
| 44 |
"-", |
|
| 45 | ! |
epic_id_maps$months[raw_data$month] |
| 46 |
) |
|
| 47 |
} |
|
| 48 | 1x |
week_col <- which(cols == "week") |
| 49 | 1x |
if (length(week_col)) {
|
| 50 | 1x |
raw_data$time <- paste0( |
| 51 | 1x |
raw_data$time, |
| 52 |
"-", |
|
| 53 | 1x |
vapply( |
| 54 | 1x |
strsplit(raw_data$week, "[^A-Za-z0-9]"), |
| 55 | 1x |
function(p) {
|
| 56 | 4x |
paste0( |
| 57 | 4x |
epic_id_maps$months[[p[[1L]]]], |
| 58 |
"-", |
|
| 59 | 4x |
formatC(as.integer(p[[2L]]), width = 2L, flag = "0") |
| 60 |
) |
|
| 61 |
}, |
|
| 62 |
"" |
|
| 63 |
) |
|
| 64 |
) |
|
| 65 |
} |
|
| 66 | 1x |
geo_col <- grep("^(?:state|county)", cols)
|
| 67 | 1x |
if (length(geo_col)) {
|
| 68 | 1x |
colnames(raw_data)[geo_col] <- "geography" |
| 69 | 1x |
raw_data$geography <- toupper(raw_data$geography) |
| 70 | 1x |
missing_geo <- !(raw_data$geography %in% names(region_names)) |
| 71 | 1x |
if (any(missing_geo)) {
|
| 72 | 1x |
geo <- sub( |
| 73 | 1x |
"LA ", |
| 74 | 1x |
"LA", |
| 75 | 1x |
sub("^SAINT", "ST", raw_data$geography[missing_geo]),
|
| 76 | 1x |
fixed = TRUE |
| 77 |
) |
|
| 78 | 1x |
if (any(grepl(", VA", geo, fixed = TRUE))) {
|
| 79 | 1x |
geo[geo == "SALEM, VA"] <- "SALEM CITY, VA" |
| 80 | 1x |
geo[geo == "RADFORD, VA"] <- "RADFORD CITY, VA" |
| 81 | 1x |
geo[geo == "DONA ANA, NM"] <- "DO\u00d1A ANA, NM" |
| 82 | 1x |
geo[geo == "MATANUSKA SUSITNA, AK"] <- "MATANUSKA-SUSITNA, AK" |
| 83 |
} |
|
| 84 | 1x |
raw_data$geography[missing_geo] <- geo |
| 85 |
} |
|
| 86 | 1x |
missing_regions <- raw_data$geography[ |
| 87 | 1x |
!(raw_data$geography %in% names(region_names)) |
| 88 |
] |
|
| 89 | 1x |
if (length(missing_regions)) {
|
| 90 | ! |
cli::cli_warn( |
| 91 | ! |
'unrecognized regions: {paste(unique(missing_regions), collapse = "; ")}'
|
| 92 |
) |
|
| 93 |
} |
|
| 94 | 1x |
raw_data$geography <- region_names[raw_data$geography] |
| 95 | 1x |
raw_data <- raw_data[!is.na(raw_data$geography), ] |
| 96 |
} |
|
| 97 | 1x |
raw_data <- raw_data[, |
| 98 | 1x |
!(colnames(raw_data) %in% c("state", "county", "year", "month", "week"))
|
| 99 |
] |
|
| 100 | 1x |
raw_data[ |
| 101 | 1x |
rowSums( |
| 102 | 1x |
!is.na(raw_data[, |
| 103 | 1x |
!(colnames(raw_data) %in% c("geography", "time", "age"))
|
| 104 |
]) |
|
| 105 |
) != |
|
| 106 | 1x |
0L, |
| 107 |
] |
|
| 108 |
} |
| 1 |
#' Adds documentation of a dataset to a datapackage |
|
| 2 |
#' |
|
| 3 |
#' Add information about variables in a dataset to a \code{datapackage.json} metadata file.
|
|
| 4 |
#' |
|
| 5 |
#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}.
|
|
| 6 |
#' @param meta Information about each data file. A list with a list entry for each entry in |
|
| 7 |
#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all.
|
|
| 8 |
#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to
|
|
| 9 |
#' \code{dir}, or a list with the read-in version.
|
|
| 10 |
#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}.
|
|
| 11 |
#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting
|
|
| 12 |
#' \code{packagename}.
|
|
| 13 |
#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information.
|
|
| 14 |
#' @param sha A number specifying the Secure Hash Algorithm function, |
|
| 15 |
#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}).
|
|
| 16 |
#' @param pretty Logical; if \code{TRUE}, will pretty-print the datapackage.
|
|
| 17 |
#' @param summarize_ids Logical; if \code{TRUE}, will include ID columns in schema field summaries.
|
|
| 18 |
#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving.
|
|
| 19 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
|
|
| 20 |
#' @details |
|
| 21 |
#' \code{meta} should be a list with unnamed entries for entry in \code{filename},
|
|
| 22 |
#' and each entry can include a named entry for any of these: |
|
| 23 |
#' \describe{
|
|
| 24 |
#' \item{source}{
|
|
| 25 |
#' A list or list of lists with entries for at least \code{name}, and ideally for \code{url}.
|
|
| 26 |
#' } |
|
| 27 |
#' \item{ids}{
|
|
| 28 |
#' A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset).
|
|
| 29 |
#' Might also include \code{map} with a list or path to a JSON file resulting in a list with an
|
|
| 30 |
#' entry for each ID, and additional information about that entity, to be read in a its features. |
|
| 31 |
#' All files will be loaded to help with aggregation, but local files will be included in the datapackage, |
|
| 32 |
#' whereas hosted files will be loaded client-side. |
|
| 33 |
#' } |
|
| 34 |
#' \item{time}{
|
|
| 35 |
#' A string giving the name of a variable in the dataset representing a repeated observation of the same entity. |
|
| 36 |
#' } |
|
| 37 |
#' \item{variables}{
|
|
| 38 |
#' A list with named entries providing more information about the variables in the dataset. |
|
| 39 |
#' See \code{\link{dcf_measure_info}}.
|
|
| 40 |
#' } |
|
| 41 |
#' \item{vintage}{
|
|
| 42 |
#' A string specifying the time and/or location at which the data were produced. |
|
| 43 |
#' } |
|
| 44 |
#' } |
|
| 45 |
#' @examples |
|
| 46 |
#' \dontrun{
|
|
| 47 |
#' # write example data |
|
| 48 |
#' write.csv(mtcars, "mtcars.csv") |
|
| 49 |
#' |
|
| 50 |
#' # add it to an existing datapackage.json file in the current working directory |
|
| 51 |
#' dcf_datapackage_add("mtcars.csv")
|
|
| 52 |
#' } |
|
| 53 |
#' @return An invisible version of the updated datapackage, which is also written to |
|
| 54 |
#' \code{datapackage.json} if \code{write = TRUE}.
|
|
| 55 |
#' @seealso Initialize the \code{datapackage.json} file with \code{\link{dcf_datapackage_init}}.
|
|
| 56 |
#' @export |
|
| 57 | ||
| 58 |
dcf_datapackage_add <- function( |
|
| 59 |
filename, |
|
| 60 |
meta = list(), |
|
| 61 |
packagename = "datapackage.json", |
|
| 62 |
dir = ".", |
|
| 63 |
write = TRUE, |
|
| 64 |
refresh = TRUE, |
|
| 65 |
sha = "512", |
|
| 66 |
pretty = FALSE, |
|
| 67 |
summarize_ids = FALSE, |
|
| 68 |
open_after = FALSE, |
|
| 69 |
verbose = interactive() |
|
| 70 |
) {
|
|
| 71 | 4x |
if (missing(filename)) {
|
| 72 | ! |
cli::cli_abort("{.arg filename} must be specified")
|
| 73 |
} |
|
| 74 | 4x |
setnames <- names(filename) |
| 75 | 4x |
if (file.exists(filename[[1]])) {
|
| 76 | ! |
if (dir == ".") {
|
| 77 | ! |
dir <- dirname(filename[[1]]) |
| 78 |
} |
|
| 79 | ! |
filename <- basename(filename) |
| 80 |
} |
|
| 81 | 4x |
if (any(!file.exists(paste0(dir, "/", filename)))) {
|
| 82 | ! |
filename <- filename[!file.exists(filename)] |
| 83 | ! |
cli::cli_abort("{?a file/files} did not exist: {filename}")
|
| 84 |
} |
|
| 85 | 4x |
package <- if ( |
| 86 | 4x |
is.character(packagename) && file.exists(paste0(dir, "/", packagename)) |
| 87 |
) {
|
|
| 88 | 4x |
paste0(dir, "/", packagename) |
| 89 |
} else {
|
|
| 90 | ! |
packagename |
| 91 |
} |
|
| 92 | 4x |
if (write) {
|
| 93 | 4x |
if (is.character(package)) {
|
| 94 | 4x |
package <- paste0(dir, "/", packagename) |
| 95 | 4x |
package <- if (file.exists(package)) {
|
| 96 | 4x |
packagename <- package |
| 97 | 4x |
jsonlite::read_json(package) |
| 98 |
} else {
|
|
| 99 | ! |
dcf_datapackage_init( |
| 100 | ! |
if (!is.null(setnames)) setnames[[1]] else filename[[1]], |
| 101 | ! |
dir = dir |
| 102 |
) |
|
| 103 |
} |
|
| 104 |
} |
|
| 105 | 4x |
if (!is.list(package)) {
|
| 106 | ! |
cli::cli_abort(c( |
| 107 | ! |
"{.arg package} does not appear to be in the right format",
|
| 108 | ! |
i = "this should be (or be read in from JSON as) a list with a {.code resource} entry"
|
| 109 |
)) |
|
| 110 |
} |
|
| 111 |
} |
|
| 112 | 4x |
if (!is.list(package)) {
|
| 113 | ! |
package <- list() |
| 114 |
} |
|
| 115 | 4x |
single_meta <- FALSE |
| 116 | 4x |
metas <- if (!is.null(names(meta))) {
|
| 117 | 4x |
meta_names <- if (is.null(setnames)) filename else setnames |
| 118 | 4x |
if (all(meta_names %in% names(meta))) {
|
| 119 | 4x |
meta[meta_names] |
| 120 |
} else {
|
|
| 121 | ! |
single_meta <- TRUE |
| 122 | ! |
if (length(meta$variables) == 1 && is.character(meta$variables)) {
|
| 123 | ! |
if (!file.exists(meta$variables)) {
|
| 124 | ! |
meta$variables <- paste0(dir, "/", meta$variables) |
| 125 |
} |
|
| 126 | ! |
if (file.exists(meta$variables)) {
|
| 127 | ! |
meta$variables <- jsonlite::read_json(meta$variables) |
| 128 |
} |
|
| 129 |
} |
|
| 130 | ! |
meta$variables <- replace_equations(meta$variables) |
| 131 | ! |
meta |
| 132 |
} |
|
| 133 |
} else {
|
|
| 134 | ! |
meta[seq_along(filename)] |
| 135 |
} |
|
| 136 | 4x |
if (!single_meta) {
|
| 137 | 4x |
metas <- lapply(metas, function(m) {
|
| 138 | 4x |
m$variables <- replace_equations(m$variables) |
| 139 | 4x |
m |
| 140 |
}) |
|
| 141 |
} |
|
| 142 | 4x |
collect_metadata <- function(file) {
|
| 143 | 4x |
f <- paste0(dir, "/", filename[[file]]) |
| 144 | 4x |
m <- if (single_meta) meta else metas[[file]] |
| 145 | 4x |
format <- if (grepl(".parquet", f, fixed = TRUE)) {
|
| 146 | ! |
"parquet" |
| 147 | 4x |
} else if (grepl(".json", f, fixed = TRUE)) {
|
| 148 | 1x |
"json" |
| 149 | 4x |
} else if (grepl(".csv", f, fixed = TRUE)) {
|
| 150 | 3x |
"csv" |
| 151 | 4x |
} else if (grepl(".rds", f, fixed = TRUE)) {
|
| 152 | ! |
"rds" |
| 153 |
} else {
|
|
| 154 | ! |
"tsv" |
| 155 |
} |
|
| 156 | 4x |
if (is.na(format)) {
|
| 157 | ! |
format <- "rds" |
| 158 |
} |
|
| 159 | 4x |
info <- file.info(f) |
| 160 | 4x |
metas <- list() |
| 161 | 4x |
unpack_meta <- function(n) {
|
| 162 | 20x |
if (!length(m[[n]])) {
|
| 163 | 5x |
list() |
| 164 | 15x |
} else if (is.list(m[[n]][[1]])) {
|
| 165 | 4x |
m[[n]] |
| 166 |
} else {
|
|
| 167 | 11x |
list(m[[n]]) |
| 168 |
} |
|
| 169 |
} |
|
| 170 | 4x |
vintage <- unlist(unpack_meta("vintage"))
|
| 171 | 4x |
ids <- unpack_meta("ids")
|
| 172 | 4x |
idvars <- NULL |
| 173 | 4x |
for (i in seq_along(ids)) {
|
| 174 | 4x |
if (is.list(ids[[i]])) {
|
| 175 | 4x |
if ( |
| 176 | ! |
length(ids[[i]]$map) == 1 && |
| 177 | ! |
is.character(ids[[i]]$map) && |
| 178 | ! |
file.exists(ids[[i]]$map) |
| 179 |
) {
|
|
| 180 | ! |
ids[[i]]$map_content <- paste( |
| 181 | ! |
readLines(ids[[i]]$map, warn = FALSE), |
| 182 | ! |
collapse = "" |
| 183 |
) |
|
| 184 |
} |
|
| 185 |
} else {
|
|
| 186 | 4x |
ids[[i]] <- list(variable = ids[[i]]) |
| 187 |
} |
|
| 188 | 4x |
if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable) |
| 189 |
} |
|
| 190 | 4x |
data <- if (format == "rds") {
|
| 191 | ! |
tryCatch(readRDS(f), error = function(e) NULL) |
| 192 | 4x |
} else if (format == "parquet") {
|
| 193 | ! |
tryCatch(arrow::read_parquet(f), error = function(e) NULL) |
| 194 | 4x |
} else if (format == "json") {
|
| 195 | 1x |
tryCatch( |
| 196 | 1x |
as.data.frame(jsonlite::read_json(f, simplifyVector = TRUE)), |
| 197 | 1x |
error = function(e) NULL |
| 198 |
) |
|
| 199 |
} else {
|
|
| 200 | 3x |
attempt_read(f, c("geography", "time", idvars))
|
| 201 |
} |
|
| 202 | 4x |
if (is.null(data)) {
|
| 203 | ! |
cli::cli_warn(c( |
| 204 | ! |
paste0("failed to read in the data file ({.file {f}})"),
|
| 205 | ! |
i = "check that it is in a compatible format" |
| 206 |
)) |
|
| 207 | ! |
return(NULL) |
| 208 |
} |
|
| 209 | 4x |
if (!all(rownames(data) == seq_len(nrow(data)))) {
|
| 210 | ! |
data <- cbind(`_row` = rownames(data), data) |
| 211 |
} |
|
| 212 | 4x |
timevar <- unlist(unpack_meta("time"))
|
| 213 | 4x |
times <- if (is.null(timevar)) rep(1, nrow(data)) else data[[timevar]] |
| 214 | 4x |
times_unique <- unique(times) |
| 215 | 4x |
if (!single_meta) {
|
| 216 | 4x |
varinf <- unpack_meta("variables")
|
| 217 | 4x |
if (length(varinf) == 1 && is.character(varinf[[1]])) {
|
| 218 | ! |
if (!file.exists(varinf[[1]])) {
|
| 219 | ! |
varinf[[1]] <- paste0(dir, "/", varinf[[1]]) |
| 220 |
} |
|
| 221 | ! |
if (file.exists(varinf[[1]])) {
|
| 222 | ! |
if (varinf[[1]] %in% names(metas)) {
|
| 223 | ! |
varinf <- metas[[varinf[[1]]]] |
| 224 |
} else {
|
|
| 225 | ! |
varinf <- metas[[varinf[[1]]]] <- dcf_measure_info( |
| 226 | ! |
varinf[[1]], |
| 227 | ! |
write = FALSE, |
| 228 | ! |
render = TRUE |
| 229 |
) |
|
| 230 |
} |
|
| 231 | ! |
varinf <- varinf[varinf != ""] |
| 232 |
} |
|
| 233 |
} |
|
| 234 | 4x |
varinf_full <- names(varinf) |
| 235 | 4x |
varinf_suf <- sub("^[^:]+:", "", varinf_full)
|
| 236 |
} |
|
| 237 | 4x |
created <- as.character(info$mtime) |
| 238 | 4x |
res <- list( |
| 239 | 4x |
bytes = as.integer(info$size), |
| 240 | 4x |
encoding = stringi::stri_enc_detect(f)[[1]][1, 1], |
| 241 | 4x |
md5 = tools::md5sum(f)[[1]], |
| 242 | 4x |
format = format, |
| 243 | 4x |
name = if (!is.null(setnames)) {
|
| 244 | ! |
setnames[file] |
| 245 | 4x |
} else if (!is.null(m$name)) {
|
| 246 | ! |
m$name |
| 247 |
} else {
|
|
| 248 | 4x |
sub("\\.[^.]*$", "", basename(filename[[file]]))
|
| 249 |
}, |
|
| 250 | 4x |
filename = filename[[file]], |
| 251 | 4x |
versions = get_versions(f), |
| 252 | 4x |
source = unpack_meta("source"),
|
| 253 | 4x |
ids = ids, |
| 254 | 4x |
id_length = if (length(idvars)) {
|
| 255 | 4x |
id_lengths <- nchar(data[[idvars[1]]]) |
| 256 | 4x |
id_lengths <- id_lengths[!is.na(id_lengths)] |
| 257 | ! |
if (all(id_lengths == id_lengths[1])) id_lengths[1] else 0 |
| 258 |
} else {
|
|
| 259 | ! |
0 |
| 260 |
}, |
|
| 261 | 4x |
time = timevar, |
| 262 | 4x |
profile = "data-resource", |
| 263 | 4x |
created = as.character(info$mtime), |
| 264 | 4x |
last_modified = as.character(info$ctime), |
| 265 | 4x |
vintage = if (length(vintage)) vintage else NULL, |
| 266 | 4x |
row_count = nrow(data), |
| 267 | 4x |
entity_count = if (length(idvars)) {
|
| 268 | 4x |
length(unique(data[[idvars[1]]])) |
| 269 |
} else {
|
|
| 270 | ! |
nrow(data) |
| 271 |
}, |
|
| 272 | 4x |
schema = list( |
| 273 | 4x |
fields = lapply( |
| 274 | 4x |
if (summarize_ids) {
|
| 275 | 4x |
colnames(data) |
| 276 |
} else {
|
|
| 277 | ! |
colnames(data)[!colnames(data) %in% idvars] |
| 278 |
}, |
|
| 279 | 4x |
function(cn) {
|
| 280 | 12x |
v <- data[[cn]] |
| 281 | 12x |
invalid <- !is.finite(v) |
| 282 | 12x |
r <- list(name = cn, duplicates = sum(duplicated(v))) |
| 283 | 12x |
if (!single_meta) {
|
| 284 | 12x |
if (cn %in% varinf_full) {
|
| 285 | 2x |
r$info <- varinf[[cn]] |
| 286 | 10x |
} else if (cn %in% varinf_suf) {
|
| 287 | ! |
r$info <- varinf[[which(varinf_suf == cn)]] |
| 288 |
} |
|
| 289 | 12x |
r$info <- r$info[r$info != ""] |
| 290 |
} |
|
| 291 | 12x |
su <- !is.na(v) |
| 292 | 12x |
if (any(su)) {
|
| 293 | 12x |
r$time_range <- which(times_unique %in% range(times[su])) - 1 |
| 294 | 12x |
r$time_range <- if (length(r$time_range)) {
|
| 295 | 12x |
r$time_range[c(1, length(r$time_range))] |
| 296 |
} else {
|
|
| 297 | ! |
c(-1, -1) |
| 298 |
} |
|
| 299 |
} else {
|
|
| 300 | ! |
r$time_range <- c(-1, -1) |
| 301 |
} |
|
| 302 | 12x |
if (!is.character(v) && all(invalid)) {
|
| 303 | ! |
r$type <- "unknown" |
| 304 | ! |
r$missing <- length(v) |
| 305 | 12x |
} else if (is.numeric(v)) {
|
| 306 | 5x |
r$type <- if (all(invalid | as.integer(v) == v)) {
|
| 307 | 5x |
"integer" |
| 308 |
} else {
|
|
| 309 | ! |
"float" |
| 310 |
} |
|
| 311 | 5x |
r$missing <- sum(invalid) |
| 312 | 5x |
r$mean <- round(mean(v, na.rm = TRUE), 6) |
| 313 | 5x |
r$sd <- round(stats::sd(v, na.rm = TRUE), 6) |
| 314 | 5x |
r$min <- round(min(v, na.rm = TRUE), 6) |
| 315 | 5x |
r$max <- round(max(v, na.rm = TRUE), 6) |
| 316 |
} else {
|
|
| 317 | 7x |
r$type <- "string" |
| 318 | 7x |
v <- as.factor(as.character(v)) |
| 319 | 7x |
r$missing <- sum(is.na(v) | is.nan(v) | grepl("^[\\s.-]$", v))
|
| 320 | 7x |
r$table <- structure(as.list(tabulate(v)), names = levels(v)) |
| 321 |
} |
|
| 322 | 12x |
r |
| 323 |
} |
|
| 324 |
) |
|
| 325 |
) |
|
| 326 |
) |
|
| 327 | 4x |
if (!single_meta && "_references" %in% names(varinf)) {
|
| 328 | ! |
res[["_references"]] <- varinf[["_references"]] |
| 329 |
} |
|
| 330 | 4x |
if (Sys.which("openssl") != "") {
|
| 331 | 4x |
res[[paste0("sha", sha)]] <- calculate_sha(f, sha)
|
| 332 |
} |
|
| 333 | 4x |
res |
| 334 |
} |
|
| 335 | 4x |
metadata <- Filter(length, lapply(seq_along(filename), collect_metadata)) |
| 336 | 4x |
if (single_meta) {
|
| 337 | ! |
package$measure_info <- lapply(meta$variables, function(e) e[e != ""]) |
| 338 |
} |
|
| 339 | 4x |
names <- vapply(metadata, "[[", "", "filename") |
| 340 | 4x |
for (resource in package$resources) {
|
| 341 | 2x |
if (length(resource$versions)) {
|
| 342 | ! |
su <- which(names %in% resource$filename) |
| 343 | ! |
if (length(su)) {
|
| 344 | ! |
if (length(metadata[[su]]$versions)) {
|
| 345 | ! |
metadata[[su]]$versions <- rbind( |
| 346 | ! |
metadata[[su]]$versions, |
| 347 | ! |
if (is.data.frame(resource$versions)) {
|
| 348 | ! |
resource$versions |
| 349 |
} else {
|
|
| 350 | ! |
as.data.frame(do.call(cbind, resource$versions)) |
| 351 |
} |
|
| 352 |
) |
|
| 353 | ! |
metadata[[su]]$versions <- metadata[[su]]$versions[ |
| 354 | ! |
!duplicated(metadata[[su]]$versions), |
| 355 |
] |
|
| 356 |
} |
|
| 357 |
} |
|
| 358 |
} |
|
| 359 |
} |
|
| 360 | 4x |
if (refresh) {
|
| 361 | 4x |
package$resources <- metadata |
| 362 |
} else {
|
|
| 363 | ! |
package$resources <- c( |
| 364 | ! |
metadata, |
| 365 | ! |
package$resources[ |
| 366 | ! |
!(vapply(package$resources, "[[", "", "filename") %in% names) |
| 367 |
] |
|
| 368 |
) |
|
| 369 |
} |
|
| 370 | 4x |
if (write) {
|
| 371 | 4x |
packagename <- if (is.character(packagename)) {
|
| 372 | 4x |
packagename |
| 373 |
} else {
|
|
| 374 | ! |
"datapackage.json" |
| 375 |
} |
|
| 376 | 4x |
jsonlite::write_json( |
| 377 | 4x |
package, |
| 378 | 4x |
if (file.exists(packagename)) {
|
| 379 | 4x |
packagename |
| 380 |
} else {
|
|
| 381 | ! |
paste0(dir, "/", packagename) |
| 382 |
}, |
|
| 383 | 4x |
auto_unbox = TRUE, |
| 384 | 4x |
digits = 6, |
| 385 | 4x |
dataframe = "columns", |
| 386 | 4x |
pretty = pretty |
| 387 |
) |
|
| 388 | 4x |
if (verbose) {
|
| 389 | ! |
cli::cli_bullets(c( |
| 390 | ! |
v = paste( |
| 391 | ! |
if (refresh) "updated resource in" else "added resource to", |
| 392 | ! |
"datapackage.json:" |
| 393 |
), |
|
| 394 | ! |
"*" = paste0("{.path ", packagename, "}")
|
| 395 |
)) |
|
| 396 | ! |
if (open_after) rstudioapi::navigateToFile(packagename) |
| 397 |
} |
|
| 398 |
} |
|
| 399 | 4x |
invisible(package) |
| 400 |
} |
|
| 401 | ||
| 402 |
get_versions <- function(file) {
|
|
| 403 | 4x |
log <- suppressWarnings(system2( |
| 404 | 4x |
"git", |
| 405 | 4x |
c("log", file),
|
| 406 | 4x |
stdout = TRUE |
| 407 |
)) |
|
| 408 | 4x |
if (is.null(attr(log, "status"))) {
|
| 409 | 2x |
log_entries <- strsplit(paste(log, collapse = "|"), "commit ")[[ |
| 410 | 2x |
1 |
| 411 |
]] |
|
| 412 | 2x |
log_entries <- do.call( |
| 413 | 2x |
rbind, |
| 414 | 2x |
Filter( |
| 415 | 2x |
function(x) length(x) == 4L, |
| 416 | 2x |
strsplit( |
| 417 | 2x |
log_entries[log_entries != ""], |
| 418 | 2x |
"\\|+(?:[^:]+:)?\\s*" |
| 419 |
) |
|
| 420 |
) |
|
| 421 |
) |
|
| 422 | 2x |
if (length(log_entries)) {
|
| 423 | 1x |
colnames(log_entries) <- c( |
| 424 | 1x |
"hash", |
| 425 | 1x |
"author", |
| 426 | 1x |
"date", |
| 427 | 1x |
"message" |
| 428 |
) |
|
| 429 | 1x |
as.data.frame(log_entries) |
| 430 |
} |
|
| 431 |
} |
|
| 432 |
} |
|
| 433 | ||
| 434 |
attempt_read <- function(file, id_cols) {
|
|
| 435 | 3x |
tryCatch( |
| 436 |
{
|
|
| 437 | 3x |
sep <- if (grepl(".csv", file, fixed = TRUE)) "," else "\t"
|
| 438 | 3x |
cols <- scan(file, "", nlines = 1, sep = sep, quiet = TRUE) |
| 439 | 3x |
types <- rep("?", length(cols))
|
| 440 | 3x |
types[cols %in% id_cols] <- "c" |
| 441 | 3x |
arrow::read_delim_arrow( |
| 442 | 3x |
gzfile(file), |
| 443 | 3x |
sep, |
| 444 | 3x |
col_names = cols, |
| 445 | 3x |
col_types = paste(types, collapse = ""), |
| 446 | 3x |
skip = 1 |
| 447 |
) |
|
| 448 |
}, |
|
| 449 | 3x |
error = function(e) NULL |
| 450 |
) |
|
| 451 |
} |
|
| 452 | ||
| 453 |
calculate_sha <- function(file, level) {
|
|
| 454 | 4x |
if (Sys.which("openssl") != "") {
|
| 455 | 4x |
tryCatch( |
| 456 | 4x |
strsplit( |
| 457 | 4x |
system2( |
| 458 | 4x |
"openssl", |
| 459 | 4x |
c("dgst", paste0("-sha", level), shQuote(file)),
|
| 460 | 4x |
TRUE |
| 461 |
), |
|
| 462 |
" ", |
|
| 463 | 4x |
fixed = TRUE |
| 464 | 4x |
)[[1]][2], |
| 465 | 4x |
error = function(e) "" |
| 466 |
) |
|
| 467 |
} else {
|
|
| 468 |
"" |
|
| 469 |
} |
|
| 470 |
} |
| 1 |
#' Process Epic Stating Files |
|
| 2 |
#' |
|
| 3 |
#' Process Epic stating files, lightly standardizing them and moving them to raw. |
|
| 4 |
#' |
|
| 5 |
#' @param staging_dir Directory containing the staging files. |
|
| 6 |
#' @param out_dir Directory to write new raw files to. |
|
| 7 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
|
|
| 8 |
#' @param cleanup Logical; if \code{FALSE}, will not remove staging files after being processed.
|
|
| 9 |
#' @param ... Passes additional arguments to \code{\link{dcf_read_epic}}.
|
|
| 10 |
#' @returns \code{NULL} if no staging files are found.
|
|
| 11 |
#' Otherwise, a list with entries for \code{data} and \code{metadata}.
|
|
| 12 |
#' Each of these are lists with entries for each recognized standard name, |
|
| 13 |
#' with potentially combined outputs similar to \code{\link{dcf_read_epic}}
|
|
| 14 |
#' |
|
| 15 |
#' @examples |
|
| 16 |
#' \dontrun{
|
|
| 17 |
#' # run from a source project |
|
| 18 |
#' dcf_process_epic_staging() |
|
| 19 |
#' } |
|
| 20 |
#' |
|
| 21 |
#' @export |
|
| 22 | ||
| 23 |
dcf_process_epic_staging <- function( |
|
| 24 |
staging_dir = "raw/staging", |
|
| 25 |
out_dir = "raw", |
|
| 26 |
verbose = TRUE, |
|
| 27 |
cleanup = TRUE, |
|
| 28 |
... |
|
| 29 |
) {
|
|
| 30 | 1x |
files <- sort(list.files( |
| 31 | 1x |
staging_dir, |
| 32 | 1x |
"csv", |
| 33 | 1x |
full.names = TRUE, |
| 34 | 1x |
recursive = TRUE |
| 35 |
)) |
|
| 36 | 1x |
files <- files[!grepl("census", files)]
|
| 37 | 1x |
if (!length(files)) {
|
| 38 | ! |
if (verbose) {
|
| 39 | ! |
cli::cli_progress_message("no staging files found")
|
| 40 |
} |
|
| 41 | ! |
return(NULL) |
| 42 |
} |
|
| 43 | 1x |
id_cols <- c("state", "county", "age", "year", "month", "week")
|
| 44 | 1x |
metadata <- list() |
| 45 | 1x |
data <- list() |
| 46 | 1x |
for (file in files) {
|
| 47 | 2x |
if (verbose) {
|
| 48 | 2x |
cli::cli_progress_step("processing file {.file {file}}", spinner = TRUE)
|
| 49 |
} |
|
| 50 | 2x |
epic <- tryCatch(dcf_read_epic(file, ...), error = function(e) NULL) |
| 51 | 2x |
if (is.null(epic)) {
|
| 52 | ! |
if (verbose) {
|
| 53 | ! |
cli::cli_progress_done(result = "failed") |
| 54 |
} |
|
| 55 | ! |
next |
| 56 |
} |
|
| 57 | 2x |
if (epic$metadata$standard_name == "") {
|
| 58 | ! |
if (verbose) {
|
| 59 | ! |
cli::cli_warn("failed to identify standard type for {.file {file}}")
|
| 60 | ! |
cli::cli_progress_done(result = "failed") |
| 61 |
} |
|
| 62 | ! |
next |
| 63 |
} |
|
| 64 | 2x |
name <- epic$metadata$standard_name |
| 65 | 2x |
metadata[[name]] <- c(list(epic$metadata), metadata[[name]]) |
| 66 | 2x |
file_id_cols <- id_cols[id_cols %in% colnames(epic$data)] |
| 67 | 2x |
epic$data <- epic$data[ |
| 68 | 2x |
rowMeans(is.na(epic$data[, |
| 69 | 2x |
!(colnames(epic$data) %in% file_id_cols), |
| 70 | 2x |
drop = FALSE |
| 71 |
])) != |
|
| 72 | 2x |
1, |
| 73 |
] |
|
| 74 | 2x |
n_col <- grep("^n_", colnames(epic$data))
|
| 75 | 2x |
if (length(n_col)) {
|
| 76 | ! |
colnames(epic$data)[[n_col]] <- paste0("n_", epic$metadata$standard_name)
|
| 77 |
} |
|
| 78 | 2x |
if (!is.null(data[[name]])) {
|
| 79 | 1x |
cols <- colnames(data[[name]]) |
| 80 | 1x |
cols <- cols[!(cols %in% colnames(epic$data))] |
| 81 | 1x |
if (length(cols)) {
|
| 82 | ! |
epic$data[, cols] <- NA |
| 83 |
} |
|
| 84 | 1x |
epic$data <- epic$data[, colnames(data[[name]])] |
| 85 | 1x |
file_id_cols <- id_cols[id_cols %in% colnames(data[[name]])] |
| 86 | 1x |
data[[name]] <- data[[name]][ |
| 87 | 1x |
!(do.call(paste, data[[name]][, file_id_cols]) %in% |
| 88 | 1x |
do.call(paste, epic$data[, file_id_cols])), |
| 89 |
] |
|
| 90 |
} |
|
| 91 | 2x |
data[[name]] <- rbind(epic$data, data[[name]]) |
| 92 | 2x |
if (verbose) cli::cli_progress_done() |
| 93 |
} |
|
| 94 | 1x |
for (name in names(metadata)) {
|
| 95 | 1x |
if (verbose) {
|
| 96 | 1x |
cli::cli_progress_step( |
| 97 | 1x |
"writing standard raw output for {.field {name}}",
|
| 98 | 1x |
spinner = TRUE |
| 99 |
) |
|
| 100 |
} |
|
| 101 | 1x |
paths <- paste0(out_dir, "/", name, ".", c("json", "csv.xz"))
|
| 102 | 1x |
jsonlite::write_json( |
| 103 | 1x |
metadata[[name]], |
| 104 | 1x |
paths[[1L]], |
| 105 | 1x |
auto_unbox = TRUE, |
| 106 | 1x |
pretty = TRUE |
| 107 |
) |
|
| 108 | 1x |
vroom::vroom_write(data[[name]], paths[[2L]]) |
| 109 | 1x |
if (cleanup) {
|
| 110 | 1x |
unlink(vapply(metadata[[name]], "[[", "", "file")) |
| 111 |
} |
|
| 112 | 1x |
if (verbose) cli::cli_process_done() |
| 113 |
} |
|
| 114 | 1x |
return(list(metadata = metadata, data = data)) |
| 115 |
} |
| 1 |
#' Create a datapackage.json template |
|
| 2 |
#' |
|
| 3 |
#' Initialize dataset documentation with a \code{datapackage.json} template, based on a
|
|
| 4 |
#' \href{https://specs.frictionlessdata.io/data-package}{Data Package} standard.
|
|
| 5 |
#' |
|
| 6 |
#' @param name A unique name for the dataset; allowed characters are \code{[a-z._/-]}.
|
|
| 7 |
#' @param title A display name for the dataset; if not specified, will be a formatted version of \code{name}.
|
|
| 8 |
#' @param dir Directory in which to save the \code{datapackage.json} file.
|
|
| 9 |
#' @param ... passes arguments to \code{\link{dcf_datapackage_add}}.
|
|
| 10 |
#' @param write Logical; if \code{FALSE}, the package object will not be written to a file.
|
|
| 11 |
#' @param overwrite Logical; if \code{TRUE} and \code{write} is \code{TRUE}, an existing
|
|
| 12 |
#' \code{datapackage.json} file will be overwritten.
|
|
| 13 |
#' @param quiet Logical; if \code{TRUE}, will not print messages or navigate to files.
|
|
| 14 |
#' @examples |
|
| 15 |
#' \dontrun{
|
|
| 16 |
#' # make a template datapackage.json file in the current working directory |
|
| 17 |
#' dcf_datapackage_init("mtcars", "Motor Trend Car Road Tests")
|
|
| 18 |
#' } |
|
| 19 |
#' @return An invisible list with the content written to the \code{datapackage.json} file.
|
|
| 20 |
#' @seealso Add basic information about a dataset with \code{\link{dcf_datapackage_add}}.
|
|
| 21 |
#' @export |
|
| 22 | ||
| 23 |
dcf_datapackage_init <- function( |
|
| 24 |
name, |
|
| 25 |
title = name, |
|
| 26 |
dir = ".", |
|
| 27 |
..., |
|
| 28 |
write = TRUE, |
|
| 29 |
overwrite = FALSE, |
|
| 30 |
quiet = !interactive() |
|
| 31 |
) {
|
|
| 32 | 2x |
if (missing(name)) {
|
| 33 | ! |
cli::cli_abort("{.arg name} must be specified")
|
| 34 |
} |
|
| 35 | 2x |
package <- list( |
| 36 | 2x |
name = name, |
| 37 | 2x |
title = if (title == name) {
|
| 38 | 2x |
gsub("\\b(\\w)", "\\U\\1", gsub("[._/-]", " ", name), perl = TRUE)
|
| 39 |
} else {
|
|
| 40 | ! |
title |
| 41 |
}, |
|
| 42 | 2x |
licence = list( |
| 43 | 2x |
url = "http://opendatacommons.org/licenses/pddl", |
| 44 | 2x |
name = "Open Data Commons Public Domain", |
| 45 | 2x |
version = "1.0", |
| 46 | 2x |
id = "odc-pddl" |
| 47 |
), |
|
| 48 | 2x |
resources = list() |
| 49 |
) |
|
| 50 | 2x |
package_path <- normalizePath(paste0(dir, "/datapackage.json"), "/", FALSE) |
| 51 | 2x |
if (write && !overwrite && file.exists(package_path)) {
|
| 52 | ! |
cli::cli_abort(c( |
| 53 | ! |
"datapackage ({.path {package_path}}) already exists",
|
| 54 | ! |
i = "add {.code overwrite = TRUE} to overwrite it"
|
| 55 |
)) |
|
| 56 |
} |
|
| 57 | 2x |
if (length(list(...))) {
|
| 58 | ! |
package$resources <- dcf_datapackage_add(..., dir = dir, write = FALSE) |
| 59 |
} |
|
| 60 | 2x |
if (write) {
|
| 61 | 2x |
if (!dir.exists(dir)) {
|
| 62 | ! |
dir.create(dir, recursive = TRUE) |
| 63 |
} |
|
| 64 | 2x |
jsonlite::write_json( |
| 65 | 2x |
package, |
| 66 | 2x |
package_path, |
| 67 | 2x |
auto_unbox = TRUE, |
| 68 | 2x |
digits = 6, |
| 69 | 2x |
pretty = TRUE |
| 70 |
) |
|
| 71 | 2x |
if (!quiet) {
|
| 72 | ! |
cli::cli_bullets(c( |
| 73 | ! |
v = "created metadata template for {name}:",
|
| 74 | ! |
"*" = paste0("{.path ", package_path, "}")
|
| 75 |
)) |
|
| 76 | ! |
rstudioapi::navigateToFile(package_path) |
| 77 |
} |
|
| 78 |
} |
|
| 79 | 2x |
invisible(package) |
| 80 |
} |
| 1 |
#' Download Medicare Disparities Data |
|
| 2 |
#' |
|
| 3 |
#' Download data from the Centers for Medicare & Medicaid Services (CMS) |
|
| 4 |
#' \href{https://data.cms.gov/tools/mapping-medicare-disparities-by-population}{Mapping Medicare Disparities by Population} (MMD) tool.
|
|
| 5 |
#' |
|
| 6 |
#' @param measure Name or letter code of the measure to download. |
|
| 7 |
#' @param population The population code; either \code{f} (Medicare Fee For Service) or
|
|
| 8 |
#' \code{m} (Medicare Advantage).
|
|
| 9 |
#' @param year Year(s) to download (e.g., \code{2015:2020}). If not specified, all available
|
|
| 10 |
#' years will be included. |
|
| 11 |
#' @param geography Geography code(s) to include, between \code{n} (national), \code{s} (state),
|
|
| 12 |
#' and \code{c} (county). If not specified, all available geographies will be included.
|
|
| 13 |
#' @param adjust,condition,sex,age,race,dual_elig,medicare_elig One or more codes indicating |
|
| 14 |
#' the variable levels to include (see \code{\link{dcf_standardize_cmsmmd}}).
|
|
| 15 |
#' If \code{"."}, values will be across all levels, whereas if \code{NULL}, all available levels
|
|
| 16 |
#' will be included (aggregated and disaggregated). See the Making Requests section. |
|
| 17 |
#' @param refresh_codebook Logical; if \code{TRUE}, will re-download the codebook even if it
|
|
| 18 |
#' exists in the temporary location (which is cleared each R session). |
|
| 19 |
#' @param codebook_only Logical; if \code{TRUE}, will return the codebook without downloading data.
|
|
| 20 |
#' @param limit Maximum number of rows to return in each request. The API limit appears to be 100,000. |
|
| 21 |
#' @param out_file Path to the CSV or Parquet file to write data to. |
|
| 22 |
#' @param state The codebook state (MD5 hash) recorded during a previous download; |
|
| 23 |
#' if provided, will only download if the new state does not match. |
|
| 24 |
#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet.
|
|
| 25 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 26 |
#' @section Making Requests: |
|
| 27 |
#' The API operates over several large files, partitioned by measure, year, |
|
| 28 |
#' adjust, and dual and medicaid eligibility. These are identified with the codebook |
|
| 29 |
#' (\code{dcf_download_cmsmmd(codebook_only = TRUE)}).
|
|
| 30 |
#' |
|
| 31 |
#' The files are larger than the API's limit, so requests for each file have to be broken up |
|
| 32 |
#' by the other variables within them (sex, age, race, and condition). |
|
| 33 |
#' |
|
| 34 |
#' For best performance, make requests as big as possible while staying under 100,000 rows |
|
| 35 |
#' each (e.g., by setting \code{sex}, \code{age}, or \code{race} to \code{NULL}).
|
|
| 36 |
#' @returns \code{dcf_download_cmsmmd}: A list:
|
|
| 37 |
#' \itemize{
|
|
| 38 |
#' \item \strong{\code{codebook}}: The codebook.
|
|
| 39 |
#' \item \strong{\code{codebook_state}}: MD5 hash of the codebook.
|
|
| 40 |
#' \item \strong{\code{data}}: The downloaded data.
|
|
| 41 |
#' } |
|
| 42 |
#' @examples |
|
| 43 |
#' # find the codes associated with menu values |
|
| 44 |
#' variable_codes <- dcf_standardize_cmsmmd() |
|
| 45 |
#' variable_codes[c( |
|
| 46 |
#' "sex", "age", "race", |
|
| 47 |
#' "adjust", "dual_elig", "medicare_elig" |
|
| 48 |
#' )] |
|
| 49 |
#' |
|
| 50 |
#' # look at the codebook which defines source files |
|
| 51 |
#' codebook <- dcf_download_cmsmmd(codebook_only = TRUE) |
|
| 52 |
#' codebook |
|
| 53 |
#' |
|
| 54 |
#' \dontrun{
|
|
| 55 |
#' # download data |
|
| 56 |
#' downloaded <- dcf_download_cmsmmd( |
|
| 57 |
#' "preventive care", |
|
| 58 |
#' population = "f", |
|
| 59 |
#' race = ".", |
|
| 60 |
#' sex = ".", |
|
| 61 |
#' age = NULL, |
|
| 62 |
#' condition = c(83, 85, 86, 88, 89, 95, 101, 102, 104, 105:107), |
|
| 63 |
#' adjust = 1 |
|
| 64 |
#' ) |
|
| 65 |
#' |
|
| 66 |
#' # convert codes to levels |
|
| 67 |
#' data_standard <- dcf_standardize_cmsmmd(downloaded$data) |
|
| 68 |
#' } |
|
| 69 |
#' @export |
|
| 70 | ||
| 71 |
dcf_download_cmsmmd <- function( |
|
| 72 |
measure, |
|
| 73 |
population = NULL, |
|
| 74 |
year = NULL, |
|
| 75 |
geography = NULL, |
|
| 76 |
adjust = NULL, |
|
| 77 |
condition = NULL, |
|
| 78 |
sex = c(1:2, "."), |
|
| 79 |
age = c(0:4, "."), |
|
| 80 |
race = c(1:6, "."), |
|
| 81 |
dual_elig = ".", |
|
| 82 |
medicare_elig = ".", |
|
| 83 |
refresh_codebook = FALSE, |
|
| 84 |
codebook_only = FALSE, |
|
| 85 |
row_limit = 9999999, |
|
| 86 |
out_file = NULL, |
|
| 87 |
state = NULL, |
|
| 88 |
parquet = FALSE, |
|
| 89 |
verbose = TRUE |
|
| 90 |
) {
|
|
| 91 |
# load codebook |
|
| 92 | 3x |
codebook_file <- paste0(tempdir(), "/codebook_crosswalk.csv") |
| 93 | 3x |
if (refresh_codebook || !file.exists(codebook_file)) {
|
| 94 | 1x |
if (verbose) cli::cli_progress_step("retrieving codebook")
|
| 95 | 1x |
codebook_req <- curl::curl_fetch_disk( |
| 96 | 1x |
"https://data.cms.gov/mmd-population/assets/codebook_crosswalk.csv", |
| 97 | 1x |
codebook_file |
| 98 |
) |
|
| 99 | 1x |
if (codebook_req$status_code != 200) {
|
| 100 | ! |
unlink(codebook_file) |
| 101 | ! |
cli::cli_abort("failed to retrieve codebook: {codebook_req$status_code}")
|
| 102 |
} |
|
| 103 | 1x |
if (verbose) cli::cli_progress_done() |
| 104 |
} |
|
| 105 | 3x |
new_state <- tools::md5sum(codebook_file) |
| 106 | 3x |
if (!is.null(state)) {
|
| 107 | 1x |
if (state == new_state) {
|
| 108 | 1x |
if (verbose) {
|
| 109 | 1x |
cli::cli_alert_info("codebook has not changed since last download")
|
| 110 |
} |
|
| 111 | 1x |
return(invisible(NULL)) |
| 112 |
} |
|
| 113 |
} |
|
| 114 | 2x |
codebook <- vroom::vroom( |
| 115 | 2x |
codebook_file, |
| 116 | 2x |
col_types = list( |
| 117 | 2x |
elig = "c", |
| 118 | 2x |
race_code = "c", |
| 119 | 2x |
sex_code = "c", |
| 120 | 2x |
adjust = "c", |
| 121 | 2x |
dual = "c" |
| 122 |
), |
|
| 123 | 2x |
na = " " |
| 124 |
) |
|
| 125 | 2x |
codebook$description <- tolower(codebook$description) |
| 126 | ||
| 127 | 1x |
if (codebook_only) return(codebook) |
| 128 | ||
| 129 |
# identify source(s) |
|
| 130 | 1x |
if (missing(measure)) {
|
| 131 | ! |
cli::cli_abort("specify a measure: {.value {unique(codebook$description)}}")
|
| 132 |
} |
|
| 133 | ||
| 134 | 1x |
measure_descriptions <- unique(codebook$description) |
| 135 | 1x |
if (nchar(measure) == 1L) {
|
| 136 | ! |
codebook <- codebook[codebook$measure == tolower(measure), ] |
| 137 |
} else {
|
|
| 138 | 1x |
codebook <- codebook[grepl(measure, codebook$description), ] |
| 139 |
} |
|
| 140 | 1x |
if (!nrow(codebook)) {
|
| 141 | ! |
cli::cli_abort( |
| 142 | ! |
paste( |
| 143 | ! |
'measure "{measure}" does not match the available measures:',
|
| 144 | ! |
"{.value {measure_descriptions}}"
|
| 145 |
) |
|
| 146 |
) |
|
| 147 |
} |
|
| 148 | ||
| 149 | 1x |
if (is.null(population)) population <- codebook$population[[1L]] |
| 150 | 1x |
codebook <- codebook[ |
| 151 | 1x |
filter_codebook(codebook$population, population, "population"), |
| 152 |
] |
|
| 153 | ||
| 154 | 1x |
if (!is.null(year)) {
|
| 155 | 1x |
if (is.numeric(year)) year <- as.character(year) |
| 156 | 1x |
if (any(nchar(year) > 2L)) {
|
| 157 | 1x |
year <- substring( |
| 158 | 1x |
year, |
| 159 | 1x |
nchar(year) - |
| 160 | 1x |
as.integer(as.character(cut( |
| 161 | 1x |
as.numeric(year), |
| 162 | 1x |
c( |
| 163 | 1x |
-Inf, |
| 164 | 1x |
if (population == "m") 2014L else 2019L, |
| 165 | 1x |
Inf |
| 166 |
), |
|
| 167 | 1x |
c(0L, 1L) |
| 168 |
))) |
|
| 169 |
) |
|
| 170 |
} |
|
| 171 | 1x |
codebook <- codebook[ |
| 172 | 1x |
filter_codebook(codebook$year, year, "year"), |
| 173 |
] |
|
| 174 |
} |
|
| 175 | 1x |
if (missing(dual_elig)) dual_elig <- codebook$dual[[1L]] |
| 176 | 1x |
if (!is.null(dual_elig)) {
|
| 177 | 1x |
codebook <- codebook[ |
| 178 | 1x |
filter_codebook(codebook$dual, dual_elig, "dual_elig"), |
| 179 |
] |
|
| 180 |
} |
|
| 181 | 1x |
if (missing(medicare_elig)) medicare_elig <- codebook$elig[[1L]] |
| 182 | 1x |
if (!is.null(medicare_elig)) {
|
| 183 | 1x |
codebook <- codebook[ |
| 184 | 1x |
filter_codebook(codebook$elig, medicare_elig, "medicare_elig"), |
| 185 |
] |
|
| 186 |
} |
|
| 187 | ||
| 188 |
# make requests |
|
| 189 | 1x |
param_sets <- expand.grid( |
| 190 | 1x |
Filter( |
| 191 | 1x |
length, |
| 192 | 1x |
list( |
| 193 | 1x |
fltr = adjust, |
| 194 | 1x |
agecat = age, |
| 195 | 1x |
sexcat = sex, |
| 196 | 1x |
racecat = race, |
| 197 | 1x |
condition = condition, |
| 198 | 1x |
"_source" = unique(codebook$url) |
| 199 |
) |
|
| 200 |
), |
|
| 201 | 1x |
stringsAsFactors = FALSE |
| 202 |
) |
|
| 203 | 1x |
param_sets[param_sets == "." | param_sets == "all"] <- ".|IS NULL" |
| 204 | 1x |
data_url <- paste0( |
| 205 | 1x |
"https://data.cms.gov/data-api/v1/mmd-tool/?_size=", |
| 206 | 1x |
row_limit, |
| 207 |
"&" |
|
| 208 |
) |
|
| 209 | 1x |
n_requests <- nrow(param_sets) |
| 210 | 1x |
all_data <- list() |
| 211 | 1x |
if (verbose) cli::cli_h1("making requests to {data_url}")
|
| 212 | 1x |
for (i in seq_len(n_requests)) {
|
| 213 | 2x |
params <- as.list(param_sets[i, ]) |
| 214 | 2x |
param_string <- paste0(names(params), "=", params, collapse = "&") |
| 215 | 2x |
req <- curl::curl_fetch_memory(utils::URLencode(paste0( |
| 216 | 2x |
data_url, |
| 217 | 2x |
param_string |
| 218 |
))) |
|
| 219 | 2x |
if (req$status_code != 200) {
|
| 220 | ! |
cli::cli_abort( |
| 221 | ! |
"a request failed: ({req$status_code}) {rawToChar(req$content)}"
|
| 222 |
) |
|
| 223 |
} |
|
| 224 | 2x |
all_data[[i]] <- jsonlite::fromJSON(rawToChar(req$content)) |
| 225 | 2x |
if (verbose) |
| 226 | 2x |
cli::cli_progress_step( |
| 227 | 2x |
paste0( |
| 228 | 2x |
i, |
| 229 | 2x |
" of ", |
| 230 | 2x |
n_requests, |
| 231 |
" (",
|
|
| 232 | 2x |
nrow(all_data[[i]]), |
| 233 | 2x |
" rows): ", |
| 234 | 2x |
param_string |
| 235 |
), |
|
| 236 | 2x |
spinner = TRUE |
| 237 |
) |
|
| 238 |
} |
|
| 239 | 1x |
if (verbose) cli::cli_progress_done() |
| 240 | ||
| 241 | 1x |
all_data <- do.call(rbind, all_data) |
| 242 | 1x |
if (!is.null(out_file)) {
|
| 243 | 1x |
dir.create(dirname(out_file), showWarnings = FALSE, recursive = TRUE) |
| 244 | 1x |
if (parquet || grepl(".parquet", out_file, fixed = TRUE)) {
|
| 245 | 1x |
if (verbose) cli::cli_progress_step("writing to Parquet")
|
| 246 | 1x |
arrow::write_parquet( |
| 247 | 1x |
all_data, |
| 248 | 1x |
compression = "gzip", |
| 249 | 1x |
sub(".csv", ".parquet", out_file, fixed = TRUE)
|
| 250 |
) |
|
| 251 |
} else {
|
|
| 252 | ! |
if (verbose) cli::cli_progress_step("writing to CSV")
|
| 253 | ! |
vroom::vroom_write(all_data, out_file, ",") |
| 254 |
} |
|
| 255 |
} |
|
| 256 | 1x |
invisible(list( |
| 257 | 1x |
codebook_state = new_state, |
| 258 | 1x |
codebook = codebook, |
| 259 | 1x |
data = all_data |
| 260 |
)) |
|
| 261 |
} |
|
| 262 | ||
| 263 |
filter_codebook <- function(x, values, column) {
|
|
| 264 | 4x |
su <- x %in% values |
| 265 | 4x |
if (!any(su)) {
|
| 266 | ! |
cli::cli_abort( |
| 267 | ! |
paste( |
| 268 | ! |
'{.arg {column}} does not contain specified values.',
|
| 269 | ! |
"Available values: {.value {unique(x)}}"
|
| 270 |
) |
|
| 271 |
) |
|
| 272 |
} |
|
| 273 | 4x |
su |
| 274 |
} |
|
| 275 | ||
| 276 |
#' @rdname dcf_download_cmsmmd |
|
| 277 |
#' @param raw_data The raw data as downloaded with \code{dcf_download_cmsmmd} to be standardized.
|
|
| 278 |
#' @returns \code{dcf_standardize_cmsmmd}: If \code{raw_data} is \code{NULL}, a list
|
|
| 279 |
#' with an entry for each API parameter, containing named vectors with level codes as names |
|
| 280 |
#' mapping to level values (as they appear in the tool's menus). |
|
| 281 |
#' Otherwise, a version of \code{raw_data} with coded values converted to labels.
|
|
| 282 |
#' @export |
|
| 283 | ||
| 284 |
dcf_standardize_cmsmmd <- function(raw_data = NULL) {
|
|
| 285 |
# extracted from https://data.cms.gov/mmd-population/js/menus.js |
|
| 286 | 1x |
menu <- jsonlite::read_json( |
| 287 | 1x |
system.file( |
| 288 | 1x |
"support_data/cms_mmd_levels.json.gz", |
| 289 | 1x |
package = "dcf" |
| 290 |
) |
|
| 291 |
) |
|
| 292 | 1x |
levels <- lapply( |
| 293 | 1x |
menu, |
| 294 | 1x |
function(e) {
|
| 295 | 18x |
options <- Filter( |
| 296 | 18x |
length, |
| 297 | 18x |
lapply(e$options, function(o) {
|
| 298 | 269x |
if (length(o$val)) {
|
| 299 | 255x |
o$val <- as.character(o$val) |
| 300 | 255x |
o |
| 301 |
} |
|
| 302 |
}) |
|
| 303 |
) |
|
| 304 | 18x |
c( |
| 305 | 18x |
structure( |
| 306 | 18x |
vapply(options, "[[", "", "disp"), |
| 307 | 18x |
names = vapply(options, "[[", "", "val") |
| 308 |
) |
|
| 309 |
) |
|
| 310 |
} |
|
| 311 |
) |
|
| 312 | 1x |
names(levels) <- vapply(menu, "[[", "", "id") |
| 313 | 1x |
levels$geography["n"] <- "National" |
| 314 | 1x |
levels$fltr <- levels$adjust |
| 315 | 1x |
levels$sex <- levels$sexcat <- levels$sex_code |
| 316 | 1x |
levels$race <- levels$racecat <- levels$race_code |
| 317 | 1x |
levels$age <- levels$agecat <- levels$age_group |
| 318 | 1x |
levels$eligcat <- levels$medicare_elig <- levels$eligibility |
| 319 | 1x |
levels$dual_elig <- levels$dual |
| 320 | ! |
if (is.null(raw_data)) return(levels) |
| 321 | 1x |
for (col in colnames(raw_data)) {
|
| 322 | 12x |
col_levels <- levels[[col]] |
| 323 | 12x |
if (!is.null(col_levels)) {
|
| 324 | 10x |
values <- as.character(raw_data[[col]]) |
| 325 | 10x |
values[values == ""] <- "." |
| 326 | 10x |
raw_data[[col]] <- c(col_levels, "." = "All")[values] |
| 327 |
} |
|
| 328 |
} |
|
| 329 | 1x |
raw_data |
| 330 |
} |
| 1 |
#' Read Epic Cosmos Data |
|
| 2 |
#' |
|
| 3 |
#' Read in metadata and data from an Epic Cosmos file. |
|
| 4 |
#' |
|
| 5 |
#' @param path Path to the file. |
|
| 6 |
#' @param path_root Directory containing \code{path}, if it is not full.
|
|
| 7 |
#' @param standard_names A character vector with standard dataset names as names, and |
|
| 8 |
#' fixed patterns to search for in the metadata as values (in lowercase; e.g., |
|
| 9 |
#' \code{c(condition = "condition name")}).
|
|
| 10 |
#' These take precedence over the existing set of standard names, so make sure |
|
| 11 |
#' the pattern is sufficiently specific to the target dataset. |
|
| 12 |
#' @returns A list with \code{data.frame} entries for \code{metadata} and \code{data}.
|
|
| 13 |
#' |
|
| 14 |
#' @examples |
|
| 15 |
#' # write an example file |
|
| 16 |
#' path <- tempfile(fileext = ".csv") |
|
| 17 |
#' raw_lines <- c( |
|
| 18 |
#' "metadata field,metadata value,", |
|
| 19 |
#' ",,", |
|
| 20 |
#' ",Measures,Value Name", |
|
| 21 |
#' "Year,Measure 1,", |
|
| 22 |
#' "2020,m1,1", |
|
| 23 |
#' ",m2,2", |
|
| 24 |
#' "2021,m1,3", |
|
| 25 |
#' ",m2,4" |
|
| 26 |
#' ) |
|
| 27 |
#' writeLines(raw_lines, path) |
|
| 28 |
#' |
|
| 29 |
#' # read it in |
|
| 30 |
#' dcf_read_epic(basename(path), dirname(path)) |
|
| 31 |
#' |
|
| 32 |
#' @export |
|
| 33 | ||
| 34 |
dcf_read_epic <- function(path, path_root = ".", standard_names = NULL) {
|
|
| 35 | 5x |
full_path <- if (file.exists(path)) {
|
| 36 | 5x |
path |
| 37 |
} else {
|
|
| 38 | ! |
sub("//", "/", paste0(path_root, "/", path), fixed = TRUE)
|
| 39 |
} |
|
| 40 | 5x |
lines <- readLines(full_path, n = 25L, skipNul = FALSE) |
| 41 | 5x |
metadata_break <- grep("^[, ]*$", lines)
|
| 42 | 5x |
if (!length(metadata_break)) {
|
| 43 | ! |
cli::cli_abort( |
| 44 | ! |
"path does not appear to point to a file in the Epic format (no metadata separation)" |
| 45 |
) |
|
| 46 |
} |
|
| 47 | 5x |
meta_end <- min(metadata_break) - 1L |
| 48 | 5x |
data_start <- (if (length(metadata_break) == 1L) {
|
| 49 | 3x |
metadata_break |
| 50 |
} else {
|
|
| 51 | 2x |
max(metadata_break[ |
| 52 | 2x |
metadata_break == c(-1L, metadata_break[-1L]) |
| 53 |
]) |
|
| 54 |
}) + |
|
| 55 | 5x |
1L |
| 56 | 5x |
meta <- c( |
| 57 | 5x |
list( |
| 58 | 5x |
file = path, |
| 59 | 5x |
md5 = unname(tools::md5sum(full_path)), |
| 60 | 5x |
date_processed = Sys.time(), |
| 61 | 5x |
standard_name = "" |
| 62 |
), |
|
| 63 | 5x |
as.list(unlist(lapply( |
| 64 | 5x |
strsplit(sub(",+$", "", lines[seq_len(meta_end)]), ",", fixed = TRUE),
|
| 65 | 5x |
function(r) {
|
| 66 | 6x |
l <- list(paste(r[-1L], collapse = ",")) |
| 67 | 6x |
if (l[[1]] == "") {
|
| 68 | ! |
r <- strsplit(r, ": ", fixed = TRUE)[[1L]] |
| 69 | ! |
l <- list(paste(r[-1L], collapse = ",")) |
| 70 |
} |
|
| 71 | 6x |
names(l) <- r[[1L]] |
| 72 | 6x |
l[[1L]] <- gsub('^"|"$', "", l[[1L]])
|
| 73 | 6x |
l |
| 74 |
} |
|
| 75 |
))) |
|
| 76 |
) |
|
| 77 | 5x |
standard_names <- c( |
| 78 | 5x |
tolower(standard_names), |
| 79 | 5x |
vaccine_mmr = "mmr receipt", |
| 80 | 5x |
rsv_tests = "rsv tests", |
| 81 | 5x |
flu = "influenza", |
| 82 | 5x |
self_harm = "self-harm", |
| 83 | 5x |
covid = "covid", |
| 84 | 5x |
rsv = "rsv", |
| 85 | 5x |
obesity = "bmi", |
| 86 | 5x |
obesity = "obesity", |
| 87 | 5x |
hba1c = "hba1c", |
| 88 | 5x |
ed_opioid = "opioid", |
| 89 | 5x |
ed_firearm = "firearm", |
| 90 | 5x |
ed_workplace = "workplace", |
| 91 | 5x |
ed_fall = "diagnoses: fall", |
| 92 | 5x |
ed_drowning = "drowning", |
| 93 | 5x |
all_encounters = "all ed encounters", |
| 94 | 5x |
all_patients = "all patients" |
| 95 |
) |
|
| 96 | 5x |
standard_names <- standard_names[!duplicated(standard_names)] |
| 97 | 5x |
meta_string <- tolower(paste(unlist(meta), collapse = " ")) |
| 98 | 5x |
for (i in seq_along(standard_names)) {
|
| 99 | 24x |
if (grepl(standard_names[[i]], meta_string, fixed = TRUE)) {
|
| 100 | 5x |
meta$standard_name = names(standard_names)[[i]] |
| 101 | 5x |
break |
| 102 |
} |
|
| 103 |
} |
|
| 104 | 5x |
header_rows <- data_start + c(0L, 1L) |
| 105 | 5x |
lines[header_rows] <- gsub( |
| 106 |
',(?=[^",]+")', |
|
| 107 |
"", |
|
| 108 | 5x |
lines[header_rows], |
| 109 | 5x |
perl = TRUE |
| 110 |
) |
|
| 111 | 5x |
header <- strsplit(lines[header_rows[[2L]]], ",", fixed = TRUE)[[1L]] |
| 112 | 5x |
id_cols <- which(header != "") |
| 113 | 5x |
header <- c( |
| 114 | 5x |
header[id_cols], |
| 115 | 5x |
strsplit(lines[data_start], ",", fixed = TRUE)[[1L]][-id_cols] |
| 116 |
) |
|
| 117 | 5x |
data <- arrow::read_csv_arrow( |
| 118 | 5x |
full_path, |
| 119 | 5x |
col_names = header, |
| 120 | 5x |
col_types = paste(rep("c", length(header)), collapse = ""),
|
| 121 | 5x |
na = c("", "-"),
|
| 122 | 5x |
skip = data_start + 1L |
| 123 |
) |
|
| 124 | 5x |
percents <- grep("^(?:Percent|Base|RSV test)", header, ignore.case = TRUE)
|
| 125 | 5x |
if (length(percents)) {
|
| 126 | ! |
for (col in percents) {
|
| 127 | ! |
data[[col]] <- sub("%", "", data[[col]], fixed = TRUE)
|
| 128 |
} |
|
| 129 |
} |
|
| 130 | 5x |
number <- grep("Number", header, fixed = TRUE)
|
| 131 | 5x |
if (length(number)) {
|
| 132 | ! |
for (col in number) {
|
| 133 | ! |
data[[col]][data[[col]] == "10 or fewer"] <- 5L |
| 134 |
} |
|
| 135 |
} |
|
| 136 | 5x |
for (col in id_cols) {
|
| 137 | 18x |
data[[col]] <- vctrs::vec_fill_missing(data[[col]], "down") |
| 138 |
} |
|
| 139 | 5x |
if (all(c("Measures", "Base Patient") %in% colnames(data))) {
|
| 140 | ! |
data <- Reduce( |
| 141 | ! |
merge, |
| 142 | ! |
lapply(split(data, data$Measures), function(d) {
|
| 143 | ! |
measure <- d$Measures[[1L]] |
| 144 | ! |
d[[measure]] <- d[["Base Patient"]] |
| 145 | ! |
d[, !(colnames(d) %in% c("Measures", "Base Patient"))]
|
| 146 |
}) |
|
| 147 |
) |
|
| 148 |
} |
|
| 149 | 5x |
colnames(data) <- standard_columns(colnames(data)) |
| 150 | 5x |
if (meta$standard_name == "obesity") {
|
| 151 | ! |
meta$standard_name <- paste0( |
| 152 | ! |
meta$standard_name, |
| 153 |
"_", |
|
| 154 | ! |
if ("state" %in% colnames(data)) "state" else "county"
|
| 155 |
) |
|
| 156 |
} else if ( |
|
| 157 | 5x |
meta$standard_name == "all_encounters" && "week" %in% colnames(data) |
| 158 |
) {
|
|
| 159 | ! |
meta$standard_name = "all_encounters_weekly" |
| 160 |
} |
|
| 161 | 5x |
if ("age" %in% colnames(data)) {
|
| 162 | 5x |
std_age <- standard_age(data$age) |
| 163 | 5x |
missed_ages <- (data$age != "No value") & is.na(std_age) |
| 164 | 5x |
if (any(missed_ages)) {
|
| 165 | ! |
std_age[missed_ages] <- data$age[missed_ages] |
| 166 | ! |
missed_levels <- unique(data$age[missed_ages]) |
| 167 | ! |
cli::cli_warn("missed age levels: {.field {missed_levels}}")
|
| 168 |
} |
|
| 169 | 5x |
data$age <- std_age |
| 170 |
} |
|
| 171 | 5x |
if (!("Year" %in% colnames(data))) {
|
| 172 | 5x |
if (!is.null(meta[["Session Date Range"]])) {
|
| 173 | 1x |
data$Year <- meta[["Session Date Range"]] |
| 174 |
} else {
|
|
| 175 | 4x |
year <- gsub( |
| 176 |
"^_|\\.$", |
|
| 177 |
"", |
|
| 178 | 4x |
regmatches( |
| 179 | 4x |
basename(path), |
| 180 | 4x |
gregexpr("_\\d{4}\\.", basename(path))
|
| 181 | 4x |
)[[1L]] |
| 182 |
) |
|
| 183 | 4x |
if (length(year)) {
|
| 184 | 1x |
data$Year <- year |
| 185 |
} |
|
| 186 |
} |
|
| 187 |
} |
|
| 188 | 5x |
list(metadata = meta, data = data) |
| 189 |
} |
|
| 190 | ||
| 191 |
standard_age <- function(age) {
|
|
| 192 | 5x |
c( |
| 193 | 5x |
`less than 1` = "<1 Years", |
| 194 | 5x |
`1 and < 2` = "1-2 Years", |
| 195 | 5x |
`2 and < 3` = "2-3 Years", |
| 196 | 5x |
`3 and < 4` = "3-4 Years", |
| 197 | 5x |
`1 and < 5` = "1-4 Years", |
| 198 | 5x |
`1 year or more and less than 5` = "1-4 Years", |
| 199 | 5x |
`4 and < 5` = "4-5 Years", |
| 200 | 5x |
`less than 5` = "<5 Years", |
| 201 | 5x |
`5 and < 6` = "5-6 Years", |
| 202 | 5x |
`5 and < 18` = "5-17 Years", |
| 203 | 5x |
`5 or more and less than 18 (1)` = "5-17 Years", |
| 204 | 5x |
`6 and < 7` = "6-7 Years", |
| 205 | 5x |
`6 or more` = "6+ Years", |
| 206 | 5x |
`7 and < 8` = "7-8 Years", |
| 207 | 5x |
`8 and < 9` = "8-9 Years", |
| 208 | 5x |
`9 or more` = "9+ Years", |
| 209 | 5x |
`less than 10` = "<10 Years", |
| 210 | 5x |
`10 and < 15` = "10-14 Years", |
| 211 | 5x |
`less than 15` = "<15 Years", |
| 212 | 5x |
`15 and < 20` = "15-19 Years", |
| 213 | 5x |
`15 and < 25` = "15-25 Years", |
| 214 | 5x |
`less than 18` = "<18 Years", |
| 215 | 5x |
`18 and < 25` = "18-24 Years", |
| 216 | 5x |
`18 and < 40` = "18-39 Years", |
| 217 | 5x |
`18 and < 45` = "18-44 Years", |
| 218 | 5x |
`18 and < 50` = "18-49 Years", |
| 219 | 5x |
`18 or more and less than 50` = "18-49 Years", |
| 220 | 5x |
`20 and < 40` = "20-39 Years", |
| 221 | 5x |
`25 and < 35` = "25-34 Years", |
| 222 | 5x |
`25 and < 45` = "25-45 Years", |
| 223 | 5x |
`35 and < 45` = "35-44 Years", |
| 224 | 5x |
`40 and < 65` = "40-64 Years", |
| 225 | 5x |
`45 and < 55` = "45-54 Years", |
| 226 | 5x |
`45 and < 65` = "45-64 Years", |
| 227 | 5x |
`45-64` = "45-64 Years", |
| 228 | 5x |
`45 and < 65` = "45-64 Years", |
| 229 | 5x |
`50 and < 65` = "50-64 Years", |
| 230 | 5x |
`50 or more and less than 64` = "50-64 Years", |
| 231 | 5x |
`55 and < 65` = "55-64 Years", |
| 232 | 5x |
`less than 65` = "<65 Years", |
| 233 | 5x |
`65 and < 110` = "65+ Years", |
| 234 | 5x |
`65 or more` = "65+ Years", |
| 235 | 5x |
`65+` = "65+ Years", |
| 236 | 5x |
`total` = "Total" |
| 237 |
)[ |
|
| 238 | 5x |
sub(" [Yy]ears", "", sub("^[^a-z0-9]+|:.*$", "", tolower(age)))
|
| 239 |
] |
|
| 240 |
} |
|
| 241 | ||
| 242 |
standard_columns <- function(cols) {
|
|
| 243 | 5x |
cols <- gsub(" ", "_", sub("number of ", "n_", tolower(cols)), fixed = TRUE)
|
| 244 | 5x |
cols[grep("^age", cols)] <- "age"
|
| 245 | 5x |
cols[grep("^state", cols)] <- "state"
|
| 246 | 5x |
cols[grep("^county", cols)] <- "county"
|
| 247 | 5x |
cols[grep("bmi_30", cols, fixed = TRUE)] <- "bmi_30_49.8"
|
| 248 | 5x |
cols[grep("hemoglobin_a1c_7", cols, fixed = TRUE)] <- "hemoglobin_a1c_7"
|
| 249 | 5x |
cols[grep("mmr_receipt", cols, fixed = TRUE)] <- "mmr_receipt"
|
| 250 | 5x |
cols[grep("opioid", cols, fixed = TRUE)] <- "ed_opioid"
|
| 251 | 5x |
cols[grep("^rsv_tests", cols)] <- "rsv_tests"
|
| 252 | 5x |
cols[grep("total:", cols, fixed = TRUE)] <- "total"
|
| 253 | 5x |
cols |
| 254 |
} |
| 1 |
#' Download CDC WISQARS Reports |
|
| 2 |
#' |
|
| 3 |
#' Download reports data from the CDC's |
|
| 4 |
#' \href{https://wisqars.cdc.gov/reports}{Web-based Injury Statistics Query and Reporting System}.
|
|
| 5 |
#' |
|
| 6 |
#' @param file File to save the report to (\code{csv} or \code{parquet}).
|
|
| 7 |
#' @param year_start Earliest year to include. |
|
| 8 |
#' @param year_end Latest year to include. |
|
| 9 |
#' @param geography State or region code. |
|
| 10 |
#' @param intent Intent ID or name: |
|
| 11 |
#' \tabular{lll}{
|
|
| 12 |
#' \code{0} \tab \code{all} \tab All \cr
|
|
| 13 |
#' \code{1} \tab \code{unintentional} \tab Unintentional \cr
|
|
| 14 |
#' \code{2} \tab \code{suicide} \tab Suicide \cr
|
|
| 15 |
#' \code{3} \tab \code{homicide} \tab Homicide \cr
|
|
| 16 |
#' \code{4} \tab \code{homicide_legal} \tab Homicide and Legal Intervention \cr
|
|
| 17 |
#' \code{5} \tab \code{undetermined} \tab Undetermined \cr
|
|
| 18 |
#' \code{6} \tab \code{legal} \tab Legal Intervention \cr
|
|
| 19 |
#' \code{8} \tab \code{violence} \tab Violence-related \cr
|
|
| 20 |
#' } |
|
| 21 |
#' @param disposition Patient disposition given nonfatal: one or multiple of \code{all} (0),
|
|
| 22 |
#' \code{treated} (1; treated and released), \code{transfered} (2), \code{hospitalized} (3), or
|
|
| 23 |
#' \code{observed} (4; observed/left AMA/unknown).
|
|
| 24 |
#' @param mechanism Mechanism code; default to \code{20810} (all injury).
|
|
| 25 |
#' Other codes appear in the URL in the \code{m} parameter when submitting the filter.
|
|
| 26 |
#' @param group_ages Logical; if \code{FALSE}, will not group ages into 5-year bins.
|
|
| 27 |
#' @param age_min Youngest age to include. |
|
| 28 |
#' @param age_max Oldest age to include. |
|
| 29 |
#' @param sex Sex groups to include: one or multiple of \code{all} (0), \code{male} (1),
|
|
| 30 |
#' \code{female} (2), or \code{unknown} (3)..
|
|
| 31 |
#' @param race Race groups to include: one or multiple of \code{all} (0), \code{white} (1),
|
|
| 32 |
#' \code{black} (2), \code{aa} (3; American Indian or Alaska Native), \code{asian} (4),
|
|
| 33 |
#' \code{pi} (5; Hawaiian Native or Pacific Islander), \code{more} (6; more than one race).
|
|
| 34 |
#' These levels apply if \code{race_reporting} is \code{single} (default) -- provide
|
|
| 35 |
#' these by index for other \code{race_reporting} levels.
|
|
| 36 |
#' @param race_reporting How to group race groups, between \code{none} (0), \code{bridge} (1),
|
|
| 37 |
#' \code{single} (2), or \code{aapi} (3).
|
|
| 38 |
#' @param ethnicity Which ethnic groups to include: one or multiple of \code{all} (0),
|
|
| 39 |
#' \code{non_hispanic} (1), \code{hispanic} (2), or \code{unknown} (3).
|
|
| 40 |
#' @param YPLL Age to use when calculating Years of Potential Life Lost. |
|
| 41 |
#' @param metro Region type filter: \code{TRUE} for only metropolitan / urban, or \code{FALSE}
|
|
| 42 |
#' for only non-metropolitan / rural. Will include all region types if \code{NULL} (default).
|
|
| 43 |
#' @param group_by One or more variables to group by. These are uppercased and sometimes |
|
| 44 |
#' abbreviated or encoded; see the \code{r1} through \code{r4} URL parameters.
|
|
| 45 |
#' @param fatal_outcome Logical; if \code{FALSE}, will return non-fatal results.
|
|
| 46 |
#' @param brain_ingury_only Logical; if \code{TRUE}, will return only traumatic brain injury results.
|
|
| 47 |
#' @param include_total Logical; if \code{FALSE}, will not include totals.
|
|
| 48 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 49 |
#' @returns A list containing the parameters of the request. The returned data are written to \code{file}.
|
|
| 50 |
#' @examples |
|
| 51 |
#' file <- "../../../wisqars.csv.xz" |
|
| 52 |
#' if (file.exists(file)) {
|
|
| 53 |
#' dcf_download_wisqars(file, verbose = FALSE) |
|
| 54 |
#' vroom::vroom(file) |
|
| 55 |
#' } |
|
| 56 |
#' @export |
|
| 57 | ||
| 58 |
dcf_download_wisqars <- function( |
|
| 59 |
file, |
|
| 60 |
fatal_outcome = TRUE, |
|
| 61 |
brain_injury_only = FALSE, |
|
| 62 |
year_start = 2018, |
|
| 63 |
year_end = year_start, |
|
| 64 |
geography = "00", |
|
| 65 |
intent = "all", |
|
| 66 |
disposition = "all", |
|
| 67 |
mechanism = if (fatal_outcome) 20810 else 3000, |
|
| 68 |
group_ages = NULL, |
|
| 69 |
age_min = 0, |
|
| 70 |
age_max = 199, |
|
| 71 |
sex = "all", |
|
| 72 |
race = "all", |
|
| 73 |
race_reporting = "single", |
|
| 74 |
ethnicity = "all", |
|
| 75 |
YPLL = 65, |
|
| 76 |
metro = NULL, |
|
| 77 |
group_by = NULL, |
|
| 78 |
include_total = FALSE, |
|
| 79 |
verbose = TRUE |
|
| 80 |
) {
|
|
| 81 | 2x |
intents <- list( |
| 82 | 2x |
all = 0, |
| 83 | 2x |
unintentional = 1, |
| 84 | 2x |
violence = 8, |
| 85 | 2x |
homicide_legal = 4, |
| 86 | 2x |
homicide = 3, |
| 87 | 2x |
legal = 6, |
| 88 | 2x |
suicide = 2, |
| 89 | 2x |
undetermined = 5 |
| 90 |
) |
|
| 91 | 2x |
dispositions <- list( |
| 92 | 2x |
all = 0, |
| 93 | 2x |
treated = 1, |
| 94 | 2x |
transfered = 2, |
| 95 | 2x |
hospitalized = 3, |
| 96 | 2x |
observed = 4 |
| 97 |
) |
|
| 98 | 2x |
sexes <- list( |
| 99 | 2x |
all = 0, |
| 100 | 2x |
male = 1, |
| 101 | 2x |
female = 2, |
| 102 | 2x |
unknown = 3 |
| 103 |
) |
|
| 104 | 2x |
races <- list( |
| 105 | 2x |
all = 0, |
| 106 | 2x |
white = 1, |
| 107 | 2x |
black = 2, |
| 108 | 2x |
aa = 3, |
| 109 | 2x |
asian = 4, |
| 110 | 2x |
pi = 5, |
| 111 | 2x |
more = 6 |
| 112 |
) |
|
| 113 | 2x |
race_reportings <- list( |
| 114 | 2x |
none = 0, |
| 115 | 2x |
bridge = 1, |
| 116 | 2x |
single = 2, |
| 117 | 2x |
aapi = 3 |
| 118 |
) |
|
| 119 | 2x |
ethnicities <- list( |
| 120 | 2x |
all = 0, |
| 121 | 2x |
non_hispanic = 1, |
| 122 | 2x |
hispanic = 2, |
| 123 | 2x |
unknown = 3 |
| 124 |
) |
|
| 125 | 2x |
if (missing(group_ages) && (!missing(age_min) || !missing(age_max))) {
|
| 126 | ! |
group_ages <- FALSE |
| 127 |
} |
|
| 128 | 2x |
params <- list( |
| 129 | 2x |
TotalLine = if (include_total) "YES" else "NO", |
| 130 | 2x |
intent = if (is.character(intent)) intents[[tolower(intent)]] else 0L, |
| 131 | 2x |
mech = mechanism, |
| 132 | 2x |
sex = paste( |
| 133 | 2x |
vapply(sex, function(l) if (is.character(l)) sexes[[l]] else l, 0), |
| 134 | 2x |
collapse = "," |
| 135 |
), |
|
| 136 | 2x |
race = paste( |
| 137 | 2x |
vapply(race, function(l) if (is.character(l)) sexes[[l]] else l, 0), |
| 138 | 2x |
collapse = "," |
| 139 |
), |
|
| 140 | 2x |
race_yr = if (is.character(race_reporting)) |
| 141 | 2x |
race_reportings[[race_reporting]] else race_reporting, |
| 142 | 2x |
year1 = year_start, |
| 143 | 2x |
year2 = year_end, |
| 144 | 2x |
agebuttn = if (is.null(group_ages)) "ALL" else if (group_ages) "5Yr" else |
| 145 | 2x |
"custom", |
| 146 | 2x |
fiveyr1 = age_min, |
| 147 | 2x |
fiveyr2 = age_max, |
| 148 | 2x |
c_age1 = age_min, |
| 149 | 2x |
c_age2 = age_max, |
| 150 | 2x |
groupby1 = "NONE", |
| 151 | 2x |
groupby2 = "NONE", |
| 152 | 2x |
groupby3 = "NONE", |
| 153 | 2x |
groupby4 = "NONE", |
| 154 | 2x |
groupby5 = "NONE", |
| 155 | 2x |
groupby6 = "NONE" |
| 156 |
) |
|
| 157 | 2x |
if (fatal_outcome) {
|
| 158 | 1x |
params$state <- geography |
| 159 | 1x |
params$ethnicty <- paste( |
| 160 | 1x |
vapply( |
| 161 | 1x |
ethnicity, |
| 162 | 1x |
function(l) if (is.character(l)) ethnicities[[l]] else l, |
| 163 | 1x |
0 |
| 164 |
), |
|
| 165 | 1x |
collapse = "," |
| 166 |
) |
|
| 167 | 1x |
params$ypllage <- YPLL |
| 168 | 1x |
params$urbrul <- if (is.null(metro)) 0 else if (metro) 1 else 2 |
| 169 | 1x |
params$tbi <- if (brain_injury_only) 1L else 0L |
| 170 |
} else {
|
|
| 171 | 1x |
params$groupby1 <- "NONE1" |
| 172 | 1x |
params$groupby2 <- "NONE2" |
| 173 | 1x |
params$groupby3 <- "NONE3" |
| 174 | 1x |
params$groupby4 <- "NONE4" |
| 175 | 1x |
params$groupby5 <- "NONE5" |
| 176 | 1x |
params$groupby6 <- "NONE6" |
| 177 | 1x |
params$outcome <- "NFI" |
| 178 | 1x |
params$racethn <- 0 |
| 179 | 1x |
params$disp <- paste( |
| 180 | 1x |
vapply( |
| 181 | 1x |
disposition, |
| 182 | 1x |
function(l) if (is.character(l)) dispositions[[l]] else l, |
| 183 | 1x |
0 |
| 184 |
), |
|
| 185 | 1x |
collapse = "," |
| 186 |
) |
|
| 187 |
} |
|
| 188 | 2x |
for (group in seq_along(group_by)) {
|
| 189 | ! |
params[[paste0("groupby", group)]] <- toupper(group_by[[group]])
|
| 190 |
} |
|
| 191 | 2x |
params <- lapply(params, as.character) |
| 192 | 2x |
if (fatal_outcome) {
|
| 193 | 1x |
params$app_id <- 1002 |
| 194 | 1x |
params$component_id <- 1000 |
| 195 |
} |
|
| 196 | ||
| 197 | 2x |
if (verbose) {
|
| 198 | 2x |
url <- paste0( |
| 199 | 2x |
"https://wisqars.cdc.gov/reports/?o=", |
| 200 | 2x |
if (fatal_outcome) "MORT" else "NFI" |
| 201 |
) |
|
| 202 | 2x |
if (!fatal_outcome) {
|
| 203 | 1x |
url <- paste0(url, "&g=00&me=") |
| 204 |
} |
|
| 205 | 2x |
url_param_map <- list( |
| 206 | 2x |
year1 = "y1", |
| 207 | 2x |
year2 = "y2", |
| 208 | 2x |
tbi = "t", |
| 209 | 2x |
disp = "d", |
| 210 | 2x |
state = "g", |
| 211 | 2x |
ethnicty = "e", |
| 212 | 2x |
intent = "i", |
| 213 | 2x |
mech = "m", |
| 214 | 2x |
sex = "s", |
| 215 | 2x |
race = "r", |
| 216 | 2x |
agebuttn = "a", |
| 217 | 2x |
urbrul = "me", |
| 218 | 2x |
race_yr = "ry", |
| 219 | 2x |
ypllage = "yp", |
| 220 | 2x |
fiveyr1 = "g1", |
| 221 | 2x |
fiveyr2 = "g2", |
| 222 | 2x |
c_age1 = "a1", |
| 223 | 2x |
c_age2 = "a2", |
| 224 | 2x |
groupby1 = "r1", |
| 225 | 2x |
groupby2 = "r2", |
| 226 | 2x |
groupby3 = "r3", |
| 227 | 2x |
groupby4 = "r4", |
| 228 | 2x |
groupby5 = "r5", |
| 229 | 2x |
groupby6 = "r6" |
| 230 |
) |
|
| 231 | 2x |
for (k in names(params)) {
|
| 232 | 48x |
url_key <- url_param_map[[k]] |
| 233 | 48x |
if (!is.null(url_key)) {
|
| 234 | 42x |
for (value in params[[k]]) url <- paste0(url, "&", url_key, "=", value) |
| 235 |
} |
|
| 236 |
} |
|
| 237 | 2x |
cli::cli_alert_info("requesting report {.url {url}}")
|
| 238 |
} |
|
| 239 | ||
| 240 | 2x |
handler <- curl::new_handle() |
| 241 | 2x |
curl::handle_setheaders(handler, "Content-Type" = "application/json") |
| 242 | 2x |
curl::handle_setopt( |
| 243 | 2x |
handler, |
| 244 | 2x |
copypostfields = jsonlite::toJSON( |
| 245 | 2x |
list(parameters = params), |
| 246 | 2x |
auto_unbox = TRUE |
| 247 |
) |
|
| 248 |
) |
|
| 249 | 2x |
req <- curl::curl_fetch_memory( |
| 250 | 2x |
paste0( |
| 251 | 2x |
"https://wisqars.cdc.gov/api/cost-", |
| 252 | 2x |
if (fatal_outcome) "fatal" else "nonfatal" |
| 253 |
), |
|
| 254 | 2x |
handle = handler |
| 255 |
) |
|
| 256 | 2x |
if (req$status_code == 200) {
|
| 257 | 2x |
dir.create(dirname(file), FALSE, TRUE) |
| 258 | 2x |
data <- jsonlite::fromJSON(rawToChar(req$content)) |
| 259 | 2x |
if (!length(data)) {
|
| 260 | ! |
cli::cli_warn("no rows in data, so no file written")
|
| 261 |
} else {
|
|
| 262 | 2x |
if (grepl("parquet", file)) {
|
| 263 | ! |
arrow::write_parquet(data, file, compression = "gzip") |
| 264 |
} else {
|
|
| 265 | 2x |
vroom::vroom_write(data, file, ",") |
| 266 |
} |
|
| 267 |
} |
|
| 268 |
} else {
|
|
| 269 | ! |
cli::cli_abort("request failed: {req$status_code}")
|
| 270 |
} |
|
| 271 | 2x |
invisible(params) |
| 272 |
} |
| 1 |
#' Make a Status Diagram |
|
| 2 |
#' |
|
| 3 |
#' Make a Data Collection Project status diagram. |
|
| 4 |
#' |
|
| 5 |
#' @param project_dir Path to the Data Collection Framework project to be built. |
|
| 6 |
#' @param out_file File name of the file to write within \code{project_dir}.
|
|
| 7 |
#' @returns A character vector of the status diagram, which is also written to |
|
| 8 |
#' the \code{project_dir/status.md} file.
|
|
| 9 |
#' @examples |
|
| 10 |
#' \dontrun{
|
|
| 11 |
#' dcf_status_diagram("project_directory")
|
|
| 12 |
#' } |
|
| 13 |
#' @export |
|
| 14 | ||
| 15 |
dcf_status_diagram <- function(project_dir = ".", out_file = "status.md") {
|
|
| 16 | 3x |
report_file <- paste0(project_dir, "/report.json.gz") |
| 17 | 3x |
if (!file.exists(report_file)) {
|
| 18 | ! |
cli::cli_abort("no report file found")
|
| 19 |
} |
|
| 20 | 3x |
report <- jsonlite::read_json(report_file) |
| 21 | 3x |
data_dir <- if (is.null(report$settings$data_dir)) {
|
| 22 | ! |
"data" |
| 23 |
} else {
|
|
| 24 | 3x |
report$settings$data_dir |
| 25 |
} |
|
| 26 | 3x |
branch <- if (is.null(report$settings$branch)) {
|
| 27 | ! |
"main" |
| 28 |
} else {
|
|
| 29 | 3x |
report$settings$branch |
| 30 |
} |
|
| 31 | 3x |
repo <- if (report$settings$github_account == "") {
|
| 32 | 3x |
NULL |
| 33 |
} else {
|
|
| 34 | ! |
paste0(report$settings$github_account, "/", report$settings$repo_name) |
| 35 |
} |
|
| 36 | 3x |
indent <- " " |
| 37 | 3x |
d <- c( |
| 38 | 3x |
'classDef pass stroke:#66bb6a', |
| 39 | 3x |
'classDef warn stroke:#ffa726', |
| 40 | 3x |
'classDef fail stroke:#f44336' |
| 41 |
) |
|
| 42 | 3x |
sources <- NULL |
| 43 | 3x |
source_ids <- list() |
| 44 | 3x |
file_ids <- NULL |
| 45 | 3x |
relationships <- NULL |
| 46 | 3x |
projects <- NULL |
| 47 | 3x |
node_id <- 0L |
| 48 | 3x |
for (name in names(sort(vapply( |
| 49 | 3x |
report$processes, |
| 50 | 3x |
function(p) !is.null(p$type) && p$type == "bundle", |
| 51 | 3x |
TRUE |
| 52 |
)))) {
|
|
| 53 | 5x |
timing <- report$source_times[[name]] |
| 54 | 5x |
issues <- report$issues[[name]] |
| 55 | 5x |
if (length(issues)) {
|
| 56 | 5x |
names(issues) <- sub( |
| 57 | 5x |
paste0(data_dir, "/"), |
| 58 |
"", |
|
| 59 | 5x |
sub( |
| 60 | 5x |
paste0(project_dir, "/"), |
| 61 |
"", |
|
| 62 | 5x |
sub("^\\.*/", "", names(issues)),
|
| 63 | 5x |
fixed = TRUE |
| 64 |
), |
|
| 65 | 5x |
fixed = TRUE |
| 66 |
) |
|
| 67 |
} |
|
| 68 | 5x |
metas <- report$metadata[grep( |
| 69 | 5x |
paste0("^", name, "/"),
|
| 70 | 5x |
names(report$metadata) |
| 71 |
)] |
|
| 72 | 5x |
measures <- if (length(metas)) metas[[1L]]$measure_info else list() |
| 73 | 5x |
process <- report$processes[[name]] |
| 74 | 5x |
contents <- NULL |
| 75 | 5x |
if (!is.null(process$type) && process$type == "bundle") {
|
| 76 | 2x |
dist_files <- grep( |
| 77 | 2x |
"measure_info", |
| 78 | 2x |
names(process$dist_state), |
| 79 | 2x |
value = TRUE, |
| 80 | 2x |
invert = TRUE |
| 81 |
) |
|
| 82 | 2x |
for (filename in sub( |
| 83 | 2x |
paste0("^[./]*", data_dir, "/", name, "/(?:dist|standard)/"),
|
| 84 |
"", |
|
| 85 | 2x |
dist_files |
| 86 |
)) {
|
|
| 87 | 2x |
node_id <- node_id + 1L |
| 88 | 2x |
contents <- c( |
| 89 | 2x |
contents, |
| 90 | 2x |
paste0( |
| 91 | 2x |
"n", |
| 92 | 2x |
node_id, |
| 93 |
'["`', |
|
| 94 | 2x |
if (is.null(repo)) {
|
| 95 | 2x |
filename |
| 96 |
} else {
|
|
| 97 | ! |
make_link( |
| 98 | ! |
paste0( |
| 99 | ! |
"https://github.com/", |
| 100 | ! |
repo, |
| 101 | ! |
"/blob/", |
| 102 | ! |
branch, |
| 103 |
"/", |
|
| 104 | ! |
data_dir, |
| 105 |
"/", |
|
| 106 | ! |
name, |
| 107 | ! |
"/dist/", |
| 108 | ! |
filename |
| 109 |
), |
|
| 110 | ! |
filename |
| 111 |
) |
|
| 112 |
}, |
|
| 113 |
'`"]' |
|
| 114 |
) |
|
| 115 |
) |
|
| 116 |
} |
|
| 117 | 2x |
file_nodes <- file_ids[ |
| 118 | 2x |
if (!is.null(names(process$source_files))) {
|
| 119 | 2x |
names(process$source_files) |
| 120 |
} else {
|
|
| 121 | ! |
unlist(process$source_files) |
| 122 |
} |
|
| 123 |
] |
|
| 124 | 2x |
file_nodes <- file_nodes[!is.na(file_nodes)] |
| 125 | 2x |
if (length(file_nodes)) {
|
| 126 | 2x |
relationships <- c( |
| 127 | 2x |
relationships, |
| 128 | 2x |
paste0("n", file_nodes, " --> ", name)
|
| 129 |
) |
|
| 130 |
} |
|
| 131 |
} else {
|
|
| 132 | 3x |
measure_sources <- measures[["_sources"]] |
| 133 | 3x |
if (is.null(measure_sources)) {
|
| 134 | 3x |
measure_sources <- measures[["_source"]] |
| 135 |
} |
|
| 136 | 3x |
for (project_meta in metas) {
|
| 137 | 3x |
for (r in project_meta$resources) {
|
| 138 | 3x |
node_id <- node_id + 1L |
| 139 | 3x |
file_path <- paste0( |
| 140 | 3x |
name, |
| 141 | 3x |
"/standard/", |
| 142 | 3x |
r$filename |
| 143 |
) |
|
| 144 | 3x |
file_ids[paste0(name, "/standard/", r$filename)] <- node_id |
| 145 | 3x |
file_issues <- issues[[file_path]] |
| 146 | 3x |
for (field in r$schema$fields) {
|
| 147 | 9x |
field_source <- measures[[field$name]]$sources |
| 148 | 9x |
if (!is.null(names(field_source))) {
|
| 149 | ! |
field_source <- list(field_source) |
| 150 |
} |
|
| 151 | 9x |
for (s in field_source) {
|
| 152 | ! |
if (is.character(s)) {
|
| 153 | ! |
s <- list(id = s) |
| 154 |
} |
|
| 155 | ! |
if (!is.null(s$id)) {
|
| 156 | ! |
s <- if (is.null(measure_sources[[s$id]])) {
|
| 157 | ! |
c(s, list(name = s$id)) |
| 158 |
} else {
|
|
| 159 | ! |
c(s, measure_sources[[s$id]]) |
| 160 |
} |
|
| 161 |
} |
|
| 162 | ! |
if (is.null(source_ids[[s$name]])) {
|
| 163 | ! |
source_id <- paste0("s", length(source_ids))
|
| 164 | ! |
source_ids[[s$name]] <- source_id |
| 165 | ! |
sources[[source_id]] <- list( |
| 166 | ! |
id = source_id, |
| 167 | ! |
general = make_link(s$url, s$name), |
| 168 | ! |
specific = NULL |
| 169 |
) |
|
| 170 | ! |
parent_id <- source_id |
| 171 |
} else {
|
|
| 172 | ! |
parent_id <- source_ids[[s$name]] |
| 173 |
} |
|
| 174 | ! |
if (is.null(s$location_url)) {
|
| 175 | ! |
source_id <- source_ids[[s$name]] |
| 176 |
} else {
|
|
| 177 | ! |
if (is.null(source_ids[[s$location_url]])) {
|
| 178 | ! |
source_id <- paste0("s", length(source_ids))
|
| 179 | ! |
source_ids[[s$location_url]] <- source_id |
| 180 | ! |
relationships <- unique(c( |
| 181 | ! |
relationships, |
| 182 | ! |
paste0( |
| 183 | ! |
parent_id, |
| 184 |
"---", |
|
| 185 | ! |
source_id, |
| 186 |
'["', |
|
| 187 | ! |
make_link(s$location_url, s$location), |
| 188 |
'"]' |
|
| 189 |
) |
|
| 190 |
)) |
|
| 191 | ! |
sources[[parent_id]]$specific <- c( |
| 192 | ! |
sources[[parent_id]]$specific, |
| 193 | ! |
source_id |
| 194 |
) |
|
| 195 |
} else {
|
|
| 196 | ! |
source_id <- source_ids[[s$location_url]] |
| 197 |
} |
|
| 198 |
} |
|
| 199 | ! |
relationships <- unique(c( |
| 200 | ! |
relationships, |
| 201 | ! |
paste0(source_id, " --> n", node_id) |
| 202 |
)) |
|
| 203 |
} |
|
| 204 |
} |
|
| 205 | 3x |
failed <- is.null(report$source_times[[name]]) |
| 206 | 3x |
contents <- c( |
| 207 | 3x |
contents, |
| 208 | 3x |
paste0( |
| 209 | 3x |
"n", |
| 210 | 3x |
node_id, |
| 211 |
'["`', |
|
| 212 | 3x |
if (is.null(repo)) {
|
| 213 | 3x |
r$filename |
| 214 |
} else {
|
|
| 215 | ! |
make_link( |
| 216 | ! |
paste0( |
| 217 | ! |
"https://github.com/", |
| 218 | ! |
repo, |
| 219 | ! |
"/blob/", |
| 220 | ! |
branch, |
| 221 |
"/", |
|
| 222 | ! |
data_dir, |
| 223 |
"/", |
|
| 224 | ! |
name, |
| 225 | ! |
"/standard/", |
| 226 | ! |
r$filename |
| 227 |
), |
|
| 228 | ! |
r$filename |
| 229 |
) |
|
| 230 |
}, |
|
| 231 | 3x |
if (length(file_issues)) {
|
| 232 | ! |
paste0("<br/><br/>", make_list(unlist(file_issues)))
|
| 233 |
}, |
|
| 234 | 3x |
if (failed) {
|
| 235 | ! |
paste0( |
| 236 | ! |
if (length(file_issues)) "<br />" else "<br /><br />", |
| 237 | ! |
"Script Failed:<br />", |
| 238 | ! |
gsub( |
| 239 |
'[`"]', |
|
| 240 |
"'", |
|
| 241 | ! |
paste(report$logs[[name]], collapse = "<br />") |
| 242 |
) |
|
| 243 |
) |
|
| 244 |
}, |
|
| 245 | 3x |
paste0( |
| 246 |
'`"]:::', |
|
| 247 | 3x |
if (failed) {
|
| 248 | ! |
"fail" |
| 249 | 3x |
} else if (length(file_issues)) {
|
| 250 | ! |
"warn" |
| 251 |
} else {
|
|
| 252 | 3x |
"pass" |
| 253 |
} |
|
| 254 |
) |
|
| 255 |
) |
|
| 256 |
) |
|
| 257 |
} |
|
| 258 |
} |
|
| 259 |
} |
|
| 260 | 5x |
projects <- c( |
| 261 | 5x |
projects, |
| 262 | 5x |
c( |
| 263 | 5x |
paste0( |
| 264 | 5x |
"subgraph ", |
| 265 | 5x |
name, |
| 266 |
'["`', |
|
| 267 | 5x |
if (is.null(repo)) {
|
| 268 | 5x |
name |
| 269 |
} else {
|
|
| 270 | ! |
make_link( |
| 271 | ! |
paste0( |
| 272 | ! |
"https://github.com/", |
| 273 | ! |
repo, |
| 274 | ! |
"/tree/", |
| 275 | ! |
branch, |
| 276 |
"/", |
|
| 277 | ! |
data_dir, |
| 278 |
"/", |
|
| 279 | ! |
name |
| 280 |
), |
|
| 281 | ! |
name |
| 282 |
) |
|
| 283 |
}, |
|
| 284 |
'`"]' |
|
| 285 |
), |
|
| 286 | 5x |
paste0(indent, c("direction LR", contents)),
|
| 287 | 5x |
"end" |
| 288 |
) |
|
| 289 |
) |
|
| 290 |
} |
|
| 291 | 3x |
out <- c( |
| 292 | 3x |
"```mermaid", |
| 293 | 3x |
"flowchart LR", |
| 294 | 3x |
paste0( |
| 295 | 3x |
indent, |
| 296 | 3x |
c( |
| 297 | 3x |
d, |
| 298 | 3x |
vapply( |
| 299 | 3x |
sources, |
| 300 | 3x |
function(s) paste(c(s$id, '(("', s$general, '"))'), collapse = ""),
|
| 301 |
"" |
|
| 302 |
), |
|
| 303 | 3x |
projects, |
| 304 | 3x |
relationships |
| 305 |
) |
|
| 306 |
), |
|
| 307 |
"```" |
|
| 308 |
) |
|
| 309 | 3x |
if (is.character(out_file) && out_file != "") {
|
| 310 | 3x |
writeLines(out, paste0(project_dir, "/", out_file)) |
| 311 |
} |
|
| 312 | 3x |
invisible(out) |
| 313 |
} |
|
| 314 | ||
| 315 |
make_link <- function(url, name = NULL) {
|
|
| 316 | ! |
paste0( |
| 317 | ! |
'<strong><a href="', |
| 318 | ! |
url, |
| 319 | ! |
'" target="_blank" rel="noreferrer">', |
| 320 | ! |
if (is.null(name)) sub("https?://(?:www\\.)?", "", url) else name,
|
| 321 | ! |
"</a></strong>" |
| 322 |
) |
|
| 323 |
} |
|
| 324 | ||
| 325 |
make_list <- function(items) {
|
|
| 326 | ! |
paste0( |
| 327 | ! |
"<ul>", |
| 328 | ! |
paste( |
| 329 | ! |
vapply( |
| 330 | ! |
items, |
| 331 | ! |
function(i) paste0("<li><code>", i, "</code></li>"),
|
| 332 |
"" |
|
| 333 |
), |
|
| 334 | ! |
collapse = "" |
| 335 |
), |
|
| 336 | ! |
"</ul>" |
| 337 |
) |
|
| 338 |
} |
| 1 |
#' Interact with a Source Process File |
|
| 2 |
#' |
|
| 3 |
#' Read or update the current source process file. |
|
| 4 |
#' |
|
| 5 |
#' @param path Path to the process JSON file. |
|
| 6 |
#' @param updated An update version of the process definition. If specified, will |
|
| 7 |
#' write this as the new process file, rather than reading any existing file. |
|
| 8 |
#' @returns The process definition of the source project. |
|
| 9 |
#' @examples |
|
| 10 |
#' epic_process_file <- "../../data/epic/process.json" |
|
| 11 |
#' if (file.exists(epic_process_file)) {
|
|
| 12 |
#' dcf_process_record(path = epic_process_file) |
|
| 13 |
#' } |
|
| 14 |
#' @export |
|
| 15 | ||
| 16 |
dcf_process_record <- function(path = "process.json", updated = NULL) {
|
|
| 17 | 33x |
if (is.null(updated)) {
|
| 18 | 20x |
if (!file.exists(path)) {
|
| 19 | ! |
cli::cli_abort("process file {path} does not exist")
|
| 20 |
} |
|
| 21 | 20x |
jsonlite::read_json(path) |
| 22 |
} else {
|
|
| 23 | ! |
if (is.null(updated$type)) updated$type <- "source" |
| 24 | 13x |
jsonlite::write_json(updated, path, auto_unbox = TRUE, pretty = TRUE) |
| 25 | 13x |
updated |
| 26 |
} |
|
| 27 |
} |
| 1 |
#' Initialize a Data Collection Project |
|
| 2 |
#' |
|
| 3 |
#' Establishes a new data collection framework project. |
|
| 4 |
#' |
|
| 5 |
#' @param name Name of the source. Defaults to the current directory name. |
|
| 6 |
#' @param base_dir Path to the parent of the project directory (where the \code{name}
|
|
| 7 |
#' directory should be created). If \code{name} is not specified, will treat the current
|
|
| 8 |
#' directory as \code{name}, and \code{".."} as \code{base_dir}.
|
|
| 9 |
#' @param data_dir Name of the directory to store projects in, relative to \code{base_dir}.
|
|
| 10 |
#' @param github_account Name of the GitHub account that will host the repository. |
|
| 11 |
#' @param branch Name of the repository's branch. |
|
| 12 |
#' @param repo_name Name of the repository. |
|
| 13 |
#' @param use_git Logical; if \code{TRUE}, will initialize a git repository.
|
|
| 14 |
#' @param open_after Logical; if \code{TRUE}, will open the project in a new RStudio instance.
|
|
| 15 |
#' @returns Nothing; creates default files and directories. |
|
| 16 |
#' @section Data Collection Project: |
|
| 17 |
#' |
|
| 18 |
#' A data collection project starts with a \code{settings.json} file, which
|
|
| 19 |
#' specifies where source and bundle projects live (a \code{data} subdirectory by default).
|
|
| 20 |
#' |
|
| 21 |
#' The bulk of the project will then be in the source and bundle projects, as created |
|
| 22 |
#' by the \code{\link{dcf_add_source}} and \code{\link{dcf_add_bundle}}.
|
|
| 23 |
#' |
|
| 24 |
#' Once these sub-projects are in place, they can be operated over by the |
|
| 25 |
#' \code{\link{dcf_build}}, which processes each sub-project using
|
|
| 26 |
#' \code{\link{dcf_process}}, and checks them with \code{\link{dcf_check}},
|
|
| 27 |
#' resulting in a report. |
|
| 28 |
#' |
|
| 29 |
#' @examples |
|
| 30 |
#' base_dir <- tempdir() |
|
| 31 |
#' dcf_init("project_name", base_dir)
|
|
| 32 |
#' list.files(paste0(base_dir, "/project_name")) |
|
| 33 |
#' |
|
| 34 |
#' @export |
|
| 35 | ||
| 36 |
dcf_init <- function( |
|
| 37 |
name, |
|
| 38 |
base_dir = ".", |
|
| 39 |
data_dir = "data", |
|
| 40 |
github_account = "", |
|
| 41 |
branch = "main", |
|
| 42 |
repo_name = name, |
|
| 43 |
use_git = TRUE, |
|
| 44 |
open_after = FALSE |
|
| 45 |
) {
|
|
| 46 | 1x |
if (missing(name)) {
|
| 47 | ! |
name <- basename(getwd()) |
| 48 | ! |
base_dir <- ".." |
| 49 |
} else {
|
|
| 50 | 1x |
name <- gsub("[^A-Za-z0-9]+", "_", name)
|
| 51 |
} |
|
| 52 | 1x |
base_path <- paste0(base_dir, "/", name, "/") |
| 53 | 1x |
dir.create(base_path, showWarnings = FALSE, recursive = TRUE) |
| 54 | 1x |
paths <- paste0( |
| 55 | 1x |
base_path, |
| 56 | 1x |
c( |
| 57 | 1x |
"project.Rproj", |
| 58 | 1x |
"settings.json", |
| 59 | 1x |
"README.md", |
| 60 | 1x |
"scripts/build.R", |
| 61 | 1x |
".github/workflows/build.yaml", |
| 62 | 1x |
".gitignore" |
| 63 |
) |
|
| 64 |
) |
|
| 65 |
if ( |
|
| 66 | 1x |
!file.exists(paths[[1L]]) && !length(list.files(base_path, "\\.Rproj$")) |
| 67 |
) {
|
|
| 68 | 1x |
writeLines("Version: 1.0\n", paths[[1L]])
|
| 69 |
} |
|
| 70 | 1x |
if (!file.exists(paths[[2L]])) {
|
| 71 | 1x |
jsonlite::write_json( |
| 72 | 1x |
list( |
| 73 | 1x |
name = name, |
| 74 | 1x |
data_dir = data_dir, |
| 75 | 1x |
github_account = github_account, |
| 76 | 1x |
branch = "main", |
| 77 | 1x |
repo_name = repo_name |
| 78 |
), |
|
| 79 | 1x |
paths[[2L]], |
| 80 | 1x |
auto_unbox = TRUE, |
| 81 | 1x |
pretty = TRUE |
| 82 |
) |
|
| 83 |
} |
|
| 84 | 1x |
if (!file.exists(paths[[3L]])) {
|
| 85 | 1x |
writeLines( |
| 86 | 1x |
paste0( |
| 87 | 1x |
c( |
| 88 | 1x |
paste("#", name),
|
| 89 |
"", |
|
| 90 | 1x |
"This is a Data Collection Framework project, initialized with `dcf::dcf_init`.", |
| 91 |
"", |
|
| 92 | 1x |
"You can us the `dcf` package to check the source projects:", |
| 93 |
"", |
|
| 94 | 1x |
"```R", |
| 95 | 1x |
paste0('dcf::dcf_check_source()'),
|
| 96 |
"```", |
|
| 97 |
"", |
|
| 98 | 1x |
"And process them:", |
| 99 |
"", |
|
| 100 | 1x |
"```R", |
| 101 | 1x |
paste0('dcf::dcf_process()'),
|
| 102 |
"```" |
|
| 103 |
), |
|
| 104 | 1x |
collapse = "\n" |
| 105 |
), |
|
| 106 | 1x |
paths[[3L]] |
| 107 |
) |
|
| 108 |
} |
|
| 109 | 1x |
if (!file.exists(paths[[4L]])) {
|
| 110 | 1x |
dir.create(dirname(paths[[4L]]), showWarnings = FALSE) |
| 111 | 1x |
writeLines( |
| 112 | 1x |
paste(c("library(dcf)", "dcf_build()"), collapse = "\n"),
|
| 113 | 1x |
paths[[4L]] |
| 114 |
) |
|
| 115 |
} |
|
| 116 | 1x |
if (!file.exists(paths[[5L]])) {
|
| 117 | 1x |
dir.create(dirname(paths[[5L]]), recursive = TRUE, showWarnings = FALSE) |
| 118 | 1x |
file.copy(system.file("workflows/build.yaml", package = "dcf"), paths[[5L]])
|
| 119 |
} |
|
| 120 | 1x |
if (!file.exists(paths[[6L]])) {
|
| 121 | 1x |
writeLines( |
| 122 | 1x |
paste( |
| 123 | 1x |
c( |
| 124 | 1x |
"*.Rproj", |
| 125 | 1x |
".Rproj.user", |
| 126 | 1x |
"*.Rprofile", |
| 127 | 1x |
"*.Rhistory", |
| 128 | 1x |
"*.Rdata", |
| 129 | 1x |
".DS_Store", |
| 130 | 1x |
"renv" |
| 131 |
), |
|
| 132 | 1x |
collapse = "\n" |
| 133 |
), |
|
| 134 | 1x |
paths[[6L]] |
| 135 |
) |
|
| 136 |
} |
|
| 137 | ! |
if (open_after) rstudioapi::openProject(paths[[1L]], newSession = TRUE) |
| 138 |
} |
| 1 |
#' Run a Project's Build Process |
|
| 2 |
#' |
|
| 3 |
#' Build a Data Collection Framework project, |
|
| 4 |
#' which involves processing and checking all data projects. |
|
| 5 |
#' |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project to be built. |
|
| 7 |
#' @param is_auto Logical; if \code{FALSE}, will run \code{\link{dcf_process}} as if it were run
|
|
| 8 |
#' manually. |
|
| 9 |
#' @param ... Passes arguments to \code{\link{dcf_process}}.
|
|
| 10 |
#' @param make_diagram Logical; if \code{FALSE}, will not make a \code{status.md} diagram.
|
|
| 11 |
#' @param make_file_log Logical; if \code{FALSE}, will not make a \code{file_log.json} output.
|
|
| 12 |
#' @returns A version of the project report, which is also written to |
|
| 13 |
#' \code{project_dir/docs/report.json.gz}.
|
|
| 14 |
#' @examples |
|
| 15 |
#' project_file <- "../../../pophive" |
|
| 16 |
#' if (file.exists(project_file)) {
|
|
| 17 |
#' report <- dcf_build(project_file) |
|
| 18 |
#' } |
|
| 19 |
#' @export |
|
| 20 | ||
| 21 |
dcf_build <- function( |
|
| 22 |
project_dir = ".", |
|
| 23 |
is_auto = TRUE, |
|
| 24 |
..., |
|
| 25 |
make_diagram = TRUE, |
|
| 26 |
make_file_log = TRUE |
|
| 27 |
) {
|
|
| 28 | 2x |
settings <- dcf_read_settings(project_dir) |
| 29 | 2x |
data_dir <- paste0(project_dir, "/", settings$data_dir) |
| 30 | 2x |
processes <- list.files( |
| 31 | 2x |
data_dir, |
| 32 | 2x |
"process\\.json", |
| 33 | 2x |
recursive = TRUE, |
| 34 | 2x |
full.names = TRUE |
| 35 |
) |
|
| 36 | 2x |
process_state <- tools::md5sum(processes) |
| 37 | 2x |
process <- dcf_process(project_dir = project_dir, is_auto = TRUE, ...) |
| 38 | 2x |
issues <- dcf_check(project_dir = project_dir) |
| 39 | 2x |
report_file <- paste0(project_dir, "/report.json.gz") |
| 40 |
if ( |
|
| 41 | 2x |
!identical( |
| 42 | 2x |
process_state, |
| 43 | 2x |
tools::md5sum(list.files( |
| 44 | 2x |
data_dir, |
| 45 | 2x |
"process\\.json", |
| 46 | 2x |
recursive = TRUE, |
| 47 | 2x |
full.names = TRUE |
| 48 |
)) |
|
| 49 |
) |
|
| 50 |
) {
|
|
| 51 | 2x |
datapackages <- list.files( |
| 52 | 2x |
data_dir, |
| 53 | 2x |
"datapackage\\.json", |
| 54 | 2x |
recursive = TRUE, |
| 55 | 2x |
full.names = TRUE |
| 56 |
) |
|
| 57 | 2x |
names(datapackages) <- dirname(sub( |
| 58 |
"^/", |
|
| 59 |
"", |
|
| 60 | 2x |
sub(data_dir, "", datapackages, fixed = TRUE) |
| 61 |
)) |
|
| 62 | 2x |
names(processes) <- dirname(sub( |
| 63 |
"^/", |
|
| 64 |
"", |
|
| 65 | 2x |
sub(data_dir, "", processes, fixed = TRUE) |
| 66 |
)) |
|
| 67 | 2x |
report <- list( |
| 68 | 2x |
date = Sys.time(), |
| 69 | 2x |
settings = settings, |
| 70 | 2x |
source_times = process$timings, |
| 71 | 2x |
logs = process$logs, |
| 72 | 2x |
issues = issues, |
| 73 | 2x |
metadata = lapply(datapackages, jsonlite::read_json), |
| 74 | 2x |
processes = lapply(processes, jsonlite::read_json) |
| 75 |
) |
|
| 76 | 2x |
jsonlite::write_json( |
| 77 | 2x |
report, |
| 78 | 2x |
gzfile(report_file), |
| 79 | 2x |
auto_unbox = TRUE, |
| 80 | 2x |
dataframe = "columns" |
| 81 |
) |
|
| 82 |
} else {
|
|
| 83 | ! |
report <- jsonlite::read_json(report_file) |
| 84 |
} |
|
| 85 | 2x |
if (make_file_log) {
|
| 86 | 2x |
file_log <- list() |
| 87 | 2x |
for (file_dir in names(report$metadata)) {
|
| 88 | 3x |
if (grepl("/dist", file_dir, fixed = TRUE)) {
|
| 89 | 1x |
p <- report$metadata[[file_dir]] |
| 90 | 1x |
for (p_file in p$resources) {
|
| 91 | 1x |
file_log[[paste0( |
| 92 | 1x |
settings$data_dir, |
| 93 |
"/", |
|
| 94 | 1x |
file_dir, |
| 95 |
"/", |
|
| 96 | 1x |
p_file$filename |
| 97 | 1x |
)]] <- list( |
| 98 | 1x |
updated = if (length(p_file$vintage)) p_file$vintage else |
| 99 | 1x |
p_file$last_modified, |
| 100 | 1x |
md5 = p_file$md5 |
| 101 |
) |
|
| 102 |
} |
|
| 103 |
} |
|
| 104 |
} |
|
| 105 | 2x |
jsonlite::write_json( |
| 106 | 2x |
file_log, |
| 107 | 2x |
paste0(project_dir, "/file_log.json"), |
| 108 | 2x |
auto_unbox = TRUE |
| 109 |
) |
|
| 110 |
} |
|
| 111 | 2x |
if (make_diagram) {
|
| 112 | 2x |
dcf_status_diagram(project_dir) |
| 113 |
} |
|
| 114 | 2x |
invisible(report) |
| 115 |
} |
| 1 |
#' Download Data from the CDC |
|
| 2 |
#' |
|
| 3 |
#' Download data and metadata from the Centers for Disease Control and Prevention (CDC). |
|
| 4 |
#' |
|
| 5 |
#' @param id ID of the resource (e.g., \code{ijqb-a7ye}).
|
|
| 6 |
#' @param out_dir Directory in which to save the metadata and data files. |
|
| 7 |
#' @param state The state ID of a previous download; if provided, will only download if the |
|
| 8 |
#' new state does not match. |
|
| 9 |
#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet.
|
|
| 10 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 11 |
#' @returns The state ID of the downloaded files; |
|
| 12 |
#' downloads files (\code{<id>.json} and \code{<id>.csv.xz}) to \code{out_dir}
|
|
| 13 |
#' @section \code{data.cdc.gov} URLs:
|
|
| 14 |
#' |
|
| 15 |
#' For each resource ID, there are 3 relevant CDC URLs: |
|
| 16 |
#' \itemize{
|
|
| 17 |
#' \item \strong{\code{resource/<id>}}: This redirects to the resource's main page,
|
|
| 18 |
#' with displayed metadata and a data preview |
|
| 19 |
#' (e.g., \href{https://data.cdc.gov/resource/ijqb-a7ye}{data.cdc.gov/resource/ijqb-a7ye}).
|
|
| 20 |
#' \item \strong{\code{api/views/<id>}}: This is a direct link to the underlying
|
|
| 21 |
#' JSON metadata (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye}{data.cdc.gov/api/views/ijqb-a7ye}).
|
|
| 22 |
#' \item \strong{\code{api/views/<id>/rows.csv}}: This is a direct link to the full
|
|
| 23 |
#' CSV dataset (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye/rows.csv}{data.cdc.gov/api/views/ijqb-a7ye/rows.csv}).
|
|
| 24 |
#' } |
|
| 25 |
#' |
|
| 26 |
#' @examples |
|
| 27 |
#' \dontrun{
|
|
| 28 |
#' dcf_download_cdc("ijqb-a7ye")
|
|
| 29 |
#' } |
|
| 30 |
#' @export |
|
| 31 | ||
| 32 |
dcf_download_cdc <- function( |
|
| 33 |
id, |
|
| 34 |
out_dir = "raw", |
|
| 35 |
state = NULL, |
|
| 36 |
parquet = FALSE, |
|
| 37 |
verbose = TRUE |
|
| 38 |
) {
|
|
| 39 | 2x |
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) |
| 40 | 2x |
if (verbose) {
|
| 41 | 2x |
resource_url <- paste0("https://data.cdc.gov/resource/", id)
|
| 42 | 2x |
cli::cli_h1( |
| 43 | 2x |
"downloading resource {.url {resource_url}}"
|
| 44 |
) |
|
| 45 |
} |
|
| 46 | 2x |
url <- paste0("https://data.cdc.gov/api/views/", id)
|
| 47 | 2x |
initial_timeout <- options(timeout = 99999L)$timeout |
| 48 | 2x |
on.exit(options(timeout = initial_timeout)) |
| 49 | 2x |
if (verbose) {
|
| 50 | 2x |
cli::cli_progress_step("metadata: {.url {url}}")
|
| 51 |
} |
|
| 52 | 2x |
metadata_file <- paste0(tempdir(), "/", id, ".json") |
| 53 | 2x |
status <- utils::download.file(url, metadata_file, quiet = TRUE) |
| 54 | 2x |
if (status != 0L) {
|
| 55 | ! |
cli::cli_abort("failed to download metadata")
|
| 56 |
} |
|
| 57 | 2x |
metadata <- jsonlite::read_json(metadata_file) |
| 58 | 2x |
new_state <- if (is.null(metadata$rowsUpdatedAt)) {
|
| 59 | ! |
as.list(tools::md5sum(metadata_file)) |
| 60 |
} else {
|
|
| 61 | 2x |
metadata$rowsUpdatedAt |
| 62 |
} |
|
| 63 | 2x |
if (!identical(new_state, state)) {
|
| 64 | 2x |
file.rename(metadata_file, paste0(out_dir, "/", id, ".json")) |
| 65 | 2x |
data_url <- paste0(url, "/rows.csv") |
| 66 | 2x |
out_path <- paste0(out_dir, "/", id, ".csv") |
| 67 | 2x |
if (verbose) {
|
| 68 | 2x |
cli::cli_progress_step("data: {.url {data_url}}")
|
| 69 |
} |
|
| 70 | 2x |
status <- utils::download.file(data_url, out_path, quiet = TRUE) |
| 71 | 2x |
if (status != 0L) {
|
| 72 | ! |
cli::cli_abort("failed to download data")
|
| 73 |
} |
|
| 74 | 2x |
if (parquet) {
|
| 75 | 1x |
if (verbose) {
|
| 76 | 1x |
cli::cli_progress_step("writing to Parquet")
|
| 77 |
} |
|
| 78 | 1x |
data <- arrow::read_csv_arrow(out_path) |
| 79 | 1x |
arrow::write_parquet( |
| 80 | 1x |
data, |
| 81 | 1x |
compression = "gzip", |
| 82 | 1x |
sub(".csv", ".parquet", out_path, fixed = TRUE)
|
| 83 |
) |
|
| 84 | 1x |
unlink(out_path) |
| 85 |
} else {
|
|
| 86 | 1x |
if (verbose) {
|
| 87 | 1x |
cli::cli_progress_step("compressing data")
|
| 88 |
} |
|
| 89 | 1x |
unlink(paste0(out_path, ".xz")) |
| 90 | 1x |
status <- system2("xz", c("-f", out_path))
|
| 91 | 1x |
if (status != 0L) {
|
| 92 | ! |
cli::cli_abort("failed to compress data")
|
| 93 |
} |
|
| 94 |
} |
|
| 95 | 2x |
if (verbose) {
|
| 96 | 2x |
cli::cli_progress_done() |
| 97 |
} |
|
| 98 | 2x |
invisible(new_state) |
| 99 |
} else {
|
|
| 100 | ! |
unlink(metadata_file) |
| 101 | ! |
invisible(state) |
| 102 |
} |
|
| 103 |
} |
| 1 |
#' Adds a Source Project |
|
| 2 |
#' |
|
| 3 |
#' Establishes a new data source project, used to collect and prepare data from a new source. |
|
| 4 |
#' |
|
| 5 |
#' @param name Name of the source. |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
| 7 |
#' @param open_after Logical; if \code{FALSE}, will not open the project.
|
|
| 8 |
#' @returns Nothing; creates default files and directories. |
|
| 9 |
#' @section Project: |
|
| 10 |
#' |
|
| 11 |
#' Within a source project, there are two files to edits: |
|
| 12 |
#' \itemize{
|
|
| 13 |
#' \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun.
|
|
| 14 |
#' It should store raw data and resources in \code{raw/} where possible,
|
|
| 15 |
#' then use what's in \code{raw/} to produce standard-format files in \code{standard/}.
|
|
| 16 |
#' This file is sourced from its location during processing, so any system paths |
|
| 17 |
#' must be relative to itself. |
|
| 18 |
#' \item \strong{\code{measure_info.json}}: This is where you can record information
|
|
| 19 |
#' about the variables included in the standardized data files. |
|
| 20 |
#' See \code{\link{dcf_measure_info}}.
|
|
| 21 |
#' } |
|
| 22 |
#' |
|
| 23 |
#' @examples |
|
| 24 |
#' project_dir <- paste0(tempdir(), "/temp_project") |
|
| 25 |
#' dcf_init("temp_project", dirname(project_dir))
|
|
| 26 |
#' dcf_add_source("source_name", project_dir)
|
|
| 27 |
#' list.files(paste0(project_dir, "/data/source_name")) |
|
| 28 |
#' |
|
| 29 |
#' @export |
|
| 30 | ||
| 31 |
dcf_add_source <- function( |
|
| 32 |
name, |
|
| 33 |
project_dir = ".", |
|
| 34 |
open_after = interactive() |
|
| 35 |
) {
|
|
| 36 | 5x |
if (missing(name)) {
|
| 37 | ! |
cli::cli_abort("specify a name")
|
| 38 |
} |
|
| 39 | 5x |
name <- gsub("[^A-Za-z0-9]+", "_", name)
|
| 40 | 5x |
settings <- dcf_read_settings(project_dir) |
| 41 | 5x |
base_dir <- paste0(project_dir, "/", settings$data_dir) |
| 42 | 5x |
base_path <- paste0(base_dir, "/", name, "/") |
| 43 | 5x |
dir.create(paste0(base_path, "raw"), showWarnings = FALSE, recursive = TRUE) |
| 44 | 5x |
dir.create(paste0(base_path, "standard"), showWarnings = FALSE) |
| 45 | 5x |
paths <- paste0( |
| 46 | 5x |
base_path, |
| 47 | 5x |
c( |
| 48 | 5x |
"measure_info.json", |
| 49 | 5x |
"ingest.R", |
| 50 | 5x |
"project.Rproj", |
| 51 | 5x |
"standard/datapackage.json", |
| 52 | 5x |
"process.json", |
| 53 | 5x |
"README.md" |
| 54 |
) |
|
| 55 |
) |
|
| 56 | 5x |
if (!file.exists(paths[[1L]])) {
|
| 57 | 1x |
dcf_measure_info( |
| 58 | 1x |
paths[[1L]], |
| 59 | 1x |
example_variable = list(), |
| 60 | 1x |
verbose = FALSE, |
| 61 | 1x |
open_after = FALSE |
| 62 |
) |
|
| 63 |
} |
|
| 64 | 5x |
if (!file.exists(paths[[2L]])) {
|
| 65 | 1x |
writeLines( |
| 66 | 1x |
paste0( |
| 67 | 1x |
c( |
| 68 |
"#", |
|
| 69 | 1x |
"# Download", |
| 70 |
"#", |
|
| 71 |
"", |
|
| 72 | 1x |
"# add files to the `raw` directory", |
| 73 |
"", |
|
| 74 |
"#", |
|
| 75 | 1x |
"# Reformat", |
| 76 |
"#", |
|
| 77 |
"", |
|
| 78 | 1x |
"# read from the `raw` directory, and write to the `standard` directory", |
| 79 |
"" |
|
| 80 |
), |
|
| 81 | 1x |
collapse = "\n" |
| 82 |
), |
|
| 83 | 1x |
paths[[2L]] |
| 84 |
) |
|
| 85 |
} |
|
| 86 | 5x |
if (!file.exists(paths[[3L]])) {
|
| 87 | 1x |
writeLines("Version: 1.0\n", paths[[3L]])
|
| 88 |
} |
|
| 89 | 5x |
if (!file.exists(paths[[4L]])) {
|
| 90 | 1x |
dcf_datapackage_init( |
| 91 | 1x |
name, |
| 92 | 1x |
dir = paste0(base_path, "standard"), |
| 93 | 1x |
quiet = TRUE |
| 94 |
) |
|
| 95 |
} |
|
| 96 | ||
| 97 | 5x |
if (!file.exists(paths[[5L]])) {
|
| 98 | 1x |
dcf_process_record( |
| 99 | 1x |
paths[[5L]], |
| 100 | 1x |
list( |
| 101 | 1x |
name = name, |
| 102 | 1x |
type = "source", |
| 103 | 1x |
scripts = list( |
| 104 | 1x |
list( |
| 105 | 1x |
path = "ingest.R", |
| 106 | 1x |
manual = FALSE, |
| 107 | 1x |
frequency = 0L, |
| 108 | 1x |
last_run = "", |
| 109 | 1x |
run_time = "", |
| 110 | 1x |
last_status = list(log = "", success = TRUE) |
| 111 |
) |
|
| 112 |
), |
|
| 113 | 1x |
checked = "", |
| 114 | 1x |
check_results = list() |
| 115 |
) |
|
| 116 |
) |
|
| 117 |
} |
|
| 118 | 5x |
if (!file.exists(paths[[6L]])) {
|
| 119 | 1x |
writeLines( |
| 120 | 1x |
paste0( |
| 121 | 1x |
c( |
| 122 | 1x |
paste("#", name),
|
| 123 |
"", |
|
| 124 | 1x |
"This is a dcf data source project, initialized with `dcf::dcf_add_source`.", |
| 125 |
"", |
|
| 126 | 1x |
"You can us the `dcf` package to check the project:", |
| 127 |
"", |
|
| 128 | 1x |
"```R", |
| 129 | 1x |
paste0('dcf_check_source("', name, '", "..")'),
|
| 130 |
"```", |
|
| 131 |
"", |
|
| 132 | 1x |
"And process it:", |
| 133 |
"", |
|
| 134 | 1x |
"```R", |
| 135 | 1x |
paste0('dcf_process("', name, '", "..")'),
|
| 136 |
"```" |
|
| 137 |
), |
|
| 138 | 1x |
collapse = "\n" |
| 139 |
), |
|
| 140 | 1x |
paths[[6L]] |
| 141 |
) |
|
| 142 |
} |
|
| 143 | ! |
if (open_after) rstudioapi::openProject(paths[[3L]], newSession = TRUE) |
| 144 |
} |
| 1 |
dcf_read_settings <- function(project_dir = ".") {
|
|
| 2 | 13x |
settings_file <- paste0(project_dir, "/settings.json") |
| 3 | 13x |
if (!file.exists(settings_file)) {
|
| 4 | ! |
cli::cli_abort( |
| 5 | ! |
"{.arg project_dir} ({project_dir}) does not appear to be a Data Collection Framework project"
|
| 6 |
) |
|
| 7 |
} |
|
| 8 | 13x |
jsonlite::read_json(settings_file) |
| 9 |
} |
| 1 |
#' Download Census Population Data |
|
| 2 |
#' |
|
| 3 |
#' Download American Community Survey population data from the U.S. Census Bureau. |
|
| 4 |
#' |
|
| 5 |
#' @param year Data year. |
|
| 6 |
#' @param out_dir Directory to download the file to. |
|
| 7 |
#' @param state_only Logical; if \code{TRUE}, will only load state data.
|
|
| 8 |
#' Will still download county data. |
|
| 9 |
#' @param age_groups A list mapping lower-level age groups to high-level ones |
|
| 10 |
#' (e.g., \code{list(`<10 Years` = c("Under 5 years", "5 to 9 years"))}).
|
|
| 11 |
#' Or the name of a standard mapping (\code{"7"} or \code{"9"}).
|
|
| 12 |
#' If \code{FALSE}, will return the lowest-level age groups.
|
|
| 13 |
#' @param overwrite Logical; if \code{TRUE}, will re-download and overwrite existing data.
|
|
| 14 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 15 |
#' @returns A \code{data.frame} including \code{GEOID} and \code{region_name}
|
|
| 16 |
#' for states and counties, along with their population, in total and within |
|
| 17 |
#' age brackets. |
|
| 18 |
#' @examples |
|
| 19 |
#' if (file.exists("../../../pophive/census_population_2021.csv.xz")) {
|
|
| 20 |
#' dcf_load_census(2021, "../../../pophive")[1:10, ] |
|
| 21 |
#' } |
|
| 22 |
#' @export |
|
| 23 | ||
| 24 |
dcf_load_census <- function( |
|
| 25 |
year = 2021, |
|
| 26 |
out_dir = NULL, |
|
| 27 |
state_only = FALSE, |
|
| 28 |
age_groups = "9", |
|
| 29 |
overwrite = FALSE, |
|
| 30 |
verbose = TRUE |
|
| 31 |
) {
|
|
| 32 | 2x |
out_file <- paste0(out_dir, "/census_population_", year, ".csv.xz") |
| 33 | 2x |
write_out <- !is.null(out_dir) |
| 34 | 2x |
age_group_sets <- list( |
| 35 | 2x |
"7" = list( |
| 36 | 2x |
`<10 Years` = c("Under 5 years", "5 to 9 years"),
|
| 37 | 2x |
`10-14 Years` = "10 to 14 years", |
| 38 | 2x |
`15-19 Years` = c("15 to 17 years", "18 and 19 years"),
|
| 39 | 2x |
`20-39 Years` = c( |
| 40 | 2x |
"20 years", |
| 41 | 2x |
"21 years", |
| 42 | 2x |
"22 to 24 years", |
| 43 | 2x |
"25 to 29 years", |
| 44 | 2x |
"30 to 34 years", |
| 45 | 2x |
"35 to 39 years" |
| 46 |
), |
|
| 47 | 2x |
`40-64 Years` = c( |
| 48 | 2x |
"40 to 44 years", |
| 49 | 2x |
"45 to 49 years", |
| 50 | 2x |
"50 to 54 years", |
| 51 | 2x |
"55 to 59 years", |
| 52 | 2x |
"60 and 61 years", |
| 53 | 2x |
"62 to 64 years" |
| 54 |
), |
|
| 55 | 2x |
`65+ Years` = c( |
| 56 | 2x |
"65 and 66 years", |
| 57 | 2x |
"67 to 69 years", |
| 58 | 2x |
"70 to 74 years", |
| 59 | 2x |
"75 to 79 years", |
| 60 | 2x |
"80 to 84 years", |
| 61 | 2x |
"85 years and over" |
| 62 |
) |
|
| 63 |
), |
|
| 64 | 2x |
"9" = list( |
| 65 | 2x |
`<10 Years` = c("Under 5 years", "5 to 9 years"),
|
| 66 | 2x |
`10-18 Years` = c("10 to 14 years", "15 to 17 years"),
|
| 67 | 2x |
`18-24 Years` = c( |
| 68 | 2x |
"18 and 19 years", |
| 69 | 2x |
"20 years", |
| 70 | 2x |
"21 years", |
| 71 | 2x |
"22 to 24 years" |
| 72 |
), |
|
| 73 | 2x |
`25-34 Years` = c("25 to 29 years", "30 to 34 years"),
|
| 74 | 2x |
`35-44 Years` = c("35 to 39 years", "40 to 44 years"),
|
| 75 | 2x |
`45-54 Years` = c("45 to 49 years", "50 to 54 years"),
|
| 76 | 2x |
`55-64 Years` = c( |
| 77 | 2x |
"55 to 59 years", |
| 78 | 2x |
"60 and 61 years", |
| 79 | 2x |
"62 to 64 years" |
| 80 |
), |
|
| 81 | 2x |
`65+ Years` = c( |
| 82 | 2x |
"65 and 66 years", |
| 83 | 2x |
"67 to 69 years", |
| 84 | 2x |
"70 to 74 years", |
| 85 | 2x |
"75 to 79 years", |
| 86 | 2x |
"80 to 84 years", |
| 87 | 2x |
"85 years and over" |
| 88 |
) |
|
| 89 |
) |
|
| 90 |
) |
|
| 91 | 2x |
age_levels <- unlist(age_group_sets[[1L]]) |
| 92 | 2x |
if (!overwrite && write_out && file.exists(out_file)) {
|
| 93 | 1x |
if (verbose) {
|
| 94 | 1x |
cli::cli_progress_step("reading in existing file")
|
| 95 |
} |
|
| 96 | 1x |
pop <- as.data.frame(vroom::vroom( |
| 97 | 1x |
out_file, |
| 98 | 1x |
delim = ",", |
| 99 | 1x |
col_types = list(GEOID = "c", region_name = "c"), |
| 100 | 1x |
n_max = if (state_only) 52L else Inf |
| 101 |
)) |
|
| 102 |
} else {
|
|
| 103 |
# GEOID to region name mapping |
|
| 104 | 1x |
id_url <- "https://www2.census.gov/geo/docs/reference/codes2020/national_" |
| 105 | 1x |
if (verbose) {
|
| 106 | 1x |
cli::cli_progress_step("downloading state IDs map")
|
| 107 |
} |
|
| 108 | 1x |
state_ids <- vroom::vroom( |
| 109 | 1x |
paste0(id_url, "state2020.txt"), |
| 110 | 1x |
delim = "|", |
| 111 | 1x |
col_types = list( |
| 112 | 1x |
STATE = "c", |
| 113 | 1x |
STATEFP = "c", |
| 114 | 1x |
STATENS = "c", |
| 115 | 1x |
STATE_NAME = "c" |
| 116 |
) |
|
| 117 |
) |
|
| 118 | 1x |
if (verbose) {
|
| 119 | 1x |
cli::cli_progress_step("downloading county IDs map")
|
| 120 |
} |
|
| 121 | 1x |
county_ids <- vroom::vroom( |
| 122 | 1x |
paste0(id_url, "county2020.txt"), |
| 123 | 1x |
delim = "|", |
| 124 | 1x |
col_types = list( |
| 125 | 1x |
STATE = "c", |
| 126 | 1x |
STATEFP = "c", |
| 127 | 1x |
COUNTYFP = "c", |
| 128 | 1x |
COUNTYNS = "c", |
| 129 | 1x |
COUNTYNAME = "c", |
| 130 | 1x |
CLASSFP = "c", |
| 131 | 1x |
FUNCSTAT = "c" |
| 132 |
) |
|
| 133 |
) |
|
| 134 | 1x |
region_name = structure( |
| 135 | 1x |
sub( |
| 136 | 1x |
" County", |
| 137 |
"", |
|
| 138 | 1x |
c( |
| 139 | 1x |
state_ids$STATE_NAME, |
| 140 | 1x |
paste0(county_ids$COUNTYNAME, ", ", county_ids$STATE) |
| 141 |
), |
|
| 142 | 1x |
fixed = TRUE |
| 143 |
), |
|
| 144 | 1x |
names = c( |
| 145 | 1x |
state_ids$STATEFP, |
| 146 | 1x |
paste0(county_ids$STATEFP, county_ids$COUNTYFP) |
| 147 |
) |
|
| 148 |
) |
|
| 149 | ||
| 150 |
# population data |
|
| 151 | ||
| 152 |
## age group labels from IDs |
|
| 153 | 1x |
if (verbose) {
|
| 154 | 1x |
cli::cli_progress_step("downloading ACS variable lables")
|
| 155 |
} |
|
| 156 | 1x |
labels <- vroom::vroom( |
| 157 | 1x |
paste0( |
| 158 | 1x |
"https://www2.census.gov/programs-surveys/acs/summary_file/", |
| 159 | 1x |
min(2021L, year), |
| 160 | 1x |
"/sequence-based-SF/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt" |
| 161 |
), |
|
| 162 | 1x |
delim = ",", |
| 163 | 1x |
col_types = list( |
| 164 | 1x |
`File ID` = "c", |
| 165 | 1x |
`Table ID` = "c", |
| 166 | 1x |
`Sequence Number` = "c", |
| 167 | 1x |
`Line Number` = "d", |
| 168 | 1x |
`Start Position` = "i", |
| 169 | 1x |
`Total Cells in Table` = "c", |
| 170 | 1x |
`Total Cells in Sequence` = "i", |
| 171 | 1x |
`Table Title` = "c", |
| 172 | 1x |
`Subject Area` = "c" |
| 173 |
) |
|
| 174 |
) |
|
| 175 | 1x |
variable_labels <- structure( |
| 176 | 1x |
labels$`Table Title`, |
| 177 | 1x |
names = paste0( |
| 178 | 1x |
labels$`Table ID`, |
| 179 | 1x |
"_E", |
| 180 | 1x |
formatC(labels$`Line Number`, width = 3L, flag = 0L) |
| 181 |
) |
|
| 182 |
) |
|
| 183 | ||
| 184 |
## age group counts |
|
| 185 | 1x |
url <- paste0( |
| 186 | 1x |
"https://www2.census.gov/programs-surveys/acs/summary_file/", |
| 187 | 1x |
year, |
| 188 | 1x |
"/table-based-SF/data/5YRData/acsdt5y", |
| 189 | 1x |
year, |
| 190 | 1x |
"-b01001.dat" |
| 191 |
) |
|
| 192 | 1x |
if (verbose) {
|
| 193 | 1x |
cli::cli_progress_step("downloading population data")
|
| 194 |
} |
|
| 195 | 1x |
data <- vroom::vroom(url, delim = "|", col_types = list(GEO_ID = "c")) |
| 196 | 1x |
data <- data[ |
| 197 | 1x |
grep("0[45]00000US", data$GEO_ID),
|
| 198 | 1x |
grep("E", colnames(data), fixed = TRUE)
|
| 199 |
] |
|
| 200 | 1x |
colnames(data)[-1L] <- variable_labels[colnames(data)[-1L]] |
| 201 | 1x |
if (verbose) {
|
| 202 | 1x |
cli::cli_progress_step("agregating across sex and fine age groups")
|
| 203 |
} |
|
| 204 | 1x |
pop <- cbind( |
| 205 | 1x |
data.frame( |
| 206 | 1x |
GEOID = substring(data$GEO_ID, 10L), |
| 207 | 1x |
region_name = "", |
| 208 | 1x |
Total = data[["Total:"]] |
| 209 |
), |
|
| 210 | 1x |
do.call( |
| 211 | 1x |
cbind, |
| 212 | 1x |
lapply( |
| 213 | 1x |
structure(age_levels, names = age_levels), |
| 214 | 1x |
function(l) rowSums(data[, colnames(data) == l]) |
| 215 |
) |
|
| 216 |
) |
|
| 217 |
) |
|
| 218 | 1x |
pop$region_name = region_name[pop$GEOID] |
| 219 | 1x |
states <- pop[1L:52L, ] |
| 220 | 1x |
health_regions <- as.data.frame(do.call( |
| 221 | 1x |
rbind, |
| 222 | 1x |
lapply( |
| 223 | 1x |
split( |
| 224 | 1x |
states[, -(1L:2L)], |
| 225 | 1x |
dcf_to_health_region(states$GEOID, "hhs_") |
| 226 |
), |
|
| 227 | 1x |
colSums |
| 228 |
) |
|
| 229 |
)) |
|
| 230 | 1x |
health_regions$GEOID <- rownames(health_regions) |
| 231 | 1x |
health_regions$region_name <- sub( |
| 232 | 1x |
"hhs_", |
| 233 | 1x |
"Health Region ", |
| 234 | 1x |
rownames(health_regions), |
| 235 | 1x |
fixed = TRUE |
| 236 |
) |
|
| 237 | 1x |
pop <- rbind(pop, health_regions[, colnames(pop)]) |
| 238 | ||
| 239 | 1x |
if (write_out) {
|
| 240 | 1x |
if (verbose) {
|
| 241 | 1x |
cli::cli_progress_step("writing output")
|
| 242 |
} |
|
| 243 | 1x |
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) |
| 244 | 1x |
vroom::vroom_write(pop, out_file, ",") |
| 245 |
} |
|
| 246 |
} |
|
| 247 | ! |
if (is.numeric(age_groups)) age_groups <- as.character(age_groups) |
| 248 | 2x |
if (is.character(age_groups) && (age_groups %in% names(age_group_sets))) |
| 249 | 1x |
age_groups <- age_group_sets[[as.character(age_groups)]] |
| 250 | 2x |
if (is.list(age_groups)) {
|
| 251 | 1x |
pop <- cbind( |
| 252 | 1x |
pop[, !(colnames(pop) %in% age_levels)], |
| 253 | 1x |
do.call( |
| 254 | 1x |
cbind, |
| 255 | 1x |
lapply( |
| 256 | 1x |
age_groups, |
| 257 | 1x |
function(l) rowSums(pop[, colnames(pop) %in% l, drop = FALSE]) |
| 258 |
) |
|
| 259 |
) |
|
| 260 |
) |
|
| 261 |
} |
|
| 262 | 2x |
invisible(if (state_only) pop[1L:52L, ] else pop) |
| 263 |
} |
| 1 |
#' Update renv.lock |
|
| 2 |
#' |
|
| 3 |
#' Updates the \code{renv.lock} file with dependencies found in project scripts.
|
|
| 4 |
#' |
|
| 5 |
#' @param project_dir Directory of the Data Collection project. |
|
| 6 |
#' @param refresh Logical; if \code{FALSE}, will update an existing
|
|
| 7 |
#' \code{renv.lock} file, rather than recreating it.
|
|
| 8 |
#' @returns Nothing; writes an \code{renv.lock} file.
|
|
| 9 |
#' @examples |
|
| 10 |
#' \dontrun{
|
|
| 11 |
#' dcf_update_lock() |
|
| 12 |
#' } |
|
| 13 |
#' @export |
|
| 14 | ||
| 15 |
dcf_update_lock <- function( |
|
| 16 |
project_dir = ".", |
|
| 17 |
refresh = TRUE |
|
| 18 |
) {
|
|
| 19 | 1x |
settings <- dcf_read_settings(project_dir) |
| 20 | 1x |
extra <- unique( |
| 21 | 1x |
renv::dependencies(list.files( |
| 22 | 1x |
paste0(project_dir, "/", settings$data_dir), |
| 23 | 1x |
"\\.[Rr]$", |
| 24 | 1x |
recursive = TRUE, |
| 25 | 1x |
full.names = TRUE |
| 26 | 1x |
))$Package |
| 27 |
) |
|
| 28 | 1x |
not_installed <- !(extra %in% rownames(utils::installed.packages())) |
| 29 | ! |
if (any(not_installed)) utils::install.packages(extra[not_installed]) |
| 30 | 1x |
if (refresh) unlink(paste0(project_dir, "/renv.lock")) |
| 31 | 1x |
renv::snapshot(packages = extra, lockfile = paste0(project_dir, "/renv.lock")) |
| 32 |
} |