| 1 |
#' View Project Variables |
|
| 2 |
#' |
|
| 3 |
#' Get a list of measures (a codebook) that are included in a data collection project. |
|
| 4 |
#' |
|
| 5 |
#' @param project Path to a local project, or the GitHub account and repository name |
|
| 6 |
#' (\code{"{account_name}/{repo_name}"}) of a remote project.
|
|
| 7 |
#' Or a report as returned from \code{\link{dcf_report}}.
|
|
| 8 |
#' @param exclude A character vector of variable names to exclude from the list (usually ID columns). |
|
| 9 |
#' @param ... Additional arguments passed to \code{\link{dcf_report}}.
|
|
| 10 |
#' @returns A tibble containing variables: |
|
| 11 |
#' \tabular{ll}{
|
|
| 12 |
#' \code{name} \tab Name of the variable, as it appears in the data file. \cr
|
|
| 13 |
#' \code{type} \tab The value's storage type. \cr
|
|
| 14 |
#' \code{n} \tab Number of non-missing observations within the file. \cr
|
|
| 15 |
#' \code{duplicates} \tab Number of duplicated values within the file. \cr
|
|
| 16 |
#' \code{missing} \tab Number of missing values within the file. \cr
|
|
| 17 |
#' \code{project_type} \tab The project type, between \code{source} and \code{bundle}. \cr
|
|
| 18 |
#' \code{data_format} \tab The orientation of the data, between \code{wide} and \code{tall}. \cr
|
|
| 19 |
#' \code{file} \tab The file containing the variable; a path relative to the project root. \cr
|
|
| 20 |
#' \code{short_name} \tab Short name, if included in measure info. \cr
|
|
| 21 |
#' \code{long_name} \tab Long name, if included in measure info. \cr
|
|
| 22 |
#' \code{short_decription} \tab Short description, if included in measure info. \cr
|
|
| 23 |
#' \code{long_description} \tab Long description, if included in measure info. \cr
|
|
| 24 |
#' \code{measure_type} \tab Higher-level description of type than storage type
|
|
| 25 |
#' (e.g., \code{count} versus \code{integer}), if included in measure info. \cr
|
|
| 26 |
#' \code{unit} \tab How a single value should be interpreted
|
|
| 27 |
#' (e.g., \code{per 100k people} for a rate per 100k people), if included in measure info. \cr
|
|
| 28 |
#' \code{time_resolution} \tab The measure's collection frequency, if included in measure info. \cr
|
|
| 29 |
#' \code{category} \tab The measure's category, if included in measure info. \cr
|
|
| 30 |
#' \code{subcategory} \tab The measure's subcategory, if included in measure info. \cr
|
|
| 31 |
#' } |
|
| 32 |
#' @family data user interface functions |
|
| 33 |
#' @examples |
|
| 34 |
#' dcf_variables("dissc-yale/pophive_demo")
|
|
| 35 |
#' @export |
|
| 36 | ||
| 37 |
dcf_variables <- function( |
|
| 38 |
project = ".", |
|
| 39 |
exclude = c("geography", "time", "age"),
|
|
| 40 |
... |
|
| 41 |
) {
|
|
| 42 | 5x |
report <- if (is.list(project)) project else dcf_report(project, ...) |
| 43 | 5x |
data_dir <- report$settings$data_dir |
| 44 | 5x |
dplyr::as_tibble(do.call( |
| 45 | 5x |
rbind, |
| 46 | 5x |
lapply(names(report$metadata), function(project_output) {
|
| 47 | 23x |
datapackage <- report$metadata[[project_output]] |
| 48 | 23x |
measure_info <- datapackage$measure_info |
| 49 | 23x |
do.call( |
| 50 | 23x |
rbind, |
| 51 | 23x |
lapply(datapackage$resources, function(resource) {
|
| 52 | 36x |
file <- paste(data_dir, project_output, resource$filename, sep = "/") |
| 53 | 36x |
project_type <- if (grepl("/dist/", file, fixed = TRUE)) "bundle" else
|
| 54 | 36x |
"source" |
| 55 | 36x |
n_rows <- resource$row_count |
| 56 | 36x |
data_format <- if (is.null(resource$data_format)) "wide" else |
| 57 | 36x |
resource$data_format |
| 58 | 36x |
do.call( |
| 59 | 36x |
rbind, |
| 60 | 36x |
Filter( |
| 61 | 36x |
length, |
| 62 | 36x |
if (data_format == "tall") {
|
| 63 | 10x |
lapply(resource$schema$fields, function(field) {
|
| 64 | 44x |
if ("levels" %in% names(field$info)) {
|
| 65 | 10x |
do.call( |
| 66 | 10x |
rbind, |
| 67 | 10x |
lapply( |
| 68 | 10x |
field$info$levels, |
| 69 | 10x |
function(level) {
|
| 70 | 28x |
n <- field$table[[level$id]] |
| 71 | 28x |
level_row( |
| 72 | 28x |
level, |
| 73 | 28x |
if (is.null(n)) 0L else n, |
| 74 | 28x |
file, |
| 75 | 28x |
project_type |
| 76 |
) |
|
| 77 |
} |
|
| 78 |
) |
|
| 79 |
) |
|
| 80 |
} |
|
| 81 |
}) |
|
| 82 |
} else {
|
|
| 83 | 26x |
lapply(resource$schema$fields, function(field) {
|
| 84 | 56x |
if (length(exclude) && field$name %in% exclude) return(NULL) |
| 85 | 94x |
info <- field$info |
| 86 | ! |
if ("info" %in% names(info)) info <- info$info
|
| 87 | 94x |
if (is.null(info) && !is.null(measure_info)) {
|
| 88 | ! |
info <- measure_info[[field$name]] |
| 89 |
} |
|
| 90 | 94x |
cbind( |
| 91 | 94x |
data.frame( |
| 92 | 94x |
name = field$name, |
| 93 | 94x |
type = field$type, |
| 94 | 94x |
n = n_rows - field$missing, |
| 95 | 94x |
duplicates = field$duplicates, |
| 96 | 94x |
missing = field$missing, |
| 97 | 94x |
project_type = project_type, |
| 98 | 94x |
data_format = "wide", |
| 99 | 94x |
file = file |
| 100 |
), |
|
| 101 | 94x |
unpack_info(info) |
| 102 |
) |
|
| 103 |
}) |
|
| 104 |
} |
|
| 105 |
) |
|
| 106 |
) |
|
| 107 |
}) |
|
| 108 |
) |
|
| 109 |
}) |
|
| 110 |
)) |
|
| 111 |
} |
|
| 112 | ||
| 113 |
unpack_info <- function(info) {
|
|
| 114 | 122x |
unpacked <- data.frame( |
| 115 | 122x |
short_name = NA_character_, |
| 116 | 122x |
long_name = NA_character_, |
| 117 | 122x |
short_description = NA_character_, |
| 118 | 122x |
long_description = NA_character_, |
| 119 | 122x |
measure_type = NA_character_, |
| 120 | 122x |
unit = NA_character_, |
| 121 | 122x |
time_resolution = NA_character_, |
| 122 | 122x |
category = NA_character_, |
| 123 | 122x |
subcategory = NA_character_ |
| 124 |
) |
|
| 125 | 122x |
info_names <- names(info) |
| 126 | 122x |
if ("name" %in% info_names && !("short_name" %in% info_names)) {
|
| 127 | ! |
info$short_name <- info$name |
| 128 |
} |
|
| 129 | 122x |
if ("description" %in% info_names && !("short_description" %in% info_names)) {
|
| 130 | ! |
info$short_description <- info$description |
| 131 |
} |
|
| 132 | 122x |
info_names <- names(info) |
| 133 | 122x |
for (name in colnames(unpacked)) {
|
| 134 | 1098x |
if (name %in% info_names) {
|
| 135 | 826x |
unpacked[[name]] <- info[[name]] |
| 136 |
} |
|
| 137 |
} |
|
| 138 | 122x |
unpacked |
| 139 |
} |
|
| 140 | ||
| 141 |
level_row <- function(level, n, file, project_type) {
|
|
| 142 | 28x |
cbind( |
| 143 | 28x |
data.frame( |
| 144 | 28x |
name = level$id, |
| 145 | 28x |
type = level$type, |
| 146 | 28x |
n = n, |
| 147 | 28x |
duplicates = NA, |
| 148 | 28x |
missing = NA, |
| 149 | 28x |
project_type = project_type, |
| 150 | 28x |
data_format = "tall", |
| 151 | 28x |
file = file |
| 152 |
), |
|
| 153 | 28x |
unpack_info(level$info) |
| 154 |
) |
|
| 155 |
} |
| 1 |
#' Process Epic Stating Files |
|
| 2 |
#' |
|
| 3 |
#' Process Epic stating files, lightly standardizing them and moving them to raw. |
|
| 4 |
#' |
|
| 5 |
#' @param staging_dir Directory containing the staging files. |
|
| 6 |
#' @param out_dir Directory to write new raw files to. |
|
| 7 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
|
|
| 8 |
#' @param cleanup Logical; if \code{FALSE}, will not remove staging files after being processed.
|
|
| 9 |
#' @param ... Passes additional arguments to \code{\link{dcf_read_epic}}.
|
|
| 10 |
#' @returns \code{NULL} if no staging files are found.
|
|
| 11 |
#' Otherwise, a list with entries for \code{data} and \code{metadata}.
|
|
| 12 |
#' Each of these are lists with entries for each recognized standard name, |
|
| 13 |
#' with potentially combined outputs similar to \code{\link{dcf_read_epic}}
|
|
| 14 |
#' |
|
| 15 |
#' @examples |
|
| 16 |
#' \dontrun{
|
|
| 17 |
#' # run from a source project |
|
| 18 |
#' dcf_process_epic_staging() |
|
| 19 |
#' } |
|
| 20 |
#' |
|
| 21 |
#' @export |
|
| 22 | ||
| 23 |
dcf_process_epic_staging <- function( |
|
| 24 |
staging_dir = "raw/staging", |
|
| 25 |
out_dir = "raw", |
|
| 26 |
verbose = TRUE, |
|
| 27 |
cleanup = TRUE, |
|
| 28 |
... |
|
| 29 |
) {
|
|
| 30 | 1x |
files <- sort(list.files( |
| 31 | 1x |
staging_dir, |
| 32 | 1x |
"csv", |
| 33 | 1x |
full.names = TRUE, |
| 34 | 1x |
recursive = TRUE |
| 35 |
)) |
|
| 36 | 1x |
files <- files[!grepl("census", files)]
|
| 37 | 1x |
if (!length(files)) {
|
| 38 | ! |
if (verbose) {
|
| 39 | ! |
cli::cli_progress_message("no staging files found")
|
| 40 |
} |
|
| 41 | ! |
return(NULL) |
| 42 |
} |
|
| 43 | 1x |
id_cols <- c("state", "county", "age", "year", "month", "week")
|
| 44 | 1x |
metadata <- list() |
| 45 | 1x |
data <- list() |
| 46 | 1x |
for (file in files) {
|
| 47 | 2x |
if (verbose) {
|
| 48 | 2x |
cli::cli_progress_step("processing file {.file {file}}", spinner = TRUE)
|
| 49 |
} |
|
| 50 | 2x |
epic <- tryCatch(dcf_read_epic(file, ...), error = function(e) NULL) |
| 51 | 2x |
if (is.null(epic)) {
|
| 52 | ! |
if (verbose) {
|
| 53 | ! |
cli::cli_progress_done(result = "failed") |
| 54 |
} |
|
| 55 | ! |
next |
| 56 |
} |
|
| 57 | 2x |
if (epic$metadata$standard_name == "") {
|
| 58 | ! |
if (verbose) {
|
| 59 | ! |
cli::cli_warn("failed to identify standard type for {.file {file}}")
|
| 60 | ! |
cli::cli_progress_done(result = "failed") |
| 61 |
} |
|
| 62 | ! |
next |
| 63 |
} |
|
| 64 | 2x |
name <- epic$metadata$standard_name |
| 65 | 2x |
metadata[[name]] <- c(list(epic$metadata), metadata[[name]]) |
| 66 | 2x |
file_id_cols <- id_cols[id_cols %in% colnames(epic$data)] |
| 67 | 2x |
epic$data <- epic$data[ |
| 68 | 2x |
rowMeans(is.na(epic$data[, |
| 69 | 2x |
!(colnames(epic$data) %in% file_id_cols), |
| 70 | 2x |
drop = FALSE |
| 71 |
])) != |
|
| 72 | 2x |
1, |
| 73 |
] |
|
| 74 | 2x |
n_col <- grep("^n_", colnames(epic$data))
|
| 75 | 2x |
if (length(n_col)) {
|
| 76 | ! |
colnames(epic$data)[[n_col]] <- paste0("n_", epic$metadata$standard_name)
|
| 77 |
} |
|
| 78 | 2x |
if (!is.null(data[[name]])) {
|
| 79 | 1x |
cols <- colnames(data[[name]]) |
| 80 | 1x |
cols <- cols[!(cols %in% colnames(epic$data))] |
| 81 | 1x |
if (length(cols)) {
|
| 82 | ! |
epic$data[, cols] <- NA |
| 83 |
} |
|
| 84 | 1x |
epic$data <- epic$data[, colnames(data[[name]])] |
| 85 | 1x |
file_id_cols <- id_cols[id_cols %in% colnames(data[[name]])] |
| 86 | 1x |
data[[name]] <- data[[name]][ |
| 87 | 1x |
!(do.call(paste, data[[name]][, file_id_cols]) %in% |
| 88 | 1x |
do.call(paste, epic$data[, file_id_cols])), |
| 89 |
] |
|
| 90 |
} |
|
| 91 | 2x |
data[[name]] <- rbind(epic$data, data[[name]]) |
| 92 | 2x |
if (verbose) cli::cli_progress_done() |
| 93 |
} |
|
| 94 | 1x |
for (name in names(metadata)) {
|
| 95 | 1x |
if (verbose) {
|
| 96 | 1x |
cli::cli_progress_step( |
| 97 | 1x |
"writing standard raw output for {.field {name}}",
|
| 98 | 1x |
spinner = TRUE |
| 99 |
) |
|
| 100 |
} |
|
| 101 | 1x |
paths <- paste0(out_dir, "/", name, ".", c("json", "csv.xz"))
|
| 102 | 1x |
jsonlite::write_json( |
| 103 | 1x |
metadata[[name]], |
| 104 | 1x |
paths[[1L]], |
| 105 | 1x |
auto_unbox = TRUE, |
| 106 | 1x |
pretty = TRUE |
| 107 |
) |
|
| 108 | 1x |
vroom::vroom_write(data[[name]], paths[[2L]]) |
| 109 | 1x |
if (cleanup) {
|
| 110 | 1x |
unlink(vapply(metadata[[name]], "[[", "", "file")) |
| 111 |
} |
|
| 112 | 1x |
if (verbose) cli::cli_process_done() |
| 113 |
} |
|
| 114 | 1x |
return(list(metadata = metadata, data = data)) |
| 115 |
} |
| 1 |
#' Run Data Project Processes |
|
| 2 |
#' |
|
| 3 |
#' Operates over data source and bundle projects, optionally running the source |
|
| 4 |
#' ingest scripts, then collecting metadata. |
|
| 5 |
#' |
|
| 6 |
#' @param name Name of a source project to process. Will default to the name of the |
|
| 7 |
#' current working directory. |
|
| 8 |
#' @param project_dir Path to the project directory. If not specified, and being called |
|
| 9 |
#' from a source project, this will be assumed to be two steps back from the working directory. |
|
| 10 |
#' @param run_scripts Logical; if \code{FALSE}, will rebuild datapackages without running
|
|
| 11 |
#' scripts. |
|
| 12 |
#' @param is_auto Logical; if \code{TRUE}, will skip process scripts marked as manual.
|
|
| 13 |
#' @param force Logical; if \code{TRUE}, will ignore process frequencies
|
|
| 14 |
#' (will run scripts even if recently run). |
|
| 15 |
#' @param clear_state Logical; if \code{TRUE}, will clear stored states before processing.
|
|
| 16 |
#' @returns A list with processing results: |
|
| 17 |
#' \itemize{
|
|
| 18 |
#' \item \code{timings}: How many seconds the scripts took to run.
|
|
| 19 |
#' \item \code{logs}: The captured output of the scripts.
|
|
| 20 |
#' } |
|
| 21 |
#' Each entry has an entry for each project. |
|
| 22 |
#' |
|
| 23 |
#' A `datapackage.json` file is also created / update in each source's `standard` directory |
|
| 24 |
#' and each bundle's `dist` directory. |
|
| 25 |
#' @examples |
|
| 26 |
#' \dontrun{
|
|
| 27 |
#' # run from a directory containing a `data` directory containing the source |
|
| 28 |
#' dcf_process("source_name")
|
|
| 29 |
#' |
|
| 30 |
#' # run without executing the ingestion script |
|
| 31 |
#' dcf_process("source_name", run_scripts = FALSE)
|
|
| 32 |
#' } |
|
| 33 |
#' @export |
|
| 34 | ||
| 35 |
dcf_process <- function( |
|
| 36 |
name = NULL, |
|
| 37 |
project_dir = ".", |
|
| 38 |
run_scripts = TRUE, |
|
| 39 |
is_auto = FALSE, |
|
| 40 |
force = FALSE, |
|
| 41 |
clear_state = FALSE |
|
| 42 |
) {
|
|
| 43 | 5x |
if (!is.null(name) && missing(project_dir) && dir.exists(name)) {
|
| 44 | ! |
project_dir <- name |
| 45 | ! |
name <- NULL |
| 46 |
} |
|
| 47 | 5x |
settings_file <- paste0(project_dir, "/settings.json") |
| 48 | 5x |
from_project <- file.exists(settings_file) |
| 49 | 5x |
if (from_project) {
|
| 50 | 4x |
source_dir <- paste0( |
| 51 | 4x |
project_dir, |
| 52 |
"/", |
|
| 53 | 4x |
dcf_attempt_read_json(settings_file)$data_dir |
| 54 |
) |
|
| 55 | 1x |
} else if (file.exists(paste0(project_dir, "../../settings.json"))) {
|
| 56 | ! |
project_dir <- normalizePath(project_dir, "/", FALSE) |
| 57 | ! |
source_dir <- dirname(project_dir) |
| 58 | ! |
project_dir <- dirname(dirname(project_dir)) |
| 59 | 1x |
} else if (is.null(name)) {
|
| 60 | 1x |
project_dir <- normalizePath(project_dir, "/", FALSE) |
| 61 | 1x |
source_dir <- dirname(project_dir) |
| 62 | 1x |
name <- basename(project_dir) |
| 63 | 1x |
project_dir <- source_dir |
| 64 |
} else {
|
|
| 65 | ! |
source_dir <- project_dir |
| 66 |
} |
|
| 67 | ||
| 68 | 5x |
sources <- if (is.null(name)) {
|
| 69 | 2x |
list.files( |
| 70 | 2x |
source_dir, |
| 71 | 2x |
"process\\.json", |
| 72 | 2x |
recursive = TRUE, |
| 73 | 2x |
full.names = TRUE |
| 74 |
) |
|
| 75 |
} else {
|
|
| 76 | 3x |
process_files <- paste0(source_dir, "/", name, "/process.json") |
| 77 | 3x |
if (any(!file.exists(process_files))) {
|
| 78 | ! |
cli::cli_abort( |
| 79 | ! |
"missing process file{?/s}: {.emph {process_files[!file.exists(process_files)]}}"
|
| 80 |
) |
|
| 81 |
} |
|
| 82 | 3x |
process_files |
| 83 |
} |
|
| 84 | 5x |
decide_to_run <- function(process_script) {
|
| 85 | 5x |
if (is_auto && process_script$manual) {
|
| 86 | ! |
return(FALSE) |
| 87 |
} |
|
| 88 | 5x |
if ( |
| 89 | 5x |
force || process_script$last_run == "" || process_script$frequency == 0L |
| 90 |
) {
|
|
| 91 | 5x |
return(TRUE) |
| 92 |
} |
|
| 93 | 5x |
if ( |
| 94 | ! |
difftime(Sys.time(), as.POSIXct(process_script$last_run), units = "day") > |
| 95 | ! |
process_script$frequency |
| 96 |
) {
|
|
| 97 | ! |
return(TRUE) |
| 98 |
} |
|
| 99 | ! |
FALSE |
| 100 |
} |
|
| 101 | 5x |
collect_env <- new.env() |
| 102 | 5x |
collect_env$timings <- list() |
| 103 | 5x |
collect_env$logs <- list() |
| 104 | 5x |
process_source <- function(process_file) {
|
| 105 | 5x |
process_def <- dcf_process_record(process_file) |
| 106 | 5x |
if (clear_state) {
|
| 107 | ! |
raw_states <- grep("raw_state", names(process_def), fixed = TRUE)
|
| 108 | ! |
if (length(raw_states)) {
|
| 109 | ! |
process_def[raw_states] <- NULL |
| 110 |
} |
|
| 111 | ! |
process_def$standard_state <- NULL |
| 112 | ! |
dcf_process_record(process_file, process_def) |
| 113 |
} |
|
| 114 | 5x |
name <- process_def$name |
| 115 | 5x |
if (is.null(name)) {
|
| 116 | ! |
name <- basename(dirname(process_file)) |
| 117 |
} |
|
| 118 | 5x |
dcf_add_source( |
| 119 | 5x |
name, |
| 120 | 5x |
project_dir, |
| 121 | 5x |
open_after = FALSE, |
| 122 | 5x |
use_git = FALSE, |
| 123 | 5x |
use_workflow = FALSE |
| 124 |
) |
|
| 125 | 5x |
base_dir <- dirname(process_file) |
| 126 | 5x |
for (si in seq_along(process_def$scripts)) {
|
| 127 | 5x |
st <- proc.time()[[3]] |
| 128 | 5x |
process_script <- process_def$scripts[[si]] |
| 129 | 5x |
run_current <- decide_to_run(process_script) |
| 130 | 5x |
script <- paste0(base_dir, "/", process_script$path) |
| 131 | 5x |
file_ref <- if (run_current) paste0(" ({.emph ", script, "})") else NULL
|
| 132 | 5x |
cli::cli_progress_step( |
| 133 | 5x |
paste0("processing source {.strong ", name, "}", file_ref),
|
| 134 | 5x |
spinner = TRUE |
| 135 |
) |
|
| 136 | 5x |
env <- new.env() |
| 137 | 5x |
env$dcf_process_continue <- TRUE |
| 138 | 5x |
status <- if (run_scripts) {
|
| 139 | 5x |
tryCatch( |
| 140 | 5x |
list( |
| 141 | 5x |
log = utils::capture.output( |
| 142 | 5x |
source(script, env, chdir = TRUE), |
| 143 | 5x |
type = "message" |
| 144 |
), |
|
| 145 | 5x |
success = TRUE |
| 146 |
), |
|
| 147 | 5x |
error = function(e) {
|
| 148 | ! |
cli::cli_warn("scripts {.file {script}} failed: {e$message}")
|
| 149 | ! |
list(log = e$message, success = FALSE) |
| 150 |
} |
|
| 151 |
) |
|
| 152 | 5x |
} else if ( |
| 153 | 5x |
length(process_def$scripts) >= si && |
| 154 | 5x |
!is.null(process_def$scripts[[si]]$last_status) |
| 155 |
) {
|
|
| 156 | ! |
process_def$scripts[[si]]$last_status |
| 157 |
} else {
|
|
| 158 | ! |
list(log = "", success = TRUE) |
| 159 |
} |
|
| 160 | 5x |
collect_env$logs[[name]] <- status$log |
| 161 | 5x |
if (run_current) {
|
| 162 | 5x |
process_script$last_run <- Sys.time() |
| 163 | 5x |
process_script$run_time <- proc.time()[[3]] - st |
| 164 | 5x |
process_script$last_status <- status |
| 165 | 5x |
process_def$scripts[[si]] <- process_script |
| 166 |
} |
|
| 167 | 5x |
if (status$success) {
|
| 168 | 5x |
collect_env$timings[[name]] <- process_script$run_time |
| 169 |
} |
|
| 170 | ! |
if (!env$dcf_process_continue) break |
| 171 |
} |
|
| 172 | 5x |
process_def_current <- dcf_process_record(process_file) |
| 173 | 5x |
if ( |
| 174 | 5x |
is.null(process_def_current$raw_state) || |
| 175 | 5x |
!identical(process_def$raw_state, process_def_current$raw_state) |
| 176 |
) {
|
|
| 177 | 5x |
if (is.null(process_def_current$name)) {
|
| 178 | ! |
process_def_current$name <- basename(dirname(process_file)) |
| 179 |
} |
|
| 180 | 5x |
process_def_current$scripts <- process_def$scripts |
| 181 | 5x |
dcf_process_record(process_file, process_def_current) |
| 182 |
} |
|
| 183 | 5x |
standard_dir <- paste0(base_dir, "/standard") |
| 184 | 5x |
data_files <- list.files(standard_dir, "\\.(?:csv|parquet|json)") |
| 185 | 5x |
data_files <- data_files[!grepl("datapackage", data_files, fixed = TRUE)]
|
| 186 | 5x |
if (length(data_files)) {
|
| 187 | 5x |
measure_info_file <- paste0(base_dir, "/measure_info.json") |
| 188 | 5x |
standard_state <- as.list(tools::md5sum(c( |
| 189 | 5x |
measure_info_file, |
| 190 | 5x |
paste0(standard_dir, "/", data_files) |
| 191 |
))) |
|
| 192 | 5x |
if (!identical(process_def_current$standard_state, standard_state)) {
|
| 193 | 4x |
measure_info <- dcf_measure_info( |
| 194 | 4x |
measure_info_file, |
| 195 | 4x |
include_empty = FALSE, |
| 196 | 4x |
render = TRUE, |
| 197 | 4x |
write = FALSE, |
| 198 | 4x |
open_after = FALSE, |
| 199 | 4x |
verbose = FALSE |
| 200 |
) |
|
| 201 | 4x |
measure_sources <- list() |
| 202 | 4x |
for (measure_id in names(measure_info)) {
|
| 203 | 7x |
measure_info[[measure_id]]$id <- measure_id |
| 204 | 7x |
info <- measure_info[[measure_id]] |
| 205 | 7x |
for (s in info$sources) {
|
| 206 | 5x |
if ( |
| 207 | 2x |
is.list(s) && |
| 208 | 2x |
!is.null(s$location) && |
| 209 | 2x |
!(s$location %in% names(sources)) |
| 210 |
) {
|
|
| 211 | ! |
measure_sources[[s$location]] <- s |
| 212 |
} |
|
| 213 |
} |
|
| 214 |
} |
|
| 215 | 4x |
if (!file.exists(paste0(standard_dir, "/datapackage.json"))) {
|
| 216 | ! |
dcf_datapackage_init(name, dir = standard_dir, quiet = TRUE) |
| 217 |
} |
|
| 218 | 4x |
base_meta <- list( |
| 219 | 4x |
source = unname(measure_sources), |
| 220 | 4x |
base_dir = base_dir, |
| 221 | 4x |
ids = "geography", |
| 222 | 4x |
time = "time", |
| 223 | 4x |
variables = measure_info |
| 224 |
) |
|
| 225 | 4x |
dcf_datapackage_add( |
| 226 | 4x |
data_files, |
| 227 | 4x |
meta = if (length(process_def_current$vintages)) {
|
| 228 | 3x |
vintages <- process_def_current$vintages |
| 229 | 3x |
lapply(structure(data_files, names = data_files), function(f) {
|
| 230 | 3x |
base_meta$vintage <- vintages[[f]] |
| 231 | 3x |
base_meta |
| 232 |
}) |
|
| 233 |
} else {
|
|
| 234 | 1x |
base_meta |
| 235 |
}, |
|
| 236 | 4x |
dir = standard_dir, |
| 237 | 4x |
pretty = TRUE, |
| 238 | 4x |
summarize_ids = TRUE, |
| 239 | 4x |
verbose = FALSE |
| 240 |
) |
|
| 241 | 4x |
process_def_current$standard_state <- standard_state |
| 242 | 4x |
dcf_process_record(process_file, process_def_current) |
| 243 |
} |
|
| 244 | 5x |
cli::cli_progress_done(result = if (status$success) "done" else "failed") |
| 245 |
} else {
|
|
| 246 | ! |
cli::cli_progress_done(result = "failed") |
| 247 | ! |
cli::cli_bullets( |
| 248 | ! |
c(" " = "no standard data files found in {.path {standard_dir}}")
|
| 249 |
) |
|
| 250 |
} |
|
| 251 |
} |
|
| 252 | 5x |
process_bundle <- function(process_file) {
|
| 253 | 2x |
process_def <- dcf_process_record(process_file) |
| 254 | 2x |
source_files <- if (length(process_def$source_files)) {
|
| 255 | 2x |
if (!is.null(names(process_def$source_files))) {
|
| 256 | 2x |
names(process_def$source_files) |
| 257 |
} else {
|
|
| 258 | ! |
process_def$source_files |
| 259 |
} |
|
| 260 |
} else {
|
|
| 261 | ! |
NULL |
| 262 |
} |
|
| 263 | 2x |
if (clear_state) {
|
| 264 | ! |
process_def$source_state <- NULL |
| 265 | ! |
process_def$dist_state <- NULL |
| 266 | ! |
dcf_process_record(process_file, process_def) |
| 267 |
} |
|
| 268 | 2x |
name <- process_def$name |
| 269 | 2x |
if (is.null(name)) {
|
| 270 | ! |
name <- basename(dirname(process_file)) |
| 271 |
} |
|
| 272 | 2x |
dcf_add_bundle( |
| 273 | 2x |
name, |
| 274 | 2x |
project_dir, |
| 275 | 2x |
open_after = FALSE, |
| 276 | 2x |
use_git = FALSE, |
| 277 | 2x |
use_workflow = FALSE |
| 278 |
) |
|
| 279 | 2x |
base_dir <- dirname(process_file) |
| 280 | 2x |
for (si in seq_along(process_def$scripts)) {
|
| 281 | 2x |
st <- proc.time()[[3]] |
| 282 | 2x |
process_script <- process_def$scripts[[si]] |
| 283 | 2x |
script <- paste0(base_dir, "/", process_script$path) |
| 284 | 2x |
run_current <- TRUE |
| 285 | 2x |
standard_state <- NULL |
| 286 | 2x |
if (length(source_files)) {
|
| 287 | 2x |
standard_files <- paste0(source_dir, "/", source_files) |
| 288 | 2x |
standard_state <- as.list(tools::md5sum(paste0( |
| 289 | 2x |
source_dir, |
| 290 |
"/", |
|
| 291 | 2x |
source_files |
| 292 |
))) |
|
| 293 | 2x |
run_current <- !identical(standard_state, process_def$source_state) |
| 294 |
} |
|
| 295 | 2x |
if (run_current) {
|
| 296 | 2x |
cli::cli_progress_step( |
| 297 | 2x |
paste0( |
| 298 | 2x |
"processing bundle {.strong ",
|
| 299 | 2x |
name, |
| 300 | 2x |
"} ({.emph ",
|
| 301 | 2x |
script, |
| 302 |
"})" |
|
| 303 |
), |
|
| 304 | 2x |
spinner = TRUE |
| 305 |
) |
|
| 306 | 2x |
env <- new.env() |
| 307 | 2x |
env$dcf_process_continue <- TRUE |
| 308 | 2x |
status <- if (run_scripts) {
|
| 309 | 2x |
tryCatch( |
| 310 | 2x |
list( |
| 311 | 2x |
log = utils::capture.output( |
| 312 | 2x |
source(script, env, chdir = TRUE), |
| 313 | 2x |
type = "message" |
| 314 |
), |
|
| 315 | 2x |
success = TRUE |
| 316 |
), |
|
| 317 | 2x |
error = function(e) {
|
| 318 | ! |
cli::cli_warn("scripts {.file {script}} failed: {e$message}")
|
| 319 | ! |
list(log = e$message, success = FALSE) |
| 320 |
} |
|
| 321 |
) |
|
| 322 | 2x |
} else if ( |
| 323 | 2x |
length(process_def$scripts) >= si && |
| 324 | 2x |
!is.null(process_def$scripts[[si]]$last_status) |
| 325 |
) {
|
|
| 326 | ! |
process_def$scripts[[si]]$last_status |
| 327 |
} else {
|
|
| 328 | ! |
list(log = "", success = TRUE) |
| 329 |
} |
|
| 330 | 2x |
collect_env$logs[[name]] <- status$log |
| 331 | 2x |
if (run_current) {
|
| 332 | 2x |
process_script$last_run <- Sys.time() |
| 333 | 2x |
process_script$run_time <- proc.time()[[3]] - st |
| 334 | 2x |
process_script$last_status <- status |
| 335 | 2x |
process_def$scripts[[si]] <- process_script |
| 336 |
} |
|
| 337 | 2x |
if (status$success) {
|
| 338 | 2x |
collect_env$timings[[name]] <- process_script$run_time |
| 339 |
} |
|
| 340 | ! |
if (!env$dcf_process_continue) break |
| 341 |
} |
|
| 342 |
} |
|
| 343 | 2x |
process_def_current <- dcf_process_record(process_file) |
| 344 | 2x |
source_packages <- as.list(unlist(Filter( |
| 345 | 2x |
length, |
| 346 | 2x |
lapply( |
| 347 | 2x |
unique(paste0( |
| 348 | 2x |
vapply(paste0(source_dir, "/", source_files), dirname, ""), |
| 349 | 2x |
"/datapackage.json" |
| 350 |
)), |
|
| 351 | 2x |
function(package_file) {
|
| 352 | 2x |
if (file.exists(package_file)) {
|
| 353 | 2x |
tools::md5sum(package_file) |
| 354 |
} |
|
| 355 |
} |
|
| 356 |
) |
|
| 357 |
))) |
|
| 358 | 2x |
dist_dir <- paste0(base_dir, "/dist") |
| 359 | 2x |
dist_files <- grep( |
| 360 | 2x |
"datapackage", |
| 361 | 2x |
list.files(dist_dir, recursive = TRUE), |
| 362 | 2x |
fixed = TRUE, |
| 363 | 2x |
invert = TRUE, |
| 364 | 2x |
value = TRUE |
| 365 |
) |
|
| 366 | 2x |
if (length(dist_files)) {
|
| 367 | 2x |
dist_state <- c( |
| 368 | 2x |
as.list(tools::md5sum(paste0( |
| 369 | 2x |
base_dir, |
| 370 | 2x |
"/dist/", |
| 371 | 2x |
dist_files |
| 372 |
))), |
|
| 373 | 2x |
source_packages |
| 374 |
) |
|
| 375 | 2x |
if (!identical(process_def_current$dist_state, dist_state)) {
|
| 376 | 2x |
process_def_current$scripts <- process_def$scripts |
| 377 | 2x |
process_def_current$dist_state <- dist_state |
| 378 | 2x |
process_def_current$standard_state <- standard_state |
| 379 | 2x |
if (is.null(process_def_current$name)) {
|
| 380 | ! |
process_def_current$name <- basename(dirname(process_file)) |
| 381 |
} |
|
| 382 | 2x |
dcf_process_record(process_file, process_def_current) |
| 383 | ||
| 384 |
# merge with standard measure infos |
|
| 385 | 2x |
measure_info <- dcf_measure_info( |
| 386 | 2x |
paste0(base_dir, "/measure_info.json"), |
| 387 | 2x |
include_empty = FALSE, |
| 388 | 2x |
render = TRUE, |
| 389 | 2x |
write = FALSE, |
| 390 | 2x |
open_after = FALSE, |
| 391 | 2x |
verbose = FALSE |
| 392 |
) |
|
| 393 | 2x |
if (length(source_packages)) {
|
| 394 | 2x |
source_resources <- lapply( |
| 395 | 2x |
names(source_packages), |
| 396 | 2x |
function(f) {
|
| 397 | 2x |
dp <- dcf_attempt_read_json(f) |
| 398 | 2x |
list( |
| 399 | 2x |
name = dp$name, |
| 400 | 2x |
data_dir = dirname(f), |
| 401 | 2x |
resources = dp$resources, |
| 402 | 2x |
measure_info = dp$measure_info |
| 403 |
) |
|
| 404 |
} |
|
| 405 |
) |
|
| 406 | 2x |
fill_source_info <- function(id, info) {
|
| 407 | 10x |
info$id <- id |
| 408 | 10x |
if (!is.null(info$levels)) {
|
| 409 | 2x |
for (level_id in names(info$levels)) {
|
| 410 | 4x |
info$levels[[level_id]] <- fill_source_info( |
| 411 | 4x |
level_id, |
| 412 | 4x |
info$levels[[level_id]] |
| 413 |
) |
|
| 414 |
} |
|
| 415 | 2x |
return(info) |
| 416 |
} |
|
| 417 | 8x |
source_id <- if (!is.null(info$source_id)) {
|
| 418 | 3x |
info$source_id |
| 419 |
} else {
|
|
| 420 | 5x |
id |
| 421 |
} |
|
| 422 | 8x |
source_info <- NULL |
| 423 | 8x |
for (package in source_resources) {
|
| 424 | 8x |
has_package_info <- !is.null(package$measure_info) |
| 425 | 8x |
for (resource in package$resources) {
|
| 426 | 8x |
has_resource_info <- !is.null(package$measure_info) |
| 427 | 8x |
for (field in resource$schema$fields) {
|
| 428 | 32x |
if (identical(source_id, field$name)) {
|
| 429 | 6x |
source_info <- field |
| 430 | 6x |
source_info$info <- if ( |
| 431 | 6x |
has_resource_info && |
| 432 | 6x |
(source_id %in% names(resource$measure_info)) |
| 433 |
) {
|
|
| 434 | ! |
resource$measure_info[[source_id]] |
| 435 | 6x |
} else if ( |
| 436 | 6x |
has_package_info && |
| 437 | 6x |
(source_id %in% names(package$measure_info)) |
| 438 |
) {
|
|
| 439 | ! |
package$measure_info[[source_id]] |
| 440 |
} else {
|
|
| 441 | 6x |
list() |
| 442 |
} |
|
| 443 | 6x |
source_info$info$source_file <- list( |
| 444 | 6x |
project = package$name, |
| 445 | 6x |
data_dir = sub( |
| 446 | 6x |
project_dir, |
| 447 |
"", |
|
| 448 | 6x |
package$data_dir, |
| 449 | 6x |
fixed = TRUE |
| 450 |
), |
|
| 451 | 6x |
file = resource$filename |
| 452 |
) |
|
| 453 |
} |
|
| 454 |
} |
|
| 455 |
} |
|
| 456 |
} |
|
| 457 | 8x |
if (!is.null(source_info)) {
|
| 458 | 6x |
for (entry_name in names(source_info)) {
|
| 459 | 2x |
if ( |
| 460 | 60x |
is.null(info[[entry_name]]) || |
| 461 | 60x |
(is.character(info[[entry_name]]) && |
| 462 | 60x |
info[[entry_name]] == "") |
| 463 |
) {
|
|
| 464 | 60x |
info[[entry_name]] <- source_info[[entry_name]] |
| 465 | ! |
} else if (is.list(info[[entry_name]])) {
|
| 466 | ! |
info[[entry_name]] <- unique(c( |
| 467 | ! |
info[[entry_name]], |
| 468 | ! |
source_info[[entry_name]] |
| 469 |
)) |
|
| 470 |
} |
|
| 471 |
} |
|
| 472 |
} |
|
| 473 | 8x |
info |
| 474 |
} |
|
| 475 | 2x |
for (measure_id in names(measure_info)) {
|
| 476 | 6x |
measure_info[[measure_id]] <- fill_source_info( |
| 477 | 6x |
measure_id, |
| 478 | 6x |
measure_info[[measure_id]] |
| 479 |
) |
|
| 480 |
} |
|
| 481 |
} |
|
| 482 | 2x |
measure_sources <- list() |
| 483 | 2x |
for (info in measure_info) {
|
| 484 | 6x |
for (s in info$sources) {
|
| 485 | 5x |
if ( |
| 486 | ! |
is.list(s) && |
| 487 | ! |
!is.null(s$location) && |
| 488 | ! |
!(s$location %in% names(sources)) |
| 489 |
) {
|
|
| 490 | ! |
measure_sources[[s$location]] <- s |
| 491 |
} |
|
| 492 |
} |
|
| 493 |
} |
|
| 494 | 2x |
if (!file.exists(paste0(dist_dir, "/datapackage.json"))) {
|
| 495 | 2x |
dcf_datapackage_init(name, dir = dist_dir, quiet = TRUE) |
| 496 |
} |
|
| 497 | 2x |
metas <- list( |
| 498 | 2x |
source = unname(measure_sources), |
| 499 | 2x |
base_dir = base_dir, |
| 500 | 2x |
ids = "geography", |
| 501 | 2x |
time = "time", |
| 502 | 2x |
variables = measure_info |
| 503 |
) |
|
| 504 | 2x |
if (!is.null(names(process_def_current$source_files))) {
|
| 505 | 2x |
bundle_source_files <- names(process_def_current$source_files) |
| 506 | 2x |
package_files <- paste0( |
| 507 | 2x |
dirname(base_dir), |
| 508 |
"/", |
|
| 509 | 2x |
dirname(bundle_source_files), |
| 510 | 2x |
"/datapackage.json" |
| 511 |
) |
|
| 512 | 2x |
vintages <- process_def_current$vintages |
| 513 | 2x |
for (i in seq_along(bundle_source_files)) {
|
| 514 | 2x |
package_file <- package_files[[i]] |
| 515 | 2x |
source_dist_files <- process_def_current$source_files[[bundle_source_files[[ |
| 516 | 2x |
i |
| 517 |
]]]] |
|
| 518 | 2x |
if (file.exists(package_file)) {
|
| 519 | 2x |
package <- dcf_attempt_read_json(package_file) |
| 520 | 2x |
for (resource in package$resources) {
|
| 521 | 2x |
if (length(resource$vintage)) {
|
| 522 | 2x |
for (dist_file in source_dist_files) {
|
| 523 | 4x |
vintages[[dist_file]] <- max( |
| 524 | 4x |
vintages[[dist_file]], |
| 525 | 4x |
resource$vintage |
| 526 |
) |
|
| 527 |
} |
|
| 528 |
} |
|
| 529 |
} |
|
| 530 |
} |
|
| 531 |
} |
|
| 532 | 2x |
if (length(names(vintages)) && any(names(vintages) %in% dist_files)) {
|
| 533 | 2x |
metas <- lapply( |
| 534 | 2x |
structure(dist_files, names = dist_files), |
| 535 | 2x |
function(dist_file) {
|
| 536 | 3x |
metas$vintage <- vintages[[dist_file]] |
| 537 | 3x |
metas |
| 538 |
} |
|
| 539 |
) |
|
| 540 |
} |
|
| 541 | ! |
} else if (length(process_def_current$vintages)) {
|
| 542 | ! |
vintages <- process_def_current$vintages |
| 543 | ! |
metas <- lapply( |
| 544 | ! |
structure(dist_files, names = dist_files), |
| 545 | ! |
function(f) {
|
| 546 | ! |
metas$vintage <- vintages[[f]] |
| 547 | ! |
metas |
| 548 |
} |
|
| 549 |
) |
|
| 550 |
} |
|
| 551 | 2x |
dcf_datapackage_add( |
| 552 | 2x |
dist_files, |
| 553 | 2x |
meta = metas, |
| 554 | 2x |
dir = dist_dir, |
| 555 | 2x |
pretty = TRUE, |
| 556 | 2x |
summarize_ids = TRUE, |
| 557 | 2x |
verbose = FALSE |
| 558 |
) |
|
| 559 |
} |
|
| 560 | 2x |
cli::cli_progress_done(result = if (status$success) "done" else "failed") |
| 561 |
} else {
|
|
| 562 | ! |
cli::cli_progress_done(result = "failed") |
| 563 | ! |
cli::cli_bullets( |
| 564 | ! |
c(" " = "no standard data files found in {.path {process_file}}")
|
| 565 |
) |
|
| 566 |
} |
|
| 567 |
} |
|
| 568 | 5x |
for (process_file in sources[order( |
| 569 | 5x |
vapply( |
| 570 | 5x |
sources, |
| 571 | 5x |
function(f) {
|
| 572 | 7x |
type <- dcf_attempt_read_json(f, strict = FALSE)$type |
| 573 | 7x |
is.null(type) || type != "bundle" |
| 574 |
}, |
|
| 575 | 5x |
TRUE |
| 576 |
), |
|
| 577 | 5x |
decreasing = TRUE |
| 578 |
)]) {
|
|
| 579 | 7x |
process_def <- dcf_process_record(process_file) |
| 580 | 7x |
if (is.null(process_def)) {
|
| 581 | ! |
next |
| 582 |
} |
|
| 583 | 7x |
if (is.null(process_def$type) || process_def$type == "source") {
|
| 584 | 5x |
process_source(process_file) |
| 585 |
} else {
|
|
| 586 | 2x |
process_bundle(process_file) |
| 587 |
} |
|
| 588 |
} |
|
| 589 | 5x |
invisible(list(timings = collect_env$timings, logs = collect_env$logs)) |
| 590 |
} |
| 1 |
#' Make a Status Diagram |
|
| 2 |
#' |
|
| 3 |
#' Make a Data Collection Project status diagram. |
|
| 4 |
#' |
|
| 5 |
#' @param project_dir Path to the Data Collection Framework project to be built. |
|
| 6 |
#' @param out_file File name of the file to write within \code{project_dir}.
|
|
| 7 |
#' @returns A character vector of the status diagram, which is also written to |
|
| 8 |
#' the \code{project_dir/status.md} file.
|
|
| 9 |
#' @examples |
|
| 10 |
#' \dontrun{
|
|
| 11 |
#' dcf_status_diagram("project_directory")
|
|
| 12 |
#' } |
|
| 13 |
#' @export |
|
| 14 | ||
| 15 |
dcf_status_diagram <- function(project_dir = ".", out_file = "status.md") {
|
|
| 16 | 4x |
report_file <- paste0(project_dir, "/report.json.gz") |
| 17 | 4x |
if (!file.exists(report_file)) {
|
| 18 | ! |
cli::cli_abort("no report file found")
|
| 19 |
} |
|
| 20 | 4x |
report <- dcf_attempt_read_json(report_file) |
| 21 | 4x |
data_dir <- if (is.null(report$settings$data_dir)) {
|
| 22 | ! |
"data" |
| 23 |
} else {
|
|
| 24 | 4x |
report$settings$data_dir |
| 25 |
} |
|
| 26 | 4x |
branch <- if (is.null(report$settings$branch)) {
|
| 27 | 1x |
"main" |
| 28 |
} else {
|
|
| 29 | 3x |
report$settings$branch |
| 30 |
} |
|
| 31 | 4x |
repo <- if (identical(report$settings$github_account, "")) {
|
| 32 | 3x |
NULL |
| 33 |
} else {
|
|
| 34 | 1x |
paste0(report$settings$github_account, "/", report$settings$repo_name) |
| 35 |
} |
|
| 36 | 4x |
indent <- " " |
| 37 | 4x |
d <- c( |
| 38 | 4x |
'classDef pass stroke:#66bb6a', |
| 39 | 4x |
'classDef warn stroke:#ffa726', |
| 40 | 4x |
'classDef fail stroke:#f44336' |
| 41 |
) |
|
| 42 | 4x |
sources <- NULL |
| 43 | 4x |
source_ids <- list() |
| 44 | 4x |
file_ids <- NULL |
| 45 | 4x |
relationships <- NULL |
| 46 | 4x |
projects <- NULL |
| 47 | 4x |
node_id <- 0L |
| 48 | 4x |
for (name in names(sort(vapply( |
| 49 | 4x |
report$processes, |
| 50 | 4x |
function(p) !is.null(p$type) && p$type == "bundle", |
| 51 | 4x |
TRUE |
| 52 |
)))) {
|
|
| 53 | 8x |
timing <- report$source_times[[name]] |
| 54 | 8x |
issues <- report$issues[[name]] |
| 55 | 8x |
if (length(issues)) {
|
| 56 | 8x |
names(issues) <- sub( |
| 57 | 8x |
paste0(data_dir, "/"), |
| 58 |
"", |
|
| 59 | 8x |
sub( |
| 60 | 8x |
paste0(project_dir, "/"), |
| 61 |
"", |
|
| 62 | 8x |
sub("^\\.*/", "", names(issues)),
|
| 63 | 8x |
fixed = TRUE |
| 64 |
), |
|
| 65 | 8x |
fixed = TRUE |
| 66 |
) |
|
| 67 |
} |
|
| 68 | 8x |
metas <- report$metadata[grep( |
| 69 | 8x |
paste0("^", name, "/"),
|
| 70 | 8x |
names(report$metadata) |
| 71 |
)] |
|
| 72 | 8x |
measures <- if (length(metas)) metas[[1L]]$measure_info else list() |
| 73 | 8x |
process <- report$processes[[name]] |
| 74 | 8x |
contents <- NULL |
| 75 | 8x |
if (!is.null(process$type) && identical(process$type, "bundle")) {
|
| 76 | 4x |
dist_files <- grep( |
| 77 | 4x |
"measure_info|datapackage", |
| 78 | 4x |
names(process$dist_state), |
| 79 | 4x |
value = TRUE, |
| 80 | 4x |
invert = TRUE |
| 81 |
) |
|
| 82 | 4x |
for (filename in sub( |
| 83 | 4x |
paste0("^[./]*", data_dir, "/", name, "/(?:dist|standard)/"),
|
| 84 |
"", |
|
| 85 | 4x |
dist_files |
| 86 |
)) {
|
|
| 87 | 6x |
node_id <- node_id + 1L |
| 88 | 6x |
contents <- c( |
| 89 | 6x |
contents, |
| 90 | 6x |
paste0("n", node_id, '["`', filename, '`"]')
|
| 91 |
) |
|
| 92 |
} |
|
| 93 | 4x |
file_nodes <- file_ids[ |
| 94 | 4x |
if (!is.null(names(process$source_files))) {
|
| 95 | 4x |
names(process$source_files) |
| 96 |
} else {
|
|
| 97 | ! |
unlist(process$source_files) |
| 98 |
} |
|
| 99 |
] |
|
| 100 | 4x |
file_nodes <- file_nodes[!is.na(file_nodes)] |
| 101 | 4x |
if (length(file_nodes)) {
|
| 102 | 4x |
relationships <- c( |
| 103 | 4x |
relationships, |
| 104 | 4x |
paste0("n", file_nodes, " --> ", name)
|
| 105 |
) |
|
| 106 |
} |
|
| 107 |
} else {
|
|
| 108 | 4x |
measure_sources <- measures[["_sources"]] |
| 109 | 4x |
if (is.null(measure_sources)) {
|
| 110 | 4x |
measure_sources <- measures[["_source"]] |
| 111 |
} |
|
| 112 | 4x |
for (project_meta in metas) {
|
| 113 | 4x |
for (r in project_meta$resources) {
|
| 114 | 4x |
node_id <- node_id + 1L |
| 115 | 4x |
file_path <- paste0( |
| 116 | 4x |
name, |
| 117 | 4x |
"/standard/", |
| 118 | 4x |
r$filename |
| 119 |
) |
|
| 120 | 4x |
file_ids[paste0(name, "/standard/", r$filename)] <- node_id |
| 121 | 4x |
file_issues <- issues[[file_path]] |
| 122 | 4x |
for (field in r$schema$fields) {
|
| 123 | 14x |
field_source <- measures[[field$name]]$sources |
| 124 | 14x |
if (!is.null(names(field_source))) {
|
| 125 | ! |
field_source <- list(field_source) |
| 126 |
} |
|
| 127 | 14x |
for (s in field_source) {
|
| 128 | ! |
if (is.character(s)) {
|
| 129 | ! |
s <- list(id = s) |
| 130 |
} |
|
| 131 | ! |
if (!is.null(s$id)) {
|
| 132 | ! |
s <- if (is.null(measure_sources[[s$id]])) {
|
| 133 | ! |
c(s, list(name = s$id)) |
| 134 |
} else {
|
|
| 135 | ! |
c(s, measure_sources[[s$id]]) |
| 136 |
} |
|
| 137 |
} |
|
| 138 | ! |
if (is.null(source_ids[[s$name]])) {
|
| 139 | ! |
source_id <- paste0("s", length(source_ids))
|
| 140 | ! |
source_ids[[s$name]] <- source_id |
| 141 | ! |
sources[[source_id]] <- list( |
| 142 | ! |
id = source_id, |
| 143 | ! |
general = make_link(s$url, s$name), |
| 144 | ! |
specific = NULL |
| 145 |
) |
|
| 146 | ! |
parent_id <- source_id |
| 147 |
} else {
|
|
| 148 | ! |
parent_id <- source_ids[[s$name]] |
| 149 |
} |
|
| 150 | ! |
if (is.null(s$location_url)) {
|
| 151 | ! |
source_id <- source_ids[[s$name]] |
| 152 |
} else {
|
|
| 153 | ! |
if (is.null(source_ids[[s$location_url]])) {
|
| 154 | ! |
source_id <- paste0("s", length(source_ids))
|
| 155 | ! |
source_ids[[s$location_url]] <- source_id |
| 156 | ! |
relationships <- unique(c( |
| 157 | ! |
relationships, |
| 158 | ! |
paste0( |
| 159 | ! |
parent_id, |
| 160 |
"---", |
|
| 161 | ! |
source_id, |
| 162 |
'["', |
|
| 163 | ! |
make_link(s$location_url, s$location), |
| 164 |
'"]' |
|
| 165 |
) |
|
| 166 |
)) |
|
| 167 | ! |
sources[[parent_id]]$specific <- c( |
| 168 | ! |
sources[[parent_id]]$specific, |
| 169 | ! |
source_id |
| 170 |
) |
|
| 171 |
} else {
|
|
| 172 | ! |
source_id <- source_ids[[s$location_url]] |
| 173 |
} |
|
| 174 |
} |
|
| 175 | ! |
relationships <- unique(c( |
| 176 | ! |
relationships, |
| 177 | ! |
paste0(source_id, " --> n", node_id) |
| 178 |
)) |
|
| 179 |
} |
|
| 180 |
} |
|
| 181 | 4x |
failed <- is.null(report$source_times[[name]]) |
| 182 | 4x |
contents <- c( |
| 183 | 4x |
contents, |
| 184 | 4x |
paste0( |
| 185 | 4x |
"n", |
| 186 | 4x |
node_id, |
| 187 |
'["`', |
|
| 188 | 4x |
r$filename, |
| 189 | 4x |
if (length(file_issues)) {
|
| 190 | 1x |
if (!is.null(file_issues$measures)) {
|
| 191 | 1x |
measure_issues <- do.call( |
| 192 | 1x |
rbind, |
| 193 | 1x |
strsplit(unlist(file_issues$measures), ": ") |
| 194 |
) |
|
| 195 | 1x |
measure_issues <- tapply( |
| 196 | 1x |
measure_issues[, 2], |
| 197 | 1x |
measure_issues[, 1], |
| 198 | 1x |
paste, |
| 199 | 1x |
collapse = ", " |
| 200 |
) |
|
| 201 | 1x |
file_issues$measures <- paste0( |
| 202 | 1x |
names(measure_issues), |
| 203 |
": ", |
|
| 204 | 1x |
measure_issues |
| 205 |
) |
|
| 206 |
} |
|
| 207 | 1x |
paste0("<br/><br/>", make_list(unlist(file_issues)))
|
| 208 |
}, |
|
| 209 | 4x |
if (failed) {
|
| 210 | ! |
paste0( |
| 211 | ! |
if (length(file_issues)) "<br />" else "<br /><br />", |
| 212 | ! |
"Script Failed:<br />", |
| 213 | ! |
gsub( |
| 214 |
'[`"]', |
|
| 215 |
"'", |
|
| 216 | ! |
paste(report$logs[[name]], collapse = "<br />") |
| 217 |
) |
|
| 218 |
) |
|
| 219 |
}, |
|
| 220 | 4x |
paste0( |
| 221 |
'`"]:::', |
|
| 222 | 4x |
if (failed) {
|
| 223 | ! |
"fail" |
| 224 | 4x |
} else if (length(file_issues)) {
|
| 225 | 1x |
"warn" |
| 226 |
} else {
|
|
| 227 | 3x |
"pass" |
| 228 |
} |
|
| 229 |
) |
|
| 230 |
) |
|
| 231 |
) |
|
| 232 |
} |
|
| 233 |
} |
|
| 234 |
} |
|
| 235 | 8x |
projects <- c( |
| 236 | 8x |
projects, |
| 237 | 8x |
c( |
| 238 | 8x |
paste0( |
| 239 | 8x |
"subgraph ", |
| 240 | 8x |
name, |
| 241 |
'["`', |
|
| 242 | 8x |
if (is.null(repo)) {
|
| 243 | 7x |
name |
| 244 |
} else {
|
|
| 245 | 1x |
make_link( |
| 246 | 1x |
paste0( |
| 247 | 1x |
"https://github.com/", |
| 248 | 1x |
repo, |
| 249 | 1x |
"/tree/", |
| 250 | 1x |
branch, |
| 251 |
"/", |
|
| 252 | 1x |
data_dir, |
| 253 |
"/", |
|
| 254 | 1x |
name |
| 255 |
), |
|
| 256 | 1x |
name |
| 257 |
) |
|
| 258 |
}, |
|
| 259 |
'`"]' |
|
| 260 |
), |
|
| 261 | 8x |
paste0(indent, c("direction LR", contents)),
|
| 262 | 8x |
"end" |
| 263 |
) |
|
| 264 |
) |
|
| 265 |
} |
|
| 266 | 4x |
out <- c( |
| 267 | 4x |
"```mermaid", |
| 268 | 4x |
"flowchart LR", |
| 269 | 4x |
paste0( |
| 270 | 4x |
indent, |
| 271 | 4x |
c( |
| 272 | 4x |
d, |
| 273 | 4x |
vapply( |
| 274 | 4x |
sources, |
| 275 | 4x |
function(s) paste(c(s$id, '(("', s$general, '"))'), collapse = ""),
|
| 276 |
"" |
|
| 277 |
), |
|
| 278 | 4x |
projects, |
| 279 | 4x |
relationships |
| 280 |
) |
|
| 281 |
), |
|
| 282 |
"```" |
|
| 283 |
) |
|
| 284 | 4x |
if (is.character(out_file) && out_file != "") {
|
| 285 | 4x |
writeLines(out, paste0(project_dir, "/", out_file)) |
| 286 |
} |
|
| 287 | 4x |
invisible(out) |
| 288 |
} |
|
| 289 | ||
| 290 |
make_link <- function(url, name = NULL) {
|
|
| 291 | 1x |
paste0( |
| 292 | 1x |
'<strong><a href="', |
| 293 | 1x |
url, |
| 294 | 1x |
'" target="_blank" rel="noreferrer">', |
| 295 | 1x |
if (is.null(name)) sub("https?://(?:www\\.)?", "", url) else name,
|
| 296 | 1x |
"</a></strong>" |
| 297 |
) |
|
| 298 |
} |
|
| 299 | ||
| 300 |
make_list <- function(items) {
|
|
| 301 | 1x |
paste0( |
| 302 | 1x |
"<ul>", |
| 303 | 1x |
paste( |
| 304 | 1x |
vapply( |
| 305 | 1x |
items, |
| 306 | 1x |
function(i) paste0("<li><code>", i, "</code></li>"),
|
| 307 |
"" |
|
| 308 |
), |
|
| 309 | 1x |
collapse = "" |
| 310 |
), |
|
| 311 | 1x |
"</ul>" |
| 312 |
) |
|
| 313 |
} |
| 1 |
#' Make a measurement metadata file |
|
| 2 |
#' |
|
| 3 |
#' Make a \code{measure_info.json} file, or add measure entries to an existing one.
|
|
| 4 |
#' |
|
| 5 |
#' @param path Path to the \code{measure_info.json} file, existing or to be created.
|
|
| 6 |
#' @param ... Lists containing individual measure items. See the Measure Entries section. |
|
| 7 |
#' @param info A list containing measurement information to be added. |
|
| 8 |
#' @param references A list containing citation entries. See the Reference Entries section. |
|
| 9 |
#' @param sources A list containing source entries. See the Sources Entries section. |
|
| 10 |
#' @param strict Logical; if \code{TRUE}, will only allow recognized entries and values.
|
|
| 11 |
#' @param include_empty Logical; if \code{FALSE}, will omit entries that have not been provided.
|
|
| 12 |
#' @param overwrite_entry Logical; if \code{TRUE}, will replace rather than add to an existing entry.
|
|
| 13 |
#' @param render Path to save a version of \code{path} to, with dynamic entries expanded. See the
|
|
| 14 |
#' Dynamic Entries section. |
|
| 15 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite rather than add to an existing \code{path}.
|
|
| 16 |
#' @param write Logical; if \code{FALSE}, will not write the build or rendered measure info.
|
|
| 17 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 18 |
#' @param open_after Logical; if \code{FALSE}, will not open the measure file after writing/updating.
|
|
| 19 |
#' @section Measure Entries: |
|
| 20 |
#' Measure entries are named by the unique variable name with any of these entries (if \code{strict}):
|
|
| 21 |
#' \itemize{
|
|
| 22 |
#' \item \strong{\code{id}}: Unique identifier of the measure; same as the entry name.
|
|
| 23 |
#' This is meant to correspond to the column name containing the measure in data files. |
|
| 24 |
#' It should be minimal in length while still being unique across all files within the project. |
|
| 25 |
#' It should only contain the characters \code{a-z}, \code{0-9}, or \code{_}.
|
|
| 26 |
#' \item \strong{\code{short_name}}: Shortest possible display name.
|
|
| 27 |
#' \item \strong{\code{long_name}}: Longer display name.
|
|
| 28 |
#' \item \strong{\code{category}}: Arbitrary category for the measure.
|
|
| 29 |
#' \item \strong{\code{subcategory}}: Arbitrary subcategory for the measure.
|
|
| 30 |
#' \item \strong{\code{short_description}}: Shortest possible description.
|
|
| 31 |
#' \item \strong{\code{long_description}}: Complete description. Either description can include
|
|
| 32 |
#' TeX-style equations, enclosed in escaped square brackets (e.g., |
|
| 33 |
#' \code{"The equation \\\\[a_{i} = b^\\\\frac{c}{d}\\\\] was used."}; or \code{$...$},
|
|
| 34 |
#' \code{\\\\(...\\\\)}, or \code{\\\\begin{math}...\\\\end{math}}). The final enclosing symbol must be
|
|
| 35 |
#' followed by a space or the end of the string. These are pre-render to MathML with |
|
| 36 |
#' \code{\link[katex]{katex_mathml}}.
|
|
| 37 |
#' \item \strong{\code{statement}}: String with dynamic references to entity features
|
|
| 38 |
#' (e.g., \code{"measure value = {value}"}). References can include:
|
|
| 39 |
#' \itemize{
|
|
| 40 |
#' \item \code{value}: Value of a currently displaying variable at a current time.
|
|
| 41 |
#' \item \code{region_name}: Alias of \code{features.name}.
|
|
| 42 |
#' \item \code{features.<entry>}: An entity feature, coming from \code{entity_info.json} or GeoJSON properties.
|
|
| 43 |
#' All entities have at least \code{name} and \code{id} entries (e.g., \code{"{features.id}"}).
|
|
| 44 |
#' \item \code{variables.<entry>}: A variable feature such as \code{name} which is the same as
|
|
| 45 |
#' \code{id} (e.g., \code{"{variables.name}"}).
|
|
| 46 |
#' \item \code{data.<variable>}: The value of another variable at a current time (e.g., \code{"{data.variable_a}"}).
|
|
| 47 |
#' } |
|
| 48 |
#' \item \strong{\code{measure_type}}: A measure's type, that is more specific than its storage type
|
|
| 49 |
#' (e.g., \code{rate} or \code{percent} for a \code{float}-type measure).
|
|
| 50 |
#' \item \strong{\code{unit}}: What a single value of the measure represents
|
|
| 51 |
#' (e.g., \code{person} for a count of people, or \code{per 100k people} for a rate per 100k people).
|
|
| 52 |
#' \item \strong{\code{time_resolution}}: Temporal resolution of the variable, such as \code{year} or \code{week}.
|
|
| 53 |
#' \item \strong{\code{restrictions}}: A license or description of restrictions that may apply to the measure.
|
|
| 54 |
#' \item \strong{\code{sources}}: A list or list of list containing source information, including any of these entries:
|
|
| 55 |
#' \itemize{
|
|
| 56 |
#' \item \code{id}: An ID found in the \code{_sources} entry, to inherit entries from.
|
|
| 57 |
#' \item \code{name}: Name of the source (such as an organization name).
|
|
| 58 |
#' \item \code{url}: General URL of the source (such as an organization's website).
|
|
| 59 |
#' \item \code{location}: More specific description of the source (such as a the name of a particular data product).
|
|
| 60 |
#' \item \code{location_url}: More direct URL to the resource (such as a page listing data products).
|
|
| 61 |
#' } |
|
| 62 |
#' \item \strong{\code{citations}}: A vector of reference ids (the names of \code{reference} entries; e.g., \code{c("ref1", "ref3")}).
|
|
| 63 |
#' \item \strong{\code{categories}}: A named list of categories, with any of the other measure entries, or a
|
|
| 64 |
#' \code{default} entry giving a default category name. See the Dynamic Entries section.
|
|
| 65 |
#' \item \strong{\code{variants}}: A named list of variants, with any of the other measure entries, or a
|
|
| 66 |
#' \code{default} entry giving a default variant name. See the Dynamic Entries section.
|
|
| 67 |
#' } |
|
| 68 |
#' |
|
| 69 |
#' @section Bundle Entries: |
|
| 70 |
#' Measures in bundle projects can inherit the information provided in source bundles. |
|
| 71 |
#' This will happen when either the measure has the same name as an existing measure |
|
| 72 |
#' (in which case, the info can be empty: \code{"existing_measure": {}}),
|
|
| 73 |
#' or when a special \code{source_id} entry maps to an existing measure
|
|
| 74 |
#' (\code{"new_measure": {"source_id": "existing_measure"}}).
|
|
| 75 |
#' |
|
| 76 |
#' If bundle files are in tall format, such that measures are stacked, they can be documented by |
|
| 77 |
#' (1) using a special \code{levels} entry to map levels of a variable that identifies the measure,
|
|
| 78 |
#' and (2) using a special \code{measure_column} entry for the variable containing values,
|
|
| 79 |
#' to point to that identifier variable: |
|
| 80 |
#' |
|
| 81 |
#' \enumerate{
|
|
| 82 |
#' \item \code{"measure": {"levels": {"existing_measure": {}, "new_measure": {"source_id": "existing_measure"}}}}
|
|
| 83 |
#' \item \code{"value": {"measure_column": "measure"}}
|
|
| 84 |
#' } |
|
| 85 |
#' |
|
| 86 |
#' @section Duplicate Names: |
|
| 87 |
#' It is strongly preferable that every distinct measure has a name that is unique across all |
|
| 88 |
#' files within a collection project. |
|
| 89 |
#' |
|
| 90 |
#' If names must be duplicated between files, they can be prefixed with the path to the file |
|
| 91 |
#' containing them, relative to the data directory (or standalone parent), separated by a bar |
|
| 92 |
#' (\code{|}; e.g., \code{subproject_name/dist/data.csv.gz|measure_name}).
|
|
| 93 |
#' |
|
| 94 |
#' @section Dynamic Entries: |
|
| 95 |
#' You may have several closely related variables in a dataset, which share sections of metadata, |
|
| 96 |
#' or have formulaic differences. In cases like this, the \code{categories} and/or \code{variants} entries
|
|
| 97 |
#' can be used along with dynamic notation to construct multiple entries from a single template. |
|
| 98 |
#' |
|
| 99 |
#' Though functionally the same, \code{categories} might include broken-out subsets of some total
|
|
| 100 |
#' (such as race groups, as categories of a total population), whereas \code{variants} may be different
|
|
| 101 |
#' transformations of the same variable (such as raw counts versus percentages). |
|
| 102 |
#' |
|
| 103 |
#' In dynamic entries, \code{{category}} or \code{{variant}} refers to entries in the \code{categories}
|
|
| 104 |
#' or \code{variants} lists. By default, these are replaced with the name of each entries in those lists
|
|
| 105 |
#' (e.g., \code{"variable_{category}"} where \code{categories = "a"} would become \code{"variable_a"}).
|
|
| 106 |
#' A \code{default} entry would change this behavior (e.g., with \code{categories = list(a = list(default = "b")}
|
|
| 107 |
#' that would become \code{"variable_b"}). Adding \code{.name} would force the original behavior (e.g.,
|
|
| 108 |
#' \code{"variable_{category.name}"} would be \code{"variable_a"}). A name of \code{"blank"} is treated as
|
|
| 109 |
#' an empty string. |
|
| 110 |
#' |
|
| 111 |
#' When notation appears in a measure info entry, they will first default to a matching name in the \code{categories}
|
|
| 112 |
#' or \code{variants} list; for example, \code{short_name} in \code{list(short_name = "variable {category}")} with
|
|
| 113 |
#' \code{categories = list(a = list(short_name = "(category a)"))} would become \code{"variable (category a)"}.
|
|
| 114 |
#' To force this behavior, the entry name can be included in the notation (e.g., |
|
| 115 |
#' \code{"{category.short_name}"} would be \code{"variable (category a)"} in any entry).
|
|
| 116 |
#' |
|
| 117 |
#' Only string entries are processed dynamically -- any list-like entries (such as |
|
| 118 |
#' \code{source}, \code{citations}, or \code{layer}) appearing in
|
|
| 119 |
#' \code{categories} or \code{variants} entries will fully replace the base entry.
|
|
| 120 |
#' |
|
| 121 |
#' Dynamic entries can be kept dynamic when passed to a data site, but can be rendered for other uses, |
|
| 122 |
#' where the rendered version will have each dynamic entry replaced with all unique combinations of |
|
| 123 |
#' \code{categories} and \code{variants} entries, assuming both are used in the dynamic entry's name
|
|
| 124 |
#' (e.g., \code{"variable_{category}_{variant}"}). See Examples.
|
|
| 125 |
#' |
|
| 126 |
#' @section Reference Entries: |
|
| 127 |
#' Reference entries can be included in a \code{_references} entry, and should have names corresponding to
|
|
| 128 |
#' those included in any of the measures' \code{citation} entries. These can include any of these entries:
|
|
| 129 |
#' \itemize{
|
|
| 130 |
#' \item \strong{\code{id}}: The reference id, same as the entry name.
|
|
| 131 |
#' \item \strong{\code{author}}: A list or list of lists specifying one or more authors. These can include
|
|
| 132 |
#' entries for \code{given} and \code{family} names.
|
|
| 133 |
#' \item \strong{\code{year}}: Year of the publication.
|
|
| 134 |
#' \item \strong{\code{title}}: Title of the publication.
|
|
| 135 |
#' \item \strong{\code{journal}}: Journal in which the publication appears.
|
|
| 136 |
#' \item \strong{\code{volume}}: Volume number of the journal.
|
|
| 137 |
#' \item \strong{\code{page}}: Page number of the journal.
|
|
| 138 |
#' \item \strong{\code{doi}}: Digital Object Identifier, from which a link is made (\code{https://doi.org/{doi}}).
|
|
| 139 |
#' \item \strong{\code{version}}: Version number of software.
|
|
| 140 |
#' \item \strong{\code{url}}: Link to the publication, alternative to a DOI.
|
|
| 141 |
#' } |
|
| 142 |
#' |
|
| 143 |
#' @section Source Entries: |
|
| 144 |
#' Source entries can be included in a \code{_sources} entry, and should have names corresponding to those
|
|
| 145 |
#' included in any of the measures' \code{sources} entry. These can include any of these entries:
|
|
| 146 |
#' \itemize{
|
|
| 147 |
#' \item \strong{\code{name}}: Name of the source.
|
|
| 148 |
#' \item \strong{\code{url}}: Link to the source's site.
|
|
| 149 |
#' \item \strong{\code{description}}: A description of the source.
|
|
| 150 |
#' \item \strong{\code{notes}}: A list of additional notes about the source.
|
|
| 151 |
#' \item \strong{\code{organization}}: Name of a higher-level organization that the source is a part of.
|
|
| 152 |
#' \item \strong{\code{organization_url}}: Link to the organization's site.
|
|
| 153 |
#' \item \strong{\code{category}}: A top-level category classification.
|
|
| 154 |
#' \item \strong{\code{subcategory}}: A lower-level category classification.
|
|
| 155 |
#' } |
|
| 156 |
#' @examples |
|
| 157 |
#' path <- tempfile() |
|
| 158 |
#' |
|
| 159 |
#' # make an initial file |
|
| 160 |
#' dcf_measure_info(path, "measure_name" = list( |
|
| 161 |
#' id = "measure_name", |
|
| 162 |
#' short_description = "A measure.", |
|
| 163 |
#' statement = "This entity has {value} measure units."
|
|
| 164 |
#' ), verbose = FALSE) |
|
| 165 |
#' |
|
| 166 |
#' # add another measure to that |
|
| 167 |
#' measure_info <- dcf_measure_info(path, "measure_two" = list( |
|
| 168 |
#' id = "measure_two", |
|
| 169 |
#' short_description = "Another measure.", |
|
| 170 |
#' statement = "This entity has {value} measure units."
|
|
| 171 |
#' ), verbose = FALSE) |
|
| 172 |
#' names(measure_info) |
|
| 173 |
#' |
|
| 174 |
#' # add a dynamic measure, and make a rendered version |
|
| 175 |
#' measure_info_rendered <- dcf_measure_info( |
|
| 176 |
#' path, |
|
| 177 |
#' "measure_{category}_{variant.name}" = list(
|
|
| 178 |
#' id = "measure_{category}_{variant.name}",
|
|
| 179 |
#' short_description = "Another measure ({category}; {variant}).",
|
|
| 180 |
#' statement = "This entity has {value} {category} {variant}s.",
|
|
| 181 |
#' categories = c("a", "b"),
|
|
| 182 |
#' variants = list(u1 = list(default = "U1"), u2 = list(default = "U2")) |
|
| 183 |
#' ), |
|
| 184 |
#' render = TRUE, verbose = FALSE |
|
| 185 |
#' ) |
|
| 186 |
#' names(measure_info_rendered) |
|
| 187 |
#' measure_info_rendered[["measure_a_u1"]]$statement |
|
| 188 |
#' @return An invisible list containing measurement metadata (the rendered version if made). |
|
| 189 |
#' @export |
|
| 190 | ||
| 191 |
dcf_measure_info <- function( |
|
| 192 |
path, |
|
| 193 |
..., |
|
| 194 |
info = list(), |
|
| 195 |
references = list(), |
|
| 196 |
sources = list(), |
|
| 197 |
strict = FALSE, |
|
| 198 |
include_empty = TRUE, |
|
| 199 |
overwrite_entry = FALSE, |
|
| 200 |
render = NULL, |
|
| 201 |
overwrite = FALSE, |
|
| 202 |
write = TRUE, |
|
| 203 |
verbose = TRUE, |
|
| 204 |
open_after = interactive() |
|
| 205 |
) {
|
|
| 206 | 22x |
if (write) {
|
| 207 | 10x |
if (missing(path) || !is.character(path)) {
|
| 208 | ! |
cli::cli_abort( |
| 209 | ! |
"enter a path to the measure_info.json file as {.arg path}"
|
| 210 |
) |
|
| 211 |
} |
|
| 212 | 10x |
dir.create(dirname(path), FALSE, TRUE) |
| 213 |
} |
|
| 214 | 22x |
built <- list() |
| 215 | 22x |
if (!overwrite && is.character(path) && file.exists(path)) {
|
| 216 | 19x |
if (verbose) {
|
| 217 | 6x |
cli::cli_bullets(c( |
| 218 | 6x |
i = "updating existing file: {.path {basename(path)}}"
|
| 219 |
)) |
|
| 220 |
} |
|
| 221 | 19x |
built <- dcf_attempt_read_json(path) |
| 222 | 19x |
if (all(c("id", "measure_type") %in% names(built))) {
|
| 223 | ! |
built <- list(built) |
| 224 | ! |
names(built) <- built[[1]]$id |
| 225 |
} |
|
| 226 |
} |
|
| 227 | 22x |
if (length(references)) {
|
| 228 | 1x |
references <- c(references, built$`_references`) |
| 229 | 1x |
references <- references[!duplicated(names(references))] |
| 230 | 1x |
built$`_references` <- references |
| 231 |
} else {
|
|
| 232 | 21x |
references <- built$`_references` |
| 233 |
} |
|
| 234 | 22x |
if (length(sources)) {
|
| 235 | 1x |
sources <- c(sources, built$`_sources`) |
| 236 | 1x |
sources <- sources[!duplicated(names(sources))] |
| 237 | 1x |
built$`_sources` <- sources |
| 238 |
} else {
|
|
| 239 | 21x |
sources <- built$`_sources` |
| 240 |
} |
|
| 241 | 22x |
defaults <- list( |
| 242 | 22x |
id = "", |
| 243 | 22x |
short_name = "", |
| 244 | 22x |
long_name = "", |
| 245 | 22x |
category = "", |
| 246 | 22x |
short_description = "", |
| 247 | 22x |
long_description = "", |
| 248 | 22x |
statement = "", |
| 249 | 22x |
measure_type = "", |
| 250 | 22x |
unit = "", |
| 251 | 22x |
time_resolution = "", |
| 252 | 22x |
restrictions = "", |
| 253 | 22x |
sources = list(), |
| 254 | 22x |
citations = list() |
| 255 |
) |
|
| 256 | 22x |
recognized_columns <- c( |
| 257 | 22x |
names(defaults), |
| 258 | 22x |
"source_id", |
| 259 | 22x |
"levels", |
| 260 | 22x |
"measure_column" |
| 261 |
) |
|
| 262 | 22x |
if (!is.list(info)) {
|
| 263 | ! |
info <- sapply(info, function(name) list()) |
| 264 |
} |
|
| 265 | 22x |
info <- c(list(...), info) |
| 266 | 22x |
if (length(info) && is.null(names(info))) {
|
| 267 | ! |
cli::cli_abort("supplied measure entries must be named")
|
| 268 |
} |
|
| 269 | 22x |
for (n in names(info)) {
|
| 270 | 14x |
if (overwrite_entry || is.null(built[[n]])) {
|
| 271 | 13x |
l <- info[[n]] |
| 272 |
} else {
|
|
| 273 | 1x |
l <- c(info[[n]], built[[n]]) |
| 274 | 1x |
l <- l[!duplicated(names(l))] |
| 275 |
} |
|
| 276 | 14x |
if (is.null(l$id)) {
|
| 277 | 9x |
l$id <- n |
| 278 |
} |
|
| 279 | 14x |
is_standard <- !startsWith(n, "_") && |
| 280 | 14x |
!(n %in% c("variants", "categories")) &&
|
| 281 | 14x |
is.null(l$levels) |
| 282 | 14x |
if (strict) {
|
| 283 | 1x |
su <- names(l) %in% recognized_columns |
| 284 | 1x |
if (verbose && any(!su)) {
|
| 285 | 1x |
cli::cli_warn(paste0( |
| 286 | 1x |
"unrecognized {?entry/entries} in ",
|
| 287 | 1x |
n, |
| 288 | 1x |
": {names(l)[!su]}"
|
| 289 |
)) |
|
| 290 |
} |
|
| 291 | 1x |
if (include_empty && is_standard) {
|
| 292 | ! |
for (e in names(l)) {
|
| 293 | ! |
if (!is.null(defaults[[e]])) {
|
| 294 | ! |
defaults[[e]] <- l[[e]] |
| 295 |
} |
|
| 296 |
} |
|
| 297 | ! |
l <- defaults |
| 298 |
} else {
|
|
| 299 | 1x |
l <- l[su] |
| 300 |
} |
|
| 301 | 13x |
} else if (include_empty && is_standard) {
|
| 302 | 10x |
su <- !names(defaults) %in% names(l) |
| 303 | 10x |
if (any(su)) l <- c(l, defaults[su]) |
| 304 |
} |
|
| 305 | 14x |
if (!is.null(l$categories) && !is.list(l$categories)) {
|
| 306 | 1x |
l$categories <- structure( |
| 307 | 1x |
lapply(l$categories, function(e) list(default = e)), |
| 308 | 1x |
names = l$categories |
| 309 |
) |
|
| 310 |
} |
|
| 311 | 14x |
if (!is.null(l$variants) && !is.list(l$variants)) {
|
| 312 | ! |
l$variants <- structure( |
| 313 | ! |
lapply(l$variants, function(e) list(default = e)), |
| 314 | ! |
names = l$categories |
| 315 |
) |
|
| 316 |
} |
|
| 317 | 14x |
if (verbose && !is.null(l$citations)) {
|
| 318 | 8x |
su <- !l$citations %in% names(references) |
| 319 | 8x |
if (any(su)) {
|
| 320 | 1x |
cli::cli_warn( |
| 321 | 1x |
"no matching reference entry for {.val {l$citations[su]}} in {.val {n}}"
|
| 322 |
) |
|
| 323 |
} |
|
| 324 |
} |
|
| 325 | 14x |
if (verbose && !is.null(l$sources)) {
|
| 326 | 6x |
l_sources <- if (!is.character(l$sources)) {
|
| 327 | 6x |
l$sources |
| 328 |
} else {
|
|
| 329 | ! |
Filter( |
| 330 | ! |
nchar, |
| 331 | ! |
vapply( |
| 332 | ! |
l$sources, |
| 333 | ! |
function(s) {
|
| 334 | ! |
if (is.character(s)) {
|
| 335 | ! |
s |
| 336 | ! |
} else if (is.null(s$id)) {
|
| 337 |
"" |
|
| 338 |
} else {
|
|
| 339 | ! |
s$id |
| 340 |
} |
|
| 341 |
}, |
|
| 342 |
"" |
|
| 343 |
) |
|
| 344 |
) |
|
| 345 |
} |
|
| 346 | 6x |
if (length(l_sources)) {
|
| 347 | ! |
su <- !l_sources %in% names(sources) |
| 348 | ! |
if (any(su)) {
|
| 349 | ! |
cli::cli_warn( |
| 350 | ! |
"no matching source entry for {.val {l_sources[su]}} in {.val {n}}"
|
| 351 |
) |
|
| 352 |
} |
|
| 353 |
} |
|
| 354 |
} |
|
| 355 | 14x |
built[[n]] <- l |
| 356 |
} |
|
| 357 | 22x |
built <- built[order(grepl("^_", names(built)))]
|
| 358 | 22x |
if (write) {
|
| 359 | 10x |
if (verbose) {
|
| 360 | 7x |
cli::cli_bullets(c(i = "writing info to {.path {path}}"))
|
| 361 |
} |
|
| 362 | 10x |
jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE) |
| 363 |
} |
|
| 364 | 22x |
if (!is.null(render)) {
|
| 365 | 13x |
expanded <- list() |
| 366 | 13x |
for (name in names(built)) {
|
| 367 | 32x |
expanded <- c( |
| 368 | 32x |
expanded, |
| 369 | 32x |
if (grepl("{", name, fixed = TRUE)) {
|
| 370 | 3x |
render_info(built[name]) |
| 371 |
} else {
|
|
| 372 | 29x |
structure(list(built[[name]]), names = name) |
| 373 |
} |
|
| 374 |
) |
|
| 375 |
} |
|
| 376 | 13x |
changed <- !identical(built, expanded) |
| 377 | 13x |
built <- expanded |
| 378 | 13x |
if (write && changed) {
|
| 379 | 1x |
path <- if (is.character(render)) {
|
| 380 | ! |
render |
| 381 |
} else {
|
|
| 382 | 1x |
sub("\\.json", "_rendered.json", path, TRUE)
|
| 383 |
} |
|
| 384 | 1x |
if (verbose) {
|
| 385 | 1x |
cli::cli_bullets(c(i = "writing rendered info to {.path {path}}"))
|
| 386 |
} |
|
| 387 | 1x |
jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE) |
| 388 |
} |
|
| 389 |
} |
|
| 390 | 22x |
if (open_after) {
|
| 391 | ! |
rstudioapi::navigateToFile(path) |
| 392 |
} |
|
| 393 | 22x |
invisible(built) |
| 394 |
} |
|
| 395 | ||
| 396 |
replace_equations <- function(info) {
|
|
| 397 | 7x |
lapply(info, function(e) {
|
| 398 | 17x |
if (!is.list(e)) {
|
| 399 | ! |
e <- list(default = e) |
| 400 |
} |
|
| 401 | 17x |
descriptions <- grep("description", names(e), fixed = TRUE)
|
| 402 | 17x |
if (length(descriptions)) {
|
| 403 | 12x |
for (d in descriptions) {
|
| 404 | 24x |
p <- gregexpr( |
| 405 | 24x |
"(?:\\$|\\\\\\[|\\\\\\(|\\\\begin\\{math\\})(.+?)(?:\\$|\\\\\\]|\\\\\\)|\\\\end\\{math\\})(?=\\s|$)",
|
| 406 | 24x |
e[[d]], |
| 407 | 24x |
perl = TRUE |
| 408 | 24x |
)[[1]] |
| 409 | 24x |
if (p[[1]] != -1) {
|
| 410 | ! |
re <- paste("", e[[d]], "")
|
| 411 | ! |
fm <- regmatches(e[[d]], p) |
| 412 | ! |
for (i in seq_along(p)) {
|
| 413 | ! |
mp <- attr(p, "capture.start")[i, ] |
| 414 | ! |
eq <- substring(e[[d]], mp, mp + attr(p, "capture.length")[i, ] - 1) |
| 415 | ! |
parsed <- tryCatch( |
| 416 | ! |
katex::katex_mathml(eq), |
| 417 | ! |
error = function(e) NULL |
| 418 |
) |
|
| 419 | ! |
if (!is.null(parsed)) {
|
| 420 | ! |
re <- paste( |
| 421 | ! |
strsplit(re, fm[[i]], fixed = TRUE)[[1]], |
| 422 | ! |
collapse = sub("^<[^>]*>", "", sub("<[^>]*>$", "", parsed))
|
| 423 |
) |
|
| 424 |
} |
|
| 425 |
} |
|
| 426 | ! |
e[[d]] <- gsub("^ | $", "", re)
|
| 427 |
} |
|
| 428 |
} |
|
| 429 |
} |
|
| 430 | 17x |
if (is.list(e$categories)) {
|
| 431 | ! |
e$categories <- replace_equations(e$categories) |
| 432 |
} |
|
| 433 | 17x |
if (is.list(e$variants)) {
|
| 434 | ! |
e$variants <- replace_equations(e$variants) |
| 435 |
} |
|
| 436 | 17x |
e |
| 437 |
}) |
|
| 438 |
} |
|
| 439 | ||
| 440 |
preprocess <- function(l) {
|
|
| 441 | 6x |
if (!is.list(l)) {
|
| 442 | 2x |
l <- sapply(l, function(n) list()) |
| 443 |
} |
|
| 444 | 6x |
ns <- names(l) |
| 445 | 6x |
for (i in seq_along(l)) {
|
| 446 | 8x |
name <- if (ns[i] == "blank") "" else ns[i] |
| 447 | 8x |
l[[i]]$name <- name |
| 448 | 5x |
if (is.null(l[[i]]$default)) l[[i]]$default <- name |
| 449 |
} |
|
| 450 | 6x |
l |
| 451 |
} |
|
| 452 | ||
| 453 |
replace_dynamic <- function(e, p, s, v = NULL, default = "default") {
|
|
| 454 | 128x |
m <- gregexpr(p, e) |
| 455 | 128x |
if (m[[1]][[1]] != -1) {
|
| 456 | 28x |
t <- regmatches(e, m)[[1]] |
| 457 | 28x |
tm <- structure(gsub("\\{[^.]+\\.?|\\}", "", t), names = t)
|
| 458 | 28x |
tm <- tm[!duplicated(names(tm))] |
| 459 | 28x |
tm[tm == ""] <- default |
| 460 | 28x |
for (tar in names(tm)) {
|
| 461 | 44x |
us <- (if (is.null(v) || substring(tar, 2, 2) == "c") s else v) |
| 462 | 44x |
entry <- tm[[tar]] |
| 463 | 44x |
if (is.null(us[[entry]]) && grepl("description", entry, fixed = TRUE)) {
|
| 464 | 8x |
entry <- default <- "description" |
| 465 |
} |
|
| 466 | 44x |
if (is.null(us[[entry]]) && entry == default) {
|
| 467 | 28x |
entry <- "default" |
| 468 |
} |
|
| 469 | 44x |
if (is.null(us[[entry]])) {
|
| 470 | ! |
cli::cli_abort("failed to render measure info from {tar}")
|
| 471 |
} |
|
| 472 | 44x |
e <- gsub(tar, us[[entry]], e, fixed = TRUE) |
| 473 |
} |
|
| 474 |
} |
|
| 475 | 128x |
e |
| 476 |
} |
|
| 477 | ||
| 478 |
prepare_source <- function(o, s, p) {
|
|
| 479 | 16x |
if (length(o)) {
|
| 480 | 12x |
lapply(o, function(e) {
|
| 481 | 2x |
if (is.character(e) && length(e) == 1) replace_dynamic(e, p, s) else e |
| 482 |
}) |
|
| 483 |
} else {
|
|
| 484 | 4x |
list(name = "", default = "") |
| 485 |
} |
|
| 486 |
} |
|
| 487 | ||
| 488 |
render_info_names <- function(infos) {
|
|
| 489 | ! |
r <- lapply(names(infos), function(n) render_info(infos[n], TRUE)) |
| 490 | ! |
structure(rep(names(infos), vapply(r, length, 0)), names = unlist(r)) |
| 491 |
} |
|
| 492 | ||
| 493 |
render_info <- function(info, names_only = FALSE) {
|
|
| 494 | 3x |
base_name <- names(info) |
| 495 | 3x |
base <- info[[1]] |
| 496 | 3x |
if (is.null(base$categories) && is.null(base$variants)) {
|
| 497 | ! |
return(if (names_only) base_name else info) |
| 498 |
} |
|
| 499 | 3x |
categories <- preprocess(base$categories) |
| 500 | 3x |
variants <- preprocess(base$variants) |
| 501 | 3x |
base$categories <- NULL |
| 502 | 3x |
base$variants <- NULL |
| 503 | 3x |
expanded <- NULL |
| 504 | 3x |
vars <- strsplit( |
| 505 | 3x |
as.character(outer( |
| 506 | 3x |
if (is.null(names(categories))) "" else names(categories), |
| 507 | 3x |
if (is.null(names(variants))) "" else names(variants), |
| 508 | 3x |
paste, |
| 509 | 3x |
sep = "PARTSEP" |
| 510 |
)), |
|
| 511 | 3x |
"PARTSEP", |
| 512 | 3x |
fixed = TRUE |
| 513 |
) |
|
| 514 | 3x |
for (var in vars) {
|
| 515 | 8x |
cs <- if (var[1] == "") list() else categories[[var[1]]] |
| 516 | 8x |
vs <- if (length(var) == 1 || var[2] == "") list() else variants[[var[2]]] |
| 517 | 8x |
cs <- prepare_source(cs, vs, "\\{variants?(?:\\.[^}]+?)?\\}")
|
| 518 | 8x |
vs <- prepare_source(vs, cs, "\\{categor(?:y|ies)(?:\\.[^}]+?)?\\}")
|
| 519 | 8x |
s <- c(cs, vs[!names(vs) %in% names(cs)]) |
| 520 | 8x |
p <- "\\{(?:categor(?:y|ies)|variants?)(?:\\.[^}]+?)?\\}"
|
| 521 | 8x |
key <- replace_dynamic(base_name, p, cs, vs) |
| 522 | 8x |
if (names_only) {
|
| 523 | ! |
expanded <- c(expanded, key) |
| 524 |
} else {
|
|
| 525 | 8x |
expanded[[key]] <- c( |
| 526 | 8x |
structure( |
| 527 | 8x |
lapply(names(base), function(n) {
|
| 528 | 108x |
e <- base[[n]] |
| 529 | 108x |
if (is.character(e) && length(e) == 1) {
|
| 530 | 92x |
e <- replace_dynamic(e, p, cs, vs, n) |
| 531 |
} |
|
| 532 | 108x |
e |
| 533 |
}), |
|
| 534 | 8x |
names = names(base) |
| 535 |
), |
|
| 536 | 8x |
s[ |
| 537 | 8x |
!names(s) %in% |
| 538 | 8x |
c( |
| 539 | 8x |
"default", |
| 540 | 8x |
"name", |
| 541 | 8x |
if (any(base[c("long_description", "short_description")] != "")) {
|
| 542 | 4x |
"description" |
| 543 |
}, |
|
| 544 | 8x |
names(base) |
| 545 |
) |
|
| 546 |
] |
|
| 547 |
) |
|
| 548 |
} |
|
| 549 |
} |
|
| 550 | 3x |
expanded |
| 551 |
} |
| 1 |
#' Check Data Projects |
|
| 2 |
#' |
|
| 3 |
#' Check the data files and measure info of subprojects. |
|
| 4 |
#' |
|
| 5 |
#' @param names Name or names of projects. |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
| 7 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
|
|
| 8 |
#' @returns A list with an entry for each source, containing a character vector |
|
| 9 |
#' including any issue codes: |
|
| 10 |
#' \itemize{
|
|
| 11 |
#' \item \code{not_compressed}: The file does not appear to be compressed.
|
|
| 12 |
#' \item \code{cant_read}: Failed to read the file in.
|
|
| 13 |
#' \item \code{geography_nas}: The file's \code{geography} column contains NAs.
|
|
| 14 |
#' \item \code{time_nas}: The file's \code{time} column contains NAs.
|
|
| 15 |
#' \item \code{missing_info: {column_name}}: The file's indicated column does not have
|
|
| 16 |
#' a matching entry in \code{measure_info.json}.
|
|
| 17 |
#' } |
|
| 18 |
#' @examples |
|
| 19 |
#' \dontrun{
|
|
| 20 |
#' dcf_check("gtrends")
|
|
| 21 |
#' } |
|
| 22 |
#' @export |
|
| 23 | ||
| 24 |
dcf_check <- function( |
|
| 25 |
names = NULL, |
|
| 26 |
project_dir = ".", |
|
| 27 |
verbose = TRUE |
|
| 28 |
) {
|
|
| 29 | 4x |
if (missing(project_dir) && length(names) == 1L && dir.exists(project_dir)) {
|
| 30 | ! |
project_dir <- names |
| 31 | ! |
names <- NULL |
| 32 |
} |
|
| 33 |
if ( |
|
| 34 | 4x |
is.null(names) && !file.exists(paste0(project_dir, "/", "settings.json")) |
| 35 |
) {
|
|
| 36 | 1x |
project_dir <- normalizePath(project_dir, "/", FALSE) |
| 37 | 1x |
if (file.exists(paste0(project_dir, "/", "process.json"))) {
|
| 38 | 1x |
names <- basename(project_dir) |
| 39 | 1x |
project_dir <- dirname(project_dir) |
| 40 |
} else {
|
|
| 41 | ! |
names <- basename(project_dir) |
| 42 | ! |
project_dir <- dirname(dirname(project_dir)) |
| 43 |
} |
|
| 44 |
} |
|
| 45 | ||
| 46 | 4x |
settings <- dcf_read_settings(project_dir) |
| 47 | 4x |
base_dir <- paste0(project_dir, "/", settings$data_dir) |
| 48 | 4x |
if (is.null(names)) {
|
| 49 | 2x |
names <- list.dirs(base_dir, recursive = FALSE, full.names = FALSE) |
| 50 | 2x |
names <- names[file.exists(paste0(base_dir, "/", names, "/process.json"))] |
| 51 |
} |
|
| 52 | 4x |
issues <- list() |
| 53 | 4x |
for (name in names) {
|
| 54 | 6x |
source_dir <- paste0(base_dir, "/", name, "/") |
| 55 | 6x |
if (!dir.exists(source_dir)) {
|
| 56 | ! |
cli::cli_abort("specify the name of an existing data project")
|
| 57 |
} |
|
| 58 | 6x |
process_file <- paste0(source_dir, "process.json") |
| 59 | 6x |
if (!file.exists(process_file)) {
|
| 60 | ! |
cli::cli_abort("{name} does not appear to be a data project")
|
| 61 |
} |
|
| 62 | 6x |
process <- dcf_process_record(process_file) |
| 63 | ! |
if (is.null(process)) next |
| 64 | 6x |
is_bundle <- !is.null(process$type) && process$type == "bundle" |
| 65 | 6x |
info_file <- paste0(source_dir, "measure_info.json") |
| 66 | 6x |
info <- tryCatch( |
| 67 | 6x |
dcf_measure_info( |
| 68 | 6x |
info_file, |
| 69 | 6x |
render = TRUE, |
| 70 | 6x |
write = FALSE, |
| 71 | 6x |
verbose = FALSE, |
| 72 | 6x |
open_after = FALSE |
| 73 |
), |
|
| 74 | 6x |
error = function(e) NULL |
| 75 |
) |
|
| 76 | 6x |
measure_ids <- unique(c( |
| 77 | 6x |
names(info), |
| 78 | 6x |
unlist(lapply(info, "[[", "source_id")) |
| 79 |
)) |
|
| 80 | 6x |
if (is.null(info)) {
|
| 81 | ! |
cli::cli_abort("{.file {info_file}} is malformed")
|
| 82 |
} |
|
| 83 | 6x |
if (verbose) {
|
| 84 | 6x |
cli::cli_bullets(c("", "Checking project {.strong {name}}"))
|
| 85 |
} |
|
| 86 | 6x |
data_files <- list.files( |
| 87 | 6x |
paste0(source_dir, if (is_bundle) "dist" else "standard"), |
| 88 | 6x |
"\\.(?:csv|parquet|json)", |
| 89 | 6x |
full.names = TRUE |
| 90 |
) |
|
| 91 | 6x |
data_files <- data_files[!grepl("datapackage", data_files, fixed = TRUE)]
|
| 92 | 6x |
source_issues <- list() |
| 93 | 6x |
for (file in list.files( |
| 94 | 6x |
paste0(source_dir, "raw"), |
| 95 | 6x |
"csv$", |
| 96 | 6x |
full.names = TRUE |
| 97 |
)) {
|
|
| 98 | 1x |
source_issues[[sub( |
| 99 | 1x |
paste0(project_dir, "/"), |
| 100 |
"", |
|
| 101 | 1x |
file, |
| 102 | 1x |
fixed = TRUE |
| 103 | 1x |
)]] <- list( |
| 104 | 1x |
data = "not_compressed" |
| 105 |
) |
|
| 106 |
} |
|
| 107 | 6x |
if (length(data_files)) {
|
| 108 | 6x |
for (file in data_files) {
|
| 109 | 7x |
file_relative_path <- sub( |
| 110 | 7x |
paste0(project_dir, "/"), |
| 111 |
"", |
|
| 112 | 7x |
file, |
| 113 | 7x |
fixed = TRUE |
| 114 |
) |
|
| 115 | 7x |
file_id <- sub("^[^/]+/", "", file_relative_path)
|
| 116 | 7x |
issue_messages <- NULL |
| 117 | 7x |
if (verbose) {
|
| 118 | 7x |
cli::cli_progress_step("checking file {.file {file}}", spinner = TRUE)
|
| 119 |
} |
|
| 120 | 7x |
data_issues <- NULL |
| 121 | 7x |
measure_issues <- NULL |
| 122 | 7x |
data <- attempt_read(file, c("geography", "time"))
|
| 123 | 7x |
if (is.null(data)) {
|
| 124 | ! |
data_issues <- c(data_issues, "cant_read") |
| 125 |
} else {
|
|
| 126 | 7x |
if (grepl("csv$", file)) {
|
| 127 | 2x |
data_issues <- c(data_issues, "not_compressed") |
| 128 | 2x |
if (verbose) {
|
| 129 | 2x |
issue_messages <- c( |
| 130 | 2x |
issue_messages, |
| 131 | 2x |
"file is not compressed" |
| 132 |
) |
|
| 133 |
} |
|
| 134 |
} |
|
| 135 | 7x |
if (("geography" %in% colnames(data)) && anyNA(data$geography)) {
|
| 136 | ! |
data_issues <- c(data_issues, "geography_nas") |
| 137 | ! |
if (verbose) {
|
| 138 | ! |
issue_messages <- c( |
| 139 | ! |
issue_messages, |
| 140 | ! |
"{.emph geography} column contains NAs"
|
| 141 |
) |
|
| 142 |
} |
|
| 143 |
} |
|
| 144 | 7x |
if (("time" %in% colnames(data)) && anyNA(data$time)) {
|
| 145 | ! |
data_issues <- c(data_issues, "time_nas") |
| 146 | ! |
if (verbose) {
|
| 147 | ! |
issue_messages <- c( |
| 148 | ! |
issue_messages, |
| 149 | ! |
"{.emph time} column contains NAs"
|
| 150 |
) |
|
| 151 |
} |
|
| 152 |
} |
|
| 153 | 7x |
for (col in colnames(data)) {
|
| 154 | 25x |
col_id <- paste0(file_id, "|", col) |
| 155 |
if ( |
|
| 156 | 25x |
!(col %in% c("geography", "time")) &&
|
| 157 | 25x |
(!(col %in% measure_ids) && !(col_id %in% measure_ids)) |
| 158 |
) {
|
|
| 159 | 3x |
measure_issues <- c(measure_issues, paste("missing_info:", col))
|
| 160 | 3x |
if (verbose) {
|
| 161 | 3x |
issue_messages <- c( |
| 162 | 3x |
issue_messages, |
| 163 | 3x |
paste0( |
| 164 | 3x |
"{.emph ",
|
| 165 | 3x |
col, |
| 166 | 3x |
"} column does not have an entry in measure_info" |
| 167 |
) |
|
| 168 |
) |
|
| 169 |
} |
|
| 170 |
} |
|
| 171 |
} |
|
| 172 |
} |
|
| 173 | 7x |
file_issues <- list() |
| 174 | 7x |
if (length(data_issues)) {
|
| 175 | 2x |
file_issues$data <- data_issues |
| 176 |
} |
|
| 177 | 7x |
if (length(measure_issues)) {
|
| 178 | 2x |
file_issues$measures <- measure_issues |
| 179 |
} |
|
| 180 | 7x |
source_issues[[file_relative_path]] <- file_issues |
| 181 | 7x |
if (verbose) {
|
| 182 | 7x |
if (length(issue_messages)) {
|
| 183 | 2x |
cli::cli_progress_done(result = "failed") |
| 184 | 2x |
cli::cli_bullets(structure( |
| 185 | 2x |
issue_messages, |
| 186 | 2x |
names = rep(" ", length(issue_messages))
|
| 187 |
)) |
|
| 188 |
} else {
|
|
| 189 | 5x |
cli::cli_progress_done() |
| 190 |
} |
|
| 191 |
} |
|
| 192 |
} |
|
| 193 |
} else {
|
|
| 194 | ! |
if (verbose) cli::cli_alert_info("no standard data files found to check")
|
| 195 |
} |
|
| 196 | 6x |
if (!identical(process$check_results, source_issues)) {
|
| 197 | 5x |
process$checked <- Sys.time() |
| 198 | 5x |
process$check_results <- source_issues |
| 199 | 5x |
dcf_process_record(process_file, process) |
| 200 |
} |
|
| 201 | 6x |
issues[[name]] <- source_issues |
| 202 |
} |
|
| 203 | ||
| 204 | 4x |
invisible(issues) |
| 205 |
} |
| 1 |
#' Adds documentation of a dataset to a datapackage |
|
| 2 |
#' |
|
| 3 |
#' Add information about variables in a dataset to a \code{datapackage.json} metadata file.
|
|
| 4 |
#' |
|
| 5 |
#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}.
|
|
| 6 |
#' @param meta Information about each data file. A list with a list entry for each entry in |
|
| 7 |
#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all.
|
|
| 8 |
#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to
|
|
| 9 |
#' \code{dir}, or a list with the read-in version.
|
|
| 10 |
#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}.
|
|
| 11 |
#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting
|
|
| 12 |
#' \code{packagename}.
|
|
| 13 |
#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information.
|
|
| 14 |
#' @param sha A number specifying the Secure Hash Algorithm function, |
|
| 15 |
#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}).
|
|
| 16 |
#' @param pretty Logical; if \code{TRUE}, will pretty-print the datapackage.
|
|
| 17 |
#' @param summarize_ids Logical; if \code{TRUE}, will include ID columns in schema field summaries.
|
|
| 18 |
#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving.
|
|
| 19 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
|
|
| 20 |
#' @details |
|
| 21 |
#' \code{meta} should be a list with unnamed entries for entry in \code{filename},
|
|
| 22 |
#' and each entry can include a named entry for any of these: |
|
| 23 |
#' \describe{
|
|
| 24 |
#' \item{source}{
|
|
| 25 |
#' A list or list of lists with entries for at least \code{name}, and ideally for \code{url}.
|
|
| 26 |
#' } |
|
| 27 |
#' \item{ids}{
|
|
| 28 |
#' A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset).
|
|
| 29 |
#' Might also include \code{map} with a list or path to a JSON file resulting in a list with an
|
|
| 30 |
#' entry for each ID, and additional information about that entity, to be read in as map features. |
|
| 31 |
#' All files will be loaded to help with aggregation, but local files will be included in the datapackage, |
|
| 32 |
#' whereas hosted files will be loaded client-side. |
|
| 33 |
#' } |
|
| 34 |
#' \item{time}{
|
|
| 35 |
#' A string giving the name of a variable in the dataset representing a repeated observation of the same entity. |
|
| 36 |
#' } |
|
| 37 |
#' \item{variables}{
|
|
| 38 |
#' A list with named entries providing more information about the variables in the dataset. |
|
| 39 |
#' See \code{\link{dcf_measure_info}}.
|
|
| 40 |
#' } |
|
| 41 |
#' \item{vintage}{
|
|
| 42 |
#' A string specifying the time and/or location at which the data were produced. |
|
| 43 |
#' } |
|
| 44 |
#' } |
|
| 45 |
#' @examples |
|
| 46 |
#' \dontrun{
|
|
| 47 |
#' # write example data |
|
| 48 |
#' write.csv(mtcars, "mtcars.csv") |
|
| 49 |
#' |
|
| 50 |
#' # add it to an existing datapackage.json file in the current working directory |
|
| 51 |
#' dcf_datapackage_add("mtcars.csv")
|
|
| 52 |
#' } |
|
| 53 |
#' @return An invisible version of the updated datapackage, which is also written to |
|
| 54 |
#' \code{datapackage.json} if \code{write = TRUE}.
|
|
| 55 |
#' @seealso Initialize the \code{datapackage.json} file with \code{\link{dcf_datapackage_init}}.
|
|
| 56 |
#' @export |
|
| 57 | ||
| 58 |
dcf_datapackage_add <- function( |
|
| 59 |
filename, |
|
| 60 |
meta = list(), |
|
| 61 |
packagename = "datapackage.json", |
|
| 62 |
dir = ".", |
|
| 63 |
write = TRUE, |
|
| 64 |
refresh = TRUE, |
|
| 65 |
sha = "512", |
|
| 66 |
pretty = FALSE, |
|
| 67 |
summarize_ids = FALSE, |
|
| 68 |
open_after = FALSE, |
|
| 69 |
verbose = interactive() |
|
| 70 |
) {
|
|
| 71 | 6x |
if (missing(filename)) {
|
| 72 | ! |
cli::cli_abort("{.arg filename} must be specified")
|
| 73 |
} |
|
| 74 | 6x |
setnames <- names(filename) |
| 75 | 6x |
if (file.exists(filename[[1L]])) {
|
| 76 | ! |
if (dir == ".") {
|
| 77 | ! |
dir <- dirname(filename[[1L]]) |
| 78 |
} |
|
| 79 | ! |
filename <- basename(filename) |
| 80 |
} |
|
| 81 | 6x |
if (any(!file.exists(paste0(dir, "/", filename)))) {
|
| 82 | ! |
filename <- filename[!file.exists(filename)] |
| 83 | ! |
cli::cli_abort("{?a file/files} did not exist: {filename}")
|
| 84 |
} |
|
| 85 | 6x |
package <- if ( |
| 86 | 6x |
is.character(packagename) && file.exists(paste0(dir, "/", packagename)) |
| 87 |
) {
|
|
| 88 | 6x |
paste0(dir, "/", packagename) |
| 89 |
} else {
|
|
| 90 | ! |
packagename |
| 91 |
} |
|
| 92 | 6x |
if (write) {
|
| 93 | 6x |
if (is.character(package)) {
|
| 94 | 6x |
package <- paste0(dir, "/", packagename) |
| 95 | 6x |
if (file.exists(package)) {
|
| 96 | 6x |
packagename <- package |
| 97 | 6x |
package <- dcf_attempt_read_json(package) |
| 98 |
} else {
|
|
| 99 | ! |
package <- dcf_datapackage_init( |
| 100 | ! |
if (!is.null(setnames)) setnames[[1]] else filename[[1]], |
| 101 | ! |
dir = dir |
| 102 |
) |
|
| 103 |
} |
|
| 104 |
} |
|
| 105 |
} |
|
| 106 | 6x |
if (!is.list(package)) {
|
| 107 | ! |
package <- list() |
| 108 |
} |
|
| 109 | 6x |
single_meta <- FALSE |
| 110 | 6x |
metas <- if (!is.null(names(meta))) {
|
| 111 | 6x |
meta_names <- if (is.null(setnames)) filename else setnames |
| 112 | 6x |
if (all(meta_names %in% names(meta))) {
|
| 113 | 5x |
meta[meta_names] |
| 114 |
} else {
|
|
| 115 | 1x |
single_meta <- TRUE |
| 116 | 1x |
if (length(meta$variables) == 1L && is.character(meta$variables)) {
|
| 117 | ! |
if (!file.exists(meta$variables)) {
|
| 118 | ! |
meta$variables <- paste0(dir, "/", meta$variables) |
| 119 |
} |
|
| 120 | ! |
if (file.exists(meta$variables)) {
|
| 121 | ! |
meta$variables <- dcf_attempt_read_json(meta$variables) |
| 122 |
} |
|
| 123 |
} |
|
| 124 | 1x |
meta$variables <- replace_equations(meta$variables) |
| 125 | 1x |
meta |
| 126 |
} |
|
| 127 |
} else {
|
|
| 128 | ! |
meta[seq_along(filename)] |
| 129 |
} |
|
| 130 | 6x |
if (!single_meta) {
|
| 131 | 5x |
metas <- lapply(metas, function(m) {
|
| 132 | 6x |
m$variables <- replace_equations(m$variables) |
| 133 | 6x |
m |
| 134 |
}) |
|
| 135 |
} |
|
| 136 | 6x |
collect_metadata <- function(file) {
|
| 137 | 7x |
f <- paste0(dir, "/", filename[[file]]) |
| 138 | 7x |
m <- if (single_meta) meta else metas[[file]] |
| 139 | 7x |
format <- if (grepl(".parquet", f, fixed = TRUE)) {
|
| 140 | ! |
"parquet" |
| 141 | 7x |
} else if (grepl(".json", f, fixed = TRUE)) {
|
| 142 | 3x |
"json" |
| 143 | 7x |
} else if (grepl(".csv", f, fixed = TRUE)) {
|
| 144 | 4x |
"csv" |
| 145 | 7x |
} else if (grepl(".rds", f, fixed = TRUE)) {
|
| 146 | ! |
"rds" |
| 147 |
} else {
|
|
| 148 | ! |
"tsv" |
| 149 |
} |
|
| 150 | 7x |
if (is.na(format)) {
|
| 151 | ! |
format <- "rds" |
| 152 |
} |
|
| 153 | 7x |
info <- file.info(f) |
| 154 | 7x |
metas <- list() |
| 155 | 7x |
unpack_meta <- function(n) {
|
| 156 | 35x |
if (!length(m[[n]])) {
|
| 157 | 9x |
list() |
| 158 | 26x |
} else if (is.list(m[[n]][[1L]])) {
|
| 159 | 7x |
m[[n]] |
| 160 |
} else {
|
|
| 161 | 19x |
list(m[[n]]) |
| 162 |
} |
|
| 163 |
} |
|
| 164 | 7x |
vintage <- unlist(unpack_meta("vintage"))
|
| 165 | 7x |
ids <- unpack_meta("ids")
|
| 166 | 7x |
idvars <- NULL |
| 167 | 7x |
for (i in seq_along(ids)) {
|
| 168 | 7x |
if (is.list(ids[[i]])) {
|
| 169 | 6x |
if ( |
| 170 | ! |
length(ids[[i]]$map) == 1L && |
| 171 | ! |
is.character(ids[[i]]$map) && |
| 172 | ! |
file.exists(ids[[i]]$map) |
| 173 |
) {
|
|
| 174 | ! |
ids[[i]]$map_content <- paste( |
| 175 | ! |
readLines(ids[[i]]$map, warn = FALSE), |
| 176 | ! |
collapse = "" |
| 177 |
) |
|
| 178 |
} |
|
| 179 |
} else {
|
|
| 180 | 7x |
ids[[i]] <- list(variable = ids[[i]]) |
| 181 |
} |
|
| 182 | 7x |
if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable) |
| 183 |
} |
|
| 184 | 7x |
data <- attempt_read(f, c("geography", "time", idvars))
|
| 185 | 7x |
if (is.null(data)) {
|
| 186 | ! |
cli::cli_warn(c( |
| 187 | ! |
paste0("failed to read in the data file ({.file {f}})"),
|
| 188 | ! |
i = "check that it is in a compatible format" |
| 189 |
)) |
|
| 190 | ! |
return(NULL) |
| 191 |
} |
|
| 192 | 7x |
if (!all(rownames(data) == seq_len(nrow(data)))) {
|
| 193 | ! |
data <- cbind(`_row` = rownames(data), data) |
| 194 |
} |
|
| 195 | 7x |
timevar <- unlist(unpack_meta("time"))
|
| 196 | 7x |
times <- if (is.null(timevar)) rep(1L, nrow(data)) else data[[timevar]] |
| 197 | 7x |
times_unique <- unique(times) |
| 198 | 7x |
varinf <- unpack_meta("variables")
|
| 199 | 7x |
if (length(varinf) == 1L && is.character(varinf[[1L]])) {
|
| 200 | ! |
if (!file.exists(varinf[[1L]])) {
|
| 201 | ! |
varinf[[1L]] <- paste0(dir, "/", varinf[[1L]]) |
| 202 |
} |
|
| 203 | ! |
if (file.exists(varinf[[1L]])) {
|
| 204 | ! |
if (varinf[[1L]] %in% names(metas)) {
|
| 205 | ! |
varinf <- metas[[varinf[[1L]]]] |
| 206 |
} else {
|
|
| 207 | ! |
varinf <- metas[[varinf[[1L]]]] <- dcf_measure_info( |
| 208 | ! |
varinf[[1L]], |
| 209 | ! |
write = FALSE, |
| 210 | ! |
render = TRUE |
| 211 |
) |
|
| 212 |
} |
|
| 213 | ! |
varinf <- varinf[varinf != ""] |
| 214 |
} |
|
| 215 |
} |
|
| 216 | 7x |
varinf_full <- if (is.null(names(varinf))) "" else names(varinf) |
| 217 | 7x |
varinf_suf <- sub("^[^:]+:", "", varinf_full)
|
| 218 | 7x |
created <- as.character(info$mtime) |
| 219 | 7x |
res <- list( |
| 220 | 7x |
bytes = as.integer(info$size), |
| 221 | 7x |
encoding = stringi::stri_enc_detect(f)[[1L]][1L, 1L], |
| 222 | 7x |
md5 = tools::md5sum(f)[[1L]], |
| 223 | 7x |
format = format, |
| 224 | 7x |
name = if (!is.null(setnames)) {
|
| 225 | ! |
setnames[file] |
| 226 | 7x |
} else if (!is.null(m$name)) {
|
| 227 | ! |
m$name |
| 228 |
} else {
|
|
| 229 | 7x |
sub("\\.[^.]*$", "", basename(filename[[file]]))
|
| 230 |
}, |
|
| 231 | 7x |
filename = filename[[file]], |
| 232 | 7x |
versions = get_versions(f), |
| 233 | 7x |
source = unpack_meta("source"),
|
| 234 | 7x |
data_format = if ( |
| 235 | 7x |
any(vapply( |
| 236 | 7x |
m$variables, |
| 237 | 7x |
function(info) !is.null(info$levels), |
| 238 | 7x |
TRUE |
| 239 |
)) |
|
| 240 |
) {
|
|
| 241 | 2x |
"tall" |
| 242 |
} else {
|
|
| 243 | 5x |
"wide" |
| 244 |
}, |
|
| 245 | 7x |
ids = ids, |
| 246 | 7x |
id_length = if (length(idvars)) {
|
| 247 | 7x |
id_lengths <- nchar(data[[idvars[1L]]]) |
| 248 | 7x |
id_lengths <- id_lengths[!is.na(id_lengths)] |
| 249 | 1x |
if (all(id_lengths == id_lengths[1L])) id_lengths[1L] else 0L |
| 250 |
} else {
|
|
| 251 | ! |
0L |
| 252 |
}, |
|
| 253 | 7x |
time = timevar, |
| 254 | 7x |
profile = "data-resource", |
| 255 | 7x |
created = as.character(info$mtime), |
| 256 | 7x |
last_modified = as.character(info$ctime), |
| 257 | 7x |
vintage = if (length(vintage)) vintage else NULL, |
| 258 | 7x |
row_count = nrow(data), |
| 259 | 7x |
entity_count = if (length(idvars)) {
|
| 260 | 7x |
length(unique(data[[idvars[1L]]])) |
| 261 |
} else {
|
|
| 262 | ! |
nrow(data) |
| 263 |
}, |
|
| 264 | 7x |
schema = list( |
| 265 | 7x |
fields = lapply( |
| 266 | 7x |
if (summarize_ids) {
|
| 267 | 7x |
colnames(data) |
| 268 |
} else {
|
|
| 269 | ! |
colnames(data)[!colnames(data) %in% idvars] |
| 270 |
}, |
|
| 271 | 7x |
function(cn) {
|
| 272 | 25x |
v <- data[[cn]] |
| 273 | 25x |
invalid <- !is.finite(v) |
| 274 | 25x |
r <- list(name = cn, duplicates = sum(duplicated(v))) |
| 275 | 25x |
if (cn %in% varinf_full) {
|
| 276 | 3x |
r$info <- varinf[[cn]] |
| 277 | 22x |
} else if (cn %in% varinf_suf) {
|
| 278 | ! |
r$info <- varinf[[which(varinf_suf == cn)]] |
| 279 |
} else {
|
|
| 280 | 22x |
scoped_name <- paste0(f, "|", cn) |
| 281 | 22x |
scoped_name <- substring( |
| 282 | 22x |
scoped_name, |
| 283 | 22x |
unique(nchar(scoped_name) - nchar(varinf_full) + 1L) |
| 284 |
) |
|
| 285 | 22x |
if (sum(scoped_name %in% varinf_full) == 1L) {
|
| 286 | 4x |
r$info <- varinf[[scoped_name[ |
| 287 | 4x |
scoped_name %in% varinf_full |
| 288 |
]]] |
|
| 289 |
} |
|
| 290 |
} |
|
| 291 | 25x |
if ("info" %in% names(r$info)) {
|
| 292 | 1x |
r$info <- r$info$info |
| 293 |
} |
|
| 294 | 25x |
r$info <- Filter( |
| 295 | 25x |
length, |
| 296 | 25x |
lapply( |
| 297 | 25x |
r$info, |
| 298 | 25x |
function(e) {
|
| 299 | 59x |
if (!identical(e, "")) {
|
| 300 | 25x |
if ( |
| 301 | 19x |
is.character(e) && |
| 302 | 19x |
length(e) == 1L && |
| 303 | 19x |
grepl("|", e, fixed = TRUE)
|
| 304 |
) {
|
|
| 305 | 6x |
scoped_id <- strsplit(e, "|", fixed = TRUE)[[1L]][[2L]] |
| 306 | ! |
if (scoped_id %in% colnames(data)) scoped_id else e |
| 307 |
} else {
|
|
| 308 | 13x |
e |
| 309 |
} |
|
| 310 |
} |
|
| 311 |
} |
|
| 312 |
) |
|
| 313 |
) |
|
| 314 | 25x |
su <- !is.na(v) |
| 315 | 25x |
if (any(su) && !is.null(times)) {
|
| 316 | 23x |
r$time_range <- which(times_unique %in% range(times[su])) - 1L |
| 317 | 23x |
r$time_range <- if (length(r$time_range)) {
|
| 318 | 23x |
r$time_range[c(1L, length(r$time_range))] |
| 319 |
} else {
|
|
| 320 | ! |
c(-1L, -1L) |
| 321 |
} |
|
| 322 |
} else {
|
|
| 323 | 2x |
r$time_range <- c(-1L, -1L) |
| 324 |
} |
|
| 325 | 25x |
if (!is.character(v) && all(invalid)) {
|
| 326 | ! |
r$type <- "unknown" |
| 327 | ! |
r$missing <- length(v) |
| 328 | 25x |
} else if (is.numeric(v)) {
|
| 329 | 15x |
r$type <- if (all(invalid | as.integer(v) == v)) {
|
| 330 | 14x |
"integer" |
| 331 |
} else {
|
|
| 332 | 1x |
"float" |
| 333 |
} |
|
| 334 | 15x |
r$missing <- sum(invalid) |
| 335 | 15x |
r$mean <- round(mean(v, na.rm = TRUE), 6L) |
| 336 | 15x |
r$sd <- round(stats::sd(v, na.rm = TRUE), 6L) |
| 337 | 15x |
r$min <- round(min(v, na.rm = TRUE), 6L) |
| 338 | 15x |
r$max <- round(max(v, na.rm = TRUE), 6L) |
| 339 |
} else {
|
|
| 340 | 10x |
r$type <- "string" |
| 341 | 10x |
v <- as.factor(iconv(as.character(v), to = "UTF-8")) |
| 342 | 10x |
r$missing <- sum(is.na(v) | is.nan(v) | grepl("^[\\s.-]$", v))
|
| 343 | 10x |
r$table <- structure(as.list(tabulate(v)), names = levels(v)) |
| 344 |
} |
|
| 345 | 25x |
r |
| 346 |
} |
|
| 347 |
) |
|
| 348 |
) |
|
| 349 |
) |
|
| 350 | 7x |
if (!single_meta) {
|
| 351 | 6x |
res$measure_info <- lapply(m$variables, function(e) e[e != ""]) |
| 352 | 6x |
if ("_references" %in% names(varinf)) {
|
| 353 | ! |
res[["_references"]] <- varinf[["_references"]] |
| 354 |
} |
|
| 355 |
} |
|
| 356 | 7x |
if (Sys.which("openssl") != "") {
|
| 357 | 7x |
res[[paste0("sha", sha)]] <- calculate_sha(f, sha)
|
| 358 |
} |
|
| 359 | 7x |
res |
| 360 |
} |
|
| 361 | 6x |
metadata <- Filter(length, lapply(seq_along(filename), collect_metadata)) |
| 362 | 6x |
if (single_meta) {
|
| 363 | 1x |
package$measure_info <- lapply(meta$variables, function(e) e[e != ""]) |
| 364 |
} |
|
| 365 | 6x |
names <- vapply(metadata, "[[", "", "filename") |
| 366 | 6x |
for (resource in package$resources) {
|
| 367 | 2x |
if (length(resource$versions)) {
|
| 368 | ! |
su <- which(names %in% resource$filename) |
| 369 | ! |
if (length(su)) {
|
| 370 | ! |
if (length(metadata[[su]]$versions)) {
|
| 371 | ! |
metadata[[su]]$versions <- rbind( |
| 372 | ! |
metadata[[su]]$versions, |
| 373 | ! |
if (is.data.frame(resource$versions)) {
|
| 374 | ! |
resource$versions |
| 375 |
} else {
|
|
| 376 | ! |
as.data.frame(do.call(cbind, resource$versions)) |
| 377 |
} |
|
| 378 |
) |
|
| 379 | ! |
metadata[[su]]$versions <- metadata[[su]]$versions[ |
| 380 | ! |
!duplicated(metadata[[su]]$versions), |
| 381 |
] |
|
| 382 |
} |
|
| 383 |
} |
|
| 384 |
} |
|
| 385 |
} |
|
| 386 | 6x |
if (refresh) {
|
| 387 | 6x |
package$resources <- metadata |
| 388 |
} else {
|
|
| 389 | ! |
package$resources <- c( |
| 390 | ! |
metadata, |
| 391 | ! |
package$resources[ |
| 392 | ! |
!(vapply(package$resources, "[[", "", "filename") %in% names) |
| 393 |
] |
|
| 394 |
) |
|
| 395 |
} |
|
| 396 | 6x |
if (write) {
|
| 397 | 6x |
packagename <- if (is.character(packagename)) {
|
| 398 | 6x |
packagename |
| 399 |
} else {
|
|
| 400 | ! |
"datapackage.json" |
| 401 |
} |
|
| 402 | 6x |
jsonlite::write_json( |
| 403 | 6x |
package, |
| 404 | 6x |
if (file.exists(packagename)) {
|
| 405 | 6x |
packagename |
| 406 |
} else {
|
|
| 407 | ! |
paste0(dir, "/", packagename) |
| 408 |
}, |
|
| 409 | 6x |
auto_unbox = TRUE, |
| 410 | 6x |
digits = 6L, |
| 411 | 6x |
dataframe = "columns", |
| 412 | 6x |
pretty = pretty |
| 413 |
) |
|
| 414 | 6x |
if (verbose) {
|
| 415 | ! |
cli::cli_bullets(c( |
| 416 | ! |
v = paste( |
| 417 | ! |
if (refresh) "updated resource in" else "added resource to", |
| 418 | ! |
"datapackage.json:" |
| 419 |
), |
|
| 420 | ! |
"*" = paste0("{.path ", packagename, "}")
|
| 421 |
)) |
|
| 422 | ! |
if (open_after) rstudioapi::navigateToFile(packagename) |
| 423 |
} |
|
| 424 |
} |
|
| 425 | 6x |
invisible(package) |
| 426 |
} |
|
| 427 | ||
| 428 |
get_versions <- function(file) {
|
|
| 429 | 7x |
log <- suppressWarnings(system2( |
| 430 | 7x |
"git", |
| 431 | 7x |
c("log", file),
|
| 432 | 7x |
stdout = TRUE |
| 433 |
)) |
|
| 434 | 7x |
if (is.null(attr(log, "status"))) {
|
| 435 | 4x |
log_entries <- strsplit(paste(log, collapse = "|"), "commit ")[[ |
| 436 | 4x |
1L |
| 437 |
]] |
|
| 438 | 4x |
log_entries <- do.call( |
| 439 | 4x |
rbind, |
| 440 | 4x |
Filter( |
| 441 | 4x |
function(x) length(x) == 4L, |
| 442 | 4x |
strsplit( |
| 443 | 4x |
log_entries[log_entries != ""], |
| 444 | 4x |
"\\|+(?:[^:]+:)?\\s*" |
| 445 |
) |
|
| 446 |
) |
|
| 447 |
) |
|
| 448 | 4x |
if (length(log_entries)) {
|
| 449 | 1x |
colnames(log_entries) <- c( |
| 450 | 1x |
"hash", |
| 451 | 1x |
"author", |
| 452 | 1x |
"date", |
| 453 | 1x |
"message" |
| 454 |
) |
|
| 455 | 1x |
as.data.frame(log_entries) |
| 456 |
} |
|
| 457 |
} |
|
| 458 |
} |
|
| 459 | ||
| 460 |
attempt_read <- function(file, id_cols = c("geography", "time")) {
|
|
| 461 | 20x |
if (grepl("\\.rds", file, ignore.case = TRUE)) {
|
| 462 | ! |
tryCatch(readRDS(file), error = function(e) NULL) |
| 463 | 20x |
} else if (grepl("\\.parquet", file, ignore.case = TRUE)) {
|
| 464 | ! |
tryCatch(arrow::read_parquet(file), error = function(e) NULL) |
| 465 | 20x |
} else if (grepl("\\.json", file, ignore.case = TRUE)) {
|
| 466 | 8x |
tryCatch( |
| 467 | 8x |
as.data.frame(jsonlite::read_json(file, simplifyVector = TRUE)), |
| 468 | 8x |
error = function(e) NULL |
| 469 |
) |
|
| 470 |
} else {
|
|
| 471 | 12x |
tryCatch( |
| 472 |
{
|
|
| 473 | 12x |
sep <- if (grepl("\\.csv", file, ignore.case = TRUE)) "," else "\t"
|
| 474 | 12x |
cols <- scan(file, "", nlines = 1L, sep = sep, quiet = TRUE) |
| 475 | 12x |
if (length(cols) == 1L && grepl("\t", cols, fixed = TRUE)) {
|
| 476 | ! |
cli::cli_warn("{file} appears to be tab-delimited")
|
| 477 | ! |
sep <- "\t" |
| 478 | ! |
cols <- scan(file, "", nlines = 1L, sep = sep, quiet = TRUE) |
| 479 |
} |
|
| 480 | 12x |
arrow::read_delim_arrow( |
| 481 | 12x |
gzfile(file), |
| 482 | 12x |
sep, |
| 483 | 12x |
col_names = cols, |
| 484 | 12x |
skip = 1L, |
| 485 | 12x |
convert_options = arrow::csv_convert_options( |
| 486 | 12x |
col_types = arrow::schema(structure( |
| 487 | 12x |
lapply(id_cols, function(x) arrow::string()), |
| 488 | 12x |
names = id_cols |
| 489 |
)), |
|
| 490 | 12x |
check_utf8 = FALSE |
| 491 |
) |
|
| 492 |
) |
|
| 493 |
}, |
|
| 494 | 12x |
error = function(e) NULL |
| 495 |
) |
|
| 496 |
} |
|
| 497 |
} |
|
| 498 | ||
| 499 |
calculate_sha <- function(file, level) {
|
|
| 500 | 7x |
if (Sys.which("openssl") != "") {
|
| 501 | 7x |
tryCatch( |
| 502 | 7x |
strsplit( |
| 503 | 7x |
system2( |
| 504 | 7x |
"openssl", |
| 505 | 7x |
c("dgst", paste0("-sha", level), shQuote(file)),
|
| 506 | 7x |
TRUE |
| 507 |
), |
|
| 508 |
" ", |
|
| 509 | 7x |
fixed = TRUE |
| 510 | 7x |
)[[1L]][2L], |
| 511 | 7x |
error = function(e) "" |
| 512 |
) |
|
| 513 |
} else {
|
|
| 514 |
"" |
|
| 515 |
} |
|
| 516 |
} |
| 1 |
#' Download CDC WISQARS Reports |
|
| 2 |
#' |
|
| 3 |
#' Download reports data from the CDC's |
|
| 4 |
#' \href{https://wisqars.cdc.gov/reports}{Web-based Injury Statistics Query and Reporting System}.
|
|
| 5 |
#' |
|
| 6 |
#' @param file File to save the report to (\code{csv} or \code{parquet}).
|
|
| 7 |
#' @param year_start Earliest year to include. |
|
| 8 |
#' @param year_end Latest year to include. |
|
| 9 |
#' @param geography State or region code. |
|
| 10 |
#' @param intent Intent ID or name: |
|
| 11 |
#' \tabular{lll}{
|
|
| 12 |
#' \code{0} \tab \code{all} \tab All \cr
|
|
| 13 |
#' \code{1} \tab \code{unintentional} \tab Unintentional \cr
|
|
| 14 |
#' \code{2} \tab \code{suicide} \tab Suicide \cr
|
|
| 15 |
#' \code{3} \tab \code{homicide} \tab Homicide \cr
|
|
| 16 |
#' \code{4} \tab \code{homicide_legal} \tab Homicide and Legal Intervention \cr
|
|
| 17 |
#' \code{5} \tab \code{undetermined} \tab Undetermined \cr
|
|
| 18 |
#' \code{6} \tab \code{legal} \tab Legal Intervention \cr
|
|
| 19 |
#' \code{8} \tab \code{violence} \tab Violence-related \cr
|
|
| 20 |
#' } |
|
| 21 |
#' @param disposition Patient disposition given nonfatal: one or multiple of \code{all} (0),
|
|
| 22 |
#' \code{treated} (1; treated and released), \code{transfered} (2), \code{hospitalized} (3), or
|
|
| 23 |
#' \code{observed} (4; observed/left AMA/unknown).
|
|
| 24 |
#' @param mechanism Mechanism code; default to \code{20810} (all injury).
|
|
| 25 |
#' Other codes appear in the URL in the \code{m} parameter when submitting the filter.
|
|
| 26 |
#' @param group_ages Logical; if \code{FALSE}, will not group ages into 5-year bins.
|
|
| 27 |
#' @param age_min Youngest age to include. |
|
| 28 |
#' @param age_max Oldest age to include. |
|
| 29 |
#' @param sex Sex groups to include: one or multiple of \code{all} (0), \code{male} (1),
|
|
| 30 |
#' \code{female} (2), or \code{unknown} (3)..
|
|
| 31 |
#' @param race Race groups to include: one or multiple of \code{all} (0), \code{white} (1),
|
|
| 32 |
#' \code{black} (2), \code{aa} (3; American Indian or Alaska Native), \code{asian} (4),
|
|
| 33 |
#' \code{pi} (5; Hawaiian Native or Pacific Islander), \code{more} (6; more than one race).
|
|
| 34 |
#' These levels apply if \code{race_reporting} is \code{single} (default) -- provide
|
|
| 35 |
#' these by index for other \code{race_reporting} levels.
|
|
| 36 |
#' @param race_reporting How to group race groups, between \code{none} (0), \code{bridge} (1),
|
|
| 37 |
#' \code{single} (2), or \code{aapi} (3).
|
|
| 38 |
#' @param ethnicity Which ethnic groups to include: one or multiple of \code{all} (0),
|
|
| 39 |
#' \code{non_hispanic} (1), \code{hispanic} (2), or \code{unknown} (3).
|
|
| 40 |
#' @param YPLL Age to use when calculating Years of Potential Life Lost. |
|
| 41 |
#' @param metro Region type filter: \code{TRUE} for only metropolitan / urban, or \code{FALSE}
|
|
| 42 |
#' for only non-metropolitan / rural. Will include all region types if \code{NULL} (default).
|
|
| 43 |
#' @param group_by One or more variables to group by. These are uppercased and sometimes |
|
| 44 |
#' abbreviated or encoded; see the \code{r1} through \code{r4} URL parameters.
|
|
| 45 |
#' @param fatal_outcome Logical; if \code{FALSE}, will return non-fatal results.
|
|
| 46 |
#' @param brain_injury_only Logical; if \code{TRUE}, will return only traumatic brain injury results.
|
|
| 47 |
#' @param include_total Logical; if \code{FALSE}, will not include totals.
|
|
| 48 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 49 |
#' @returns A list containing the parameters of the request. The returned data are written to \code{file}.
|
|
| 50 |
#' @examples |
|
| 51 |
#' file <- "../../../wisqars.csv.xz" |
|
| 52 |
#' if (file.exists(file)) {
|
|
| 53 |
#' dcf_download_wisqars(file, verbose = FALSE) |
|
| 54 |
#' vroom::vroom(file) |
|
| 55 |
#' } |
|
| 56 |
#' @export |
|
| 57 | ||
| 58 |
dcf_download_wisqars <- function( |
|
| 59 |
file, |
|
| 60 |
fatal_outcome = TRUE, |
|
| 61 |
brain_injury_only = FALSE, |
|
| 62 |
year_start = 2018, |
|
| 63 |
year_end = year_start, |
|
| 64 |
geography = "00", |
|
| 65 |
intent = "all", |
|
| 66 |
disposition = "all", |
|
| 67 |
mechanism = if (fatal_outcome) 20810 else 3000, |
|
| 68 |
group_ages = NULL, |
|
| 69 |
age_min = 0, |
|
| 70 |
age_max = 199, |
|
| 71 |
sex = "all", |
|
| 72 |
race = "all", |
|
| 73 |
race_reporting = "single", |
|
| 74 |
ethnicity = "all", |
|
| 75 |
YPLL = 65, |
|
| 76 |
metro = NULL, |
|
| 77 |
group_by = NULL, |
|
| 78 |
include_total = FALSE, |
|
| 79 |
verbose = TRUE |
|
| 80 |
) {
|
|
| 81 | 2x |
intents <- list( |
| 82 | 2x |
all = 0, |
| 83 | 2x |
unintentional = 1, |
| 84 | 2x |
violence = 8, |
| 85 | 2x |
homicide_legal = 4, |
| 86 | 2x |
homicide = 3, |
| 87 | 2x |
legal = 6, |
| 88 | 2x |
suicide = 2, |
| 89 | 2x |
undetermined = 5 |
| 90 |
) |
|
| 91 | 2x |
dispositions <- list( |
| 92 | 2x |
all = 0, |
| 93 | 2x |
treated = 1, |
| 94 | 2x |
transfered = 2, |
| 95 | 2x |
hospitalized = 3, |
| 96 | 2x |
observed = 4 |
| 97 |
) |
|
| 98 | 2x |
sexes <- list( |
| 99 | 2x |
all = 0, |
| 100 | 2x |
male = 1, |
| 101 | 2x |
female = 2, |
| 102 | 2x |
unknown = 3 |
| 103 |
) |
|
| 104 | 2x |
races <- list( |
| 105 | 2x |
all = 0, |
| 106 | 2x |
white = 1, |
| 107 | 2x |
black = 2, |
| 108 | 2x |
aa = 3, |
| 109 | 2x |
asian = 4, |
| 110 | 2x |
pi = 5, |
| 111 | 2x |
more = 6 |
| 112 |
) |
|
| 113 | 2x |
race_reportings <- list( |
| 114 | 2x |
none = 0, |
| 115 | 2x |
bridge = 1, |
| 116 | 2x |
single = 2, |
| 117 | 2x |
aapi = 3 |
| 118 |
) |
|
| 119 | 2x |
ethnicities <- list( |
| 120 | 2x |
all = 0, |
| 121 | 2x |
non_hispanic = 1, |
| 122 | 2x |
hispanic = 2, |
| 123 | 2x |
unknown = 3 |
| 124 |
) |
|
| 125 | 2x |
if (missing(group_ages) && (!missing(age_min) || !missing(age_max))) {
|
| 126 | ! |
group_ages <- FALSE |
| 127 |
} |
|
| 128 | 2x |
params <- list( |
| 129 | 2x |
TotalLine = if (include_total) "YES" else "NO", |
| 130 | 2x |
intent = if (is.character(intent)) intents[[tolower(intent)]] else 0L, |
| 131 | 2x |
mech = mechanism, |
| 132 | 2x |
sex = paste( |
| 133 | 2x |
vapply(sex, function(l) if (is.character(l)) sexes[[l]] else l, 0), |
| 134 | 2x |
collapse = "," |
| 135 |
), |
|
| 136 | 2x |
race = paste( |
| 137 | 2x |
vapply(race, function(l) if (is.character(l)) sexes[[l]] else l, 0), |
| 138 | 2x |
collapse = "," |
| 139 |
), |
|
| 140 | 2x |
race_yr = if (is.character(race_reporting)) |
| 141 | 2x |
race_reportings[[race_reporting]] else race_reporting, |
| 142 | 2x |
year1 = year_start, |
| 143 | 2x |
year2 = year_end, |
| 144 | 2x |
agebuttn = if (is.null(group_ages)) "ALL" else if (group_ages) "5Yr" else |
| 145 | 2x |
"custom", |
| 146 | 2x |
fiveyr1 = age_min, |
| 147 | 2x |
fiveyr2 = age_max, |
| 148 | 2x |
c_age1 = age_min, |
| 149 | 2x |
c_age2 = age_max, |
| 150 | 2x |
groupby1 = "NONE", |
| 151 | 2x |
groupby2 = "NONE", |
| 152 | 2x |
groupby3 = "NONE", |
| 153 | 2x |
groupby4 = "NONE", |
| 154 | 2x |
groupby5 = "NONE", |
| 155 | 2x |
groupby6 = "NONE" |
| 156 |
) |
|
| 157 | 2x |
if (fatal_outcome) {
|
| 158 | 1x |
params$state <- geography |
| 159 | 1x |
params$ethnicty <- paste( |
| 160 | 1x |
vapply( |
| 161 | 1x |
ethnicity, |
| 162 | 1x |
function(l) if (is.character(l)) ethnicities[[l]] else l, |
| 163 | 1x |
0 |
| 164 |
), |
|
| 165 | 1x |
collapse = "," |
| 166 |
) |
|
| 167 | 1x |
params$ypllage <- YPLL |
| 168 | 1x |
params$urbrul <- if (is.null(metro)) 0 else if (metro) 1 else 2 |
| 169 | 1x |
params$tbi <- if (brain_injury_only) 1L else 0L |
| 170 |
} else {
|
|
| 171 | 1x |
params$groupby1 <- "NONE1" |
| 172 | 1x |
params$groupby2 <- "NONE2" |
| 173 | 1x |
params$groupby3 <- "NONE3" |
| 174 | 1x |
params$groupby4 <- "NONE4" |
| 175 | 1x |
params$groupby5 <- "NONE5" |
| 176 | 1x |
params$groupby6 <- "NONE6" |
| 177 | 1x |
params$outcome <- "NFI" |
| 178 | 1x |
params$racethn <- 0 |
| 179 | 1x |
params$disp <- paste( |
| 180 | 1x |
vapply( |
| 181 | 1x |
disposition, |
| 182 | 1x |
function(l) if (is.character(l)) dispositions[[l]] else l, |
| 183 | 1x |
0 |
| 184 |
), |
|
| 185 | 1x |
collapse = "," |
| 186 |
) |
|
| 187 |
} |
|
| 188 | 2x |
for (group in seq_along(group_by)) {
|
| 189 | ! |
params[[paste0("groupby", group)]] <- toupper(group_by[[group]])
|
| 190 |
} |
|
| 191 | 2x |
params <- lapply(params, as.character) |
| 192 | 2x |
if (fatal_outcome) {
|
| 193 | 1x |
params$app_id <- 1002 |
| 194 | 1x |
params$component_id <- 1000 |
| 195 |
} |
|
| 196 | ||
| 197 | 2x |
if (verbose) {
|
| 198 | 2x |
url <- paste0( |
| 199 | 2x |
"https://wisqars.cdc.gov/reports/?o=", |
| 200 | 2x |
if (fatal_outcome) "MORT" else "NFI" |
| 201 |
) |
|
| 202 | 2x |
if (!fatal_outcome) {
|
| 203 | 1x |
url <- paste0(url, "&g=00&me=") |
| 204 |
} |
|
| 205 | 2x |
url_param_map <- list( |
| 206 | 2x |
year1 = "y1", |
| 207 | 2x |
year2 = "y2", |
| 208 | 2x |
tbi = "t", |
| 209 | 2x |
disp = "d", |
| 210 | 2x |
state = "g", |
| 211 | 2x |
ethnicty = "e", |
| 212 | 2x |
intent = "i", |
| 213 | 2x |
mech = "m", |
| 214 | 2x |
sex = "s", |
| 215 | 2x |
race = "r", |
| 216 | 2x |
agebuttn = "a", |
| 217 | 2x |
urbrul = "me", |
| 218 | 2x |
race_yr = "ry", |
| 219 | 2x |
ypllage = "yp", |
| 220 | 2x |
fiveyr1 = "g1", |
| 221 | 2x |
fiveyr2 = "g2", |
| 222 | 2x |
c_age1 = "a1", |
| 223 | 2x |
c_age2 = "a2", |
| 224 | 2x |
groupby1 = "r1", |
| 225 | 2x |
groupby2 = "r2", |
| 226 | 2x |
groupby3 = "r3", |
| 227 | 2x |
groupby4 = "r4", |
| 228 | 2x |
groupby5 = "r5", |
| 229 | 2x |
groupby6 = "r6" |
| 230 |
) |
|
| 231 | 2x |
for (k in names(params)) {
|
| 232 | 48x |
url_key <- url_param_map[[k]] |
| 233 | 48x |
if (!is.null(url_key)) {
|
| 234 | 42x |
for (value in params[[k]]) url <- paste0(url, "&", url_key, "=", value) |
| 235 |
} |
|
| 236 |
} |
|
| 237 | 2x |
cli::cli_alert_info("requesting report {.url {url}}")
|
| 238 |
} |
|
| 239 | ||
| 240 | 2x |
handler <- curl::new_handle() |
| 241 | 2x |
curl::handle_setheaders(handler, "Content-Type" = "application/json") |
| 242 | 2x |
curl::handle_setopt( |
| 243 | 2x |
handler, |
| 244 | 2x |
copypostfields = jsonlite::toJSON( |
| 245 | 2x |
list(parameters = params), |
| 246 | 2x |
auto_unbox = TRUE |
| 247 |
) |
|
| 248 |
) |
|
| 249 | 2x |
req <- curl::curl_fetch_memory( |
| 250 | 2x |
paste0( |
| 251 | 2x |
"https://wisqars.cdc.gov/api/cost-", |
| 252 | 2x |
if (fatal_outcome) "fatal" else "nonfatal" |
| 253 |
), |
|
| 254 | 2x |
handle = handler |
| 255 |
) |
|
| 256 | 2x |
if (req$status_code == 200) {
|
| 257 | 2x |
dir.create(dirname(file), FALSE, TRUE) |
| 258 | 2x |
data <- jsonlite::fromJSON(rawToChar(req$content)) |
| 259 | 2x |
if (!length(data)) {
|
| 260 | ! |
cli::cli_warn("no rows in data, so no file written")
|
| 261 |
} else {
|
|
| 262 | 2x |
if (grepl("parquet", file)) {
|
| 263 | ! |
arrow::write_parquet(data, file, compression = "gzip") |
| 264 |
} else {
|
|
| 265 | 2x |
vroom::vroom_write(data, file, ",") |
| 266 |
} |
|
| 267 |
} |
|
| 268 |
} else {
|
|
| 269 | ! |
cli::cli_abort("request failed: {req$status_code}")
|
| 270 |
} |
|
| 271 | 2x |
invisible(params) |
| 272 |
} |
| 1 |
#' Initialize a Data Collection Project |
|
| 2 |
#' |
|
| 3 |
#' Establishes a new data collection framework project. |
|
| 4 |
#' |
|
| 5 |
#' @param name Name of the source. Defaults to the current directory name. |
|
| 6 |
#' @param base_dir Path to the parent of the project directory (where the \code{name}
|
|
| 7 |
#' directory should be created). If \code{name} is not specified, will treat the current
|
|
| 8 |
#' directory as \code{name}, and \code{".."} as \code{base_dir}.
|
|
| 9 |
#' @param data_dir Name of the directory to store projects in, relative to \code{base_dir}.
|
|
| 10 |
#' @param github_account Name of the GitHub account that will host the repository. |
|
| 11 |
#' @param branch Name of the repository's branch. |
|
| 12 |
#' @param repo_name Name of the repository. |
|
| 13 |
#' @param use_git Logical; if \code{TRUE}, will initialize a git repository.
|
|
| 14 |
#' @param open_after Logical; if \code{TRUE}, will open the project in a new RStudio instance.
|
|
| 15 |
#' @returns Nothing; creates default files and directories. |
|
| 16 |
#' @section Data Collection Project: |
|
| 17 |
#' |
|
| 18 |
#' A data collection project starts with a \code{settings.json} file, which
|
|
| 19 |
#' specifies where source and bundle projects live (a \code{data} subdirectory by default).
|
|
| 20 |
#' |
|
| 21 |
#' The bulk of the project will then be in the source and bundle projects, as created |
|
| 22 |
#' by the \code{\link{dcf_add_source}} and \code{\link{dcf_add_bundle}}.
|
|
| 23 |
#' |
|
| 24 |
#' Once these sub-projects are in place, they can be operated over by the |
|
| 25 |
#' \code{\link{dcf_build}}, which processes each sub-project using
|
|
| 26 |
#' \code{\link{dcf_process}}, and checks them with \code{\link{dcf_check}},
|
|
| 27 |
#' resulting in a report. |
|
| 28 |
#' |
|
| 29 |
#' @examples |
|
| 30 |
#' base_dir <- tempdir() |
|
| 31 |
#' dcf_init("project_name", base_dir)
|
|
| 32 |
#' list.files(paste0(base_dir, "/project_name")) |
|
| 33 |
#' |
|
| 34 |
#' @export |
|
| 35 | ||
| 36 |
dcf_init <- function( |
|
| 37 |
name, |
|
| 38 |
base_dir = ".", |
|
| 39 |
data_dir = "data", |
|
| 40 |
github_account = "", |
|
| 41 |
branch = "main", |
|
| 42 |
repo_name = name, |
|
| 43 |
use_git = TRUE, |
|
| 44 |
open_after = FALSE |
|
| 45 |
) {
|
|
| 46 | 1x |
if (missing(name)) {
|
| 47 | ! |
base_dir <- normalizePath(base_dir, "/", FALSE) |
| 48 | ! |
name <- basename(base_dir) |
| 49 |
} else {
|
|
| 50 | 1x |
name <- gsub("[^A-Za-z0-9]+", "_", name)
|
| 51 |
} |
|
| 52 | 1x |
base_path <- paste0(base_dir, "/", name, "/") |
| 53 | 1x |
dir.create(base_path, showWarnings = FALSE, recursive = TRUE) |
| 54 | 1x |
paths <- paste0( |
| 55 | 1x |
base_path, |
| 56 | 1x |
c( |
| 57 | 1x |
"project.Rproj", |
| 58 | 1x |
"settings.json", |
| 59 | 1x |
"README.md", |
| 60 | 1x |
"scripts/build.R", |
| 61 | 1x |
".github/workflows/build.yaml", |
| 62 | 1x |
".gitignore" |
| 63 |
) |
|
| 64 |
) |
|
| 65 |
if ( |
|
| 66 | 1x |
!file.exists(paths[[1L]]) && !length(list.files(base_path, "\\.Rproj$")) |
| 67 |
) {
|
|
| 68 | 1x |
writeLines("Version: 1.0\n", paths[[1L]])
|
| 69 |
} |
|
| 70 | 1x |
if (!file.exists(paths[[2L]])) {
|
| 71 | 1x |
jsonlite::write_json( |
| 72 | 1x |
list( |
| 73 | 1x |
name = name, |
| 74 | 1x |
data_dir = data_dir, |
| 75 | 1x |
github_account = github_account, |
| 76 | 1x |
branch = "main", |
| 77 | 1x |
repo_name = repo_name |
| 78 |
), |
|
| 79 | 1x |
paths[[2L]], |
| 80 | 1x |
auto_unbox = TRUE, |
| 81 | 1x |
pretty = TRUE |
| 82 |
) |
|
| 83 |
} |
|
| 84 | 1x |
if (!file.exists(paths[[3L]])) {
|
| 85 | 1x |
writeLines( |
| 86 | 1x |
paste0( |
| 87 | 1x |
c( |
| 88 | 1x |
paste("#", name),
|
| 89 |
"", |
|
| 90 | 1x |
"This is a Data Collection Framework project, initialized with `dcf::dcf_init`.", |
| 91 |
"", |
|
| 92 | 1x |
"You can us the `dcf` package to check the source projects:", |
| 93 |
"", |
|
| 94 | 1x |
"```R", |
| 95 | 1x |
paste0('dcf::dcf_check()'),
|
| 96 |
"```", |
|
| 97 |
"", |
|
| 98 | 1x |
"And process them:", |
| 99 |
"", |
|
| 100 | 1x |
"```R", |
| 101 | 1x |
paste0('dcf::dcf_process()'),
|
| 102 |
"```" |
|
| 103 |
), |
|
| 104 | 1x |
collapse = "\n" |
| 105 |
), |
|
| 106 | 1x |
paths[[3L]] |
| 107 |
) |
|
| 108 |
} |
|
| 109 | 1x |
if (!file.exists(paths[[4L]])) {
|
| 110 | 1x |
dir.create(dirname(paths[[4L]]), showWarnings = FALSE) |
| 111 | 1x |
writeLines( |
| 112 | 1x |
paste(c("library(dcf)", "dcf_build()"), collapse = "\n"),
|
| 113 | 1x |
paths[[4L]] |
| 114 |
) |
|
| 115 |
} |
|
| 116 | 1x |
if (!file.exists(paths[[5L]])) {
|
| 117 | 1x |
dir.create(dirname(paths[[5L]]), recursive = TRUE, showWarnings = FALSE) |
| 118 | 1x |
file.copy(system.file("workflows/build.yaml", package = "dcf"), paths[[5L]])
|
| 119 |
} |
|
| 120 | 1x |
if (use_git) {
|
| 121 | 1x |
dcf_init_git(base_path) |
| 122 | 1x |
if (!file.exists(paths[[6L]])) {
|
| 123 | 1x |
writeLines( |
| 124 | 1x |
paste( |
| 125 | 1x |
c( |
| 126 | 1x |
"*.Rproj", |
| 127 | 1x |
".Rproj.user", |
| 128 | 1x |
"*.Rprofile", |
| 129 | 1x |
"*.Rhistory", |
| 130 | 1x |
"*.Rdata", |
| 131 | 1x |
".DS_Store", |
| 132 | 1x |
"renv" |
| 133 |
), |
|
| 134 | 1x |
collapse = "\n" |
| 135 |
), |
|
| 136 | 1x |
paths[[6L]] |
| 137 |
) |
|
| 138 |
} |
|
| 139 |
} |
|
| 140 | ! |
if (open_after) rstudioapi::openProject(paths[[1L]], newSession = TRUE) |
| 141 |
} |
| 1 |
#' Download Medicare Disparities Data |
|
| 2 |
#' |
|
| 3 |
#' Download data from the Centers for Medicare & Medicaid Services (CMS) |
|
| 4 |
#' \href{https://data.cms.gov/tools/mapping-medicare-disparities-by-population}{Mapping Medicare Disparities by Population} (MMD) tool.
|
|
| 5 |
#' |
|
| 6 |
#' @param measure Name or letter code of the measure to download. |
|
| 7 |
#' @param population The population code; either \code{f} (Medicare Fee For Service) or
|
|
| 8 |
#' \code{m} (Medicare Advantage).
|
|
| 9 |
#' @param year Year(s) to download (e.g., \code{2015:2020}). If not specified, all available
|
|
| 10 |
#' years will be included. |
|
| 11 |
#' @param geography Geography code(s) to include, between \code{n} (national), \code{s} (state),
|
|
| 12 |
#' and \code{c} (county). If not specified, all available geographies will be included.
|
|
| 13 |
#' @param adjust,condition,sex,age,race,dual_elig,medicare_elig One or more codes indicating |
|
| 14 |
#' the variable levels to include (see \code{\link{dcf_standardize_cmsmmd}}).
|
|
| 15 |
#' If \code{"."}, values will be across all levels, whereas if \code{NULL}, all available levels
|
|
| 16 |
#' will be included (aggregated and disaggregated). See the Making Requests section. |
|
| 17 |
#' @param refresh_codebook Logical; if \code{TRUE}, will re-download the codebook even if it
|
|
| 18 |
#' exists in the temporary location (which is cleared each R session). |
|
| 19 |
#' @param codebook_only Logical; if \code{TRUE}, will return the codebook without downloading data.
|
|
| 20 |
#' @param row_limit Maximum number of rows to return in each request. The API limit appears to be 100,000. |
|
| 21 |
#' @param out_file Path to the CSV or Parquet file to write data to. |
|
| 22 |
#' @param state The codebook state (MD5 hash) recorded during a previous download; |
|
| 23 |
#' if provided, will only download if the new state does not match. |
|
| 24 |
#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet.
|
|
| 25 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 26 |
#' @section Making Requests: |
|
| 27 |
#' The API operates over several large files, partitioned by measure, year, |
|
| 28 |
#' adjust, and dual and medicaid eligibility. These are identified with the codebook |
|
| 29 |
#' (\code{dcf_download_cmsmmd(codebook_only = TRUE)}).
|
|
| 30 |
#' |
|
| 31 |
#' The files are larger than the API's limit, so requests for each file have to be broken up |
|
| 32 |
#' by the other variables within them (sex, age, race, and condition). |
|
| 33 |
#' |
|
| 34 |
#' For best performance, make requests as big as possible while staying under 100,000 rows |
|
| 35 |
#' each (e.g., by setting \code{sex}, \code{age}, or \code{race} to \code{NULL}).
|
|
| 36 |
#' @returns \code{dcf_download_cmsmmd}: A list:
|
|
| 37 |
#' \itemize{
|
|
| 38 |
#' \item \strong{\code{codebook}}: The codebook.
|
|
| 39 |
#' \item \strong{\code{codebook_state}}: MD5 hash of the codebook.
|
|
| 40 |
#' \item \strong{\code{data}}: The downloaded data.
|
|
| 41 |
#' } |
|
| 42 |
#' @examples |
|
| 43 |
#' # find the codes associated with menu values |
|
| 44 |
#' variable_codes <- dcf_standardize_cmsmmd() |
|
| 45 |
#' variable_codes[c( |
|
| 46 |
#' "sex", "age", "race", |
|
| 47 |
#' "adjust", "dual_elig", "medicare_elig" |
|
| 48 |
#' )] |
|
| 49 |
#' |
|
| 50 |
#' # look at the codebook which defines source files |
|
| 51 |
#' codebook <- dcf_download_cmsmmd(codebook_only = TRUE) |
|
| 52 |
#' codebook |
|
| 53 |
#' |
|
| 54 |
#' \dontrun{
|
|
| 55 |
#' # download data |
|
| 56 |
#' downloaded <- dcf_download_cmsmmd( |
|
| 57 |
#' "preventive care", |
|
| 58 |
#' population = "f", |
|
| 59 |
#' race = ".", |
|
| 60 |
#' sex = ".", |
|
| 61 |
#' age = NULL, |
|
| 62 |
#' condition = c(83, 85, 86, 88, 89, 95, 101, 102, 104, 105:107), |
|
| 63 |
#' adjust = 1 |
|
| 64 |
#' ) |
|
| 65 |
#' |
|
| 66 |
#' # convert codes to levels |
|
| 67 |
#' data_standard <- dcf_standardize_cmsmmd(downloaded$data) |
|
| 68 |
#' } |
|
| 69 |
#' @export |
|
| 70 | ||
| 71 |
dcf_download_cmsmmd <- function( |
|
| 72 |
measure, |
|
| 73 |
population = NULL, |
|
| 74 |
year = NULL, |
|
| 75 |
geography = NULL, |
|
| 76 |
adjust = NULL, |
|
| 77 |
condition = NULL, |
|
| 78 |
sex = c(1:2, "."), |
|
| 79 |
age = c(0:4, "."), |
|
| 80 |
race = c(1:6, "."), |
|
| 81 |
dual_elig = ".", |
|
| 82 |
medicare_elig = ".", |
|
| 83 |
refresh_codebook = FALSE, |
|
| 84 |
codebook_only = FALSE, |
|
| 85 |
row_limit = 9999999, |
|
| 86 |
out_file = NULL, |
|
| 87 |
state = NULL, |
|
| 88 |
parquet = FALSE, |
|
| 89 |
verbose = TRUE |
|
| 90 |
) {
|
|
| 91 |
# load codebook |
|
| 92 | 3x |
codebook_file <- paste0(tempdir(), "/codebook_crosswalk.csv") |
| 93 | 3x |
if (refresh_codebook || !file.exists(codebook_file)) {
|
| 94 | 1x |
if (verbose) cli::cli_progress_step("retrieving codebook")
|
| 95 | 1x |
codebook_req <- curl::curl_fetch_disk( |
| 96 | 1x |
"https://data.cms.gov/mmd-population/assets/codebook_crosswalk.csv", |
| 97 | 1x |
codebook_file |
| 98 |
) |
|
| 99 | 1x |
if (codebook_req$status_code != 200) {
|
| 100 | ! |
unlink(codebook_file) |
| 101 | ! |
cli::cli_abort("failed to retrieve codebook: {codebook_req$status_code}")
|
| 102 |
} |
|
| 103 | 1x |
if (verbose) cli::cli_progress_done() |
| 104 |
} |
|
| 105 | 3x |
new_state <- tools::md5sum(codebook_file) |
| 106 | 3x |
if (!is.null(state)) {
|
| 107 | 1x |
if (state == new_state) {
|
| 108 | 1x |
if (verbose) {
|
| 109 | 1x |
cli::cli_alert_info("codebook has not changed since last download")
|
| 110 |
} |
|
| 111 | 1x |
return(invisible(NULL)) |
| 112 |
} |
|
| 113 |
} |
|
| 114 | 2x |
codebook <- vroom::vroom( |
| 115 | 2x |
codebook_file, |
| 116 | 2x |
col_types = list( |
| 117 | 2x |
elig = "c", |
| 118 | 2x |
race_code = "c", |
| 119 | 2x |
sex_code = "c", |
| 120 | 2x |
adjust = "c", |
| 121 | 2x |
dual = "c" |
| 122 |
), |
|
| 123 | 2x |
na = " " |
| 124 |
) |
|
| 125 | 2x |
codebook$description <- tolower(codebook$description) |
| 126 | ||
| 127 | 1x |
if (codebook_only) return(codebook) |
| 128 | ||
| 129 |
# identify source(s) |
|
| 130 | 1x |
if (missing(measure)) {
|
| 131 | ! |
cli::cli_abort("specify a measure: {.value {unique(codebook$description)}}")
|
| 132 |
} |
|
| 133 | ||
| 134 | 1x |
measure_descriptions <- unique(codebook$description) |
| 135 | 1x |
if (nchar(measure) == 1L) {
|
| 136 | ! |
codebook <- codebook[codebook$measure == tolower(measure), ] |
| 137 |
} else {
|
|
| 138 | 1x |
codebook <- codebook[grepl(measure, codebook$description), ] |
| 139 |
} |
|
| 140 | 1x |
if (!nrow(codebook)) {
|
| 141 | ! |
cli::cli_abort( |
| 142 | ! |
paste( |
| 143 | ! |
'measure "{measure}" does not match the available measures:',
|
| 144 | ! |
"{.value {measure_descriptions}}"
|
| 145 |
) |
|
| 146 |
) |
|
| 147 |
} |
|
| 148 | ||
| 149 | 1x |
if (is.null(population)) population <- codebook$population[[1L]] |
| 150 | 1x |
codebook <- codebook[ |
| 151 | 1x |
filter_codebook(codebook$population, population, "population"), |
| 152 |
] |
|
| 153 | ||
| 154 | 1x |
if (!is.null(year)) {
|
| 155 | 1x |
if (is.numeric(year)) year <- as.character(year) |
| 156 | 1x |
if (any(nchar(year) > 2L)) {
|
| 157 | 1x |
year <- substring( |
| 158 | 1x |
year, |
| 159 | 1x |
nchar(year) - |
| 160 | 1x |
as.integer(as.character(cut( |
| 161 | 1x |
as.numeric(year), |
| 162 | 1x |
c( |
| 163 | 1x |
-Inf, |
| 164 | 1x |
if (population == "m") 2014L else 2019L, |
| 165 | 1x |
Inf |
| 166 |
), |
|
| 167 | 1x |
c(0L, 1L) |
| 168 |
))) |
|
| 169 |
) |
|
| 170 |
} |
|
| 171 | 1x |
codebook <- codebook[ |
| 172 | 1x |
filter_codebook(codebook$year, year, "year"), |
| 173 |
] |
|
| 174 |
} |
|
| 175 | 1x |
if (missing(dual_elig)) dual_elig <- codebook$dual[[1L]] |
| 176 | 1x |
if (!is.null(dual_elig)) {
|
| 177 | 1x |
codebook <- codebook[ |
| 178 | 1x |
filter_codebook(codebook$dual, dual_elig, "dual_elig"), |
| 179 |
] |
|
| 180 |
} |
|
| 181 | 1x |
if (missing(medicare_elig)) medicare_elig <- codebook$elig[[1L]] |
| 182 | 1x |
if (!is.null(medicare_elig)) {
|
| 183 | 1x |
codebook <- codebook[ |
| 184 | 1x |
filter_codebook(codebook$elig, medicare_elig, "medicare_elig"), |
| 185 |
] |
|
| 186 |
} |
|
| 187 | ||
| 188 |
# make requests |
|
| 189 | 1x |
param_sets <- expand.grid( |
| 190 | 1x |
Filter( |
| 191 | 1x |
length, |
| 192 | 1x |
list( |
| 193 | 1x |
fltr = adjust, |
| 194 | 1x |
agecat = age, |
| 195 | 1x |
sexcat = sex, |
| 196 | 1x |
racecat = race, |
| 197 | 1x |
condition = condition, |
| 198 | 1x |
"_source" = unique(codebook$url) |
| 199 |
) |
|
| 200 |
), |
|
| 201 | 1x |
stringsAsFactors = FALSE |
| 202 |
) |
|
| 203 | 1x |
param_sets[param_sets == "." | param_sets == "all"] <- ".|IS NULL" |
| 204 | 1x |
data_url <- paste0( |
| 205 | 1x |
"https://data.cms.gov/data-api/v1/mmd-tool/?_size=", |
| 206 | 1x |
row_limit, |
| 207 |
"&" |
|
| 208 |
) |
|
| 209 | 1x |
n_requests <- nrow(param_sets) |
| 210 | 1x |
all_data <- list() |
| 211 | 1x |
if (verbose) cli::cli_h1("making requests to {data_url}")
|
| 212 | 1x |
for (i in seq_len(n_requests)) {
|
| 213 | 2x |
params <- as.list(param_sets[i, ]) |
| 214 | 2x |
param_string <- paste0(names(params), "=", params, collapse = "&") |
| 215 | 2x |
req <- curl::curl_fetch_memory(utils::URLencode(paste0( |
| 216 | 2x |
data_url, |
| 217 | 2x |
param_string |
| 218 |
))) |
|
| 219 | 2x |
if (req$status_code != 200) {
|
| 220 | ! |
cli::cli_abort( |
| 221 | ! |
"a request failed: ({req$status_code}) {rawToChar(req$content)}"
|
| 222 |
) |
|
| 223 |
} |
|
| 224 | 2x |
all_data[[i]] <- jsonlite::fromJSON(rawToChar(req$content)) |
| 225 | 2x |
if (verbose) |
| 226 | 2x |
cli::cli_progress_step( |
| 227 | 2x |
paste0( |
| 228 | 2x |
i, |
| 229 | 2x |
" of ", |
| 230 | 2x |
n_requests, |
| 231 |
" (",
|
|
| 232 | 2x |
nrow(all_data[[i]]), |
| 233 | 2x |
" rows): ", |
| 234 | 2x |
param_string |
| 235 |
), |
|
| 236 | 2x |
spinner = TRUE |
| 237 |
) |
|
| 238 |
} |
|
| 239 | 1x |
if (verbose) cli::cli_progress_done() |
| 240 | ||
| 241 | 1x |
all_data <- do.call(rbind, all_data) |
| 242 | 1x |
if (!is.null(out_file)) {
|
| 243 | 1x |
dir.create(dirname(out_file), showWarnings = FALSE, recursive = TRUE) |
| 244 | 1x |
if (parquet || grepl(".parquet", out_file, fixed = TRUE)) {
|
| 245 | 1x |
if (verbose) cli::cli_progress_step("writing to Parquet")
|
| 246 | 1x |
arrow::write_parquet( |
| 247 | 1x |
all_data, |
| 248 | 1x |
compression = "gzip", |
| 249 | 1x |
sub(".csv", ".parquet", out_file, fixed = TRUE)
|
| 250 |
) |
|
| 251 |
} else {
|
|
| 252 | ! |
if (verbose) cli::cli_progress_step("writing to CSV")
|
| 253 | ! |
vroom::vroom_write(all_data, out_file, ",") |
| 254 |
} |
|
| 255 |
} |
|
| 256 | 1x |
invisible(list( |
| 257 | 1x |
codebook_state = new_state, |
| 258 | 1x |
codebook = codebook, |
| 259 | 1x |
data = all_data |
| 260 |
)) |
|
| 261 |
} |
|
| 262 | ||
| 263 |
filter_codebook <- function(x, values, column) {
|
|
| 264 | 4x |
su <- x %in% values |
| 265 | 4x |
if (!any(su)) {
|
| 266 | ! |
cli::cli_abort( |
| 267 | ! |
paste( |
| 268 | ! |
'{.arg {column}} does not contain specified values.',
|
| 269 | ! |
"Available values: {.value {unique(x)}}"
|
| 270 |
) |
|
| 271 |
) |
|
| 272 |
} |
|
| 273 | 4x |
su |
| 274 |
} |
|
| 275 | ||
| 276 |
#' @rdname dcf_download_cmsmmd |
|
| 277 |
#' @param raw_data The raw data as downloaded with \code{dcf_download_cmsmmd} to be standardized.
|
|
| 278 |
#' @returns \code{dcf_standardize_cmsmmd}: If \code{raw_data} is \code{NULL}, a list
|
|
| 279 |
#' with an entry for each API parameter, containing named vectors with level codes as names |
|
| 280 |
#' mapping to level values (as they appear in the tool's menus). |
|
| 281 |
#' Otherwise, a version of \code{raw_data} with coded values converted to labels.
|
|
| 282 |
#' @export |
|
| 283 | ||
| 284 |
dcf_standardize_cmsmmd <- function(raw_data = NULL) {
|
|
| 285 |
# extracted from https://data.cms.gov/mmd-population/js/menus.js |
|
| 286 | 1x |
menu <- jsonlite::read_json( |
| 287 | 1x |
system.file( |
| 288 | 1x |
"support_data/cms_mmd_levels.json.gz", |
| 289 | 1x |
package = "dcf" |
| 290 |
) |
|
| 291 |
) |
|
| 292 | 1x |
levels <- lapply( |
| 293 | 1x |
menu, |
| 294 | 1x |
function(e) {
|
| 295 | 18x |
options <- Filter( |
| 296 | 18x |
length, |
| 297 | 18x |
lapply(e$options, function(o) {
|
| 298 | 269x |
if (length(o$val)) {
|
| 299 | 255x |
o$val <- as.character(o$val) |
| 300 | 255x |
o |
| 301 |
} |
|
| 302 |
}) |
|
| 303 |
) |
|
| 304 | 18x |
c( |
| 305 | 18x |
structure( |
| 306 | 18x |
vapply(options, "[[", "", "disp"), |
| 307 | 18x |
names = vapply(options, "[[", "", "val") |
| 308 |
) |
|
| 309 |
) |
|
| 310 |
} |
|
| 311 |
) |
|
| 312 | 1x |
names(levels) <- vapply(menu, "[[", "", "id") |
| 313 | 1x |
levels$geography["n"] <- "National" |
| 314 | 1x |
levels$fltr <- levels$adjust |
| 315 | 1x |
levels$sex <- levels$sexcat <- levels$sex_code |
| 316 | 1x |
levels$race <- levels$racecat <- levels$race_code |
| 317 | 1x |
levels$age <- levels$agecat <- levels$age_group |
| 318 | 1x |
levels$eligcat <- levels$medicare_elig <- levels$eligibility |
| 319 | 1x |
levels$dual_elig <- levels$dual |
| 320 | ! |
if (is.null(raw_data)) return(levels) |
| 321 | 1x |
for (col in colnames(raw_data)) {
|
| 322 | 12x |
col_levels <- levels[[col]] |
| 323 | 12x |
if (!is.null(col_levels)) {
|
| 324 | 10x |
values <- as.character(raw_data[[col]]) |
| 325 | 10x |
values[values == ""] <- "." |
| 326 | 10x |
raw_data[[col]] <- c(col_levels, "." = "All")[values] |
| 327 |
} |
|
| 328 |
} |
|
| 329 | 1x |
raw_data |
| 330 |
} |
| 1 |
#' Create a datapackage.json template |
|
| 2 |
#' |
|
| 3 |
#' Initialize dataset documentation with a \code{datapackage.json} template, based on a
|
|
| 4 |
#' \href{https://specs.frictionlessdata.io/data-package}{Data Package} standard.
|
|
| 5 |
#' |
|
| 6 |
#' @param name A unique name for the dataset; allowed characters are \code{[a-z._/-]}.
|
|
| 7 |
#' @param title A display name for the dataset; if not specified, will be a formatted version of \code{name}.
|
|
| 8 |
#' @param dir Directory in which to save the \code{datapackage.json} file.
|
|
| 9 |
#' @param licenses A list or list of lists with a license definition; see |
|
| 10 |
#' \href{https://specs.frictionlessdata.io/data-package/#licenses}{Data Package Licenses}.
|
|
| 11 |
#' @param ... passes arguments to \code{\link{dcf_datapackage_add}}.
|
|
| 12 |
#' @param write Logical; if \code{FALSE}, the package object will not be written to a file.
|
|
| 13 |
#' @param overwrite Logical; if \code{TRUE} and \code{write} is \code{TRUE}, an existing
|
|
| 14 |
#' \code{datapackage.json} file will be overwritten.
|
|
| 15 |
#' @param quiet Logical; if \code{TRUE}, will not print messages or navigate to files.
|
|
| 16 |
#' @examples |
|
| 17 |
#' \dontrun{
|
|
| 18 |
#' # make a template datapackage.json file in the current working directory |
|
| 19 |
#' dcf_datapackage_init("mtcars", "Motor Trend Car Road Tests")
|
|
| 20 |
#' } |
|
| 21 |
#' @return An invisible list with the content written to the \code{datapackage.json} file.
|
|
| 22 |
#' @seealso Add basic information about a dataset with \code{\link{dcf_datapackage_add}}.
|
|
| 23 |
#' @export |
|
| 24 | ||
| 25 |
dcf_datapackage_init <- function( |
|
| 26 |
name, |
|
| 27 |
title = name, |
|
| 28 |
dir = ".", |
|
| 29 |
licenses = list(), |
|
| 30 |
..., |
|
| 31 |
write = TRUE, |
|
| 32 |
overwrite = FALSE, |
|
| 33 |
quiet = !interactive() |
|
| 34 |
) {
|
|
| 35 | 4x |
if (is.null(name)) {
|
| 36 | ! |
cli::cli_abort("{.arg name} must be provided")
|
| 37 |
} |
|
| 38 | 4x |
package <- list( |
| 39 | 4x |
name = name, |
| 40 | 4x |
title = if (title == name) {
|
| 41 | 4x |
gsub("\\b(\\w)", "\\U\\1", gsub("[._/-]", " ", name), perl = TRUE)
|
| 42 |
} else {
|
|
| 43 | ! |
title |
| 44 |
}, |
|
| 45 | 4x |
licenses = if (is.null(names(licenses))) licenses else list(licenses), |
| 46 | 4x |
resources = list() |
| 47 |
) |
|
| 48 | 4x |
package_path <- normalizePath(paste0(dir, "/datapackage.json"), "/", FALSE) |
| 49 | 4x |
if (write && !overwrite && file.exists(package_path)) {
|
| 50 | ! |
cli::cli_abort(c( |
| 51 | ! |
"datapackage ({.path {package_path}}) already exists",
|
| 52 | ! |
i = "add {.code overwrite = TRUE} to overwrite it"
|
| 53 |
)) |
|
| 54 |
} |
|
| 55 | 4x |
if (length(list(...))) {
|
| 56 | ! |
package$resources <- dcf_datapackage_add(..., dir = dir, write = FALSE) |
| 57 |
} |
|
| 58 | 4x |
if (write) {
|
| 59 | 4x |
if (!dir.exists(dir)) {
|
| 60 | ! |
dir.create(dir, recursive = TRUE) |
| 61 |
} |
|
| 62 | 4x |
jsonlite::write_json( |
| 63 | 4x |
package, |
| 64 | 4x |
package_path, |
| 65 | 4x |
auto_unbox = TRUE, |
| 66 | 4x |
digits = 6, |
| 67 | 4x |
pretty = TRUE |
| 68 |
) |
|
| 69 | 4x |
if (!quiet) {
|
| 70 | ! |
cli::cli_bullets(c( |
| 71 | ! |
v = "created metadata template for {name}:",
|
| 72 | ! |
"*" = paste0("{.path ", package_path, "}")
|
| 73 |
)) |
|
| 74 | ! |
rstudioapi::navigateToFile(package_path) |
| 75 |
} |
|
| 76 |
} |
|
| 77 | 4x |
invisible(package) |
| 78 |
} |
| 1 |
#' Adds a Bundle Project |
|
| 2 |
#' |
|
| 3 |
#' Establishes a new data bundle project, used to prepare outputs from standardized datasets. |
|
| 4 |
#' |
|
| 5 |
#' @param name Name of the bundle |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
| 7 |
#' @param source_files A list or character vector, with names as paths to standard files |
|
| 8 |
#' form source projects (relative to the project's data directory), and distribution file |
|
| 9 |
#' names as entries. This associates input with output files, allowing for calculation of |
|
| 10 |
#' a source state, and metadata inheritance from source files. |
|
| 11 |
#' @param open_after Logical; if \code{FALSE}, will not open the project.
|
|
| 12 |
#' @param use_git Logical; if \code{TRUE}, will initialize a git repository.
|
|
| 13 |
#' @param use_workflow Logical; if \code{TRUE}, will add a GitHub Actions workflow.
|
|
| 14 |
#' @returns Nothing; creates default files and directories. |
|
| 15 |
#' @section Project Definition: |
|
| 16 |
#' |
|
| 17 |
#' The \strong{\code{process.json}} file defines the project with some initial attributes:
|
|
| 18 |
#' \itemize{
|
|
| 19 |
#' \item \code{type} Always \code{bundle} to define this as a bundle project.
|
|
| 20 |
#' \item \code{name} Name of the project.
|
|
| 21 |
#' \item \code{scripts} List of script definitions.
|
|
| 22 |
#' \item \code{source_files} A character array of paths to other files
|
|
| 23 |
#' used within the scripts, relative to the overall project's \code{data} directory.
|
|
| 24 |
#' \item \code{standard_state} State of the \code{source_files}: A list
|
|
| 25 |
#' with keys as the file paths, relative to the overall project root, and values |
|
| 26 |
#' as the MD5 hash of those files. |
|
| 27 |
#' \item \code{dist_state} State of the \code{dist} directory: A list
|
|
| 28 |
#' with keys as the file paths, relative to the overall project root, and values |
|
| 29 |
#' as the MD5 hash of those files. |
|
| 30 |
#' \item \code{checked} Timestamp when the project was last checked with \code{\link{dcf_check}}.
|
|
| 31 |
#' \item \code{check_results} Results of the last check.
|
|
| 32 |
#' } |
|
| 33 |
#' |
|
| 34 |
#' Each \strong{\code{scripts}} entry points to a script to be run, with one default:
|
|
| 35 |
#' \itemize{
|
|
| 36 |
#' \item \code{path} path to the script, relative to this project's root.
|
|
| 37 |
#' \item \code{last_run} Timestamp of the last processing.
|
|
| 38 |
#' \item \code{run_time} How long the script took to run last, in milliseconds.
|
|
| 39 |
#' \item \code{last_status} Status of the last run; a list with entries for
|
|
| 40 |
#' \code{success} (logical) and \code{log} (output of the script).
|
|
| 41 |
#' } |
|
| 42 |
#' |
|
| 43 |
#' @section Project Files: |
|
| 44 |
#' |
|
| 45 |
#' Within a bundle project, there are two files to edits: |
|
| 46 |
#' \itemize{
|
|
| 47 |
#' \item \strong{\code{build.R}}: This is the primary script, which is automatically rerun.
|
|
| 48 |
#' It should read data from the \code{standard} directory of source projects,
|
|
| 49 |
#' and write to it's own \code{dist} directory.
|
|
| 50 |
#' \item \strong{\code{measure_info.json}}: This should list all non-ID variable names
|
|
| 51 |
#' in the data files within \code{dist}. These will inherit the standard measure info
|
|
| 52 |
#' if found in the source projects referred to in \code{source_files}.
|
|
| 53 |
#' If the \code{dist} name is different, but should still inherit standard measure info,
|
|
| 54 |
#' a \code{source_id} entry with the original measure ID will be used to identify the original
|
|
| 55 |
#' measure info. |
|
| 56 |
#' See \code{\link{dcf_measure_info}}.
|
|
| 57 |
#' } |
|
| 58 |
#' |
|
| 59 |
#' @examples |
|
| 60 |
#' project_dir <- paste0(tempdir(), "/temp_project") |
|
| 61 |
#' dcf_init("temp_project", dirname(project_dir))
|
|
| 62 |
#' dcf_add_bundle("bundle_name", project_dir)
|
|
| 63 |
#' list.files(paste0(project_dir, "/data/bundle_name")) |
|
| 64 |
#' |
|
| 65 |
#' @export |
|
| 66 | ||
| 67 |
dcf_add_bundle <- function( |
|
| 68 |
name, |
|
| 69 |
project_dir = ".", |
|
| 70 |
source_files = NULL, |
|
| 71 |
open_after = interactive(), |
|
| 72 |
use_git = TRUE, |
|
| 73 |
use_workflow = FALSE |
|
| 74 |
) {
|
|
| 75 | 4x |
if (missing(name)) {
|
| 76 | ! |
cli::cli_abort("specify a name")
|
| 77 |
} |
|
| 78 | 4x |
name <- gsub("[^A-Za-z0-9]+", "_", name)
|
| 79 | 4x |
is_standalone <- !file.exists(paste0(project_dir, "/settings.json")) |
| 80 | 4x |
data_dir <- dcf_read_settings(project_dir)$data_dir |
| 81 | 4x |
base_dir <- paste0(project_dir, "/", data_dir) |
| 82 | 4x |
base_path <- paste0(base_dir, "/", name, "/") |
| 83 | 4x |
if (!is.null(source_files)) {
|
| 84 | 2x |
su <- !file.exists(paste0( |
| 85 | 2x |
data_dir, |
| 86 |
"/", |
|
| 87 | 2x |
if (is.null(names(source_files))) source_files else names(source_files) |
| 88 |
)) |
|
| 89 | 2x |
if (any(su)) {
|
| 90 | ! |
cli::cli_abort( |
| 91 | ! |
"source file{? doesn't/s don't} exist: {settings$data_dir[su]}"
|
| 92 |
) |
|
| 93 |
} |
|
| 94 |
} |
|
| 95 | 4x |
dir.create(paste0(base_path, "/dist"), showWarnings = FALSE, recursive = TRUE) |
| 96 | 4x |
paths <- paste0( |
| 97 | 4x |
base_path, |
| 98 |
"/", |
|
| 99 | 4x |
c( |
| 100 | 4x |
"README.md", |
| 101 | 4x |
"project.Rproj", |
| 102 | 4x |
"process.json", |
| 103 | 4x |
"measure_info.json", |
| 104 | 4x |
"build.R", |
| 105 | 4x |
"README.md", |
| 106 | 4x |
".gitignore", |
| 107 | 4x |
".github/workflows/process.yaml" |
| 108 |
) |
|
| 109 |
) |
|
| 110 | 4x |
if (!file.exists(paths[[1L]])) {
|
| 111 | 2x |
writeLines( |
| 112 | 2x |
paste0( |
| 113 | 2x |
c( |
| 114 | 2x |
paste("#", name),
|
| 115 |
"", |
|
| 116 | 2x |
"This is a Data Collection Framework data bundle project, initialized with `dcf::dcf_add_bundle`.", |
| 117 |
"", |
|
| 118 | 2x |
"You can us the `dcf` package to rebuild the bundle:", |
| 119 |
"", |
|
| 120 | 2x |
"```R", |
| 121 | 2x |
paste0('dcf::dcf_process("', name, '", "..")'),
|
| 122 |
"```" |
|
| 123 |
), |
|
| 124 | 2x |
collapse = "\n" |
| 125 |
), |
|
| 126 | 2x |
paths[[1L]] |
| 127 |
) |
|
| 128 |
} |
|
| 129 | 4x |
if (!file.exists(paths[[2L]])) {
|
| 130 | 2x |
writeLines("Version: 1.0\n", paths[[2L]])
|
| 131 |
} |
|
| 132 | 4x |
if (!file.exists(paths[[3L]])) {
|
| 133 | 2x |
jsonlite::write_json( |
| 134 | 2x |
list( |
| 135 | 2x |
name = name, |
| 136 | 2x |
type = "bundle", |
| 137 | 2x |
scripts = list( |
| 138 | 2x |
list( |
| 139 | 2x |
path = "build.R", |
| 140 | 2x |
last_run = "", |
| 141 | 2x |
run_time = "", |
| 142 | 2x |
last_status = list(log = "", success = TRUE) |
| 143 |
) |
|
| 144 |
), |
|
| 145 | 2x |
source_files = if (!is.null(names(source_files))) |
| 146 | 2x |
as.list(source_files) else source_files |
| 147 |
), |
|
| 148 | 2x |
paths[[3L]], |
| 149 | 2x |
auto_unbox = TRUE, |
| 150 | 2x |
pretty = TRUE |
| 151 |
) |
|
| 152 |
} |
|
| 153 | 4x |
if (!file.exists(paths[[4L]])) {
|
| 154 | 2x |
writeLines("{}\n", paths[[4L]])
|
| 155 |
} |
|
| 156 | 4x |
if (!file.exists(paths[[5L]])) {
|
| 157 | 2x |
writeLines( |
| 158 | 2x |
paste0( |
| 159 | 2x |
c( |
| 160 | 2x |
"# read data from data source projects", |
| 161 | 2x |
"# and write to this project's `dist` directory", |
| 162 |
"" |
|
| 163 |
), |
|
| 164 | 2x |
collapse = "\n" |
| 165 |
), |
|
| 166 | 2x |
paths[[5L]] |
| 167 |
) |
|
| 168 |
} |
|
| 169 | 4x |
if (!file.exists(paths[[6L]])) {
|
| 170 | ! |
writeLines( |
| 171 | ! |
paste0( |
| 172 | ! |
c( |
| 173 | ! |
paste("#", name),
|
| 174 |
"", |
|
| 175 | ! |
"This is a dcf data bundle project, initialized with `dcf::dcf_add_bundle`.", |
| 176 |
"", |
|
| 177 | ! |
"You can use the `dcf` package to check the project:", |
| 178 |
"", |
|
| 179 | ! |
"```R", |
| 180 | ! |
"dcf_check()", |
| 181 |
"```", |
|
| 182 |
"", |
|
| 183 | ! |
"And process it:", |
| 184 |
"", |
|
| 185 | ! |
"```R", |
| 186 | ! |
"dcf_process()", |
| 187 |
"```" |
|
| 188 |
), |
|
| 189 | ! |
collapse = "\n" |
| 190 |
), |
|
| 191 | ! |
paths[[6L]] |
| 192 |
) |
|
| 193 |
} |
|
| 194 | 4x |
if (is_standalone) {
|
| 195 | ! |
if (use_git) {
|
| 196 | ! |
dcf_init_git(base_path) |
| 197 | ! |
if (!file.exists(paths[[7L]])) {
|
| 198 | ! |
writeLines( |
| 199 | ! |
paste( |
| 200 | ! |
c( |
| 201 | ! |
"*.Rproj", |
| 202 | ! |
".Rproj.user", |
| 203 | ! |
"*.Rprofile", |
| 204 | ! |
"*.Rhistory", |
| 205 | ! |
"*.Rdata", |
| 206 | ! |
".DS_Store", |
| 207 | ! |
"renv" |
| 208 |
), |
|
| 209 | ! |
collapse = "\n" |
| 210 |
), |
|
| 211 | ! |
paths[[7L]] |
| 212 |
) |
|
| 213 |
} |
|
| 214 |
} |
|
| 215 | ! |
if (use_workflow && !file.exists(paths[[8L]])) {
|
| 216 | ! |
dir.create(dirname(paths[[8L]]), recursive = TRUE, showWarnings = FALSE) |
| 217 | ! |
file.copy( |
| 218 | ! |
system.file("workflows/build.yaml", package = "dcf"),
|
| 219 | ! |
paths[[8L]] |
| 220 |
) |
|
| 221 |
} |
|
| 222 |
} |
|
| 223 | ! |
if (open_after) rstudioapi::openProject(paths[[2L]], newSession = TRUE) |
| 224 |
} |
| 1 |
#' Read Epic Cosmos Data |
|
| 2 |
#' |
|
| 3 |
#' Read in metadata and data from an Epic Cosmos file. |
|
| 4 |
#' |
|
| 5 |
#' @param path Path to the file. |
|
| 6 |
#' @param path_root Directory containing \code{path}, if it is not full.
|
|
| 7 |
#' @param standard_names A character vector with standard dataset names as names, and |
|
| 8 |
#' fixed patterns to search for in the metadata as values (in lowercase; e.g., |
|
| 9 |
#' \code{c(condition = "condition name")}).
|
|
| 10 |
#' These take precedence over the existing set of standard names, so make sure |
|
| 11 |
#' the pattern is sufficiently specific to the target dataset. |
|
| 12 |
#' @returns A list with \code{data.frame} entries for \code{metadata} and \code{data}.
|
|
| 13 |
#' |
|
| 14 |
#' @examples |
|
| 15 |
#' # write an example file |
|
| 16 |
#' path <- tempfile(fileext = ".csv") |
|
| 17 |
#' raw_lines <- c( |
|
| 18 |
#' "metadata field,metadata value,", |
|
| 19 |
#' ",,", |
|
| 20 |
#' ",Measures,Value Name", |
|
| 21 |
#' "Year,Measure 1,", |
|
| 22 |
#' "2020,m1,1", |
|
| 23 |
#' ",m2,2", |
|
| 24 |
#' "2021,m1,3", |
|
| 25 |
#' ",m2,4" |
|
| 26 |
#' ) |
|
| 27 |
#' writeLines(raw_lines, path) |
|
| 28 |
#' |
|
| 29 |
#' # read it in |
|
| 30 |
#' dcf_read_epic(basename(path), dirname(path)) |
|
| 31 |
#' |
|
| 32 |
#' @export |
|
| 33 | ||
| 34 |
dcf_read_epic <- function(path, path_root = ".", standard_names = NULL) {
|
|
| 35 | 5x |
full_path <- if (file.exists(path)) {
|
| 36 | 5x |
path |
| 37 |
} else {
|
|
| 38 | ! |
sub("//", "/", paste0(path_root, "/", path), fixed = TRUE)
|
| 39 |
} |
|
| 40 | 5x |
lines <- readLines(full_path, n = 25L, skipNul = FALSE) |
| 41 | 5x |
metadata_break <- grep("^[, ]*$", lines)
|
| 42 | 5x |
if (!length(metadata_break)) {
|
| 43 | ! |
cli::cli_abort( |
| 44 | ! |
"path does not appear to point to a file in the Epic format (no metadata separation)" |
| 45 |
) |
|
| 46 |
} |
|
| 47 | 5x |
meta_end <- min(metadata_break) - 1L |
| 48 | 5x |
data_start <- (if (length(metadata_break) == 1L) {
|
| 49 | 3x |
metadata_break |
| 50 |
} else {
|
|
| 51 | 2x |
max(metadata_break[ |
| 52 | 2x |
metadata_break == c(-1L, metadata_break[-1L]) |
| 53 |
]) |
|
| 54 |
}) + |
|
| 55 | 5x |
1L |
| 56 | 5x |
meta <- c( |
| 57 | 5x |
list( |
| 58 | 5x |
file = path, |
| 59 | 5x |
md5 = unname(tools::md5sum(full_path)), |
| 60 | 5x |
date_processed = Sys.time(), |
| 61 | 5x |
standard_name = "" |
| 62 |
), |
|
| 63 | 5x |
as.list(unlist(lapply( |
| 64 | 5x |
strsplit(sub(",+$", "", lines[seq_len(meta_end)]), ",", fixed = TRUE),
|
| 65 | 5x |
function(r) {
|
| 66 | 6x |
l <- list(paste(r[-1L], collapse = ",")) |
| 67 | 6x |
if (l[[1]] == "") {
|
| 68 | ! |
r <- strsplit(r, ": ", fixed = TRUE)[[1L]] |
| 69 | ! |
l <- list(paste(r[-1L], collapse = ",")) |
| 70 |
} |
|
| 71 | 6x |
names(l) <- r[[1L]] |
| 72 | 6x |
l[[1L]] <- gsub('^"|"$', "", l[[1L]])
|
| 73 | 6x |
l |
| 74 |
} |
|
| 75 |
))) |
|
| 76 |
) |
|
| 77 | 5x |
standard_names <- c( |
| 78 | 5x |
tolower(standard_names), |
| 79 | 5x |
vaccine_mmr = "mmr receipt", |
| 80 | 5x |
rsv_tests = "rsv tests", |
| 81 | 5x |
flu = "influenza", |
| 82 | 5x |
self_harm = "self-harm", |
| 83 | 5x |
covid = "covid", |
| 84 | 5x |
rsv = "rsv", |
| 85 | 5x |
obesity = "bmi", |
| 86 | 5x |
obesity = "obesity", |
| 87 | 5x |
hba1c = "hba1c", |
| 88 | 5x |
ed_opioid = "opioid", |
| 89 | 5x |
ed_firearm = "firearm", |
| 90 | 5x |
ed_workplace = "workplace", |
| 91 | 5x |
ed_fall = "diagnoses: fall", |
| 92 | 5x |
ed_drowning = "drowning", |
| 93 | 5x |
all_encounters = "all ed encounters", |
| 94 | 5x |
all_patients = "all patients" |
| 95 |
) |
|
| 96 | 5x |
standard_names <- standard_names[!duplicated(standard_names)] |
| 97 | 5x |
meta_string <- tolower(paste(unlist(meta), collapse = " ")) |
| 98 | 5x |
for (i in seq_along(standard_names)) {
|
| 99 | 24x |
if (grepl(standard_names[[i]], meta_string, fixed = TRUE)) {
|
| 100 | 5x |
meta$standard_name = names(standard_names)[[i]] |
| 101 | 5x |
break |
| 102 |
} |
|
| 103 |
} |
|
| 104 | 5x |
header_rows <- data_start + c(0L, 1L) |
| 105 | 5x |
lines[header_rows] <- gsub( |
| 106 |
',(?=[^",]+")', |
|
| 107 |
"", |
|
| 108 | 5x |
lines[header_rows], |
| 109 | 5x |
perl = TRUE |
| 110 |
) |
|
| 111 | 5x |
header <- strsplit(lines[header_rows[[2L]]], ",", fixed = TRUE)[[1L]] |
| 112 | 5x |
id_cols <- which(header != "") |
| 113 | 5x |
header <- c( |
| 114 | 5x |
header[id_cols], |
| 115 | 5x |
strsplit(lines[data_start], ",", fixed = TRUE)[[1L]][-id_cols] |
| 116 |
) |
|
| 117 | 5x |
data <- arrow::read_csv_arrow( |
| 118 | 5x |
full_path, |
| 119 | 5x |
col_names = header, |
| 120 | 5x |
col_types = paste(rep("c", length(header)), collapse = ""),
|
| 121 | 5x |
na = c("", "-"),
|
| 122 | 5x |
skip = data_start + 1L |
| 123 |
) |
|
| 124 | 5x |
percents <- grep("^(?:Percent|Base|RSV test)", header, ignore.case = TRUE)
|
| 125 | 5x |
if (length(percents)) {
|
| 126 | ! |
for (col in percents) {
|
| 127 | ! |
data[[col]] <- sub("%", "", data[[col]], fixed = TRUE)
|
| 128 |
} |
|
| 129 |
} |
|
| 130 | 5x |
number <- grep("Number", header, fixed = TRUE)
|
| 131 | 5x |
if (length(number)) {
|
| 132 | ! |
for (col in number) {
|
| 133 | ! |
data[[col]][data[[col]] == "10 or fewer"] <- 5L |
| 134 |
} |
|
| 135 |
} |
|
| 136 | 5x |
for (col in id_cols) {
|
| 137 | 18x |
data[[col]] <- vctrs::vec_fill_missing(data[[col]], "down") |
| 138 |
} |
|
| 139 | 5x |
if (all(c("Measures", "Base Patient") %in% colnames(data))) {
|
| 140 | ! |
data <- Reduce( |
| 141 | ! |
merge, |
| 142 | ! |
lapply(split(data, data$Measures), function(d) {
|
| 143 | ! |
measure <- d$Measures[[1L]] |
| 144 | ! |
d[[measure]] <- d[["Base Patient"]] |
| 145 | ! |
d[, !(colnames(d) %in% c("Measures", "Base Patient"))]
|
| 146 |
}) |
|
| 147 |
) |
|
| 148 |
} |
|
| 149 | 5x |
colnames(data) <- standard_columns(colnames(data)) |
| 150 | 5x |
if (meta$standard_name == "obesity") {
|
| 151 | ! |
meta$standard_name <- paste0( |
| 152 | ! |
meta$standard_name, |
| 153 |
"_", |
|
| 154 | ! |
if ("state" %in% colnames(data)) "state" else "county"
|
| 155 |
) |
|
| 156 |
} else if ( |
|
| 157 | 5x |
meta$standard_name == "all_encounters" && "week" %in% colnames(data) |
| 158 |
) {
|
|
| 159 | ! |
meta$standard_name = "all_encounters_weekly" |
| 160 |
} |
|
| 161 | 5x |
if ("age" %in% colnames(data)) {
|
| 162 | 5x |
std_age <- standard_age(data$age) |
| 163 | 5x |
missed_ages <- (data$age != "No value") & is.na(std_age) |
| 164 | 5x |
if (any(missed_ages)) {
|
| 165 | ! |
std_age[missed_ages] <- data$age[missed_ages] |
| 166 | ! |
missed_levels <- unique(data$age[missed_ages]) |
| 167 | ! |
cli::cli_warn("missed age levels: {.field {missed_levels}}")
|
| 168 |
} |
|
| 169 | 5x |
data$age <- std_age |
| 170 |
} |
|
| 171 | 5x |
if (!("year" %in% colnames(data))) {
|
| 172 | 2x |
if (!is.null(meta[["Session Date Range"]])) {
|
| 173 | 1x |
data$Year <- meta[["Session Date Range"]] |
| 174 |
} else {
|
|
| 175 | 1x |
year <- gsub( |
| 176 |
"^_|\\.$", |
|
| 177 |
"", |
|
| 178 | 1x |
regmatches( |
| 179 | 1x |
basename(path), |
| 180 | 1x |
gregexpr("_\\d{4}\\.", basename(path))
|
| 181 | 1x |
)[[1L]] |
| 182 |
) |
|
| 183 | 1x |
if (length(year)) {
|
| 184 | 1x |
data$Year <- year |
| 185 |
} |
|
| 186 |
} |
|
| 187 |
} |
|
| 188 | 5x |
list(metadata = meta, data = data) |
| 189 |
} |
|
| 190 | ||
| 191 |
standard_age <- function(age) {
|
|
| 192 | 5x |
c( |
| 193 | 5x |
`less than 1` = "<1 Years", |
| 194 | 5x |
`1 and < 2` = "1-2 Years", |
| 195 | 5x |
`2 and < 3` = "2-3 Years", |
| 196 | 5x |
`3 and < 4` = "3-4 Years", |
| 197 | 5x |
`1 and < 5` = "1-4 Years", |
| 198 | 5x |
`1 year or more and less than 5` = "1-4 Years", |
| 199 | 5x |
`4 and < 5` = "4-5 Years", |
| 200 | 5x |
`less than 5` = "<5 Years", |
| 201 | 5x |
`5 and < 6` = "5-6 Years", |
| 202 | 5x |
`5 and < 18` = "5-17 Years", |
| 203 | 5x |
`5 or more and less than 18 (1)` = "5-17 Years", |
| 204 | 5x |
`6 and < 7` = "6-7 Years", |
| 205 | 5x |
`6 or more` = "6+ Years", |
| 206 | 5x |
`7 and < 8` = "7-8 Years", |
| 207 | 5x |
`8 and < 9` = "8-9 Years", |
| 208 | 5x |
`9 or more` = "9+ Years", |
| 209 | 5x |
`less than 10` = "<10 Years", |
| 210 | 5x |
`10 and < 15` = "10-14 Years", |
| 211 | 5x |
`less than 15` = "<15 Years", |
| 212 | 5x |
`15 and < 20` = "15-19 Years", |
| 213 | 5x |
`15 and < 25` = "15-25 Years", |
| 214 | 5x |
`less than 18` = "<18 Years", |
| 215 | 5x |
`18 and < 25` = "18-24 Years", |
| 216 | 5x |
`18 and < 40` = "18-39 Years", |
| 217 | 5x |
`18 and < 45` = "18-44 Years", |
| 218 | 5x |
`18 and < 50` = "18-49 Years", |
| 219 | 5x |
`18 or more and less than 50` = "18-49 Years", |
| 220 | 5x |
`20 and < 40` = "20-39 Years", |
| 221 | 5x |
`25 and < 35` = "25-34 Years", |
| 222 | 5x |
`25 and < 45` = "25-45 Years", |
| 223 | 5x |
`35 and < 45` = "35-44 Years", |
| 224 | 5x |
`40 and < 65` = "40-64 Years", |
| 225 | 5x |
`45 and < 55` = "45-54 Years", |
| 226 | 5x |
`45 and < 65` = "45-64 Years", |
| 227 | 5x |
`45-64` = "45-64 Years", |
| 228 | 5x |
`45 and < 65` = "45-64 Years", |
| 229 | 5x |
`50 and < 65` = "50-64 Years", |
| 230 | 5x |
`50 or more and less than 64` = "50-64 Years", |
| 231 | 5x |
`55 and < 65` = "55-64 Years", |
| 232 | 5x |
`less than 65` = "<65 Years", |
| 233 | 5x |
`65 and < 110` = "65+ Years", |
| 234 | 5x |
`65 or more` = "65+ Years", |
| 235 | 5x |
`65+` = "65+ Years", |
| 236 | 5x |
`total` = "Total" |
| 237 |
)[ |
|
| 238 | 5x |
sub(" [Yy]ears", "", sub("^[^a-z0-9]+|:.*$", "", tolower(age)))
|
| 239 |
] |
|
| 240 |
} |
|
| 241 | ||
| 242 |
standard_columns <- function(cols) {
|
|
| 243 | 5x |
cols <- gsub(" ", "_", sub("number of ", "n_", tolower(cols)), fixed = TRUE)
|
| 244 | 5x |
cols[grep("^age", cols)] <- "age"
|
| 245 | 5x |
cols[grep("^state", cols)] <- "state"
|
| 246 | 5x |
cols[grep("^county", cols)] <- "county"
|
| 247 | 5x |
cols[grep("bmi_30", cols, fixed = TRUE)] <- "bmi_30_49.8"
|
| 248 | 5x |
cols[grep("hemoglobin_a1c_7", cols, fixed = TRUE)] <- "hemoglobin_a1c_7"
|
| 249 | 5x |
cols[grep("mmr_receipt", cols, fixed = TRUE)] <- "mmr_receipt"
|
| 250 | 5x |
cols[grep("opioid", cols, fixed = TRUE)] <- "ed_opioid"
|
| 251 | 5x |
cols[grep("^rsv_tests", cols)] <- "rsv_tests"
|
| 252 | 5x |
cols[grep("total:", cols, fixed = TRUE)] <- "total"
|
| 253 | 5x |
cols |
| 254 |
} |
| 1 |
#' Run a Project's Build Process |
|
| 2 |
#' |
|
| 3 |
#' Build a Data Collection Framework project, |
|
| 4 |
#' which involves processing and checking all data projects. |
|
| 5 |
#' |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project to be built. |
|
| 7 |
#' @param is_auto Logical; if \code{FALSE}, will run \code{\link{dcf_process}} as if it were run
|
|
| 8 |
#' manually. |
|
| 9 |
#' @param ... Passes arguments to \code{\link{dcf_process}}.
|
|
| 10 |
#' @param make_diagram Logical; if \code{FALSE}, will not make a \code{status.md} diagram.
|
|
| 11 |
#' @param make_file_log Logical; if \code{FALSE}, will not make a \code{file_log.json} output.
|
|
| 12 |
#' @returns A version of the project report, which is also written to |
|
| 13 |
#' \code{project_dir/docs/report.json.gz}.
|
|
| 14 |
#' @examples |
|
| 15 |
#' project_file <- "../../../pophive/pophive_demo" |
|
| 16 |
#' if (file.exists(project_file)) {
|
|
| 17 |
#' report <- dcf_build(project_file) |
|
| 18 |
#' } |
|
| 19 |
#' @export |
|
| 20 | ||
| 21 |
dcf_build <- function( |
|
| 22 |
project_dir = ".", |
|
| 23 |
is_auto = TRUE, |
|
| 24 |
..., |
|
| 25 |
make_diagram = TRUE, |
|
| 26 |
make_file_log = TRUE |
|
| 27 |
) {
|
|
| 28 | 3x |
settings <- dcf_read_settings(project_dir) |
| 29 | 3x |
is_standalone <- isTRUE(settings$standalone) |
| 30 | 3x |
data_dir <- if (is_standalone) {
|
| 31 | 1x |
dirname(project_dir) |
| 32 |
} else {
|
|
| 33 | 2x |
paste0(project_dir, "/", settings$data_dir) |
| 34 |
} |
|
| 35 | ||
| 36 | 3x |
processes <- list.files( |
| 37 | 3x |
data_dir, |
| 38 | 3x |
"process\\.json", |
| 39 | 3x |
recursive = TRUE, |
| 40 | 3x |
full.names = TRUE |
| 41 |
) |
|
| 42 | 3x |
process_state <- tools::md5sum(processes) |
| 43 | 3x |
report_file <- paste0(project_dir, "/report.json.gz") |
| 44 | 3x |
process <- dcf_process(project_dir = project_dir, is_auto = TRUE, ...) |
| 45 | 3x |
issues <- dcf_check(project_dir = project_dir) |
| 46 |
if ( |
|
| 47 | 3x |
!identical( |
| 48 | 3x |
process_state, |
| 49 | 3x |
tools::md5sum(list.files( |
| 50 | 3x |
data_dir, |
| 51 | 3x |
"process\\.json", |
| 52 | 3x |
recursive = TRUE, |
| 53 | 3x |
full.names = TRUE |
| 54 |
)) |
|
| 55 |
) |
|
| 56 |
) {
|
|
| 57 | 3x |
datapackages <- list.files( |
| 58 | 3x |
data_dir, |
| 59 | 3x |
"datapackage\\.json", |
| 60 | 3x |
recursive = TRUE, |
| 61 | 3x |
full.names = TRUE |
| 62 |
) |
|
| 63 | 3x |
names(datapackages) <- sub( |
| 64 |
"^/", |
|
| 65 |
"", |
|
| 66 | 3x |
sub( |
| 67 | 3x |
data_dir, |
| 68 |
"", |
|
| 69 | 3x |
sub("/datapackage.json", "", datapackages, fixed = TRUE),
|
| 70 | 3x |
fixed = TRUE |
| 71 |
) |
|
| 72 |
) |
|
| 73 | 3x |
names(processes) <- sub( |
| 74 |
"^/", |
|
| 75 |
"", |
|
| 76 | 3x |
sub( |
| 77 | 3x |
data_dir, |
| 78 |
"", |
|
| 79 | 3x |
sub("/process.json", "", processes, fixed = TRUE),
|
| 80 | 3x |
fixed = TRUE |
| 81 |
) |
|
| 82 |
) |
|
| 83 | 3x |
report <- list( |
| 84 | 3x |
date = Sys.time(), |
| 85 | 3x |
settings = settings, |
| 86 | 3x |
source_times = process$timings, |
| 87 | 3x |
logs = process$logs, |
| 88 | 3x |
issues = issues, |
| 89 | 3x |
metadata = lapply(datapackages, dcf_attempt_read_json), |
| 90 | 3x |
processes = lapply(processes, dcf_attempt_read_json) |
| 91 |
) |
|
| 92 | 3x |
with_levels <- list() |
| 93 | 3x |
measures <- list() |
| 94 | 3x |
for (p in seq_along(report$metadata)) {
|
| 95 | 5x |
for (r in seq_along(report$metadata[[p]]$resources)) {
|
| 96 | 6x |
for (f in seq_along( |
| 97 | 6x |
report$metadata[[p]]$resources[[r]]$schema$fields |
| 98 |
)) {
|
|
| 99 | 22x |
info <- report$metadata[[p]]$resources[[r]]$schema$fields[[f]]$info |
| 100 | 22x |
if (is.list(info) && !is.null(info$id)) {
|
| 101 | 8x |
measures[[info$id]] <- list( |
| 102 | 8x |
project = report$metadata[[p]]$name, |
| 103 | 8x |
file = report$metadata[[p]]$resources[[r]]$filename, |
| 104 | 8x |
info = info |
| 105 |
) |
|
| 106 | 8x |
if (!is.null(info$levels)) {
|
| 107 | 2x |
with_levels <- c(with_levels, list(c(p, r, f))) |
| 108 |
} |
|
| 109 |
} |
|
| 110 |
} |
|
| 111 |
} |
|
| 112 |
} |
|
| 113 | 3x |
for (cords in with_levels) {
|
| 114 | 2x |
levels <- report$metadata[[cords[[1L]]]]$resources[[cords[[ |
| 115 | 2x |
2L |
| 116 | 2x |
]]]]$schema$fields[[cords[[3L]]]]$info$levels |
| 117 | 2x |
source_info <- list() |
| 118 | 2x |
for (level_id in names(levels)) {
|
| 119 | 4x |
level <- levels[[level_id]] |
| 120 | 4x |
source_id <- if (!is.list(level) || is.null(level$source_id)) {
|
| 121 | 2x |
level_id |
| 122 |
} else {
|
|
| 123 | 2x |
level$source_id |
| 124 |
} |
|
| 125 | 4x |
source_info[[source_id]] <- measures[[source_id]] |
| 126 |
} |
|
| 127 | 2x |
report$metadata[[cords[[1L]]]]$resources[[cords[[ |
| 128 | 2x |
2L |
| 129 | 2x |
]]]]$schema$fields[[cords[[3L]]]]$info$source_info <- source_info |
| 130 |
} |
|
| 131 | 3x |
jsonlite::write_json( |
| 132 | 3x |
report, |
| 133 | 3x |
gzfile(report_file), |
| 134 | 3x |
auto_unbox = TRUE, |
| 135 | 3x |
dataframe = "columns" |
| 136 |
) |
|
| 137 |
} else {
|
|
| 138 | ! |
report <- dcf_attempt_read_json(report_file) |
| 139 |
} |
|
| 140 | 3x |
if (make_file_log) {
|
| 141 | 3x |
file_log <- list() |
| 142 | 3x |
for (file_dir in names(report$metadata)) {
|
| 143 | 5x |
if (grepl("/dist", file_dir, fixed = TRUE)) {
|
| 144 | 2x |
p <- report$metadata[[file_dir]] |
| 145 | 2x |
for (p_file in p$resources) {
|
| 146 | 3x |
file_log[[paste0( |
| 147 | 3x |
settings$data_dir, |
| 148 |
"/", |
|
| 149 | 3x |
file_dir, |
| 150 |
"/", |
|
| 151 | 3x |
p_file$filename |
| 152 | 3x |
)]] <- list( |
| 153 | 3x |
updated = if (length(p_file$vintage)) {
|
| 154 | 3x |
p_file$vintage |
| 155 |
} else {
|
|
| 156 | ! |
p_file$last_modified |
| 157 |
}, |
|
| 158 | 3x |
md5 = p_file$md5 |
| 159 |
) |
|
| 160 |
} |
|
| 161 |
} |
|
| 162 |
} |
|
| 163 | 3x |
jsonlite::write_json( |
| 164 | 3x |
file_log, |
| 165 | 3x |
paste0(project_dir, "/file_log.json"), |
| 166 | 3x |
auto_unbox = TRUE |
| 167 |
) |
|
| 168 |
} |
|
| 169 | 3x |
if (make_diagram) {
|
| 170 | 3x |
dcf_status_diagram(project_dir) |
| 171 |
} |
|
| 172 | 3x |
invisible(report) |
| 173 |
} |
| 1 |
#' Standardize Epic Data |
|
| 2 |
#' |
|
| 3 |
#' Standardize a raw Epic data table. |
|
| 4 |
#' |
|
| 5 |
#' @param raw_data Raw Epic data, such as returned from \link{dcf_read_epic}.
|
|
| 6 |
#' @returns A standardized form of \code{data}.
|
|
| 7 |
#' @section Standardization: |
|
| 8 |
#' \itemize{
|
|
| 9 |
#' \item Collapse location columns (\code{state} or \code{county}) to a single
|
|
| 10 |
#' \code{geography} column, and region names to IDs.
|
|
| 11 |
#' \item Collapse time columns (\code{year}, \code{month}, or \code{week}) to a single
|
|
| 12 |
#' \code{time} column, and clean up value formatting.
|
|
| 13 |
#' \item Drop rows with no values across value columns. |
|
| 14 |
#' } |
|
| 15 |
#' @examples |
|
| 16 |
#' \dontrun{
|
|
| 17 |
#' raw_data <- dcf_read_epic("data/epic/raw/flu.csv.xz")
|
|
| 18 |
#' standard_data <- dcf_process_epic_raw(raw_data) |
|
| 19 |
#' } |
|
| 20 |
#' |
|
| 21 |
#' @export |
|
| 22 | ||
| 23 |
dcf_standardize_epic <- function(raw_data) {
|
|
| 24 | 1x |
region_names <- epic_id_maps$regions |
| 25 | 1x |
names(region_names) <- gsub( |
| 26 | 1x |
" (?:CITY AND BOROUGH|BOROUGH|PARISH|MUNICIPALITY|MUNICIPIO)|[.']", |
| 27 |
"", |
|
| 28 | 1x |
toupper(names(region_names)) |
| 29 |
) |
|
| 30 | 1x |
cols <- colnames(raw_data) |
| 31 | 1x |
time_col <- which(cols == "year") |
| 32 | 1x |
if (length(time_col)) {
|
| 33 | 1x |
colnames(raw_data)[time_col] <- "time" |
| 34 | 1x |
raw_data$time <- as.integer(substring( |
| 35 | 1x |
raw_data$time, |
| 36 | 1x |
0L, |
| 37 | 1x |
4L |
| 38 |
)) |
|
| 39 |
} |
|
| 40 | 1x |
month_col <- which(cols == "month") |
| 41 | 1x |
if (length(month_col)) {
|
| 42 | ! |
raw_data$time <- paste0( |
| 43 | ! |
raw_data$time, |
| 44 |
"-", |
|
| 45 | ! |
epic_id_maps$months[raw_data$month] |
| 46 |
) |
|
| 47 |
} |
|
| 48 | 1x |
week_col <- which(cols == "week") |
| 49 | 1x |
if (length(week_col)) {
|
| 50 | 1x |
raw_data$time <- paste0( |
| 51 | 1x |
raw_data$time, |
| 52 |
"-", |
|
| 53 | 1x |
vapply( |
| 54 | 1x |
strsplit(raw_data$week, "[^A-Za-z0-9]"), |
| 55 | 1x |
function(p) {
|
| 56 | 4x |
paste0( |
| 57 | 4x |
epic_id_maps$months[[p[[1L]]]], |
| 58 |
"-", |
|
| 59 | 4x |
formatC(as.integer(p[[2L]]), width = 2L, flag = "0") |
| 60 |
) |
|
| 61 |
}, |
|
| 62 |
"" |
|
| 63 |
) |
|
| 64 |
) |
|
| 65 |
} |
|
| 66 | 1x |
geo_col <- grep("^(?:state|county)", cols)
|
| 67 | 1x |
if (length(geo_col)) {
|
| 68 | 1x |
colnames(raw_data)[geo_col] <- "geography" |
| 69 | 1x |
raw_data$geography <- toupper(raw_data$geography) |
| 70 | 1x |
missing_geo <- !(raw_data$geography %in% names(region_names)) |
| 71 | 1x |
if (any(missing_geo)) {
|
| 72 | 1x |
geo <- sub( |
| 73 | 1x |
"LA ", |
| 74 | 1x |
"LA", |
| 75 | 1x |
sub("^SAINT", "ST", raw_data$geography[missing_geo]),
|
| 76 | 1x |
fixed = TRUE |
| 77 |
) |
|
| 78 | 1x |
if (any(grepl(", VA", geo, fixed = TRUE))) {
|
| 79 | 1x |
geo[geo == "SALEM, VA"] <- "SALEM CITY, VA" |
| 80 | 1x |
geo[geo == "RADFORD, VA"] <- "RADFORD CITY, VA" |
| 81 | 1x |
geo[geo == "DONA ANA, NM"] <- "DO\u00d1A ANA, NM" |
| 82 | 1x |
geo[geo == "MATANUSKA SUSITNA, AK"] <- "MATANUSKA-SUSITNA, AK" |
| 83 |
} |
|
| 84 | 1x |
raw_data$geography[missing_geo] <- geo |
| 85 |
} |
|
| 86 | 1x |
missing_regions <- raw_data$geography[ |
| 87 | 1x |
!(raw_data$geography %in% names(region_names)) |
| 88 |
] |
|
| 89 | 1x |
if (length(missing_regions)) {
|
| 90 | ! |
cli::cli_warn( |
| 91 | ! |
'unrecognized regions: {paste(unique(missing_regions), collapse = "; ")}'
|
| 92 |
) |
|
| 93 |
} |
|
| 94 | 1x |
raw_data$geography <- region_names[raw_data$geography] |
| 95 | 1x |
raw_data <- raw_data[!is.na(raw_data$geography), ] |
| 96 |
} |
|
| 97 | 1x |
raw_data <- raw_data[, |
| 98 | 1x |
!(colnames(raw_data) %in% c("state", "county", "year", "month", "week"))
|
| 99 |
] |
|
| 100 | 1x |
raw_data[ |
| 101 | 1x |
rowSums( |
| 102 | 1x |
!is.na(raw_data[, |
| 103 | 1x |
!(colnames(raw_data) %in% c("geography", "time", "age"))
|
| 104 |
]) |
|
| 105 |
) != |
|
| 106 | 1x |
0L, |
| 107 |
] |
|
| 108 |
} |
| 1 |
#' Use Data from a Data Collection Project |
|
| 2 |
#' |
|
| 3 |
#' Load the standard or distribution data from a local or remote data collection project. |
|
| 4 |
#' |
|
| 5 |
#' @param variables A character vector of variable names to be loaded, or a selected |
|
| 6 |
#' subset of a project data dictionary, as returned from \code{\link{dcf_variables}}.
|
|
| 7 |
#' @param project Path to a local project, or the GitHub account and repository name |
|
| 8 |
#' (\code{"{account_name}/{repo_name}"}) of a remote project.
|
|
| 9 |
#' @param data_format The data format to select, between \code{tall} and \code{wide}.
|
|
| 10 |
#' Useful if there are duplicate measure names between files of different formats. |
|
| 11 |
#' @param project_type Project type to select, between \code{bundle} and \code{source}.
|
|
| 12 |
#' @param ... Additional arguments passed to \code{\link{dcf_report}}.
|
|
| 13 |
#' @param unify Logical; if \code{FALSE}, will return \code{data} as a list with entries for each
|
|
| 14 |
#' file. Otherwise (by default), will attempt to combine the loaded data. |
|
| 15 |
#' @param only_selected Logical; if \code{TRUE}, will drop columns that were not included in
|
|
| 16 |
#' \code{variables}, other than ID columns.
|
|
| 17 |
#' @param cache Path to a directory in which to store downloaded files. Within this directory, |
|
| 18 |
#' the repository structure will be recreated within an account-named directory. |
|
| 19 |
#' @param refresh Logical; if \code{TURE}, will download files even if they exist in the \code{cache}.
|
|
| 20 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
|
|
| 21 |
#' @returns A list with entries for metadata (the datapackage resource entry for each file loaded) |
|
| 22 |
#' and data (a tibble or list of tibbles of the unified or separately loaded files). |
|
| 23 |
#' @family data user interface functions |
|
| 24 |
#' @examples |
|
| 25 |
#' # retrieve the full bundle file that includes the `epic_rsv` measure |
|
| 26 |
#' bundle <- dcf_data( |
|
| 27 |
#' "epic_rsv", |
|
| 28 |
#' "dissc-yale/pophive_demo", |
|
| 29 |
#' data_format = "tall", |
|
| 30 |
#' verbose = FALSE |
|
| 31 |
#' ) |
|
| 32 |
#' bundle$data |
|
| 33 |
#' |
|
| 34 |
#' if (require("ggplot2", quietly = TRUE)) {
|
|
| 35 |
#' # extract short names from metadata |
|
| 36 |
#' labels <- vapply( |
|
| 37 |
#' bundle$metadata[[1L]]$schema$fields[[3L]]$info$levels, |
|
| 38 |
#' function(measure) measure$info$short_name, |
|
| 39 |
#' "" |
|
| 40 |
#' ) |
|
| 41 |
#' |
|
| 42 |
#' # show trends from different measures over time |
|
| 43 |
#' bundle$data |> |
|
| 44 |
#' dplyr::filter( |
|
| 45 |
#' time >= as.Date("2024-01-01"),
|
|
| 46 |
#' measure != "epic_all_encounters" |
|
| 47 |
#' ) |> |
|
| 48 |
#' dplyr::mutate(measure = labels[measure]) |> |
|
| 49 |
#' ggplot(aes(x = time, y = value_scaled, color = measure)) + |
|
| 50 |
#' theme_dark() %+replace% |
|
| 51 |
#' theme(panel.background = element_rect(fill = FALSE, color = FALSE)) + |
|
| 52 |
#' geom_smooth( |
|
| 53 |
#' method = "gam", |
|
| 54 |
#' formula = y ~ s(x, bs = "cs", k = 50L) |
|
| 55 |
#' ) |
|
| 56 |
#' } |
|
| 57 |
#' @export |
|
| 58 | ||
| 59 |
dcf_data <- function( |
|
| 60 |
variables = NULL, |
|
| 61 |
project = ".", |
|
| 62 |
data_format = NULL, |
|
| 63 |
project_type = "bundle", |
|
| 64 |
..., |
|
| 65 |
unify = TRUE, |
|
| 66 |
only_selected = FALSE, |
|
| 67 |
cache = tempdir(), |
|
| 68 |
refresh = FALSE, |
|
| 69 |
verbose = TRUE |
|
| 70 |
) {
|
|
| 71 | 4x |
report <- dcf_report(project, ..., refresh = refresh) |
| 72 | 4x |
if (is.null(variables) || is.character(variables)) {
|
| 73 | 3x |
all_variables <- dcf_variables(report) |
| 74 | 3x |
selected <- all_variables[ |
| 75 | 3x |
all_variables$project_type == project_type & |
| 76 | 3x |
grepl( |
| 77 | 3x |
paste0("^(?:", paste(variables, collapse = "|"), ")$"),
|
| 78 | 3x |
all_variables$name |
| 79 |
), |
|
| 80 |
] |
|
| 81 |
} else {
|
|
| 82 | 1x |
if (!all(c("name", "file") %in% colnames(variables))) {
|
| 83 | ! |
cli::cli_abort( |
| 84 | ! |
"`variables` must include `name` and `file` columns if not a character vector" |
| 85 |
) |
|
| 86 |
} |
|
| 87 | 1x |
all_variables <- selected <- variables |
| 88 | 1x |
variables <- selected$name |
| 89 |
} |
|
| 90 | 4x |
if (!is.null(data_format)) {
|
| 91 | 2x |
selected <- selected[selected$data_format == data_format, ] |
| 92 |
} |
|
| 93 | 4x |
if (nrow(selected) == 0L) {
|
| 94 | ! |
cli::cli_abort("no variables found")
|
| 95 |
} |
|
| 96 | 4x |
not_found <- if (is.null(variables)) character() else |
| 97 | 4x |
variables[!(variables %in% selected$name)] |
| 98 | 4x |
if (length(not_found)) {
|
| 99 | ! |
cli::cli_abort("variable{?/s} not found: {not_found}")
|
| 100 |
} |
|
| 101 | 4x |
data_dir <- report$settings$data_dir |
| 102 | 4x |
files <- unique(selected$file) |
| 103 | 4x |
project_outputs <- gsub("^[^/]+/|/[^/]+$", "", files)
|
| 104 | 4x |
project_metadata <- report$metadata[ |
| 105 | 4x |
names(report$metadata) %in% project_outputs |
| 106 |
] |
|
| 107 | 4x |
file_metadata <- list() |
| 108 | 4x |
for (output in names(project_metadata)) {
|
| 109 | 6x |
datapackage <- project_metadata[[output]] |
| 110 | 6x |
for (i in seq_along(datapackage$resources)) {
|
| 111 | 15x |
resource_file <- paste( |
| 112 | 15x |
data_dir, |
| 113 | 15x |
output, |
| 114 | 15x |
datapackage$resources[[i]]$filename, |
| 115 | 15x |
sep = "/" |
| 116 |
) |
|
| 117 | 15x |
if (resource_file %in% files) {
|
| 118 | 6x |
file_metadata[[resource_file]] <- datapackage$resources[[i]] |
| 119 |
} |
|
| 120 |
} |
|
| 121 |
} |
|
| 122 | 4x |
file_metadata <- file_metadata[files] |
| 123 | ||
| 124 |
# download files to cache if needed |
|
| 125 | 4x |
if (identical(report$settings$report_url, "")) {
|
| 126 | 2x |
if (verbose) cli::cli_alert_info("loading files from local project")
|
| 127 | 2x |
project_root <- paste0(project, "/") |
| 128 |
} else {
|
|
| 129 | 2x |
project_root <- paste0( |
| 130 | 2x |
normalizePath(paste0(cache, "/", project), "/", FALSE), |
| 131 |
"/" |
|
| 132 |
) |
|
| 133 | 2x |
if (verbose) |
| 134 | 2x |
cli::cli_alert_info("downloading files to cache: {project_root}")
|
| 135 | 2x |
base_url <- dirname(report$settings$report_url) |
| 136 | 2x |
for (file in files) {
|
| 137 | 4x |
cached_file <- paste0(project_root, file) |
| 138 | 4x |
if (refresh || !file.exists(cached_file)) {
|
| 139 | 2x |
file_url <- paste0(base_url, "/", file) |
| 140 | 2x |
dir.create(dirname(cached_file), FALSE, TRUE) |
| 141 | 2x |
req <- curl::curl_fetch_disk(file_url, cached_file) |
| 142 | 2x |
if (req$status_code != 200L) {
|
| 143 | ! |
unlink(cached_file) |
| 144 | ! |
cli::cli_warn("failed to download {.url {file_url}}")
|
| 145 |
} |
|
| 146 |
} |
|
| 147 |
} |
|
| 148 | 2x |
if (verbose) cli::cli_alert_info("loading files from cache project")
|
| 149 |
} |
|
| 150 | ||
| 151 |
# load files |
|
| 152 | 4x |
n_files <- length(files) |
| 153 | 4x |
if (verbose) |
| 154 | 4x |
cli::cli_progress_bar("loading files", "download", total = n_files)
|
| 155 | 4x |
data <- list() |
| 156 | 4x |
data_tall <- structure(logical(n_files), names = files) |
| 157 | 4x |
for (file in files) {
|
| 158 | 6x |
data[[file]] <- attempt_read(paste0(project_root, file)) |
| 159 | 6x |
data[[file]]$source_file <- file |
| 160 | 6x |
if (identical(file_metadata[[file]]$data_format, "tall")) {
|
| 161 | 1x |
data_tall[[file]] <- TRUE |
| 162 |
} |
|
| 163 | 6x |
if (verbose) cli::cli_progress_update() |
| 164 |
} |
|
| 165 | 4x |
if (verbose) cli::cli_progress_done() |
| 166 | ||
| 167 | 4x |
if (unify) {
|
| 168 | 4x |
if (length(data) > 1L) {
|
| 169 | 2x |
all_cols <- unique(unlist(lapply(data, colnames))) |
| 170 | 2x |
if (!any(data_tall)) {
|
| 171 | 2x |
id_cols <- c("geography", "time", "age")
|
| 172 | 2x |
id_cols <- id_cols[id_cols %in% all_cols] |
| 173 | 2x |
if (only_selected) |
| 174 | ! |
all_cols <- all_cols[all_cols %in% c(id_cols, selected$name)] |
| 175 | 2x |
data <- dplyr::as_tibble(Reduce( |
| 176 | 2x |
function(x, y) merge(x, y, id_cols, all = TRUE), |
| 177 | 2x |
lapply( |
| 178 | 2x |
data, |
| 179 | 2x |
function(d) {
|
| 180 | 2x |
for (col in id_cols[!(id_cols %in% colnames(d))]) d[[col]] <- NA |
| 181 | ! |
if (only_selected) d <- d[, all_cols[all_cols %in% colnames(d)]] |
| 182 | 4x |
d[, colnames(d) != "source_file"] |
| 183 |
} |
|
| 184 |
) |
|
| 185 |
)) |
|
| 186 | ! |
} else if (all(data_tall)) {
|
| 187 | ! |
data <- do.call( |
| 188 | ! |
dplyr::bind_rows, |
| 189 | ! |
lapply(files, function(file) {
|
| 190 | ! |
d <- data[[file]] |
| 191 | ! |
if (only_selected) {
|
| 192 | ! |
measure_col <- unlist(lapply( |
| 193 | ! |
file_metadata[[file]]$schema$fields, |
| 194 | ! |
function(field) |
| 195 | ! |
if ("levels" %in% names(field$info)) field$name else NULL
|
| 196 |
)) |
|
| 197 | ! |
d <- d[d[[measure_col]] %in% selected$name, ] |
| 198 |
} |
|
| 199 | ! |
d[, all_cols] |
| 200 |
}) |
|
| 201 |
) |
|
| 202 |
} else {
|
|
| 203 | ! |
cli::cli_warn( |
| 204 | ! |
"datasets are in inconsistent formats, so will not be unified" |
| 205 |
) |
|
| 206 |
} |
|
| 207 |
} else {
|
|
| 208 | 2x |
data <- data[[1L]] |
| 209 |
} |
|
| 210 |
} |
|
| 211 | ||
| 212 | 4x |
invisible(list(metadata = file_metadata, data = data)) |
| 213 |
} |
| 1 |
#' Download Census Population Data |
|
| 2 |
#' |
|
| 3 |
#' Download American Community Survey population data from the U.S. Census Bureau. |
|
| 4 |
#' |
|
| 5 |
#' @param year Data year. |
|
| 6 |
#' @param out_dir Directory to download the file to. |
|
| 7 |
#' @param state_only Logical; if \code{TRUE}, will only load state data.
|
|
| 8 |
#' Will still download county data. |
|
| 9 |
#' @param age_groups A list mapping lower-level age groups to high-level ones |
|
| 10 |
#' (e.g., \code{list(`<10 Years` = c("Under 5 years", "5 to 9 years"))}).
|
|
| 11 |
#' Or the name of a standard mapping (\code{"7"} or \code{"9"}).
|
|
| 12 |
#' If \code{FALSE}, will return the lowest-level age groups.
|
|
| 13 |
#' @param overwrite Logical; if \code{TRUE}, will re-download and overwrite existing data.
|
|
| 14 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 15 |
#' @returns A \code{data.frame} including \code{GEOID} and \code{region_name}
|
|
| 16 |
#' for states and counties, along with their population, in total and within |
|
| 17 |
#' age brackets. |
|
| 18 |
#' @examples |
|
| 19 |
#' if (file.exists("../../../pophive/census_population_2021.csv.xz")) {
|
|
| 20 |
#' dcf_load_census(2021, "../../../pophive")[1:10, ] |
|
| 21 |
#' } |
|
| 22 |
#' @export |
|
| 23 | ||
| 24 |
dcf_load_census <- function( |
|
| 25 |
year = 2021, |
|
| 26 |
out_dir = NULL, |
|
| 27 |
state_only = FALSE, |
|
| 28 |
age_groups = "9", |
|
| 29 |
overwrite = FALSE, |
|
| 30 |
verbose = TRUE |
|
| 31 |
) {
|
|
| 32 | 2x |
out_file <- paste0(out_dir, "/census_population_", year, ".csv.xz") |
| 33 | 2x |
write_out <- !is.null(out_dir) |
| 34 | 2x |
age_group_sets <- list( |
| 35 | 2x |
"7" = list( |
| 36 | 2x |
`<10 Years` = c("Under 5 years", "5 to 9 years"),
|
| 37 | 2x |
`10-14 Years` = "10 to 14 years", |
| 38 | 2x |
`15-19 Years` = c("15 to 17 years", "18 and 19 years"),
|
| 39 | 2x |
`20-39 Years` = c( |
| 40 | 2x |
"20 years", |
| 41 | 2x |
"21 years", |
| 42 | 2x |
"22 to 24 years", |
| 43 | 2x |
"25 to 29 years", |
| 44 | 2x |
"30 to 34 years", |
| 45 | 2x |
"35 to 39 years" |
| 46 |
), |
|
| 47 | 2x |
`40-64 Years` = c( |
| 48 | 2x |
"40 to 44 years", |
| 49 | 2x |
"45 to 49 years", |
| 50 | 2x |
"50 to 54 years", |
| 51 | 2x |
"55 to 59 years", |
| 52 | 2x |
"60 and 61 years", |
| 53 | 2x |
"62 to 64 years" |
| 54 |
), |
|
| 55 | 2x |
`65+ Years` = c( |
| 56 | 2x |
"65 and 66 years", |
| 57 | 2x |
"67 to 69 years", |
| 58 | 2x |
"70 to 74 years", |
| 59 | 2x |
"75 to 79 years", |
| 60 | 2x |
"80 to 84 years", |
| 61 | 2x |
"85 years and over" |
| 62 |
) |
|
| 63 |
), |
|
| 64 | 2x |
"9" = list( |
| 65 | 2x |
`<10 Years` = c("Under 5 years", "5 to 9 years"),
|
| 66 | 2x |
`10-18 Years` = c("10 to 14 years", "15 to 17 years"),
|
| 67 | 2x |
`18-24 Years` = c( |
| 68 | 2x |
"18 and 19 years", |
| 69 | 2x |
"20 years", |
| 70 | 2x |
"21 years", |
| 71 | 2x |
"22 to 24 years" |
| 72 |
), |
|
| 73 | 2x |
`25-34 Years` = c("25 to 29 years", "30 to 34 years"),
|
| 74 | 2x |
`35-44 Years` = c("35 to 39 years", "40 to 44 years"),
|
| 75 | 2x |
`45-54 Years` = c("45 to 49 years", "50 to 54 years"),
|
| 76 | 2x |
`55-64 Years` = c( |
| 77 | 2x |
"55 to 59 years", |
| 78 | 2x |
"60 and 61 years", |
| 79 | 2x |
"62 to 64 years" |
| 80 |
), |
|
| 81 | 2x |
`65+ Years` = c( |
| 82 | 2x |
"65 and 66 years", |
| 83 | 2x |
"67 to 69 years", |
| 84 | 2x |
"70 to 74 years", |
| 85 | 2x |
"75 to 79 years", |
| 86 | 2x |
"80 to 84 years", |
| 87 | 2x |
"85 years and over" |
| 88 |
) |
|
| 89 |
) |
|
| 90 |
) |
|
| 91 | 2x |
age_levels <- unlist(age_group_sets[[1L]]) |
| 92 | 2x |
if (!overwrite && write_out && file.exists(out_file)) {
|
| 93 | 1x |
if (verbose) {
|
| 94 | 1x |
cli::cli_progress_step("reading in existing file")
|
| 95 |
} |
|
| 96 | 1x |
pop <- as.data.frame(vroom::vroom( |
| 97 | 1x |
out_file, |
| 98 | 1x |
delim = ",", |
| 99 | 1x |
col_types = list(GEOID = "c", region_name = "c"), |
| 100 | 1x |
n_max = if (state_only) 52L else Inf |
| 101 |
)) |
|
| 102 |
} else {
|
|
| 103 |
# GEOID to region name mapping |
|
| 104 | 1x |
id_url <- "https://www2.census.gov/geo/docs/reference/codes2020/national_" |
| 105 | 1x |
if (verbose) {
|
| 106 | 1x |
cli::cli_progress_step("downloading state IDs map")
|
| 107 |
} |
|
| 108 | 1x |
state_ids <- vroom::vroom( |
| 109 | 1x |
paste0(id_url, "state2020.txt"), |
| 110 | 1x |
delim = "|", |
| 111 | 1x |
col_types = list( |
| 112 | 1x |
STATE = "c", |
| 113 | 1x |
STATEFP = "c", |
| 114 | 1x |
STATENS = "c", |
| 115 | 1x |
STATE_NAME = "c" |
| 116 |
) |
|
| 117 |
) |
|
| 118 | 1x |
if (verbose) {
|
| 119 | 1x |
cli::cli_progress_step("downloading county IDs map")
|
| 120 |
} |
|
| 121 | 1x |
county_ids <- vroom::vroom( |
| 122 | 1x |
paste0(id_url, "county2020.txt"), |
| 123 | 1x |
delim = "|", |
| 124 | 1x |
col_types = list( |
| 125 | 1x |
STATE = "c", |
| 126 | 1x |
STATEFP = "c", |
| 127 | 1x |
COUNTYFP = "c", |
| 128 | 1x |
COUNTYNS = "c", |
| 129 | 1x |
COUNTYNAME = "c", |
| 130 | 1x |
CLASSFP = "c", |
| 131 | 1x |
FUNCSTAT = "c" |
| 132 |
) |
|
| 133 |
) |
|
| 134 | 1x |
region_name = structure( |
| 135 | 1x |
sub( |
| 136 | 1x |
" County", |
| 137 |
"", |
|
| 138 | 1x |
c( |
| 139 | 1x |
state_ids$STATE_NAME, |
| 140 | 1x |
paste0(county_ids$COUNTYNAME, ", ", county_ids$STATE) |
| 141 |
), |
|
| 142 | 1x |
fixed = TRUE |
| 143 |
), |
|
| 144 | 1x |
names = c( |
| 145 | 1x |
state_ids$STATEFP, |
| 146 | 1x |
paste0(county_ids$STATEFP, county_ids$COUNTYFP) |
| 147 |
) |
|
| 148 |
) |
|
| 149 | ||
| 150 |
# population data |
|
| 151 | ||
| 152 |
## age group labels from IDs |
|
| 153 | 1x |
if (verbose) {
|
| 154 | 1x |
cli::cli_progress_step("downloading ACS variable lables")
|
| 155 |
} |
|
| 156 | 1x |
labels <- vroom::vroom( |
| 157 | 1x |
paste0( |
| 158 | 1x |
"https://www2.census.gov/programs-surveys/acs/summary_file/", |
| 159 | 1x |
min(2021L, year), |
| 160 | 1x |
"/sequence-based-SF/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt" |
| 161 |
), |
|
| 162 | 1x |
delim = ",", |
| 163 | 1x |
col_types = list( |
| 164 | 1x |
`File ID` = "c", |
| 165 | 1x |
`Table ID` = "c", |
| 166 | 1x |
`Sequence Number` = "c", |
| 167 | 1x |
`Line Number` = "d", |
| 168 | 1x |
`Start Position` = "i", |
| 169 | 1x |
`Total Cells in Table` = "c", |
| 170 | 1x |
`Total Cells in Sequence` = "i", |
| 171 | 1x |
`Table Title` = "c", |
| 172 | 1x |
`Subject Area` = "c" |
| 173 |
) |
|
| 174 |
) |
|
| 175 | 1x |
variable_labels <- structure( |
| 176 | 1x |
labels$`Table Title`, |
| 177 | 1x |
names = paste0( |
| 178 | 1x |
labels$`Table ID`, |
| 179 | 1x |
"_E", |
| 180 | 1x |
formatC(labels$`Line Number`, width = 3L, flag = 0L) |
| 181 |
) |
|
| 182 |
) |
|
| 183 | ||
| 184 |
## age group counts |
|
| 185 | 1x |
url <- paste0( |
| 186 | 1x |
"https://www2.census.gov/programs-surveys/acs/summary_file/", |
| 187 | 1x |
year, |
| 188 | 1x |
"/table-based-SF/data/5YRData/acsdt5y", |
| 189 | 1x |
year, |
| 190 | 1x |
"-b01001.dat" |
| 191 |
) |
|
| 192 | 1x |
if (verbose) {
|
| 193 | 1x |
cli::cli_progress_step("downloading population data")
|
| 194 |
} |
|
| 195 | 1x |
data <- vroom::vroom(url, delim = "|", col_types = list(GEO_ID = "c")) |
| 196 | 1x |
data <- data[ |
| 197 | 1x |
grep("0[45]00000US", data$GEO_ID),
|
| 198 | 1x |
grep("E", colnames(data), fixed = TRUE)
|
| 199 |
] |
|
| 200 | 1x |
colnames(data)[-1L] <- variable_labels[colnames(data)[-1L]] |
| 201 | 1x |
if (verbose) {
|
| 202 | 1x |
cli::cli_progress_step("agregating across sex and fine age groups")
|
| 203 |
} |
|
| 204 | 1x |
pop <- cbind( |
| 205 | 1x |
data.frame( |
| 206 | 1x |
GEOID = substring(data$GEO_ID, 10L), |
| 207 | 1x |
region_name = "", |
| 208 | 1x |
Total = data[["Total:"]] |
| 209 |
), |
|
| 210 | 1x |
do.call( |
| 211 | 1x |
cbind, |
| 212 | 1x |
lapply( |
| 213 | 1x |
structure(age_levels, names = age_levels), |
| 214 | 1x |
function(l) rowSums(data[, colnames(data) == l]) |
| 215 |
) |
|
| 216 |
) |
|
| 217 |
) |
|
| 218 | 1x |
pop$region_name = region_name[pop$GEOID] |
| 219 | 1x |
states <- pop[1L:52L, ] |
| 220 | 1x |
health_regions <- as.data.frame(do.call( |
| 221 | 1x |
rbind, |
| 222 | 1x |
lapply( |
| 223 | 1x |
split( |
| 224 | 1x |
states[, -(1L:2L)], |
| 225 | 1x |
dcf_to_health_region(states$GEOID, "hhs_") |
| 226 |
), |
|
| 227 | 1x |
colSums |
| 228 |
) |
|
| 229 |
)) |
|
| 230 | 1x |
health_regions$GEOID <- rownames(health_regions) |
| 231 | 1x |
health_regions$region_name <- sub( |
| 232 | 1x |
"hhs_", |
| 233 | 1x |
"Health Region ", |
| 234 | 1x |
rownames(health_regions), |
| 235 | 1x |
fixed = TRUE |
| 236 |
) |
|
| 237 | 1x |
pop <- rbind(pop, health_regions[, colnames(pop)]) |
| 238 | ||
| 239 | 1x |
if (write_out) {
|
| 240 | 1x |
if (verbose) {
|
| 241 | 1x |
cli::cli_progress_step("writing output")
|
| 242 |
} |
|
| 243 | 1x |
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) |
| 244 | 1x |
vroom::vroom_write(pop, out_file, ",") |
| 245 |
} |
|
| 246 |
} |
|
| 247 | ! |
if (is.numeric(age_groups)) age_groups <- as.character(age_groups) |
| 248 | 2x |
if (is.character(age_groups) && (age_groups %in% names(age_group_sets))) |
| 249 | 1x |
age_groups <- age_group_sets[[as.character(age_groups)]] |
| 250 | 2x |
if (is.list(age_groups)) {
|
| 251 | 1x |
pop <- cbind( |
| 252 | 1x |
pop[, !(colnames(pop) %in% age_levels)], |
| 253 | 1x |
do.call( |
| 254 | 1x |
cbind, |
| 255 | 1x |
lapply( |
| 256 | 1x |
age_groups, |
| 257 | 1x |
function(l) rowSums(pop[, colnames(pop) %in% l, drop = FALSE]) |
| 258 |
) |
|
| 259 |
) |
|
| 260 |
) |
|
| 261 |
} |
|
| 262 | 2x |
invisible(if (state_only) pop[1L:52L, ] else pop) |
| 263 |
} |
| 1 |
#' Update renv.lock |
|
| 2 |
#' |
|
| 3 |
#' Updates the \code{renv.lock} file with dependencies found in project scripts.
|
|
| 4 |
#' |
|
| 5 |
#' @param project_dir Directory of the Data Collection project. |
|
| 6 |
#' @param refresh Logical; if \code{FALSE}, will update an existing
|
|
| 7 |
#' \code{renv.lock} file, rather than recreating it.
|
|
| 8 |
#' @returns Nothing; writes an \code{renv.lock} file.
|
|
| 9 |
#' @examples |
|
| 10 |
#' \dontrun{
|
|
| 11 |
#' dcf_update_lock() |
|
| 12 |
#' } |
|
| 13 |
#' @export |
|
| 14 | ||
| 15 |
dcf_update_lock <- function( |
|
| 16 |
project_dir = ".", |
|
| 17 |
refresh = TRUE |
|
| 18 |
) {
|
|
| 19 | 1x |
settings <- dcf_read_settings(project_dir) |
| 20 | 1x |
extra <- unique( |
| 21 | 1x |
renv::dependencies(list.files( |
| 22 | 1x |
paste0(project_dir, "/", settings$data_dir), |
| 23 | 1x |
"\\.[Rr]$", |
| 24 | 1x |
recursive = TRUE, |
| 25 | 1x |
full.names = TRUE |
| 26 | 1x |
))$Package |
| 27 |
) |
|
| 28 | 1x |
not_installed <- !(extra %in% rownames(utils::installed.packages())) |
| 29 | ! |
if (any(not_installed)) utils::install.packages(extra[not_installed]) |
| 30 | 1x |
if (refresh) unlink(paste0(project_dir, "/renv.lock")) |
| 31 | 1x |
renv::snapshot(packages = extra, lockfile = paste0(project_dir, "/renv.lock")) |
| 32 |
} |
| 1 |
#' Download Data from the CDC |
|
| 2 |
#' |
|
| 3 |
#' Download data and metadata from the Centers for Disease Control and Prevention (CDC). |
|
| 4 |
#' |
|
| 5 |
#' @param id ID of the resource (e.g., \code{ijqb-a7ye}).
|
|
| 6 |
#' @param out_dir Directory in which to save the metadata and data files. |
|
| 7 |
#' @param state The state ID of a previous download; if provided, will only download if the |
|
| 8 |
#' new state does not match. |
|
| 9 |
#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet.
|
|
| 10 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
|
|
| 11 |
#' @returns The state ID of the downloaded files; |
|
| 12 |
#' downloads files (\code{<id>.json} and \code{<id>.csv.xz}) to \code{out_dir}
|
|
| 13 |
#' @section \code{data.cdc.gov} URLs:
|
|
| 14 |
#' |
|
| 15 |
#' For each resource ID, there are 3 relevant CDC URLs: |
|
| 16 |
#' \itemize{
|
|
| 17 |
#' \item \strong{\code{resource/<id>}}: This redirects to the resource's main page,
|
|
| 18 |
#' with displayed metadata and a data preview |
|
| 19 |
#' (e.g., \href{https://data.cdc.gov/resource/ijqb-a7ye}{data.cdc.gov/resource/ijqb-a7ye}).
|
|
| 20 |
#' \item \strong{\code{api/views/<id>}}: This is a direct link to the underlying
|
|
| 21 |
#' JSON metadata (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye}{data.cdc.gov/api/views/ijqb-a7ye}).
|
|
| 22 |
#' \item \strong{\code{api/views/<id>/rows.csv}}: This is a direct link to the full
|
|
| 23 |
#' CSV dataset (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye/rows.csv}{data.cdc.gov/api/views/ijqb-a7ye/rows.csv}).
|
|
| 24 |
#' } |
|
| 25 |
#' |
|
| 26 |
#' @examples |
|
| 27 |
#' \dontrun{
|
|
| 28 |
#' dcf_download_cdc("ijqb-a7ye")
|
|
| 29 |
#' } |
|
| 30 |
#' @export |
|
| 31 | ||
| 32 |
dcf_download_cdc <- function( |
|
| 33 |
id, |
|
| 34 |
out_dir = "raw", |
|
| 35 |
state = NULL, |
|
| 36 |
parquet = FALSE, |
|
| 37 |
verbose = TRUE |
|
| 38 |
) {
|
|
| 39 | 2x |
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) |
| 40 | 2x |
if (verbose) {
|
| 41 | 2x |
resource_url <- paste0("https://data.cdc.gov/resource/", id)
|
| 42 | 2x |
cli::cli_h1( |
| 43 | 2x |
"downloading resource {.url {resource_url}}"
|
| 44 |
) |
|
| 45 |
} |
|
| 46 | 2x |
url <- paste0("https://data.cdc.gov/api/views/", id)
|
| 47 | 2x |
initial_timeout <- options(timeout = 99999L)$timeout |
| 48 | 2x |
on.exit(options(timeout = initial_timeout)) |
| 49 | 2x |
if (verbose) {
|
| 50 | 2x |
cli::cli_progress_step("metadata: {.url {url}}")
|
| 51 |
} |
|
| 52 | 2x |
metadata_file <- paste0(tempdir(), "/", id, ".json") |
| 53 | 2x |
status <- utils::download.file(url, metadata_file, quiet = TRUE) |
| 54 | 2x |
if (status != 0L) {
|
| 55 | ! |
cli::cli_abort("failed to download metadata")
|
| 56 |
} |
|
| 57 | 2x |
metadata <- dcf_attempt_read_json(metadata_file) |
| 58 | 2x |
new_state <- if (is.null(metadata$rowsUpdatedAt)) {
|
| 59 | ! |
as.list(tools::md5sum(metadata_file)) |
| 60 |
} else {
|
|
| 61 | 2x |
metadata$rowsUpdatedAt |
| 62 |
} |
|
| 63 | 2x |
if (!identical(new_state, state)) {
|
| 64 | 2x |
file.rename(metadata_file, paste0(out_dir, "/", id, ".json")) |
| 65 | 2x |
data_url <- paste0(url, "/rows.csv") |
| 66 | 2x |
out_path <- paste0(out_dir, "/", id, ".csv") |
| 67 | 2x |
if (verbose) {
|
| 68 | 2x |
cli::cli_progress_step("data: {.url {data_url}}")
|
| 69 |
} |
|
| 70 | 2x |
status <- utils::download.file(data_url, out_path, quiet = TRUE) |
| 71 | 2x |
if (status != 0L) {
|
| 72 | ! |
cli::cli_abort("failed to download data")
|
| 73 |
} |
|
| 74 | 2x |
if (parquet) {
|
| 75 | 1x |
if (verbose) {
|
| 76 | 1x |
cli::cli_progress_step("writing to Parquet")
|
| 77 |
} |
|
| 78 | 1x |
data <- arrow::read_csv_arrow(out_path) |
| 79 | 1x |
arrow::write_parquet( |
| 80 | 1x |
data, |
| 81 | 1x |
compression = "gzip", |
| 82 | 1x |
sub(".csv", ".parquet", out_path, fixed = TRUE)
|
| 83 |
) |
|
| 84 | 1x |
unlink(out_path) |
| 85 |
} else {
|
|
| 86 | 1x |
if (verbose) {
|
| 87 | 1x |
cli::cli_progress_step("compressing data")
|
| 88 |
} |
|
| 89 | 1x |
unlink(paste0(out_path, ".xz")) |
| 90 | 1x |
status <- system2("xz", c("-f", out_path))
|
| 91 | 1x |
if (status != 0L) {
|
| 92 | ! |
cli::cli_abort("failed to compress data")
|
| 93 |
} |
|
| 94 |
} |
|
| 95 | 2x |
if (verbose) {
|
| 96 | 2x |
cli::cli_progress_done() |
| 97 |
} |
|
| 98 | 2x |
invisible(new_state) |
| 99 |
} else {
|
|
| 100 | ! |
unlink(metadata_file) |
| 101 | ! |
invisible(state) |
| 102 |
} |
|
| 103 |
} |
| 1 |
#' Extract Older Files from Git History |
|
| 2 |
#' |
|
| 3 |
#' Extract a file from a data collection project's Git history, |
|
| 4 |
#' or list the available versions of the file. |
|
| 5 |
#' |
|
| 6 |
#' @param path Path to the file. |
|
| 7 |
#' @param date Date of the version to load; A \code{Date}, or \code{character} in the format
|
|
| 8 |
#' \code{YYYY-MM-DD}. Will match to the nearest version.
|
|
| 9 |
#' @param commit_hash SHA signature of the committed version; |
|
| 10 |
#' can be the first 6 or so characters. Ignored if \code{date} is provided.
|
|
| 11 |
#' @param versions Logical; if \code{TRUE}, will return a list of available version,
|
|
| 12 |
#' rather than a |
|
| 13 |
#' @returns If \code{versions} is \code{TRUE}, a \code{data.frame} with columns for
|
|
| 14 |
#' the \code{hash}, \code{author}, \code{date}, and \code{message} of each commit.
|
|
| 15 |
#' Otherwise, the path to a temporary file, if one was extracted. |
|
| 16 |
#' |
|
| 17 |
#' @examples |
|
| 18 |
#' path <- "../../../pophive/pophive_demo/data/wastewater/raw/flua.csv.xz" |
|
| 19 |
#' if (file.exists(path)) {
|
|
| 20 |
#' # list versions |
|
| 21 |
#' versions <- dcf_get_file(path, versions = TRUE) |
|
| 22 |
#' print(versions[, c("date", "hash")])
|
|
| 23 |
#' |
|
| 24 |
#' # extract a version to a temporary file |
|
| 25 |
#' temp_path <- dcf_get_file(path, "2025-05") |
|
| 26 |
#' basename(temp_path) |
|
| 27 |
#' } |
|
| 28 |
#' |
|
| 29 |
#' @export |
|
| 30 | ||
| 31 |
dcf_get_file <- function( |
|
| 32 |
path, |
|
| 33 |
date = NULL, |
|
| 34 |
commit_hash = NULL, |
|
| 35 |
versions = FALSE |
|
| 36 |
) {
|
|
| 37 | 3x |
if (missing(path)) {
|
| 38 | ! |
cli::cli_abort("specify a path")
|
| 39 |
} |
|
| 40 | 3x |
if (!file.exists(path)) {
|
| 41 | ! |
cli::cli_abort("path does not exist")
|
| 42 |
} |
|
| 43 | 3x |
vs <- data.frame( |
| 44 | 3x |
hash = character(), |
| 45 | 3x |
author = character(), |
| 46 | 3x |
date = character(), |
| 47 | 3x |
message = character() |
| 48 |
) |
|
| 49 | 3x |
if (versions || !is.null(date)) {
|
| 50 | 2x |
wd <- getwd() |
| 51 | 2x |
on.exit(setwd(wd)) |
| 52 | 2x |
setwd(dirname(path)) |
| 53 | 2x |
commits <- sys::exec_internal("git", c("log", basename(path)))
|
| 54 | 2x |
setwd(wd) |
| 55 | 2x |
if (commits$status == 0L) {
|
| 56 | 2x |
commits <- do.call( |
| 57 | 2x |
rbind, |
| 58 | 2x |
Filter( |
| 59 | 2x |
function(e) length(e) == 4L, |
| 60 | 2x |
strsplit( |
| 61 | 2x |
strsplit(rawToChar(commits$stdout), "commit ", fixed = TRUE)[[1L]], |
| 62 | 2x |
"\\n+(?:[^:]+:)?\\s*" |
| 63 |
) |
|
| 64 |
) |
|
| 65 |
) |
|
| 66 | 2x |
colnames(commits) <- colnames(vs) |
| 67 | 2x |
vs <- as.data.frame(commits) |
| 68 |
} else {
|
|
| 69 | ! |
cli::cli_abort("failed to git log: {rawToChar(commits$stderr)}")
|
| 70 |
} |
|
| 71 |
} |
|
| 72 | 3x |
if (versions) {
|
| 73 | 1x |
return(vs) |
| 74 |
} |
|
| 75 | 2x |
if (!is.null(date)) {
|
| 76 | 1x |
if (nrow(vs) == 0L) {
|
| 77 | ! |
return(path) |
| 78 |
} |
|
| 79 | 1x |
if (is.character(date)) {
|
| 80 | 1x |
date <- as.POSIXct( |
| 81 | 1x |
date, |
| 82 | 1x |
tryFormats = c( |
| 83 | 1x |
"%Y-%m-%d %H:%M:%S", |
| 84 | 1x |
"%Y-%m-%d %H:%M", |
| 85 | 1x |
"%Y-%m-%d", |
| 86 | 1x |
"%Y-%m", |
| 87 | 1x |
"%Y" |
| 88 |
), |
|
| 89 | 1x |
tz = "UTC" |
| 90 |
) |
|
| 91 |
} |
|
| 92 | 1x |
commit_hash <- vs$hash[which.min(abs( |
| 93 | 1x |
as.POSIXct(vs$date, "%a %b %d %H:%M:%S %Y", tz = "UTC") - date |
| 94 |
))] |
|
| 95 |
} |
|
| 96 | 2x |
if (is.null(commit_hash)) {
|
| 97 | ! |
return(path) |
| 98 |
} |
|
| 99 | 2x |
name_parts <- strsplit(basename(path), ".", fixed = TRUE)[[1L]] |
| 100 | 2x |
out_path <- paste0( |
| 101 | 2x |
tempdir(), |
| 102 |
"/", |
|
| 103 | 2x |
name_parts[[1L]], |
| 104 |
"-", |
|
| 105 | 2x |
substring(commit_hash, 1L, 6L), |
| 106 |
".", |
|
| 107 | 2x |
paste(name_parts[-1L], collapse = ".") |
| 108 |
) |
|
| 109 | 2x |
if (file.exists(out_path)) {
|
| 110 | ! |
return(out_path) |
| 111 |
} |
|
| 112 | 2x |
wd <- getwd() |
| 113 | 2x |
on.exit(setwd(wd), TRUE) |
| 114 | 2x |
setwd(dirname(path)) |
| 115 | 2x |
status <- sys::exec_wait( |
| 116 | 2x |
"git", |
| 117 | 2x |
c("show", paste0(commit_hash, ":./", basename(path))),
|
| 118 | 2x |
std_out = out_path |
| 119 |
) |
|
| 120 | 2x |
setwd(wd) |
| 121 | 2x |
if (status != 0L) {
|
| 122 | ! |
unlink(out_path) |
| 123 | ! |
cli::cli_abort("failed to git show: {status}")
|
| 124 |
} |
|
| 125 | 2x |
out_path |
| 126 |
} |
| 1 |
#' Adds a Source Project |
|
| 2 |
#' |
|
| 3 |
#' Establishes a new data source project, used to collect and prepare data from a new source. |
|
| 4 |
#' |
|
| 5 |
#' @param name Name of the source. |
|
| 6 |
#' @param project_dir Path to the Data Collection Framework project. |
|
| 7 |
#' @param open_after Logical; if \code{FALSE}, will not open the project.
|
|
| 8 |
#' @param use_git Logical; if \code{TRUE}, will initialize a git repository.
|
|
| 9 |
#' @param use_workflow Logical; if \code{TRUE}, will add a GitHub Actions workflow.
|
|
| 10 |
#' @returns Nothing; creates default files and directories. |
|
| 11 |
#' @section Project Definition: |
|
| 12 |
#' |
|
| 13 |
#' The \strong{\code{process.json}} file defines the project with some initial attributes:
|
|
| 14 |
#' \itemize{
|
|
| 15 |
#' \item \code{type} Always \code{source} to define this as a source project.
|
|
| 16 |
#' \item \code{name} Name of the project.
|
|
| 17 |
#' \item \code{scripts} List of script definitions.
|
|
| 18 |
#' \item \code{checked} When the project was last checked with \code{\link{dcf_check}}.
|
|
| 19 |
#' \item \code{check_results} Results of the last check.
|
|
| 20 |
#' \item \code{standalone} Logical; \code{TRUE} if the source project does not exist
|
|
| 21 |
#' within a broader collection project. |
|
| 22 |
#' \item \code{standard_state} State of the \code{standard} directory: A list
|
|
| 23 |
#' with names as the file paths, relative to the overall project root, and values |
|
| 24 |
#' as the MD5 hash of those files. |
|
| 25 |
#' \item \code{raw_state} State of the \code{raw} directory, if
|
|
| 26 |
#' set within a script. |
|
| 27 |
#' \item \code{vintages} A list with names as names of files found in the \code{standard}
|
|
| 28 |
#' directory, and values as dates (of arbitrary format). This is a way to provide |
|
| 29 |
#' a date separate from the files dates (e.g., if you have some other source for when the |
|
| 30 |
#' data were actually collected), which will be included the named file's |
|
| 31 |
#' \code{datapackage.json}.
|
|
| 32 |
#' } |
|
| 33 |
#' |
|
| 34 |
#' Each \strong{\code{scripts}} entry points to a script to be run, with one default:
|
|
| 35 |
#' \itemize{
|
|
| 36 |
#' \item \code{path} path to the script, relative to this project's root.
|
|
| 37 |
#' \item \code{manual} Logical; if \code{TRUE}, will only run the script from
|
|
| 38 |
#' \code{\link{dcf_process}} (not \code{\link{dcf_build}}).
|
|
| 39 |
#' \item \code{frequency} How often to rerun the project, in days.
|
|
| 40 |
#' This is checked against the last run timestamp when processed; it is a way |
|
| 41 |
#' to skip processing, but can only be as frequent as the overall process is run. |
|
| 42 |
#' \item \code{last_run} Timestamp of the last processing.
|
|
| 43 |
#' \item \code{run_time} How long the script took to run last, in milliseconds.
|
|
| 44 |
#' \item \code{last_status} Status of the last run; a list with entries for
|
|
| 45 |
#' \code{success} (logical) and \code{log} (output of the script).
|
|
| 46 |
#' } |
|
| 47 |
#' |
|
| 48 |
#' See the \href{https://dissc-yale.github.io/dcf/articles/standards.html#scripts}{script standards}
|
|
| 49 |
#' for examples of using this within a sub-project script. |
|
| 50 |
#' |
|
| 51 |
#' @section Project Files : |
|
| 52 |
#' |
|
| 53 |
#' Within a source project, there are two files to edits: |
|
| 54 |
#' \itemize{
|
|
| 55 |
#' \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun.
|
|
| 56 |
#' It should store raw data and resources in \code{raw/} where possible,
|
|
| 57 |
#' then use what's in \code{raw/} to produce standard-format files in \code{standard/}.
|
|
| 58 |
#' This file is sourced from its location during processing, so any system paths |
|
| 59 |
#' must be relative to itself. |
|
| 60 |
#' \item \strong{\code{measure_info.json}}: This is where you can record information
|
|
| 61 |
#' about the variables included in the standardized data files. |
|
| 62 |
#' See \code{\link{dcf_measure_info}}.
|
|
| 63 |
#' } |
|
| 64 |
#' |
|
| 65 |
#' @examples |
|
| 66 |
#' project_dir <- paste0(tempdir(), "/temp_project") |
|
| 67 |
#' dcf_init("temp_project", dirname(project_dir))
|
|
| 68 |
#' dcf_add_source("source_name", project_dir)
|
|
| 69 |
#' list.files(paste0(project_dir, "/data/source_name")) |
|
| 70 |
#' @export |
|
| 71 | ||
| 72 |
dcf_add_source <- function( |
|
| 73 |
name, |
|
| 74 |
project_dir = ".", |
|
| 75 |
open_after = interactive(), |
|
| 76 |
use_git = TRUE, |
|
| 77 |
use_workflow = FALSE |
|
| 78 |
) {
|
|
| 79 | 7x |
if (is.null(name)) {
|
| 80 | ! |
cli::cli_abort("provide a name")
|
| 81 |
} |
|
| 82 | 7x |
name <- gsub("[^A-Za-z0-9]+", "_", name)
|
| 83 | 7x |
is_standalone <- !file.exists(paste0(project_dir, "/settings.json")) |
| 84 | 7x |
data_dir <- dcf_read_settings(project_dir)$data_dir |
| 85 | 7x |
base_dir <- paste0(project_dir, "/", data_dir) |
| 86 | 7x |
base_path <- paste0(base_dir, "/", name, "/") |
| 87 | 7x |
dir.create(paste0(base_path, "raw"), showWarnings = FALSE, recursive = TRUE) |
| 88 | 7x |
dir.create(paste0(base_path, "standard"), showWarnings = FALSE) |
| 89 | 7x |
paths <- paste0( |
| 90 | 7x |
base_path, |
| 91 | 7x |
c( |
| 92 | 7x |
"measure_info.json", |
| 93 | 7x |
"ingest.R", |
| 94 | 7x |
"project.Rproj", |
| 95 | 7x |
"standard/datapackage.json", |
| 96 | 7x |
"process.json", |
| 97 | 7x |
"README.md", |
| 98 | 7x |
".gitignore", |
| 99 | 7x |
".github/workflows/process.yaml" |
| 100 |
) |
|
| 101 |
) |
|
| 102 | 7x |
if (!file.exists(paths[[1L]])) {
|
| 103 | 2x |
dcf_measure_info( |
| 104 | 2x |
paths[[1L]], |
| 105 | 2x |
example_variable = list(), |
| 106 | 2x |
verbose = FALSE, |
| 107 | 2x |
open_after = FALSE |
| 108 |
) |
|
| 109 |
} |
|
| 110 | 7x |
if (!file.exists(paths[[2L]])) {
|
| 111 | 2x |
writeLines( |
| 112 | 2x |
paste0( |
| 113 | 2x |
c( |
| 114 |
"#", |
|
| 115 | 2x |
"# Download", |
| 116 |
"#", |
|
| 117 |
"", |
|
| 118 | 2x |
"# add files to the `raw` directory", |
| 119 |
"", |
|
| 120 |
"#", |
|
| 121 | 2x |
"# Reformat", |
| 122 |
"#", |
|
| 123 |
"", |
|
| 124 | 2x |
"# read from the `raw` directory, and write to the `standard` directory", |
| 125 |
"" |
|
| 126 |
), |
|
| 127 | 2x |
collapse = "\n" |
| 128 |
), |
|
| 129 | 2x |
paths[[2L]] |
| 130 |
) |
|
| 131 |
} |
|
| 132 | 7x |
if (!file.exists(paths[[3L]])) {
|
| 133 | 2x |
writeLines("Version: 1.0\n", paths[[3L]])
|
| 134 |
} |
|
| 135 | 7x |
if (!file.exists(paths[[4L]])) {
|
| 136 | 2x |
dcf_datapackage_init( |
| 137 | 2x |
name, |
| 138 | 2x |
dir = paste0(base_path, "standard"), |
| 139 | 2x |
quiet = TRUE |
| 140 |
) |
|
| 141 |
} |
|
| 142 | ||
| 143 | 7x |
if (!file.exists(paths[[5L]])) {
|
| 144 | 2x |
dcf_process_record( |
| 145 | 2x |
paths[[5L]], |
| 146 | 2x |
list( |
| 147 | 2x |
name = name, |
| 148 | 2x |
type = "source", |
| 149 | 2x |
scripts = list( |
| 150 | 2x |
list( |
| 151 | 2x |
path = "ingest.R", |
| 152 | 2x |
manual = FALSE, |
| 153 | 2x |
frequency = 0L, |
| 154 | 2x |
last_run = "", |
| 155 | 2x |
run_time = "", |
| 156 | 2x |
last_status = list(log = "", success = TRUE) |
| 157 |
) |
|
| 158 |
), |
|
| 159 | 2x |
checked = "", |
| 160 | 2x |
check_results = list(), |
| 161 | 2x |
standalone = is_standalone |
| 162 |
) |
|
| 163 |
) |
|
| 164 |
} |
|
| 165 | 7x |
if (!file.exists(paths[[6L]])) {
|
| 166 | 2x |
writeLines( |
| 167 | 2x |
paste0( |
| 168 | 2x |
c( |
| 169 | 2x |
paste("#", name),
|
| 170 |
"", |
|
| 171 | 2x |
"This is a dcf data source project, initialized with `dcf::dcf_add_source`.", |
| 172 |
"", |
|
| 173 | 2x |
"You can use the `dcf` package to check the project:", |
| 174 |
"", |
|
| 175 | 2x |
"```R", |
| 176 | 2x |
"dcf_check()", |
| 177 |
"```", |
|
| 178 |
"", |
|
| 179 | 2x |
"And process it:", |
| 180 |
"", |
|
| 181 | 2x |
"```R", |
| 182 | 2x |
"dcf_process()", |
| 183 |
"```" |
|
| 184 |
), |
|
| 185 | 2x |
collapse = "\n" |
| 186 |
), |
|
| 187 | 2x |
paths[[6L]] |
| 188 |
) |
|
| 189 |
} |
|
| 190 | 7x |
if (is_standalone) {
|
| 191 | 2x |
if (use_git) {
|
| 192 | 1x |
dcf_init_git(base_path) |
| 193 | 1x |
if (!file.exists(paths[[7L]])) {
|
| 194 | 1x |
writeLines( |
| 195 | 1x |
paste( |
| 196 | 1x |
c( |
| 197 | 1x |
"*.Rproj", |
| 198 | 1x |
".Rproj.user", |
| 199 | 1x |
"*.Rprofile", |
| 200 | 1x |
"*.Rhistory", |
| 201 | 1x |
"*.Rdata", |
| 202 | 1x |
".DS_Store", |
| 203 | 1x |
"renv" |
| 204 |
), |
|
| 205 | 1x |
collapse = "\n" |
| 206 |
), |
|
| 207 | 1x |
paths[[7L]] |
| 208 |
) |
|
| 209 |
} |
|
| 210 |
} |
|
| 211 | 2x |
if (use_workflow && !file.exists(paths[[8L]])) {
|
| 212 | ! |
dir.create(dirname(paths[[8L]]), recursive = TRUE, showWarnings = FALSE) |
| 213 | ! |
file.copy( |
| 214 | ! |
system.file("workflows/build.yaml", package = "dcf"),
|
| 215 | ! |
paths[[8L]] |
| 216 |
) |
|
| 217 |
} |
|
| 218 |
} |
|
| 219 | ! |
if (open_after) rstudioapi::openProject(paths[[3L]], newSession = TRUE) |
| 220 |
} |
| 1 |
#' Retrieve Project Report |
|
| 2 |
#' |
|
| 3 |
#' Retrieve the report file from a local or remote project. |
|
| 4 |
#' |
|
| 5 |
#' @param project Path to a local project, or the GitHub account and repository name |
|
| 6 |
#' (\code{"{account_name}/{repo_name}"}) of a remote project.
|
|
| 7 |
#' @param branch Name of the remote repository branch. |
|
| 8 |
#' @param commit Commit hash; overrides \code{branch}.
|
|
| 9 |
#' @param provider Base URL of the remote repository provider. |
|
| 10 |
#' @param cache Directory to store retrieved report in (at \code{{cache}/{project}/report.json.gz}).
|
|
| 11 |
#' @param refresh Logical; if \code{TRUE}, will always retrieve a fresh copy of the report,
|
|
| 12 |
#' even if a copy exists in \code{cache}.
|
|
| 13 |
#' @returns A data collection project report: |
|
| 14 |
#' \tabular{ll}{
|
|
| 15 |
#' \code{date} \tab Timestamp when the report was created. \cr
|
|
| 16 |
#' \code{settings} \tab The project's settings file. \cr
|
|
| 17 |
#' \code{source_times} \tab
|
|
| 18 |
#' A list with entries for each subproject, containing the number of seconds |
|
| 19 |
#' it took to run the project's scripts. \cr |
|
| 20 |
#' \code{issues} \tab
|
|
| 21 |
#' A list with entries for each subproject, containing issues flagged by |
|
| 22 |
#' \code{\link{dcf_check}}, within a list with \code{data} and/or \code{measure} entries,
|
|
| 23 |
#' containing character vectors of issue labels. \cr |
|
| 24 |
#' \code{logs} \tab
|
|
| 25 |
#' A list with entries for each subproject, containing the logged output of their scripts. \cr |
|
| 26 |
#' \code{metadata} \tab
|
|
| 27 |
#' A list with entries for each subproject, containing the datapackage of their output, |
|
| 28 |
#' as created by \code{\link{dcf_measure_info}}. \cr
|
|
| 29 |
#' \code{processes} \tab
|
|
| 30 |
#' A list with entries for each subproject, containing their process definitions |
|
| 31 |
#' (see \code{\link{dcf_add_source}} and/or \code{\link{dcf_add_bundle}}). \cr
|
|
| 32 |
#' } |
|
| 33 |
#' @family data user interface functions |
|
| 34 |
#' @examples |
|
| 35 |
#' report <- dcf_report("dissc-yale/pophive_demo")
|
|
| 36 |
#' report$date |
|
| 37 |
#' jsonlite::toJSON(report$settings, auto_unbox = TRUE, pretty = TRUE) |
|
| 38 |
#' @export |
|
| 39 | ||
| 40 |
dcf_report <- function( |
|
| 41 |
project = "dissc-yale/pophive_demo", |
|
| 42 |
branch = "main", |
|
| 43 |
commit = NULL, |
|
| 44 |
provider = "https://github.com", |
|
| 45 |
cache = tempdir(), |
|
| 46 |
refresh = FALSE |
|
| 47 |
) {
|
|
| 48 | 7x |
if (dir.exists(project)) {
|
| 49 | 4x |
report_file <- paste0(project, "/report.json.gz") |
| 50 | 4x |
if (!file.exists(report_file)) {
|
| 51 | ! |
cli::cli_abort("report does not exist at {report_file}")
|
| 52 |
} |
|
| 53 | 4x |
report_url <- "" |
| 54 |
} else {
|
|
| 55 | 3x |
report_file <- paste0(cache, "/", project, "/report.json.gz") |
| 56 | 3x |
report_url <- paste0( |
| 57 | 3x |
provider, |
| 58 |
"/", |
|
| 59 | 3x |
project, |
| 60 | 3x |
"/raw/", |
| 61 | 3x |
if (is.null(commit)) paste0("refs/heads/", branch) else commit,
|
| 62 | 3x |
"/report.json.gz" |
| 63 |
) |
|
| 64 | 3x |
if (refresh || !file.exists(report_file)) {
|
| 65 | 1x |
dir.create(dirname(report_file), FALSE, TRUE) |
| 66 | 1x |
req <- curl::curl_fetch_disk(report_url, report_file) |
| 67 | 1x |
if (req$status_code != 200L) {
|
| 68 | ! |
unlink(report_file) |
| 69 | ! |
cli::cli_abort( |
| 70 | ! |
"failed to retrieve report at {report_url}: {req$status_code}, {req$content}"
|
| 71 |
) |
|
| 72 |
} |
|
| 73 |
} |
|
| 74 |
} |
|
| 75 | 7x |
report <- dcf_attempt_read_json(report_file) |
| 76 | 7x |
report$settings$report_url <- report_url |
| 77 | 7x |
invisible(report) |
| 78 |
} |
| 1 |
#' Map States to Health Regions |
|
| 2 |
#' |
|
| 3 |
#' Maps state FIPS state numeric codes to Human Health Service regions. |
|
| 4 |
#' |
|
| 5 |
#' @param geoids Character vector of GEOIDs. |
|
| 6 |
#' @param prefix A prefix to add to region IDs. |
|
| 7 |
#' @returns A vector of Health Region names the same length as \code{geoids}.
|
|
| 8 |
#' @examples |
|
| 9 |
#' dcf_to_health_region(c("01", "01001", "02", "02001"))
|
|
| 10 |
#' @export |
|
| 11 | ||
| 12 |
dcf_to_health_region <- function(geoids, prefix = "Region ") {
|
|
| 13 | 1x |
regions <- c( |
| 14 | 1x |
"01" = 4, |
| 15 | 1x |
"02" = 10, |
| 16 | 1x |
"04" = 9, |
| 17 | 1x |
"05" = 6, |
| 18 | 1x |
"06" = 9, |
| 19 | 1x |
"08" = 8, |
| 20 | 1x |
"09" = 1, |
| 21 | 1x |
"10" = 3, |
| 22 | 1x |
"11" = 3, |
| 23 | 1x |
"12" = 4, |
| 24 | 1x |
"13" = 4, |
| 25 | 1x |
"15" = 9, |
| 26 | 1x |
"16" = 10, |
| 27 | 1x |
"17" = 5, |
| 28 | 1x |
"18" = 5, |
| 29 | 1x |
"19" = 7, |
| 30 | 1x |
"20" = 7, |
| 31 | 1x |
"21" = 4, |
| 32 | 1x |
"22" = 6, |
| 33 | 1x |
"23" = 1, |
| 34 | 1x |
"24" = 3, |
| 35 | 1x |
"25" = 1, |
| 36 | 1x |
"26" = 5, |
| 37 | 1x |
"27" = 5, |
| 38 | 1x |
"28" = 4, |
| 39 | 1x |
"29" = 7, |
| 40 | 1x |
"30" = 8, |
| 41 | 1x |
"31" = 7, |
| 42 | 1x |
"32" = 9, |
| 43 | 1x |
"33" = 1, |
| 44 | 1x |
"34" = 2, |
| 45 | 1x |
"35" = 6, |
| 46 | 1x |
"36" = 2, |
| 47 | 1x |
"37" = 4, |
| 48 | 1x |
"38" = 8, |
| 49 | 1x |
"39" = 5, |
| 50 | 1x |
"40" = 6, |
| 51 | 1x |
"41" = 10, |
| 52 | 1x |
"42" = 3, |
| 53 | 1x |
"44" = 1, |
| 54 | 1x |
"45" = 4, |
| 55 | 1x |
"46" = 8, |
| 56 | 1x |
"47" = 4, |
| 57 | 1x |
"48" = 6, |
| 58 | 1x |
"49" = 8, |
| 59 | 1x |
"50" = 1, |
| 60 | 1x |
"51" = 3, |
| 61 | 1x |
"53" = 10, |
| 62 | 1x |
"54" = 3, |
| 63 | 1x |
"55" = 5, |
| 64 | 1x |
"56" = 8, |
| 65 | 1x |
"72" = 2, |
| 66 | 1x |
"66" = 6, |
| 67 | 1x |
"74" = 2 |
| 68 |
) |
|
| 69 | 1x |
regions[] <- paste0(prefix, regions) |
| 70 | 1x |
unname(regions[substring(geoids, 1L, 2L)]) |
| 71 |
} |
| 1 |
dcf_attempt_read_json <- function(path, ..., strict = TRUE) {
|
|
| 2 | 106x |
contents <- tryCatch(jsonlite::read_json(path, ...), error = function(e) NULL) |
| 3 | 106x |
if (strict && is.null(contents)) {
|
| 4 | ! |
cli::cli_abort("failed to read {.file {path}")
|
| 5 |
} |
|
| 6 | 106x |
contents |
| 7 |
} |
|
| 8 |
dcf_read_settings <- function(project_dir = ".", strict = FALSE) {
|
|
| 9 | 19x |
settings_file <- paste0(project_dir, "/settings.json") |
| 10 | 19x |
if (!file.exists(settings_file)) {
|
| 11 | 4x |
if (strict) {
|
| 12 | ! |
cli::cli_abort( |
| 13 | ! |
"{.arg project_dir} ({project_dir}) does not appear to be a Data Collection Framework project"
|
| 14 |
) |
|
| 15 |
} else {
|
|
| 16 | 4x |
return(list( |
| 17 | 4x |
name = basename(normalizePath(project_dir, "/", FALSE)), |
| 18 | 4x |
data_dir = ".", |
| 19 | 4x |
standalone = TRUE |
| 20 |
)) |
|
| 21 |
} |
|
| 22 |
} |
|
| 23 | 15x |
dcf_attempt_read_json(settings_file) |
| 24 |
} |
|
| 25 |
dcf_init_git <- function(dir) {
|
|
| 26 | 2x |
if (!dir.exists(paste0(dir, ".git"))) {
|
| 27 | 2x |
wd <- getwd() |
| 28 | 2x |
on.exit(setwd(wd)) |
| 29 | 2x |
setwd(dir) |
| 30 | 2x |
system2("git", "init")
|
| 31 | 2x |
setwd(wd) |
| 32 |
} |
|
| 33 |
} |
| 1 |
#' Interact with a Process File |
|
| 2 |
#' |
|
| 3 |
#' Read or update the current process file. |
|
| 4 |
#' |
|
| 5 |
#' See the \href{https://dissc-yale.github.io/dcf/articles/standards.html#scripts}{script standards}
|
|
| 6 |
#' for examples of using this within a sub-project script. |
|
| 7 |
#' |
|
| 8 |
#' @param path Path to the process JSON file. |
|
| 9 |
#' @param updated An update version of the process definition. If specified, will |
|
| 10 |
#' write this as the new process file, rather than reading any existing file. |
|
| 11 |
#' @returns The process definition of the source project. |
|
| 12 |
#' @examples |
|
| 13 |
#' epic_process_file <- "../../../pophive/pophive_demo/data/epic/process.json" |
|
| 14 |
#' if (file.exists(epic_process_file)) {
|
|
| 15 |
#' dcf_process_record(epic_process_file) |
|
| 16 |
#' } |
|
| 17 |
#' @export |
|
| 18 | ||
| 19 |
dcf_process_record <- function(path = "process.json", updated = NULL) {
|
|
| 20 | 47x |
if (is.null(updated)) {
|
| 21 | 28x |
if (!file.exists(path)) {
|
| 22 | ! |
cli::cli_abort("process file {path} does not exist")
|
| 23 |
} |
|
| 24 | 28x |
spec <- dcf_attempt_read_json(path) |
| 25 | 28x |
if (is.null(spec$name)) {
|
| 26 | ! |
spec$name <- basename(dirname(path)) |
| 27 |
} |
|
| 28 | 28x |
if (is.null(spec$type)) {
|
| 29 | ! |
spec$type <- "source" |
| 30 |
} |
|
| 31 | 28x |
spec |
| 32 |
} else {
|
|
| 33 | 19x |
if (is.null(updated$type)) {
|
| 34 | ! |
updated$type <- "source" |
| 35 |
} |
|
| 36 | 19x |
jsonlite::write_json(updated, path, auto_unbox = TRUE, pretty = TRUE) |
| 37 | 19x |
updated |
| 38 |
} |
|
| 39 |
} |