dcf coverage - 85.41%

Files
Source

#' Download Census Population Data
#'
#' Download American Community Survey population data from the U.S. Census Bureau.
#'
#' @param year Data year.
#' @param out_dir Directory to download the file to.
#' @param state_only Logical; if \code{TRUE}, will only load state data.
#'   Will still download county data.
#' @param overwrite Logical; if \code{TRUE}, will re-download and overwrite existing data.
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
#' @returns A \code{data.frame} including \code{GEOID} and \code{region_name}
#'   for states and counties, along with their population, in total and within
#'   age brackets.
#' @examples
#' if (file.exists("../../resources/census_population_2021.csv.xz")) {
#'   dcf_load_census(2021, "../../resources")[1:10, ]
#' }
#' @export

dcf_load_census <- function(
  year = 2021,
  out_dir = NULL,
  state_only = FALSE,
  overwrite = FALSE,
  verbose = TRUE
) {
  out_file <- paste0(out_dir, "/census_population_", year, ".csv.xz")
  write_out <- !is.null(out_dir)
  if (!overwrite && write_out && file.exists(out_file)) {
    if (verbose) {
      cli::cli_progress_step("reading in existing file")
    }
    invisible(as.data.frame(vroom::vroom(
      out_file,
      delim = ",",
      col_types = list(
        GEOID = "c",
        region_name = "c",
        Total = "i",
        `<10 Years` = "i",
        `10-14 Years` = "i",
        `15-19 Years` = "i",
        `20-39 Years` = "i",
        `40-64 Years` = "i",
        `65+ Years` = "i"
      ),
      n_max = if (state_only) 52L else Inf
    )))
  } else {
    # GEOID to region name mapping
    id_url <- "https://www2.census.gov/geo/docs/reference/codes2020/national_"
    if (verbose) {
      cli::cli_progress_step("downloading state IDs map")
    }
    state_ids <- vroom::vroom(
      paste0(id_url, "state2020.txt"),
      delim = "|",
      col_types = list(
        STATE = "c",
        STATEFP = "c",
        STATENS = "c",
        STATE_NAME = "c"
      )
    )
    if (verbose) {
      cli::cli_progress_step("downloading county IDs map")
    }
    county_ids <- vroom::vroom(
      paste0(id_url, "county2020.txt"),
      delim = "|",
      col_types = list(
        STATE = "c",
        STATEFP = "c",
        COUNTYFP = "c",
        COUNTYNS = "c",
        COUNTYNAME = "c",
        CLASSFP = "c",
        FUNCSTAT = "c"
      )
    )
    region_name = structure(
      sub(
        " County",
        "",
        c(
          state_ids$STATE_NAME,
          paste0(county_ids$COUNTYNAME, ", ", county_ids$STATE)
        ),
        fixed = TRUE
      ),
      names = c(
        state_ids$STATEFP,
        paste0(county_ids$STATEFP, county_ids$COUNTYFP)
      )
    )

    # population data

    ## age group labels from IDs
    if (verbose) {
      cli::cli_progress_step("downloading ACS variable lables")
    }
    labels <- vroom::vroom(
      paste0(
        "https://www2.census.gov/programs-surveys/acs/summary_file/",
        min(2021L, year),
        "/sequence-based-SF/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt"
      ),
      delim = ",",
      col_types = list(
        `File ID` = "c",
        `Table ID` = "c",
        `Sequence Number` = "c",
        `Line Number` = "d",
        `Start Position` = "i",
        `Total Cells in Table` = "c",
        `Total Cells in Sequence` = "i",
        `Table Title` = "c",
        `Subject Area` = "c"
      )
    )
    variable_labels <- structure(
      labels$`Table Title`,
      names = paste0(
        labels$`Table ID`,
        "_E",
        formatC(labels$`Line Number`, width = 3L, flag = 0L)
      )
    )

    ## age group counts
    url <- paste0(
      "https://www2.census.gov/programs-surveys/acs/summary_file/",
      year,
      "/table-based-SF/data/5YRData/acsdt5y",
      year,
      "-b01001.dat"
    )
    if (verbose) {
      cli::cli_progress_step("downloading population data")
    }
    data <- vroom::vroom(url, delim = "|", col_types = list(GEO_ID = "c"))
    data <- data[
      grep("0[45]00000US", data$GEO_ID),
      grep("E", colnames(data), fixed = TRUE)
    ]
    colnames(data)[-1L] <- variable_labels[colnames(data)[-1L]]

    age_groups <- list(
      Total = "Total:",
      `<10 Years` = c("Under 5 years", "5 to 9 years"),
      `10-14 Years` = "10 to 14 years",
      `15-19 Years` = c("15 to 17 years", "18 and 19 years"),
      `20-39 Years` = c(
        "20 years",
        "21 years",
        "22 to 24 years",
        "25 to 29 years",
        "30 to 34 years",
        "35 to 39 years"
      ),
      `40-64 Years` = c(
        "40 to 44 years",
        "45 to 49 years",
        "50 to 54 years",
        "55 to 59 years",
        "60 and 61 years",
        "62 to 64 years"
      ),
      `65+ Years` = c(
        "65 and 66 years",
        "67 to 69 years",
        "70 to 74 years",
        "75 to 79 years",
        "80 to 84 years",
        "85 years and over"
      )
    )
    if (verbose) {
      cli::cli_progress_step("agregating across sex and fine age groups")
    }
    pop <- cbind(
      data.frame(GEOID = substring(data$GEO_ID, 10L), region_name = ""),
      do.call(
        cbind,
        lapply(age_groups, function(l) rowSums(data[, colnames(data) %in% l]))
      )
    )
    pop$region_name = region_name[pop$GEOID]
    states <- pop[1L:52L, ]
    health_regions <- as.data.frame(do.call(
      rbind,
      lapply(
        split(
          states[, -(1L:2L)],
          dcf_to_health_region(states$GEOID, "hhs_")
        ),
        colSums
      )
    ))
    health_regions$GEOID <- rownames(health_regions)
    health_regions$region_name <- sub(
      "hhs_",
      "Health Region ",
      rownames(health_regions),
      fixed = TRUE
    )
    pop <- rbind(pop, health_regions[, colnames(pop)])

    if (write_out) {
      if (verbose) {
        cli::cli_progress_step("writing output")
      }
      dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
      vroom::vroom_write(pop, out_file, ",")
    }
    invisible(if (state_only) states else pop)
  }
}

#' Adds documentation of a dataset to a datapackage
#'
#' Add information about variables in a dataset to a \code{datapackage.json} metadata file.
#'
#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}.
#' @param meta Information about each data file. A list with a list entry for each entry in
#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all.
#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to
#' \code{dir}, or a list with the read-in version.
#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}.
#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting
#' \code{packagename}.
#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information.
#' @param sha A number specifying the Secure Hash Algorithm function,
#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}).
#' @param pretty Logical; if \code{TRUE}, will pretty-print the datapackage.
#' @param summarize_ids Logical; if \code{TRUE}, will include ID columns in schema field summaries.
#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving.
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
#' @details
#' \code{meta} should be a list with unnamed entries for entry in \code{filename},
#' and each entry can include a named entry for any of these:
#' \describe{
#'   \item{source}{
#'   A list or list of lists with entries for at least \code{name}, and ideally for \code{url}.
#'   }
#'   \item{ids}{
#'   A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset).
#'   Might also include \code{map} with a list or path to a JSON file resulting in a list with an
#'   entry for each ID, and additional information about that entity, to be read in a its features.
#'   All files will be loaded to help with aggregation, but local files will be included in the datapackage,
#'   whereas hosted files will be loaded client-side.
#'   }
#'   \item{time}{
#'   A string giving the name of a variable in the dataset representing a repeated observation of the same entity.
#'   }
#'   \item{variables}{
#'   A list with named entries providing more information about the variables in the dataset.
#'   See \code{\link{dcf_measure_info}}.
#'   }
#' }
#' @examples
#' \dontrun{
#' # write example data
#' write.csv(mtcars, "mtcars.csv")
#'
#' # add it to an existing datapackage.json file in the current working directory
#' dcf_datapackage_add("mtcars.csv")
#' }
#' @return An invisible version of the updated datapackage, which is also written to
#' \code{datapackage.json} if \code{write = TRUE}.
#' @seealso Initialize the \code{datapackage.json} file with \code{\link{dcf_datapackage_init}}.
#' @export

dcf_datapackage_add <- function(
  filename,
  meta = list(),
  packagename = "datapackage.json",
  dir = ".",
  write = TRUE,
  refresh = TRUE,
  sha = "512",
  pretty = FALSE,
  summarize_ids = FALSE,
  open_after = FALSE,
  verbose = interactive()
) {
  if (missing(filename)) {
    cli::cli_abort("{.arg filename} must be specified")
  }
  setnames <- names(filename)
  if (file.exists(filename[[1]])) {
    if (dir == ".") {
      dir <- dirname(filename[[1]])
    }
    filename <- basename(filename)
  }
  if (any(!file.exists(paste0(dir, "/", filename)))) {
    filename <- filename[!file.exists(filename)]
    cli::cli_abort("{?a file/files} did not exist: {filename}")
  }
  package <- if (
    is.character(packagename) && file.exists(paste0(dir, "/", packagename))
  ) {
    paste0(dir, "/", packagename)
  } else {
    packagename
  }
  if (write) {
    if (is.character(package)) {
      package <- paste0(dir, "/", packagename)
      package <- if (file.exists(package)) {
        packagename <- package
        jsonlite::read_json(package)
      } else {
        dcf_datapackage_init(
          if (!is.null(setnames)) setnames[[1]] else filename[[1]],
          dir = dir
        )
      }
    }
    if (!is.list(package)) {
      cli::cli_abort(c(
        "{.arg package} does not appear to be in the right format",
        i = "this should be (or be read in from JSON as) a list with a {.code resource} entry"
      ))
    }
  }
  if (!is.list(package)) {
    package <- list()
  }
  single_meta <- FALSE
  metas <- if (!is.null(names(meta))) {
    if (!is.null(setnames) && all(setnames %in% names(meta))) {
      meta[setnames]
    } else {
      single_meta <- TRUE
      if (length(meta$variables) == 1 && is.character(meta$variables)) {
        if (!file.exists(meta$variables)) {
          meta$variables <- paste0(dir, "/", meta$variables)
        }
        if (file.exists(meta$variables)) {
          meta$variables <- jsonlite::read_json(meta$variables)
        }
      }
      meta$variables <- replace_equations(meta$variables)
      meta
    }
  } else {
    meta[seq_along(filename)]
  }
  if (!single_meta) {
    metas <- lapply(metas, function(m) {
      m$variables <- replace_equations(m$variables)
      m
    })
  }
  collect_metadata <- function(file) {
    f <- paste0(dir, "/", filename[[file]])
    m <- if (single_meta) meta else metas[[file]]
    format <- if (grepl(".parquet", f, fixed = TRUE)) {
      "parquet"
    } else if (grepl(".csv", f, fixed = TRUE)) {
      "csv"
    } else if (grepl(".rds", f, fixed = TRUE)) {
      "rds"
    } else {
      "tsv"
    }
    if (is.na(format)) {
      format <- "rds"
    }
    info <- file.info(f)
    metas <- list()
    unpack_meta <- function(n) {
      if (!length(m[[n]])) {
        list()
      } else if (is.list(m[[n]][[1]])) {
        m[[n]]
      } else {
        list(m[[n]])
      }
    }
    ids <- unpack_meta("ids")
    idvars <- NULL
    for (i in seq_along(ids)) {
      if (is.list(ids[[i]])) {
        if (
          length(ids[[i]]$map) == 1 &&
            is.character(ids[[i]]$map) &&
            file.exists(ids[[i]]$map)
        ) {
          ids[[i]]$map_content <- paste(
            readLines(ids[[i]]$map, warn = FALSE),
            collapse = ""
          )
        }
      } else {
        ids[[i]] <- list(variable = ids[[i]])
      }
      if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable)
    }
    data <- if (format == "rds") {
      tryCatch(readRDS(f), error = function(e) NULL)
    } else if (format == "parquet") {
      tryCatch(arrow::read_parquet(f), error = function(e) NULL)
    } else {
      attempt_read(f, c("geography", "time", idvars))
    }
    if (is.null(data)) {
      cli::cli_abort(c(
        paste0("failed to read in the data file ({.file {f}})"),
        i = "check that it is in a compatible format"
      ))
    }
    if (!all(rownames(data) == seq_len(nrow(data)))) {
      data <- cbind(`_row` = rownames(data), data)
    }
    timevar <- unlist(unpack_meta("time"))
    times <- if (is.null(timevar)) rep(1, nrow(data)) else data[[timevar]]
    times_unique <- unique(times)
    if (!single_meta) {
      varinf <- unpack_meta("variables")
      if (length(varinf) == 1 && is.character(varinf[[1]])) {
        if (!file.exists(varinf[[1]])) {
          varinf[[1]] <- paste0(dir, "/", varinf[[1]])
        }
        if (file.exists(varinf[[1]])) {
          if (varinf[[1]] %in% names(metas)) {
            varinf <- metas[[varinf[[1]]]]
          } else {
            varinf <- metas[[varinf[[1]]]] <- dcf_measure_info(
              varinf[[1]],
              write = FALSE,
              render = TRUE
            )
          }
          varinf <- varinf[varinf != ""]
        }
      }
      varinf_full <- names(varinf)
      varinf_suf <- sub("^[^:]+:", "", varinf_full)
    }
    res <- list(
      bytes = as.integer(info$size),
      encoding = stringi::stri_enc_detect(f)[[1]][1, 1],
      md5 = tools::md5sum(f)[[1]],
      format = format,
      name = if (!is.null(setnames)) {
        setnames[file]
      } else if (!is.null(m$name)) {
        m$name
      } else {
        sub("\\.[^.]*$", "", basename(filename[[file]]))
      },
      filename = filename[[file]],
      versions = get_versions(f),
      source = unpack_meta("source"),
      ids = ids,
      id_length = if (length(idvars)) {
        id_lengths <- nchar(data[[idvars[1]]])
        id_lengths <- id_lengths[!is.na(id_lengths)]
        if (all(id_lengths == id_lengths[1])) id_lengths[1] else 0
      } else {
        0
      },
      time = timevar,
      profile = "data-resource",
      created = as.character(info$mtime),
      last_modified = as.character(info$ctime),
      row_count = nrow(data),
      entity_count = if (length(idvars)) {
        length(unique(data[[idvars[1]]]))
      } else {
        nrow(data)
      },
      schema = list(
        fields = lapply(
          if (summarize_ids) {
            colnames(data)
          } else {
            colnames(data)[!colnames(data) %in% idvars]
          },
          function(cn) {
            v <- data[[cn]]
            invalid <- !is.finite(v)
            r <- list(name = cn, duplicates = sum(duplicated(v)))
            if (!single_meta) {
              if (cn %in% varinf_full) {
                r$info <- varinf[[cn]]
              } else if (cn %in% varinf_suf) {
                r$info <- varinf[[which(varinf_suf == cn)]]
              }
              r$info <- r$info[r$info != ""]
            }
            su <- !is.na(v)
            if (any(su)) {
              r$time_range <- which(times_unique %in% range(times[su])) - 1
              r$time_range <- if (length(r$time_range)) {
                r$time_range[c(1, length(r$time_range))]
              } else {
                c(-1, -1)
              }
            } else {
              r$time_range <- c(-1, -1)
            }
            if (!is.character(v) && all(invalid)) {
              r$type <- "unknown"
              r$missing <- length(v)
            } else if (is.numeric(v)) {
              r$type <- if (all(invalid | as.integer(v) == v)) {
                "integer"
              } else {
                "float"
              }
              r$missing <- sum(invalid)
              r$mean <- round(mean(v, na.rm = TRUE), 6)
              r$sd <- round(stats::sd(v, na.rm = TRUE), 6)
              r$min <- round(min(v, na.rm = TRUE), 6)
              r$max <- round(max(v, na.rm = TRUE), 6)
            } else {
              r$type <- "string"
              if (!is.factor(v)) {
                v <- as.factor(as.character(v))
              }
              r$missing <- sum(is.na(v) | is.nan(v) | grepl("^[\\s.-]$", v))
              r$table <- structure(as.list(tabulate(v)), names = levels(v))
            }
            r
          }
        )
      )
    )
    if (!single_meta && "_references" %in% names(varinf)) {
      res[["_references"]] <- varinf[["_references"]]
    }
    if (Sys.which("openssl") != "") {
      res[[paste0("sha", sha)]] <- calculate_sha(f, sha)
    }
    res
  }
  metadata <- lapply(seq_along(filename), collect_metadata)
  if (single_meta) {
    package$measure_info <- lapply(meta$variables, function(e) e[e != ""])
  }
  names <- vapply(metadata, "[[", "", "filename")
  for (resource in package$resources) {
    if (length(resource$versions)) {
      su <- which(names %in% resource$filename)
      if (length(su)) {
        if (length(metadata[[su]]$versions)) {
          metadata[[su]]$versions <- rbind(
            metadata[[su]]$versions,
            if (is.data.frame(resource$versions)) {
              resource$versions
            } else {
              as.data.frame(do.call(cbind, resource$versions))
            }
          )
          metadata[[su]]$versions <- metadata[[su]]$versions[
            !duplicated(metadata[[su]]$versions),
          ]
        }
      }
    }
  }
  if (refresh) {
    package$resources <- metadata
  } else {
    package$resources <- c(
      metadata,
      package$resources[
        !(vapply(package$resources, "[[", "", "filename") %in% names)
      ]
    )
  }
  if (write) {
    packagename <- if (is.character(packagename)) {
      packagename
    } else {
      "datapackage.json"
    }
    jsonlite::write_json(
      package,
      if (file.exists(packagename)) {
        packagename
      } else {
        paste0(dir, "/", packagename)
      },
      auto_unbox = TRUE,
      digits = 6,
      dataframe = "columns",
      pretty = pretty
    )
    if (verbose) {
      cli::cli_bullets(c(
        v = paste(
          if (refresh) "updated resource in" else "added resource to",
          "datapackage.json:"
        ),
        "*" = paste0("{.path ", packagename, "}")
      ))
      if (open_after) rstudioapi::navigateToFile(packagename)
    }
  }
  invisible(package)
}

get_versions <- function(file) {
  log <- suppressWarnings(system2(
    "git",
    c("log", file),
    stdout = TRUE
  ))
  if (is.null(attr(log, "status"))) {
    log_entries <- strsplit(paste(log, collapse = "|"), "commit ")[[
      1
    ]]
    log_entries <- do.call(
      rbind,
      Filter(
        function(x) length(x) == 4L,
        strsplit(
          log_entries[log_entries != ""],
          "\\|+(?:[^:]+:)?\\s*"
        )
      )
    )
    if (length(log_entries)) {
      colnames(log_entries) <- c(
        "hash",
        "author",
        "date",
        "message"
      )
      as.data.frame(log_entries)
    }
  }
}

attempt_read <- function(file, id_cols) {
  tryCatch(
    {
      sep <- if (grepl(".csv", file, fixed = TRUE)) "," else "\t"
      cols <- scan(file, "", nlines = 1, sep = sep, quiet = TRUE)
      types <- rep("?", length(cols))
      types[cols %in% id_cols] <- "c"
      arrow::read_delim_arrow(
        gzfile(file),
        sep,
        col_names = cols,
        col_types = paste(types, collapse = ""),
        skip = 1
      )
    },
    error = function(e) NULL
  )
}

calculate_sha <- function(file, level) {
  if (Sys.which("openssl") != "") {
    tryCatch(
      strsplit(
        system2(
          "openssl",
          c("dgst", paste0("-sha", level), shQuote(file)),
          TRUE
        ),
        " ",
        fixed = TRUE
      )[[1]][2],
      error = function(e) ""
    )
  } else {
    ""
  }
}

#' Standardize Epic Data
#'
#' Standardize a raw Epic data table.
#'
#' @param raw_data Raw Epic data, such as returned from \link{dcf_read_epic}.
#' @returns A standardized form of \code{data}.
#' @section Standardization:
#' \itemize{
#'   \item Collapse location columns (\code{state} or \code{county}) to a single
#'     \code{geography} column, and region names to IDs.
#'   \item Collapse time columns (\code{year}, \code{month}, or \code{week}) to a single
#'     \code{time} column, and clean up value formatting.
#'   \item Drop rows with no values across value columns.
#' }
#' @examples
#' \dontrun{
#'   raw_data <- dcf_read_epic("data/epic/raw/flu.csv.xz")
#'   standard_data <- dcf_process_epic_raw(raw_data)
#' }
#'
#' @export

dcf_standardize_epic <- function(raw_data) {
  region_names <- epic_id_maps$regions
  names(region_names) <- gsub(
    " (?:CITY AND BOROUGH|BOROUGH|PARISH|MUNICIPALITY|MUNICIPIO)|[.']",
    "",
    toupper(names(region_names))
  )
  cols <- colnames(raw_data)
  time_col <- which(cols == "year")
  if (length(time_col)) {
    colnames(raw_data)[time_col] <- "time"
    raw_data$time <- as.integer(substring(
      raw_data$time,
      nchar(raw_data$time) - 4L
    ))
  }
  month_col <- which(cols == "month")
  if (length(month_col)) {
    raw_data$time <- paste0(
      raw_data$time,
      "-",
      epic_id_maps$months[raw_data$month]
    )
  }
  week_col <- which(cols == "week")
  if (length(week_col)) {
    raw_data$time <- paste0(
      raw_data$time,
      "-",
      vapply(
        strsplit(raw_data$week, "[^A-Za-z0-9]"),
        function(p) {
          paste0(
            epic_id_maps$months[[p[[1L]]]],
            "-",
            formatC(as.integer(p[[2L]]), width = 2L, flag = "0")
          )
        },
        ""
      )
    )
  }
  geo_col <- grep("^(?:state|county)", cols)
  if (length(geo_col)) {
    colnames(raw_data)[geo_col] <- "geography"
    raw_data$geography <- toupper(raw_data$geography)
    missing_geo <- !(raw_data$geography %in% names(region_names))
    if (any(missing_geo)) {
      geo <- sub(
        "LA ",
        "LA",
        sub("^SAINT", "ST", raw_data$geography[missing_geo]),
        fixed = TRUE
      )
      if (any(grepl(", VA", geo, fixed = TRUE))) {
        geo[geo == "SALEM, VA"] <- "SALEM CITY, VA"
        geo[geo == "RADFORD, VA"] <- "RADFORD CITY, VA"
        geo[geo == "DONA ANA, NM"] <- "DO\u00d1A ANA, NM"
        geo[geo == "MATANUSKA SUSITNA, AK"] <- "MATANUSKA-SUSITNA, AK"
      }
      raw_data$geography[missing_geo] <- geo
    }
    missing_regions <- raw_data$geography[
      !(raw_data$geography %in% names(region_names))
    ]
    if (length(missing_regions)) {
      cli::cli_warn(
        'unrecognized regions: {paste(unique(missing_regions), collapse = "; ")}'
      )
    }
    raw_data$geography <- region_names[raw_data$geography]
    raw_data <- raw_data[!is.na(raw_data$geography), ]
  }
  raw_data <- raw_data[,
    !(colnames(raw_data) %in% c("state", "county", "year", "month", "week"))
  ]
  raw_data[
    rowSums(
      !is.na(raw_data[,
        !(colnames(raw_data) %in% c("geography", "time", "age"))
      ])
    ) !=
      0L,
  ]
}

#' Download Data from the CDC
#'
#' Download data and metadata from the Centers for Disease Control and Prevention (CDC).
#'
#' @param id ID of the resource (e.g., \code{ijqb-a7ye}).
#' @param out_dir Directory in which to save the metadata and data files.
#' @param state The state ID of a previous download; if provided, will only download if the
#' new state does not match.
#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet.
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
#' @returns The state ID of the downloaded files;
#' downloads files (\code{<id>.json} and \code{<id>.csv.xz}) to \code{out_dir}
#' @section \code{data.cdc.gov} URLs:
#'
#' For each resource ID, there are 3 relevant CDC URLs:
#' \itemize{
#'   \item \strong{\code{resource/<id>}}: This redirects to the resource's main page,
#'     with displayed metadata and a data preview
#'     (e.g., \href{https://data.cdc.gov/resource/ijqb-a7ye}{data.cdc.gov/resource/ijqb-a7ye}).
#'   \item \strong{\code{api/views/<id>}}: This is a direct link to the underlying
#'     JSON metadata (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye}{data.cdc.gov/api/views/ijqb-a7ye}).
#'   \item \strong{\code{api/views/<id>/rows.csv}}: This is a direct link to the full
#'     CSV dataset (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye/rows.csv}{data.cdc.gov/api/views/ijqb-a7ye/rows.csv}).
#' }
#'
#' @examples
#' \dontrun{
#'   dcf_download_cdc("ijqb-a7ye")
#' }
#' @export

dcf_download_cdc <- function(
  id,
  out_dir = "raw",
  state = NULL,
  parquet = FALSE,
  verbose = TRUE
) {
  dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
  if (verbose) {
    resource_url <- paste0("https://data.cdc.gov/resource/", id)
    cli::cli_h1(
      "downloading resource {.url {resource_url}}"
    )
  }
  url <- paste0("https://data.cdc.gov/api/views/", id)
  initial_timeout <- options(timeout = 99999L)$timeout
  on.exit(options(timeout = initial_timeout))
  if (verbose) {
    cli::cli_progress_step("metadata: {.url {url}}")
  }
  metadata_file <- paste0(tempdir(), "/", id, ".json")
  status <- utils::download.file(url, metadata_file, quiet = TRUE)
  if (status != 0L) {
    cli::cli_abort("failed to download metadata")
  }
  metadata <- jsonlite::read_json(metadata_file)
  new_state <- if (is.null(metadata$rowsUpdatedAt)) {
    as.list(tools::md5sum(metadata_file))
  } else {
    metadata$rowsUpdatedAt
  }
  if (!identical(new_state, state)) {
    file.rename(metadata_file, paste0(out_dir, "/", id, ".json"))
    data_url <- paste0(url, "/rows.csv")
    out_path <- paste0(out_dir, "/", id, ".csv")
    if (verbose) {
      cli::cli_progress_step("data: {.url {data_url}}")
    }
    status <- utils::download.file(data_url, out_path, quiet = TRUE)
    if (status != 0L) {
      cli::cli_abort("failed to download data")
    }
    if (verbose) {
      cli::cli_progress_step("compressing data")
    }
    if (parquet) {
      data <- vroom::vroom(out_path, show_col_types = FALSE)
      arrow::write_parquet(
        data,
        compression = "gzip",
        sub(".csv", ".parquet", out_path, fixed = TRUE)
      )
      unlink(out_path)
    } else {
      unlink(paste0(out_path, ".xz"))
      status <- system2("xz", c("-f", out_path))
      if (status != 0L) {
        cli::cli_abort("failed to compress data")
      }
    }
    if (verbose) {
      cli::cli_progress_done()
    }
    invisible(new_state)
  } else {
    unlink(metadata_file)
    invisible(state)
  }
}

#' Read Epic Cosmos Data
#'
#' Read in metadata and data from an Epic Cosmos file.
#'
#' @param path Path to the file.
#' @param path_root Directory containing \code{path}, if it is not full.
#' @returns A list with \code{data.frame} entries for \code{metadata} and \code{data}.
#'
#' @examples
#' # write an example file
#' path <- tempfile(fileext = ".csv")
#' raw_lines <- c(
#'   "metadata field,metadata value,",
#'   ",,",
#'   ",Measures,Value Name",
#'   "Year,Measure 1,",
#'   "2020,m1,1",
#'   ",m2,2",
#'   "2021,m1,3",
#'   ",m2,4"
#' )
#' writeLines(raw_lines, path)
#'
#' # read it in
#' dcf_read_epic(basename(path), dirname(path))
#'
#' @export

dcf_read_epic <- function(path, path_root = ".") {
  full_path <- if (file.exists(path)) {
    path
  } else {
    sub("//", "/", paste0(path_root, "/", path), fixed = TRUE)
  }
  lines <- readLines(full_path, n = 25L, skipNul = FALSE)
  metadata_break <- grep("^[, ]*$", lines)
  if (!length(metadata_break)) {
    cli::cli_abort(
      "path does not appear to point to a file in the Epic format (no metadata separation)"
    )
  }
  meta_end <- min(metadata_break) - 1L
  data_start <- (if (length(metadata_break) == 1L) {
    metadata_break
  } else {
    max(metadata_break[
      metadata_break == c(-1L, metadata_break[-1L])
    ])
  }) +
    1L
  meta <- c(
    list(
      file = path,
      md5 = unname(tools::md5sum(full_path)),
      date_processed = Sys.time(),
      standard_name = ""
    ),
    as.list(unlist(lapply(
      strsplit(sub(",+$", "", lines[seq_len(meta_end)]), ",", fixed = TRUE),
      function(r) {
        l <- list(paste(r[-1L], collapse = ","))
        if (l[[1]] == "") {
          r <- strsplit(r, ": ", fixed = TRUE)[[1L]]
          l <- list(paste(r[-1L], collapse = ","))
        }
        names(l) <- r[[1L]]
        l[[1L]] <- gsub('^"|"$', "", l[[1L]])
        l
      }
    )))
  )
  standard_names <- c(
    vaccine_mmr = "MMR receipt",
    rsv_tests = "RSV tests",
    flu = "Influenza",
    self_harm = "self-harm",
    covid = "COVID",
    rsv = "RSV",
    obesity = "BMI",
    obesity = "obesity",
    all_encounters = "All ED Encounters"
  )
  meta_string <- paste(unlist(meta), collapse = " ")
  for (i in seq_along(standard_names)) {
    if (grepl(standard_names[[i]], meta_string, fixed = TRUE)) {
      meta$standard_name = names(standard_names)[[i]]
      break
    }
  }
  id_cols <- seq_len(
    length(strsplit(lines[data_start], "^,|Measures,")[[1L]]) - 1L
  )
  header <- c(
    strsplit(lines[data_start + 1L], ",", fixed = TRUE)[[1L]][id_cols],
    strsplit(lines[data_start], ",", fixed = TRUE)[[1L]][-id_cols]
  )
  data <- arrow::read_csv_arrow(
    full_path,
    col_names = header,
    col_types = paste(rep("c", length(header)), collapse = ""),
    na = c("", "-"),
    skip = data_start + 1L
  )
  percents <- grep("^(?:Percent|Base|RSV test)", header)
  if (length(percents)) {
    for (col in percents) {
      data[[col]] <- sub("%", "", data[[col]], fixed = TRUE)
    }
  }
  number <- grep("Number", header, fixed = TRUE)
  if (length(number)) {
    for (col in number) {
      data[[col]][data[[col]] == "10 or fewer"] <- 5L
    }
  }
  for (col in id_cols) {
    data[[col]] <- vctrs::vec_fill_missing(data[[col]], "down")
  }
  if (all(c("Measures", "Base Patient") %in% colnames(data))) {
    data <- Reduce(
      merge,
      lapply(split(data, data$Measures), function(d) {
        measure <- d$Measures[[1L]]
        d[[measure]] <- d[["Base Patient"]]
        d[, !(colnames(d) %in% c("Measures", "Base Patient"))]
      })
    )
  }
  colnames(data) <- standard_columns(colnames(data))
  if (meta$standard_name == "obesity") {
    meta$standard_name <- paste0(
      meta$standard_name,
      "_",
      if ("state" %in% colnames(data)) "state" else "county"
    )
  }
  if ("age" %in% colnames(data)) {
    std_age <- standard_age(data$age)
    missed_ages <- (data$age != "No value") & is.na(std_age)
    if (any(missed_ages)) {
      std_age[missed_ages] <- data$age[missed_ages]
      missed_levels <- unique(data$age[missed_ages])
      cli::cli_warn("missed age levels: {.field {missed_levels}}")
    }
    data$age <- std_age
  }
  list(metadata = meta, data = data)
}

standard_age <- function(age) {
  c(
    `less than 1 years` = "<1 Years",
    `1 and < 2 years` = "1-2 Years",
    `2 and < 3 years` = "2-3 Years",
    `3 and < 4 years` = "3-4 Years",
    `1 and < 5 years` = "1-4 Years",
    `1 year or more and less than 5 years` = "1-4 Years",
    `4 and < 5 years` = "4-5 Years",
    `less than 5 years` = "<5 Years",
    `5 and < 6 years` = "5-6 Years",
    `5 and < 18 years` = "5-17 Years",
    `5 years or more and less than 18 years (1)` = "5-17 Years",
    `6 and < 7 years` = "6-7 Years",
    `6 years or more` = "6+ Years",
    `7 and < 8 years` = "7-8 Years",
    `8 and < 9 years` = "8-9 Years",
    `9 years or more` = "9+ Years",
    `less than 10 years` = "<10 Years",
    `10 and < 15 years` = "10-14 Years",
    `15 and < 20 years` = "15-19 Years",
    `18 and < 40 years` = "18-39 Years",
    `18 and < 50 years` = "18-49 Years",
    `18 years or more and less than 50 years` = "18-49 Years",
    `20 and < 40 years` = "20-39 Years",
    `40 and < 65 years` = "40-64 Years",
    `50 and < 65 years` = "50-64 Years",
    `50 years or more and less than 64 years` = "50-64 Years",
    `65 years or more` = "65+ Years",
    `65 and < 110 years` = "65+ Years",
    `total` = "Total"
  )[
    sub("^[^a-z0-9]+|:.*$", "", tolower(age))
  ]
}

standard_columns <- function(cols) {
  cols <- gsub(" ", "_", sub("number of ", "n_", tolower(cols)), fixed = TRUE)
  cols[grep("^age", cols)] <- "age"
  cols[grep("^state", cols)] <- "state"
  cols[grep("^county", cols)] <- "county"
  cols[grep("bmi_30", cols)] <- "bmi_30_49.8"
  cols[grep("hemoglobin_a1c_7", cols)] <- "hemoglobin_a1c_7"
  cols[grep("mmr_receipt", cols)] <- "mmr_receipt"
  cols[grep("^rsv_tests", cols)] <- "rsv_tests"
  cols
}

#' Check Data Sources
#'
#' Check the data files and measure info of source projects.
#'
#' @param names Name or names of source projects.
#' @param project_dir Path to the Data Collection Framework project.
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
#' @returns A list with an entry for each source, containing a character vector
#'   including any issue codes:
#' \itemize{
#'   \item \code{not_compressed}: The file does not appear to be compressed.
#'   \item \code{cant_read}: Failed to read the file in.
#'   \item \code{geography_missing}: File does not contain a \code{geography} column.
#'   \item \code{geography_nas}: The file's \code{geography} column contains NAs.
#'   \item \code{time_missing}: File does not contain a \code{time} column.
#'   \item \code{time_nas}: The file's \code{time} column contains NAs.
#'   \item \code{missing_info: {column_name}}: The file's indicated column does not have
#'     a matching entry in \code{measure_info.json}.
#' }
#' @examples
#' \dontrun{
#'   dcf_check_sources("gtrends")
#' }
#' @export

dcf_check_sources <- function(
  names = NULL,
  project_dir = ".",
  verbose = TRUE
) {
  settings <- dcf_read_settings(project_dir)
  base_dir <- paste0(project_dir, "/", settings$data_dir)
  if (is.null(names)) {
    names <- list.dirs(base_dir, recursive = FALSE, full.names = FALSE)
  }
  issues <- list()
  for (name in names) {
    source_dir <- paste0(base_dir, "/", name, "/")
    if (!dir.exists(source_dir)) {
      cli::cli_abort("specify the name of an existing data source")
    }
    process_file <- paste0(source_dir, "process.json")
    dcf_add_source(name, project_dir, FALSE)
    if (!file.exists(process_file)) {
      cli::cli_abort("{name} does not appear to be a data source project")
    }
    process <- dcf_process_record(process_file)
    info_file <- paste0(source_dir, "measure_info.json")
    info <- tryCatch(
      dcf_measure_info(
        info_file,
        render = TRUE,
        write = FALSE,
        verbose = FALSE,
        open_after = FALSE
      ),
      error = function(e) NULL
    )
    if (is.null(info)) {
      cli::cli_abort("{.file {info_file}} is malformed")
    }
    if (verbose) {
      cli::cli_bullets(c("", "Checking data source {.strong {name}}"))
    }
    data_files <- list.files(
      paste0(source_dir, "standard"),
      "\\.(?:csv|parquet)",
      full.names = TRUE
    )
    source_issues <- list()
    for (file in list.files(
      paste0(source_dir, "raw"),
      "csv$",
      full.names = TRUE
    )) {
      source_issues[[file]] <- list(data = "not_compressed")
    }
    if (length(data_files)) {
      for (file in data_files) {
        issue_messages <- NULL
        if (verbose) {
          cli::cli_progress_step("checking file {.file {file}}", spinner = TRUE)
        }
        data_issues <- NULL
        measure_issues <- NULL
        data <- tryCatch(
          if (grepl("parquet$", file)) {
            dplyr::collect(arrow::read_parquet(file))
          } else {
            con <- gzfile(file)
            on.exit(con)
            vroom::vroom(con, show_col_types = FALSE)
          },
          error = function(e) NULL
        )
        if (is.null(data)) {
          data_issues <- c(data_issues, "cant_read")
        } else {
          if (grepl("csv$", file)) {
            data_issues <- c(data_issues, "not_compressed")
            if (verbose) {
              issue_messages <- c(
                issue_messages,
                "file is not compressed"
              )
            }
          }
          if (!("geography" %in% colnames(data))) {
            data_issues <- c(data_issues, "geography_missing")
            if (verbose) {
              issue_messages <- c(
                issue_messages,
                "missing {.emph geography} column"
              )
            }
          } else if (anyNA(data$geography)) {
            data_issues <- c(data_issues, "geography_nas")
            if (verbose) {
              issue_messages <- c(
                issue_messages,
                "{.emph geography} column contains NAs"
              )
            }
          }
          if (!("time" %in% colnames(data))) {
            data_issues <- c(data_issues, "time_missing")
            if (verbose) {
              issue_messages <- c(
                issue_messages,
                "missing {.emph time} column"
              )
            }
          } else if (anyNA(data$time)) {
            data_issues <- c(data_issues, "time_nas")
            if (verbose) {
              issue_messages <- c(
                issue_messages,
                "{.emph time} column contains NAs"
              )
            }
          }
          for (col in colnames(data)) {
            if (!(col %in% c("geography", "time")) && !(col %in% names(info))) {
              measure_issues <- c(measure_issues, paste("missing_info:", col))
              if (verbose) {
                issue_messages <- c(
                  issue_messages,
                  paste0(
                    "{.emph ",
                    col,
                    "} column does not have an entry in measure_info"
                  )
                )
              }
            }
          }
        }
        file_issues <- list()
        if (length(data_issues)) {
          file_issues$data <- data_issues
        }
        if (length(measure_issues)) {
          file_issues$measures <- measure_issues
        }
        source_issues[[file]] <- file_issues
        if (verbose) {
          if (length(issue_messages)) {
            cli::cli_progress_done(result = "failed")
            cli::cli_bullets(structure(
              issue_messages,
              names = rep(" ", length(issue_messages))
            ))
          } else {
            cli::cli_progress_done()
          }
        }
      }
    } else {
      if (verbose) cli::cli_alert_info("no standard data files found to check")
    }
    if (!identical(process$check_results, source_issues)) {
      process$checked <- Sys.time()
      process$check_results <- source_issues
      dcf_process_record(process_file, process)
    }
    issues[[name]] <- source_issues
  }

  invisible(issues)
}

#' Process Epic Stating Files
#'
#' Process Epic stating files, lightly standardizing them and moving them to raw.
#'
#' @param staging_dir Directory containing the staging files.
#' @param out_dir Directory to write new raw files to.
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
#' @param cleanup Logical; if \code{FALSE}, will not remove staging files after being processed.
#' @returns \code{NULL} if no staging files are found.
#'   Otherwise, a list with entries for \code{data} and \code{metadata}.
#'   Each of these are lists with entries for each recognized standard name,
#'   with potentially combined outputs similar to \code{\link{dcf_read_epic}}
#'
#' @examples
#' \dontrun{
#'   # run from a source project
#'   dcf_process_epic_staging()
#' }
#'
#' @export

dcf_process_epic_staging <- function(
  staging_dir = "raw/staging",
  out_dir = "raw",
  verbose = TRUE,
  cleanup = TRUE
) {
  files <- sort(list.files(
    staging_dir,
    "csv",
    full.names = TRUE,
    recursive = TRUE
  ))
  files <- files[!grepl("census", files)]
  if (!length(files)) {
    if (verbose) {
      cli::cli_progress_message("no staging files found")
    }
    return(NULL)
  }
  id_cols <- c("state", "county", "age", "year", "month", "week")
  metadata <- list()
  data <- list()
  for (file in files) {
    if (verbose) {
      cli::cli_progress_step("processing file {.file {file}}", spinner = TRUE)
    }
    epic <- tryCatch(dcf_read_epic(file), error = function(e) NULL)
    if (is.null(epic)) {
      if (verbose) {
        cli::cli_progress_done(result = "failed")
      }
      next
    }
    if (epic$metadata$standard_name == "") {
      if (verbose) {
        cli::cli_progress_update(
          status = "failed to identify standard type for {.file {file}}"
        )
        cli::cli_progress_done(result = "failed")
      }
      next
    }
    name <- epic$metadata$standard_name
    metadata[[name]] <- c(list(epic$metadata), metadata[[name]])
    file_id_cols <- id_cols[id_cols %in% colnames(epic$data)]
    epic$data <- epic$data[
      rowMeans(is.na(epic$data[,
        !(colnames(epic$data) %in% file_id_cols),
        drop = FALSE
      ])) !=
        1,
    ]
    n_col <- grep("^n_", colnames(epic$data))
    if (length(n_col)) {
      colnames(epic$data)[[n_col]] <- paste0("n_", epic$metadata$standard_name)
    }
    if (!is.null(data[[name]])) {
      cols <- colnames(data[[name]])
      cols <- cols[!(cols %in% colnames(epic$data))]
      if (length(cols)) {
        epic$data[, cols] <- NA
      }
      epic$data <- epic$data[, colnames(data[[name]])]
      file_id_cols <- id_cols[id_cols %in% colnames(data[[name]])]
      data[[name]] <- data[[name]][
        !(do.call(paste, data[[name]][, file_id_cols]) %in%
          do.call(paste, epic$data[, file_id_cols])),
      ]
    }
    data[[name]] <- rbind(epic$data, data[[name]])
    if (verbose) cli::cli_progress_done()
  }
  for (name in names(metadata)) {
    if (verbose) {
      cli::cli_progress_step(
        "writing standard raw output for {.field {name}}",
        spinner = TRUE
      )
    }
    paths <- paste0(out_dir, "/", name, ".", c("json", "csv.xz"))
    jsonlite::write_json(
      metadata[[name]],
      paths[[1L]],
      auto_unbox = TRUE,
      pretty = TRUE
    )
    vroom::vroom_write(data[[name]], paths[[2L]])
    if (cleanup) {
      unlink(vapply(metadata[[name]], "[[", "", "file"))
    }
    if (verbose) cli::cli_process_done()
  }
  return(list(metadata = metadata, data = data))
}

#' Run Data Project Processes
#'
#' Operates over data source and bundle projects, optionally running the source
#' ingest scripts, then collecting metadata.
#'
#' @param name Name of a source project to process. Will
#' @param project_dir Path to the project directory. If not specified, and being called
#' from a source project, this will be assumed to be two steps back from the working directory.
#' @param ingest Logical; if \code{FALSE}, will re-process standardized data without running
#' ingestion scripts. Only applies to source projects.
#' @param is_auto Logical; if \code{TRUE}, will skip process scripts marked as manual.
#' @param force Logical; if \code{TRUE}, will ignore process frequencies
#' (will run scripts even if recently run).
#' @param clear_state Logical; if \code{TRUE}, will clear stored states before processing.
#' @returns A list with processing results:
#' \itemize{
#'   \item \code{timings}: How many seconds the scripts took to run.
#'   \item \code{logs}: The captured output of the scripts.
#' }
#' Each entry has an entry for each project.
#'
#' A `datapackage.json` file is also created / update in each source's `standard` directory.
#' @examples
#' \dontrun{
#'   # run from a directory containing a `data` directory containing the source
#'   dcf_process("source_name")
#'
#'   # run without executing the ingestion script
#'   dcf_process("source_name", ingest = FALSE)
#' }
#' @export

dcf_process <- function(
  name = NULL,
  project_dir = ".",
  ingest = TRUE,
  is_auto = FALSE,
  force = FALSE,
  clear_state = FALSE
) {
  settings_file <- paste0(project_dir, "/settings.json")
  from_project <- file.exists(settings_file)
  if (from_project) {
    source_dir <- paste0(
      project_dir,
      "/",
      jsonlite::read_json(settings_file)$data_dir
    )
  } else {
    source_dir <- "../.."
    name <- dirname(getwd())
  }

  sources <- if (is.null(name)) {
    list.files(
      source_dir,
      "process\\.json",
      recursive = TRUE,
      full.names = TRUE
    )
  } else {
    process_files <- paste0(source_dir, "/", name, "/process.json")
    if (any(!file.exists(process_files))) {
      cli::cli_abort(
        "missing process file{?/s}: {.emph {process_files[!file.exists(process_files)]}}"
      )
    }
    process_files
  }
  decide_to_run <- function(process_script) {
    if (is_auto && process_script$manual) {
      return(FALSE)
    }
    if (
      force || process_script$last_run == "" || process_script$frequency == 0L
    ) {
      return(TRUE)
    }
    if (
      difftime(Sys.time(), as.POSIXct(process_script$last_run), units = "day") >
        process_script$frequency
    ) {
      return(TRUE)
    }
    FALSE
  }
  collect_env <- new.env()
  collect_env$timings <- list()
  collect_env$logs <- list()
  process_source <- function(process_file) {
    process_def <- dcf_process_record(process_file)
    if (clear_state) {
      process_def$raw_state <- NULL
      process_def$standard_state <- NULL
      dcf_process_record(process_file, process_def)
    }
    name <- process_def$name
    dcf_add_source(name, project_dir, open_after = FALSE)
    for (si in seq_along(process_def$scripts)) {
      st <- proc.time()[[3]]
      process_script <- process_def$scripts[[si]]
      run_current <- ingest && decide_to_run(process_script)
      base_dir <- dirname(process_file)
      standard_dir <- paste0(base_dir, "/standard")
      script <- paste0(base_dir, "/", process_script$path)
      file_ref <- if (run_current) paste0(" ({.emph ", script, "})") else NULL
      cli::cli_progress_step(
        paste0("processing source {.strong ", name, "}", file_ref),
        spinner = TRUE
      )
      env <- new.env()
      env$dcf_process_continue <- TRUE
      status <- if (ingest) {
        tryCatch(
          list(
            log = utils::capture.output(
              source(script, env, chdir = TRUE),
              type = "message"
            ),
            success = TRUE
          ),
          error = function(e) list(log = e$message, success = FALSE)
        )
      } else {
        list(log = "", success = TRUE)
      }
      collect_env$logs[[name]] <- status$log
      if (run_current) {
        process_script$last_run <- Sys.time()
        process_script$run_time <- proc.time()[[3]] - st
        process_script$last_status <- status
        process_def$scripts[[si]] <- process_script
      }
      if (status$success) {
        collect_env$timings[[name]] <- process_script$run_time
      }
      if (!env$dcf_process_continue) break
    }
    process_def_current <- dcf_process_record(process_file)
    if (
      is.null(process_def_current$raw_state) ||
        !identical(process_def$raw_state, process_def_current$raw_state)
    ) {
      process_def_current$scripts <- process_def$scripts
      dcf_process_record(process_file, process_def_current)
    }
    data_files <- list.files(standard_dir, "\\.(?:csv|parquet)")
    if (length(data_files)) {
      measure_info_file <- paste0(base_dir, "/measure_info.json")
      standard_state <- as.list(tools::md5sum(c(
        measure_info_file,
        paste0(standard_dir, "/", data_files)
      )))
      if (!identical(process_def_current$standard_state, standard_state)) {
        measure_info <- dcf_measure_info(
          measure_info_file,
          include_empty = FALSE,
          render = TRUE,
          write = FALSE,
          open_after = FALSE,
          verbose = FALSE
        )
        measure_sources <- list()
        for (info in measure_info) {
          for (s in info$sources) {
            if (
              !is.null(s$location) &&
                !(s$location %in% names(sources))
            ) {
              measure_sources[[s$location]] <- s
            }
          }
        }
        if (!file.exists(paste0(standard_dir, "/datapackage.json"))) {
          dcf_datapackage_init(name, dir = standard_dir, quiet = TRUE)
        }
        dcf_datapackage_add(
          data_files,
          meta = list(
            source = unname(measure_sources),
            base_dir = base_dir,
            ids = "geography",
            time = "time",
            variables = measure_info
          ),
          dir = standard_dir,
          pretty = TRUE,
          summarize_ids = TRUE,
          verbose = FALSE
        )
        process_def_current$standard_state <- standard_state
        dcf_process_record(process_file, process_def_current)
      }
      cli::cli_progress_done(result = if (status$success) "done" else "failed")
    } else {
      cli::cli_progress_done(result = "failed")
      cli::cli_bullets(
        c(" " = "no standard data files found in {.path {process_file}}")
      )
    }
  }
  process_bundle <- function(process_file) {
    process_def <- dcf_process_record(process_file)
    if (clear_state) {
      process_def$source_state <- NULL
      process_def$dist_state <- NULL
      dcf_process_record(process_file, process_def)
    }
    name <- process_def$name
    dcf_add_bundle(name, project_dir, open_after = FALSE)
    for (si in seq_along(process_def$scripts)) {
      st <- proc.time()[[3]]
      process_script <- process_def$scripts[[si]]
      base_dir <- dirname(process_file)
      script <- paste0(base_dir, "/", process_script$path)
      measure_info_file <- paste0(base_dir, "/measure_info.json")
      run_current <- TRUE
      if (length(process_def$source_files)) {
        standard_files <- paste0(source_dir, "/", process_def$source_files)
        standard_state <- as.list(tools::md5sum(c(
          measure_info_file,
          paste0(source_dir, "/", process_def$source_files)
        )))
        run_current <- !identical(standard_state, process_def$source_state)
      }
      if (run_current) {
        cli::cli_progress_step(
          paste0(
            "processing bundle {.strong ",
            name,
            "} ({.emph ",
            script,
            "})"
          ),
          spinner = TRUE
        )
        env <- new.env()
        env$dcf_process_continue <- TRUE
        status <- tryCatch(
          list(
            log = utils::capture.output(
              source(script, env, chdir = TRUE),
              type = "message"
            ),
            success = TRUE
          ),
          error = function(e) list(log = e$message, success = FALSE)
        )
        collect_env$logs[[name]] <- status$log
        if (run_current) {
          process_script$last_run <- Sys.time()
          process_script$run_time <- proc.time()[[3]] - st
          process_script$last_status <- status
          process_def$scripts[[si]] <- process_script
        }
        if (status$success) {
          collect_env$timings[[name]] <- process_script$run_time
        }
        if (!env$dcf_process_continue) break
      }
    }
    process_def_current <- dcf_process_record(process_file)
    dist_files <- list.files(paste0(base_dir, "/dist"))
    if (length(dist_files)) {
      measure_info_file <- paste0(base_dir, "/measure_info.json")
      dist_state <- as.list(tools::md5sum(c(
        measure_info_file,
        paste0(base_dir, "/dist/", dist_files)
      )))
      if (!identical(process_def_current$dist_state, dist_state)) {
        process_def_current$scripts <- process_def$scripts
        process_def_current$dist_state <- dist_state
        dcf_process_record(process_file, process_def_current)
      }
      cli::cli_progress_done(result = if (status$success) "done" else "failed")
    } else {
      cli::cli_progress_done(result = "failed")
      cli::cli_bullets(
        c(" " = "no standard data files found in {.path {process_file}}")
      )
    }
  }
  for (process_file in sources[order(
    vapply(
      sources,
      function(f) {
        type <- jsonlite::read_json(f)$type
        is.null(type) || type == "bundle"
      },
      TRUE
    ) ==
      "bundle"
  )]) {
    process_def <- dcf_process_record(process_file)
    if (is.null(process_def$type) || process_def$type == "source") {
      process_source(process_file)
    } else {
      process_bundle(process_file)
    }
  }
  invisible(list(timings = collect_env$timings, logs = collect_env$logs))
}

#' Retrieve A Data File
#'
#' Load a data file from a source data project, or list versions of the file.
#'
#' @param path Path to the file.
#' @param date Date of the version to load; A \code{Date}, or \code{character} in the format
#'   \code{YYYY-MM-DD}. Will match to the nearest version.
#' @param commit_hash SHA signature of the committed version;
#'   can be the first 6 or so characters. Ignored if \code{date} is provided.
#' @param versions Logical; if \code{TRUE}, will return a list of available version,
#'   rather than a
#' @returns If \code{versions} is \code{TRUE}, a \code{data.frame} with columns for
#'   the \code{hash}, \code{author}, \code{date}, and \code{message} of each commit.
#'   Otherwise, the path to a temporary file, if one was extracted.
#'
#' @examples
#' path <- "../../../pophive/data/wastewater/raw/flua.csv.xz"
#' if (file.exists(path)) {
#'   # list versions
#'   versions <- dcf_get_file(path, versions = TRUE)
#'   print(versions[, c("date", "hash")])
#'
#'   # extract a version to a temporary file
#'   temp_path <- dcf_get_file(path, "2025-05")
#'   basename(temp_path)
#' }
#'
#' @export

dcf_get_file <- function(
  path,
  date = NULL,
  commit_hash = NULL,
  versions = FALSE
) {
  if (missing(path)) {
    cli::cli_abort("specify a path")
  }
  if (!file.exists(path)) {
    cli::cli_abort("path does not exist")
  }
  vs <- data.frame(
    hash = character(),
    author = character(),
    date = character(),
    message = character()
  )
  if (versions || !is.null(date)) {
    commits <- sys::exec_internal("git", c("log", path))
    if (commits$status == 0L) {
      commits <- do.call(
        rbind,
        Filter(
          function(e) length(e) == 4L,
          strsplit(
            strsplit(rawToChar(commits$stdout), "commit ", fixed = TRUE)[[1L]],
            "\\n+(?:[^:]+:)?\\s*"
          )
        )
      )
      colnames(commits) <- colnames(vs)
      vs <- as.data.frame(commits)
    } else {
      cli::cli_abort("failed to git log: {rawToChar(commits$stderr)}")
    }
  }
  if (versions) {
    return(vs)
  }
  if (!is.null(date)) {
    if (nrow(vs) == 0L) {
      return(path)
    }
    if (is.character(date)) {
      date <- as.POSIXct(
        date,
        tryFormats = c(
          "%Y-%m-%d %H:%M:%S",
          "%Y-%m-%d %H:%M",
          "%Y-%m-%d",
          "%Y-%m",
          "%Y"
        ),
        tz = "UTC"
      )
    }
    commit_hash <- vs$hash[which.min(abs(
      as.POSIXct(vs$date, "%a %b %d %H:%M:%S %Y", tz = "UTC") - date
    ))]
  }
  if (is.null(commit_hash)) {
    return(path)
  }
  name_parts <- strsplit(basename(path), ".", fixed = TRUE)[[1L]]
  out_path <- paste0(
    tempdir(),
    "/",
    name_parts[[1L]],
    "-",
    substring(commit_hash, 1L, 6L),
    ".",
    paste(name_parts[-1L], collapse = ".")
  )
  if (file.exists(out_path)) {
    return(out_path)
  }
  status <- sys::exec_wait(
    "git",
    c("show", paste0(commit_hash, ":", path)),
    std_out = out_path
  )
  if (status != 0L) {
    cli::cli_abort("failed to git show: {rawToChar(status$stderr)}")
  }
  out_path
}

#' Adds a Bundle Project
#'
#' Establishes a new data bundle project, used to prepare outputs from standardized datasets.
#'
#' @param name Name of the bundle
#' @param project_dir Path to the Data Collection Framework project.
#' @param source_files Character vector of paths to standard files form source projects.
#' @param open_after Logical; if \code{FALSE}, will not open the project.
#' @returns Nothing; creates default files and directories.
#' @section Project:
#'
#' Within a bundle project, there are two files to edits:
#' \itemize{
#'   \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun.
#'     It should store raw data and resources in \code{raw/} where possible,
#'     then use what's in \code{raw/} to produce standard-format files in \code{standard/}.
#'     This file is sourced from its location during processing, so any system paths
#'     must be relative to itself.
#'   \item \strong{\code{measure_info.json}}: This is where you can record information
#'     about the variables included in the standardized data files.
#'     See \code{\link{dcf_measure_info}}.
#' }
#'
#' @examples
#' project_dir <- paste0(tempdir(), "/temp_project")
#' dcf_init("temp_project", dirname(project_dir))
#' dcf_add_bundle("bundle_name", project_dir)
#' list.files(paste0(project_dir, "/data/bundle_name"))
#'
#' @export

dcf_add_bundle <- function(
  name,
  project_dir = ".",
  source_files = NULL,
  open_after = interactive()
) {
  if (missing(name)) {
    cli::cli_abort("specify a name")
  }
  name <- gsub("[^A-Za-z0-9]+", "_", name)
  settings <- dcf_read_settings(project_dir)
  base_dir <- paste(c(project_dir, settings$data_dir, name), collapse = "/")
  dir.create(paste0(base_dir, "/dist"), showWarnings = FALSE, recursive = TRUE)
  paths <- paste0(
    base_dir,
    "/",
    c(
      "README.md",
      "project.Rproj",
      "process.json",
      "measure_info.json",
      "build.R"
    )
  )
  if (!file.exists(paths[[1L]])) {
    writeLines(
      paste0(
        c(
          paste("#", name),
          "",
          "This is a Data Collection Framework data bundle project, initialized with `dcf::dcf_add_bundle`.",
          "",
          "You can us the `dcf` package to rebuild the bundle:",
          "",
          "```R",
          paste0('dcf::dcf_process("', name, '", "..")'),
          "```"
        ),
        collapse = "\n"
      ),
      paths[[1L]]
    )
  }
  if (!file.exists(paths[[2L]])) {
    writeLines("Version: 1.0\n", paths[[2L]])
  }
  if (!file.exists(paths[[3L]])) {
    jsonlite::write_json(
      list(
        name = name,
        type = "bundle",
        scripts = list(
          list(
            path = "build.R",
            last_run = "",
            run_time = "",
            last_status = list(log = "", success = TRUE)
          )
        ),
        source_files = source_files
      ),
      paths[[3L]],
      auto_unbox = TRUE,
      pretty = TRUE
    )
  }
  if (!file.exists(paths[[4L]])) {
    writeLines("{}\n", paths[[4L]])
  }
  if (!file.exists(paths[[5L]])) {
    writeLines(
      paste0(
        c(
          "# read data from data source projects",
          "# and write to this project's `dist` directory",
          ""
        ),
        collapse = "\n"
      ),
      paths[[5L]]
    )
  }
  if (open_after) rstudioapi::openProject(paths[[2L]], newSession = TRUE)
}

#' Initialize a Data Collection Project
#'
#' Establishes a new data collection framework project.
#'
#' @param name Name of the source. Defaults to the current directory name.
#' @param base_dir Path to the parent of the project directory (where the \code{name}
#' directory should be created). If \code{name} is not specified, will treat the current
#' directory as \code{name}, and \code{".."} as \code{base_dir}.
#' @param data_dir Name of the directory to store projects in, relative to \code{base_dir}.
#' @param github_account Name of the GitHub account that will host the repository.
#' @param branch Name of the repository's branch.
#' @param repo_name Name of the repository.
#' @param use_git Logical; if \code{TRUE}, will initialize a git repository.
#' @param open_after Logical; if \code{TRUE}, will open the project in a new RStudio instance.
#' @returns Nothing; creates default files and directories.
#' @section Data Collection Project:
#'
#' A data collection project starts with a \code{settings.json} file, which
#' specifies where source and bundle projects live (a \code{data} subdirectory by default).
#'
#' The bulk of the project will then be in the source and bundle projects, as created
#' by the \code{\link{dcf_add_source}} and \code{\link{dcf_add_bundle}}.
#'
#' Once these sub-projects are in place, they can be operated over by the
#' \code{\link{dcf_build}}, which processes each sub-project using
#' \code{\link{dcf_process}}, and checks them with \code{\link{dcf_check_sources}},
#' resulting in a report.
#'
#' @examples
#' base_dir <- tempdir()
#' dcf_init("project_name", base_dir)
#' list.files(paste0(base_dir, "/project_name"))
#'
#' @export

dcf_init <- function(
  name,
  base_dir = ".",
  data_dir = "data",
  github_account = "",
  branch = "main",
  repo_name = name,
  use_git = TRUE,
  open_after = FALSE
) {
  if (missing(name)) {
    name <- basename(getwd())
    base_dir <- ".."
  } else {
    name <- gsub("[^A-Za-z0-9]+", "_", name)
  }
  base_path <- paste0(base_dir, "/", name, "/")
  dir.create(base_path, showWarnings = FALSE, recursive = TRUE)
  paths <- paste0(
    base_path,
    c(
      "project.Rproj",
      "settings.json",
      "README.md",
      "scripts/build.R",
      ".github/workflows/build.yaml",
      ".gitignore"
    )
  )
  if (
    !file.exists(paths[[1L]]) && !length(list.files(base_path, "\\.Rproj$"))
  ) {
    writeLines("Version: 1.0\n", paths[[1L]])
  }
  if (!file.exists(paths[[2L]])) {
    jsonlite::write_json(
      list(
        name = name,
        data_dir = data_dir,
        github_account = github_account,
        branch = "main",
        repo_name = repo_name
      ),
      paths[[2L]],
      auto_unbox = TRUE,
      pretty = TRUE
    )
  }
  if (!file.exists(paths[[3L]])) {
    writeLines(
      paste0(
        c(
          paste("#", name),
          "",
          "This is a Data Collection Framework project, initialized with `dcf::dcf_init`.",
          "",
          "You can us the `dcf` package to check the source projects:",
          "",
          "```R",
          paste0('dcf::dcf_check_source()'),
          "```",
          "",
          "And process them:",
          "",
          "```R",
          paste0('dcf::dcf_process()'),
          "```"
        ),
        collapse = "\n"
      ),
      paths[[3L]]
    )
  }
  if (!file.exists(paths[[4L]])) {
    dir.create(dirname(paths[[4L]]), showWarnings = FALSE)
    writeLines(
      paste(c("library(dcf)", "dcf_build()"), collapse = "\n"),
      paths[[4L]]
    )
  }
  if (!file.exists(paths[[5L]])) {
    dir.create(dirname(paths[[5L]]), recursive = TRUE, showWarnings = FALSE)
    file.copy(system.file("workflows/build.yaml", package = "dcf"), paths[[5L]])
  }
  if (!file.exists(paths[[6L]])) {
    writeLines(
      paste(
        c(
          "*.Rproj",
          ".Rproj.user",
          ".Rprofile",
          ".Rhistory",
          ".Rdata",
          ".DS_Store",
          "renv"
        ),
        collapse = "\n"
      ),
      paths[[6L]]
    )
  }
  if (open_after) rstudioapi::openProject(paths[[1L]], newSession = TRUE)
}

#' Run a Project's Build Process
#'
#' Build a Data Collection Framework project,
#' which involves processing and checking all data projects.
#'
#' @param project_dir Path to the Data Collection Framework project to be built.
#' @param is_auto Logical; if \code{FALSE}, will run \code{\link{dcf_process}} as if it were run
#' manually.
#' @param ... Passes arguments to \code{\link{dcf_process}}.
#' @param make_diagram Logical; if \code{FALSE}, will not make a \code{status.md} diagram.
#' @returns A version of the project report, which is also written to
#' \code{project_dir/docs/report.json.gz}.
#' @examples
#' project_file <- "../../../pophive"
#' if (file.exists(project_file)) {
#'   report <- dcf_build(project_file)
#' }
#' @export

dcf_build <- function(
  project_dir = ".",
  is_auto = TRUE,
  ...,
  make_diagram = TRUE
) {
  settings <- dcf_read_settings(project_dir)
  data_dir <- paste0(project_dir, "/", settings$data_dir)
  processes <- list.files(
    data_dir,
    "process\\.json",
    recursive = TRUE,
    full.names = TRUE
  )
  process_state <- tools::md5sum(processes)
  process <- dcf_process(project_dir = project_dir, is_auto = TRUE, ...)
  issues <- dcf_check_sources(project_dir = project_dir)
  report_file <- paste0(project_dir, "/report.json.gz")
  if (
    !identical(
      process_state,
      tools::md5sum(list.files(
        data_dir,
        "process\\.json",
        recursive = TRUE,
        full.names = TRUE
      ))
    )
  ) {
    datapackages <- list.files(
      data_dir,
      "datapackage\\.json",
      recursive = TRUE,
      full.names = TRUE
    )
    names(datapackages) <- list.dirs(
      data_dir,
      recursive = FALSE,
      full.names = FALSE
    )
    names(processes) <- names(datapackages)
    report <- list(
      date = Sys.time(),
      settings = settings,
      source_times = process$timings,
      logs = process$logs,
      issues = issues,
      metadata = lapply(datapackages, jsonlite::read_json),
      processes = lapply(processes, jsonlite::read_json)
    )
    jsonlite::write_json(
      report,
      gzfile(report_file),
      auto_unbox = TRUE,
      dataframe = "columns"
    )
    if (make_diagram) {
      writeLines(
        dcf_status_diagram(project_dir),
        paste0(project_dir, "/status.md")
      )
    }
    invisible(report)
  } else {
    invisible(jsonlite::read_json(report_file))
  }
}

#' Make a Status Diagram
#'
#' Make a Data Collection Project status diagram.
#'
#' @param project_dir Path to the Data Collection Framework project to be built.
#' @returns A character vector of the status diagram, which is also written to
#' the \code{project_dir/status.md} file.
#' @examples
#' \dontrun{
#'   dcf_status_diagram("project_directory")
#' }
#' @export

dcf_status_diagram <- function(project_dir = ".") {
  report_file <- paste0(project_dir, "/report.json.gz")
  if (!file.exists(report_file)) {
    cli::cli_abort("no report file found")
  }
  report <- jsonlite::read_json(report_file)
  data_dir <- if (is.null(report$settings$data_dir)) "data" else
    report$settings$data_dir
  branch <- if (is.null(report$settings$branch)) "main" else
    report$settings$branch
  repo <- if (report$settings$github_account == "") {
    NULL
  } else {
    paste0(report$settings$github_account, "/", report$settings$repo_name)
  }
  indent <- "    "
  d <- c(
    'classDef source fill:#ffcdb4, stroke:#303',
    'classDef pass fill:#66bb6a, stroke:#303',
    'classDef warn fill:#ffa726, stroke:#303',
    'classDef fail fill:#f44336, stroke:#303'
  )
  sources <- NULL
  source_ids <- list()
  file_ids <- NULL
  relationships <- NULL
  projects <- NULL
  node_id <- 0L
  for (project_meta in report$metadata[names(sort(vapply(
    report$processes,
    function(p) !is.na(p$type) && p$type == "bundle",
    TRUE
  )))]) {
    name <- project_meta$name
    timing <- report$source_times[[name]]
    issues <- report$issues[[name]]
    measures <- report$metadata[[name]]$measure_info
    process <- report$processes[[name]]
    contents <- NULL
    node_id <- node_id + 1L
    if (!is.null(process$type) && process$type == "bundle") {
      dist_files <- grep(
        "measure_info",
        names(process$dist_state),
        value = TRUE,
        invert = TRUE
      )
      for (filename in basename(dist_files)) {
        contents <- c(
          contents,
          paste0(
            "n",
            node_id,
            '["`',
            if (is.null(repo)) filename else
              make_link(
                paste0(
                  "https://github.com/",
                  repo,
                  "/blob/",
                  branch,
                  "/",
                  data_dir,
                  "/",
                  name,
                  "/dist/",
                  filename
                ),
                filename
              ),
            '`"]'
          )
        )
      }
      relationships <- c(
        relationships,
        paste0("n", file_ids[process$source_files], " --> ", name)
      )
    } else {
      for (r in project_meta$resources) {
        file_path <- paste0(
          "./",
          report$settings$data_dir,
          "/",
          name,
          "/standard/",
          r$filename
        )
        file_ids[paste0(name, "/standard/", r$filename)] <- node_id
        file_issues <- issues[[file_path]]
        measure_sources <- NULL
        for (field in r$schema$fields) {
          field_source <- measures[[field$name]]$source
          for (s in field_source) {
            if (is.null(source_ids[[s$name]])) {
              source_id <- paste0("s", length(source_ids))
              source_ids[[s$name]] <- source_id
              sources[[source_id]] <- list(
                id = source_id,
                general = make_link(s$url, s$name),
                specific = NULL
              )
            }
            source_id <- source_ids[[s$name]]
            if (!is.null(s$location)) {
              sources[[source_id]]$specific <- unique(c(
                sources[[source_id]]$specific,
                make_link(s$location_url, s$location)
              ))
            }
            relationships <- unique(c(
              relationships,
              paste0(source_id, " --> n", node_id)
            ))
          }
        }
        contents <- c(
          contents,
          paste0(
            "n",
            node_id,
            '["`',
            if (is.null(repo)) r$filename else
              make_link(
                paste0(
                  "https://github.com/",
                  repo,
                  "/blob/",
                  branch,
                  "/",
                  data_dir,
                  "/",
                  name,
                  "/standard/",
                  r$filename
                ),
                r$filename
              ),
            if (length(file_issues)) make_list(unlist(file_issues)),
            paste0('`"]:::', if (length(file_issues)) "warn" else "pass")
          )
        )
      }
    }
    projects <- c(
      projects,
      c(
        paste0(
          "subgraph ",
          name,
          '["`',
          if (is.null(repo)) name else
            make_link(
              paste0(
                "https://github.com/",
                repo,
                "/tree/",
                branch,
                "/",
                data_dir,
                "/",
                name
              ),
              name
            ),
          '`"]'
        ),
        paste0(indent, contents),
        "end"
      )
    )
  }
  c(
    "```mermaid",
    "flowchart LR",
    paste0(
      indent,
      c(
        d,
        vapply(
          sources,
          function(s)
            paste(
              c(
                s$id,
                '("`<h4>',
                s$general,
                "</h4>",
                if (length(s$specific)) paste0("<br/>", make_list(s$specific)),
                '`"):::source'
              ),
              collapse = ""
            ),
          ""
        ),
        projects,
        relationships
      )
    ),
    "```"
  )
}

make_link <- function(url, name = NULL) {
  paste0(
    '<a href="',
    url,
    '" target="_blank" rel="noreferrer">',
    if (is.null(name)) sub("https?://(?:www\\.)?", "", url) else name,
    "</a>"
  )
}

make_list <- function(items) {
  paste0(
    "<ul>",
    vapply(items, function(i) paste0("<br/><li>", i, "</li>"), ""),
    "</ul>"
  )
}

#' Interact with a Source Process File
#'
#' Read or update the current source process file.
#'
#' @param path Path to the process JSON file.
#' @param updated An update version of the process definition. If specified, will
#' write this as the new process file, rather than reading any existing file.
#' @returns The process definition of the source project.
#' @examples
#' epic_process_file <- "../../data/epic/process.json"
#' if (file.exists(epic_process_file)) {
#'   dcf_process_record(path = epic_process_file)
#' }
#' @export

dcf_process_record <- function(path = "process.json", updated = NULL) {
  if (is.null(updated)) {
    if (!file.exists(path)) {
      cli::cli_abort("process file {path} does not exist")
    }
    jsonlite::read_json(path)
  } else {
    jsonlite::write_json(updated, path, auto_unbox = TRUE, pretty = TRUE)
    updated
  }
}

#' Update renv.lock
#'
#' Updates the \code{renv.lock} file with dependencies found in project scripts.
#'
#' @param project_dir Directory of the Data Collection project.
#' @param refresh Logical; if \code{FALSE}, will update an existing
#' \code{renv.lock} file, rather than recreating it.
#' @returns Nothing; writes an \code{renv.lock} file.
#' @examples
#' \dontrun{
#'   dcf_update_lock()
#' }
#' @export

dcf_update_lock <- function(
  project_dir = ".",
  refresh = TRUE
) {
  settings <- dcf_read_settings(project_dir)
  extra <- unique(
    renv::dependencies(list.files(
      paste0(project_dir, "/", settings$data_dir),
      "\\.[Rr]$",
      recursive = TRUE,
      full.names = TRUE
    ))$Package
  )
  not_installed <- !(extra %in% rownames(utils::installed.packages()))
  if (any(not_installed)) utils::install.packages(extra[not_installed])
  if (refresh) unlink(paste0(project_dir, "/renv.lock"))
  renv::snapshot(packages = extra, lockfile = paste0(project_dir, "/renv.lock"))
}

#' Create a datapackage.json template
#'
#' Initialize dataset documentation with a \code{datapackage.json} template, based on a
#' \href{https://specs.frictionlessdata.io/data-package}{Data Package} standard.
#'
#' @param name A unique name for the dataset; allowed characters are \code{[a-z._/-]}.
#' @param title A display name for the dataset; if not specified, will be a formatted version of \code{name}.
#' @param dir Directory in which to save the \code{datapackage.json} file.
#' @param ... passes arguments to \code{\link{dcf_datapackage_add}}.
#' @param write Logical; if \code{FALSE}, the package object will not be written to a file.
#' @param overwrite Logical; if \code{TRUE} and \code{write} is \code{TRUE}, an existing
#' \code{datapackage.json} file will be overwritten.
#' @param quiet Logical; if \code{TRUE}, will not print messages or navigate to files.
#' @examples
#' \dontrun{
#' # make a template datapackage.json file in the current working directory
#' dcf_datapackage_init("mtcars", "Motor Trend Car Road Tests")
#' }
#' @return An invisible list with the content written to the \code{datapackage.json} file.
#' @seealso Add basic information about a dataset with \code{\link{dcf_datapackage_add}}.
#' @export

dcf_datapackage_init <- function(
  name,
  title = name,
  dir = ".",
  ...,
  write = TRUE,
  overwrite = FALSE,
  quiet = !interactive()
) {
  if (missing(name)) {
    cli::cli_abort("{.arg name} must be specified")
  }
  package <- list(
    name = name,
    title = if (title == name) {
      gsub("\\b(\\w)", "\\U\\1", gsub("[._/-]", " ", name), perl = TRUE)
    } else {
      title
    },
    licence = list(
      url = "http://opendatacommons.org/licenses/pddl",
      name = "Open Data Commons Public Domain",
      version = "1.0",
      id = "odc-pddl"
    ),
    resources = list()
  )
  package_path <- normalizePath(paste0(dir, "/datapackage.json"), "/", FALSE)
  if (write && !overwrite && file.exists(package_path)) {
    cli::cli_abort(c(
      "datapackage ({.path {package_path}}) already exists",
      i = "add {.code overwrite = TRUE} to overwrite it"
    ))
  }
  if (length(list(...))) {
    package$resources <- dcf_datapackage_add(..., dir = dir, write = FALSE)
  }
  if (write) {
    if (!dir.exists(dir)) {
      dir.create(dir, recursive = TRUE)
    }
    jsonlite::write_json(
      package,
      package_path,
      auto_unbox = TRUE,
      digits = 6,
      pretty = TRUE
    )
    if (!quiet) {
      cli::cli_bullets(c(
        v = "created metadata template for {name}:",
        "*" = paste0("{.path ", package_path, "}")
      ))
      rstudioapi::navigateToFile(package_path)
    }
  }
  invisible(package)
}

#' Adds a Source Project
#'
#' Establishes a new data source project, used to collect and prepare data from a new source.
#'
#' @param name Name of the source.
#' @param project_dir Path to the Data Collection Framework project.
#' @param open_after Logical; if \code{FALSE}, will not open the project.
#' @returns Nothing; creates default files and directories.
#' @section Project:
#'
#' Within a source project, there are two files to edits:
#' \itemize{
#'   \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun.
#'     It should store raw data and resources in \code{raw/} where possible,
#'     then use what's in \code{raw/} to produce standard-format files in \code{standard/}.
#'     This file is sourced from its location during processing, so any system paths
#'     must be relative to itself.
#'   \item \strong{\code{measure_info.json}}: This is where you can record information
#'     about the variables included in the standardized data files.
#'     See \code{\link{dcf_measure_info}}.
#' }
#'
#' @examples
#' project_dir <- paste0(tempdir(), "/temp_project")
#' dcf_init("temp_project", dirname(project_dir))
#' dcf_add_source("source_name", project_dir)
#' list.files(paste0(project_dir, "/data/source_name"))
#'
#' @export

dcf_add_source <- function(
  name,
  project_dir = ".",
  open_after = interactive()
) {
  if (missing(name)) {
    cli::cli_abort("specify a name")
  }
  name <- gsub("[^A-Za-z0-9]+", "_", name)
  settings <- dcf_read_settings(project_dir)
  base_dir <- paste0(project_dir, "/", settings$data_dir)
  base_path <- paste0(base_dir, "/", name, "/")
  dir.create(paste0(base_path, "raw"), showWarnings = FALSE, recursive = TRUE)
  dir.create(paste0(base_path, "standard"), showWarnings = FALSE)
  paths <- paste0(
    base_path,
    c(
      "measure_info.json",
      "ingest.R",
      "project.Rproj",
      "standard/datapackage.json",
      "process.json",
      "README.md"
    )
  )
  if (!file.exists(paths[[1L]])) {
    dcf_measure_info(
      paths[[1L]],
      example_variable = list(),
      verbose = FALSE,
      open_after = FALSE
    )
  }
  if (!file.exists(paths[[2L]])) {
    writeLines(
      paste0(
        c(
          "#",
          "# Download",
          "#",
          "",
          "# add files to the `raw` directory",
          "",
          "#",
          "# Reformat",
          "#",
          "",
          "# read from the `raw` directory, and write to the `standard` directory",
          ""
        ),
        collapse = "\n"
      ),
      paths[[2L]]
    )
  }
  if (!file.exists(paths[[3L]])) {
    writeLines("Version: 1.0\n", paths[[3L]])
  }
  if (!file.exists(paths[[4L]])) {
    dcf_datapackage_init(
      name,
      dir = paste0(base_path, "standard"),
      quiet = TRUE
    )
  }

  if (!file.exists(paths[[5L]])) {
    jsonlite::write_json(
      list(
        name = name,
        type = "source",
        scripts = list(
          list(
            path = "ingest.R",
            manual = FALSE,
            frequency = 0L,
            last_run = "",
            run_time = "",
            last_status = list(log = "", success = TRUE)
          )
        ),
        checked = "",
        check_results = list()
      ),
      paths[[5L]],
      auto_unbox = TRUE,
      pretty = TRUE
    )
  }
  if (!file.exists(paths[[6L]])) {
    writeLines(
      paste0(
        c(
          paste("#", name),
          "",
          "This is a dcf data source project, initialized with `dcf::dcf_add_source`.",
          "",
          "You can us the `dcf` package to check the project:",
          "",
          "```R",
          paste0('dcf_check_source("', name, '", "..")'),
          "```",
          "",
          "And process it:",
          "",
          "```R",
          paste0('dcf_process("', name, '", "..")'),
          "```"
        ),
        collapse = "\n"
      ),
      paths[[6L]]
    )
  }
  if (open_after) rstudioapi::openProject(paths[[3L]], newSession = TRUE)
}

1		#' Map States to Health Regions
2		#'
3		#' Maps state FIPS state numeric codes to Human Health Service regions.
4		#'
5		#' @param geoids Character vector of GEOIDs.
6		#' @param prefix A prefix to add to region IDs.
7		#' @returns A vector of Health Region names the same length as \code{geoids}.
8		#' @examples
9		#' dcf_to_health_region(c("01", "01001", "02", "02001"))
10		#' @export
11
12		dcf_to_health_region <- function(geoids, prefix = "Region ") {
13	1x	regions <- c(
14	1x	"01" = 4,
15	1x	"02" = 10,
16	1x	"04" = 9,
17	1x	"05" = 6,
18	1x	"06" = 9,
19	1x	"08" = 8,
20	1x	"09" = 1,
21	1x	"10" = 3,
22	1x	"11" = 3,
23	1x	"12" = 4,
24	1x	"13" = 4,
25	1x	"15" = 9,
26	1x	"16" = 10,
27	1x	"17" = 5,
28	1x	"18" = 5,
29	1x	"19" = 7,
30	1x	"20" = 7,
31	1x	"21" = 4,
32	1x	"22" = 6,
33	1x	"23" = 1,
34	1x	"24" = 3,
35	1x	"25" = 1,
36	1x	"26" = 5,
37	1x	"27" = 5,
38	1x	"28" = 4,
39	1x	"29" = 7,
40	1x	"30" = 8,
41	1x	"31" = 7,
42	1x	"32" = 9,
43	1x	"33" = 1,
44	1x	"34" = 2,
45	1x	"35" = 6,
46	1x	"36" = 2,
47	1x	"37" = 4,
48	1x	"38" = 8,
49	1x	"39" = 5,
50	1x	"40" = 6,
51	1x	"41" = 10,
52	1x	"42" = 3,
53	1x	"44" = 1,
54	1x	"45" = 4,
55	1x	"46" = 8,
56	1x	"47" = 4,
57	1x	"48" = 6,
58	1x	"49" = 8,
59	1x	"50" = 1,
60	1x	"51" = 3,
61	1x	"53" = 10,
62	1x	"54" = 3,
63	1x	"55" = 5,
64	1x	"56" = 8,
65	1x	"72" = 2,
66	1x	"66" = 6,
67	1x	"74" = 2
68		)
69	1x	regions[] <- paste0(prefix, regions)
70	1x	unname(regions[substring(geoids, 1L, 2L)])
71		}

1		dcf_read_settings <- function(project_dir = ".") {
2	17x	settings_file <- paste0(project_dir, "/settings.json")
3	17x	if (!file.exists(settings_file)) {
4	!	cli::cli_abort(
5	!	"{.arg project_dir} ({project_dir}) does not appear to be a Data Collection Framework project"
6		)
7		}
8	17x	jsonlite::read_json(settings_file)
9		}

1		#' Download Census Population Data
2		#'
3		#' Download American Community Survey population data from the U.S. Census Bureau.
4		#'
5		#' @param year Data year.
6		#' @param out_dir Directory to download the file to.
7		#' @param state_only Logical; if \code{TRUE}, will only load state data.
8		#' Will still download county data.
9		#' @param overwrite Logical; if \code{TRUE}, will re-download and overwrite existing data.
10		#' @param verbose Logical; if \code{FALSE}, will not display status messages.
11		#' @returns A \code{data.frame} including \code{GEOID} and \code{region_name}
12		#' for states and counties, along with their population, in total and within
13		#' age brackets.
14		#' @examples
15		#' if (file.exists("../../resources/census_population_2021.csv.xz")) {
16		#' dcf_load_census(2021, "../../resources")[1:10, ]
17		#' }
18		#' @export
19
20		dcf_load_census <- function(
21		year = 2021,
22		out_dir = NULL,
23		state_only = FALSE,
24		overwrite = FALSE,
25		verbose = TRUE
26		) {
27	2x	out_file <- paste0(out_dir, "/census_population_", year, ".csv.xz")
28	2x	write_out <- !is.null(out_dir)
29	2x	if (!overwrite && write_out && file.exists(out_file)) {
30	1x	if (verbose) {
31	1x	cli::cli_progress_step("reading in existing file")
32		}
33	1x	invisible(as.data.frame(vroom::vroom(
34	1x	out_file,
35	1x	delim = ",",
36	1x	col_types = list(
37	1x	GEOID = "c",
38	1x	region_name = "c",
39	1x	Total = "i",
40	1x	`<10 Years` = "i",
41	1x	`10-14 Years` = "i",
42	1x	`15-19 Years` = "i",
43	1x	`20-39 Years` = "i",
44	1x	`40-64 Years` = "i",
45	1x	`65+ Years` = "i"
46		),
47	1x	n_max = if (state_only) 52L else Inf
48		)))
49		} else {
50		# GEOID to region name mapping
51	1x	id_url <- "https://www2.census.gov/geo/docs/reference/codes2020/national_"
52	1x	if (verbose) {
53	1x	cli::cli_progress_step("downloading state IDs map")
54		}
55	1x	state_ids <- vroom::vroom(
56	1x	paste0(id_url, "state2020.txt"),
57	1x	delim = "\|",
58	1x	col_types = list(
59	1x	STATE = "c",
60	1x	STATEFP = "c",
61	1x	STATENS = "c",
62	1x	STATE_NAME = "c"
63		)
64		)
65	1x	if (verbose) {
66	1x	cli::cli_progress_step("downloading county IDs map")
67		}
68	1x	county_ids <- vroom::vroom(
69	1x	paste0(id_url, "county2020.txt"),
70	1x	delim = "\|",
71	1x	col_types = list(
72	1x	STATE = "c",
73	1x	STATEFP = "c",
74	1x	COUNTYFP = "c",
75	1x	COUNTYNS = "c",
76	1x	COUNTYNAME = "c",
77	1x	CLASSFP = "c",
78	1x	FUNCSTAT = "c"
79		)
80		)
81	1x	region_name = structure(
82	1x	sub(
83	1x	" County",
84		"",
85	1x	c(
86	1x	state_ids$STATE_NAME,
87	1x	paste0(county_ids$COUNTYNAME, ", ", county_ids$STATE)
88		),
89	1x	fixed = TRUE
90		),
91	1x	names = c(
92	1x	state_ids$STATEFP,
93	1x	paste0(county_ids$STATEFP, county_ids$COUNTYFP)
94		)
95		)
96
97		# population data
98
99		## age group labels from IDs
100	1x	if (verbose) {
101	1x	cli::cli_progress_step("downloading ACS variable lables")
102		}
103	1x	labels <- vroom::vroom(
104	1x	paste0(
105	1x	"https://www2.census.gov/programs-surveys/acs/summary_file/",
106	1x	min(2021L, year),
107	1x	"/sequence-based-SF/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt"
108		),
109	1x	delim = ",",
110	1x	col_types = list(
111	1x	`File ID` = "c",
112	1x	`Table ID` = "c",
113	1x	`Sequence Number` = "c",
114	1x	`Line Number` = "d",
115	1x	`Start Position` = "i",
116	1x	`Total Cells in Table` = "c",
117	1x	`Total Cells in Sequence` = "i",
118	1x	`Table Title` = "c",
119	1x	`Subject Area` = "c"
120		)
121		)
122	1x	variable_labels <- structure(
123	1x	labels$`Table Title`,
124	1x	names = paste0(
125	1x	labels$`Table ID`,
126	1x	"_E",
127	1x	formatC(labels$`Line Number`, width = 3L, flag = 0L)
128		)
129		)
130
131		## age group counts
132	1x	url <- paste0(
133	1x	"https://www2.census.gov/programs-surveys/acs/summary_file/",
134	1x	year,
135	1x	"/table-based-SF/data/5YRData/acsdt5y",
136	1x	year,
137	1x	"-b01001.dat"
138		)
139	1x	if (verbose) {
140	1x	cli::cli_progress_step("downloading population data")
141		}
142	1x	data <- vroom::vroom(url, delim = "\|", col_types = list(GEO_ID = "c"))
143	1x	data <- data[
144	1x	grep("0[45]00000US", data$GEO_ID),
145	1x	grep("E", colnames(data), fixed = TRUE)
146		]
147	1x	colnames(data)[-1L] <- variable_labels[colnames(data)[-1L]]
148
149	1x	age_groups <- list(
150	1x	Total = "Total:",
151	1x	`<10 Years` = c("Under 5 years", "5 to 9 years"),
152	1x	`10-14 Years` = "10 to 14 years",
153	1x	`15-19 Years` = c("15 to 17 years", "18 and 19 years"),
154	1x	`20-39 Years` = c(
155	1x	"20 years",
156	1x	"21 years",
157	1x	"22 to 24 years",
158	1x	"25 to 29 years",
159	1x	"30 to 34 years",
160	1x	"35 to 39 years"
161		),
162	1x	`40-64 Years` = c(
163	1x	"40 to 44 years",
164	1x	"45 to 49 years",
165	1x	"50 to 54 years",
166	1x	"55 to 59 years",
167	1x	"60 and 61 years",
168	1x	"62 to 64 years"
169		),
170	1x	`65+ Years` = c(
171	1x	"65 and 66 years",
172	1x	"67 to 69 years",
173	1x	"70 to 74 years",
174	1x	"75 to 79 years",
175	1x	"80 to 84 years",
176	1x	"85 years and over"
177		)
178		)
179	1x	if (verbose) {
180	1x	cli::cli_progress_step("agregating across sex and fine age groups")
181		}
182	1x	pop <- cbind(
183	1x	data.frame(GEOID = substring(data$GEO_ID, 10L), region_name = ""),
184	1x	do.call(
185	1x	cbind,
186	1x	lapply(age_groups, function(l) rowSums(data[, colnames(data) %in% l]))
187		)
188		)
189	1x	pop$region_name = region_name[pop$GEOID]
190	1x	states <- pop[1L:52L, ]
191	1x	health_regions <- as.data.frame(do.call(
192	1x	rbind,
193	1x	lapply(
194	1x	split(
195	1x	states[, -(1L:2L)],
196	1x	dcf_to_health_region(states$GEOID, "hhs_")
197		),
198	1x	colSums
199		)
200		))
201	1x	health_regions$GEOID <- rownames(health_regions)
202	1x	health_regions$region_name <- sub(
203	1x	"hhs_",
204	1x	"Health Region ",
205	1x	rownames(health_regions),
206	1x	fixed = TRUE
207		)
208	1x	pop <- rbind(pop, health_regions[, colnames(pop)])
209
210	1x	if (write_out) {
211	1x	if (verbose) {
212	1x	cli::cli_progress_step("writing output")
213		}
214	1x	dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
215	1x	vroom::vroom_write(pop, out_file, ",")
216		}
217	1x	invisible(if (state_only) states else pop)
218		}
219		}

1		#' Adds documentation of a dataset to a datapackage
2		#'
3		#' Add information about variables in a dataset to a \code{datapackage.json} metadata file.
4		#'
5		#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}.
6		#' @param meta Information about each data file. A list with a list entry for each entry in
7		#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all.
8		#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to
9		#' \code{dir}, or a list with the read-in version.
10		#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}.
11		#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting
12		#' \code{packagename}.
13		#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information.
14		#' @param sha A number specifying the Secure Hash Algorithm function,
15		#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}).
16		#' @param pretty Logical; if \code{TRUE}, will pretty-print the datapackage.
17		#' @param summarize_ids Logical; if \code{TRUE}, will include ID columns in schema field summaries.
18		#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving.
19		#' @param verbose Logical; if \code{FALSE}, will not show status messages.
20		#' @details
21		#' \code{meta} should be a list with unnamed entries for entry in \code{filename},
22		#' and each entry can include a named entry for any of these:
23		#' \describe{
24		#' \item{source}{
25		#' A list or list of lists with entries for at least \code{name}, and ideally for \code{url}.
26		#' }
27		#' \item{ids}{
28		#' A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset).
29		#' Might also include \code{map} with a list or path to a JSON file resulting in a list with an
30		#' entry for each ID, and additional information about that entity, to be read in a its features.
31		#' All files will be loaded to help with aggregation, but local files will be included in the datapackage,
32		#' whereas hosted files will be loaded client-side.
33		#' }
34		#' \item{time}{
35		#' A string giving the name of a variable in the dataset representing a repeated observation of the same entity.
36		#' }
37		#' \item{variables}{
38		#' A list with named entries providing more information about the variables in the dataset.
39		#' See \code{\link{dcf_measure_info}}.
40		#' }
41		#' }
42		#' @examples
43		#' \dontrun{
44		#' # write example data
45		#' write.csv(mtcars, "mtcars.csv")
46		#'
47		#' # add it to an existing datapackage.json file in the current working directory
48		#' dcf_datapackage_add("mtcars.csv")
49		#' }
50		#' @return An invisible version of the updated datapackage, which is also written to
51		#' \code{datapackage.json} if \code{write = TRUE}.
52		#' @seealso Initialize the \code{datapackage.json} file with \code{\link{dcf_datapackage_init}}.
53		#' @export
54
55		dcf_datapackage_add <- function(
56		filename,
57		meta = list(),
58		packagename = "datapackage.json",
59		dir = ".",
60		write = TRUE,
61		refresh = TRUE,
62		sha = "512",
63		pretty = FALSE,
64		summarize_ids = FALSE,
65		open_after = FALSE,
66		verbose = interactive()
67		) {
68	3x	if (missing(filename)) {
69	!	cli::cli_abort("{.arg filename} must be specified")
70		}
71	3x	setnames <- names(filename)
72	3x	if (file.exists(filename[[1]])) {
73	!	if (dir == ".") {
74	!	dir <- dirname(filename[[1]])
75		}
76	!	filename <- basename(filename)
77		}
78	3x	if (any(!file.exists(paste0(dir, "/", filename)))) {
79	!	filename <- filename[!file.exists(filename)]
80	!	cli::cli_abort("{?a file/files} did not exist: {filename}")
81		}
82	3x	package <- if (
83	3x	is.character(packagename) && file.exists(paste0(dir, "/", packagename))
84		) {
85	3x	paste0(dir, "/", packagename)
86		} else {
87	!	packagename
88		}
89	3x	if (write) {
90	3x	if (is.character(package)) {
91	3x	package <- paste0(dir, "/", packagename)
92	3x	package <- if (file.exists(package)) {
93	3x	packagename <- package
94	3x	jsonlite::read_json(package)
95		} else {
96	!	dcf_datapackage_init(
97	!	if (!is.null(setnames)) setnames[[1]] else filename[[1]],
98	!	dir = dir
99		)
100		}
101		}
102	3x	if (!is.list(package)) {
103	!	cli::cli_abort(c(
104	!	"{.arg package} does not appear to be in the right format",
105	!	i = "this should be (or be read in from JSON as) a list with a {.code resource} entry"
106		))
107		}
108		}
109	3x	if (!is.list(package)) {
110	!	package <- list()
111		}
112	3x	single_meta <- FALSE
113	3x	metas <- if (!is.null(names(meta))) {
114	3x	if (!is.null(setnames) && all(setnames %in% names(meta))) {
115	!	meta[setnames]
116		} else {
117	3x	single_meta <- TRUE
118	3x	if (length(meta$variables) == 1 && is.character(meta$variables)) {
119	!	if (!file.exists(meta$variables)) {
120	!	meta$variables <- paste0(dir, "/", meta$variables)
121		}
122	!	if (file.exists(meta$variables)) {
123	!	meta$variables <- jsonlite::read_json(meta$variables)
124		}
125		}
126	3x	meta$variables <- replace_equations(meta$variables)
127	3x	meta
128		}
129		} else {
130	!	meta[seq_along(filename)]
131		}
132	3x	if (!single_meta) {
133	!	metas <- lapply(metas, function(m) {
134	!	m$variables <- replace_equations(m$variables)
135	!	m
136		})
137		}
138	3x	collect_metadata <- function(file) {
139	3x	f <- paste0(dir, "/", filename[[file]])
140	3x	m <- if (single_meta) meta else metas[[file]]
141	3x	format <- if (grepl(".parquet", f, fixed = TRUE)) {
142	!	"parquet"
143	3x	} else if (grepl(".csv", f, fixed = TRUE)) {
144	3x	"csv"
145	3x	} else if (grepl(".rds", f, fixed = TRUE)) {
146	!	"rds"
147		} else {
148	!	"tsv"
149		}
150	3x	if (is.na(format)) {
151	!	format <- "rds"
152		}
153	3x	info <- file.info(f)
154	3x	metas <- list()
155	3x	unpack_meta <- function(n) {
156	9x	if (!length(m[[n]])) {
157	3x	list()
158	6x	} else if (is.list(m[[n]][[1]])) {
159	!	m[[n]]
160		} else {
161	6x	list(m[[n]])
162		}
163		}
164	3x	ids <- unpack_meta("ids")
165	3x	idvars <- NULL
166	3x	for (i in seq_along(ids)) {
167	3x	if (is.list(ids[[i]])) {
168	3x	if (
169	!	length(ids[[i]]$map) == 1 &&
170	!	is.character(ids[[i]]$map) &&
171	!	file.exists(ids[[i]]$map)
172		) {
173	!	ids[[i]]$map_content <- paste(
174	!	readLines(ids[[i]]$map, warn = FALSE),
175	!	collapse = ""
176		)
177		}
178		} else {
179	3x	ids[[i]] <- list(variable = ids[[i]])
180		}
181	3x	if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable)
182		}
183	3x	data <- if (format == "rds") {
184	!	tryCatch(readRDS(f), error = function(e) NULL)
185	3x	} else if (format == "parquet") {
186	!	tryCatch(arrow::read_parquet(f), error = function(e) NULL)
187		} else {
188	3x	attempt_read(f, c("geography", "time", idvars))
189		}
190	3x	if (is.null(data)) {
191	!	cli::cli_abort(c(
192	!	paste0("failed to read in the data file ({.file {f}})"),
193	!	i = "check that it is in a compatible format"
194		))
195		}
196	3x	if (!all(rownames(data) == seq_len(nrow(data)))) {
197	!	data <- cbind(`_row` = rownames(data), data)
198		}
199	3x	timevar <- unlist(unpack_meta("time"))
200	3x	times <- if (is.null(timevar)) rep(1, nrow(data)) else data[[timevar]]
201	3x	times_unique <- unique(times)
202	3x	if (!single_meta) {
203	!	varinf <- unpack_meta("variables")
204	!	if (length(varinf) == 1 && is.character(varinf[[1]])) {
205	!	if (!file.exists(varinf[[1]])) {
206	!	varinf[[1]] <- paste0(dir, "/", varinf[[1]])
207		}
208	!	if (file.exists(varinf[[1]])) {
209	!	if (varinf[[1]] %in% names(metas)) {
210	!	varinf <- metas[[varinf[[1]]]]
211		} else {
212	!	varinf <- metas[[varinf[[1]]]] <- dcf_measure_info(
213	!	varinf[[1]],
214	!	write = FALSE,
215	!	render = TRUE
216		)
217		}
218	!	varinf <- varinf[varinf != ""]
219		}
220		}
221	!	varinf_full <- names(varinf)
222	!	varinf_suf <- sub("^[^:]+:", "", varinf_full)
223		}
224	3x	res <- list(
225	3x	bytes = as.integer(info$size),
226	3x	encoding = stringi::stri_enc_detect(f)[[1]][1, 1],
227	3x	md5 = tools::md5sum(f)[[1]],
228	3x	format = format,
229	3x	name = if (!is.null(setnames)) {
230	!	setnames[file]
231	3x	} else if (!is.null(m$name)) {
232	!	m$name
233		} else {
234	3x	sub("\\.[^.]*$", "", basename(filename[[file]]))
235		},
236	3x	filename = filename[[file]],
237	3x	versions = get_versions(f),
238	3x	source = unpack_meta("source"),
239	3x	ids = ids,
240	3x	id_length = if (length(idvars)) {
241	3x	id_lengths <- nchar(data[[idvars[1]]])
242	3x	id_lengths <- id_lengths[!is.na(id_lengths)]
243	!	if (all(id_lengths == id_lengths[1])) id_lengths[1] else 0
244		} else {
245	!	0
246		},
247	3x	time = timevar,
248	3x	profile = "data-resource",
249	3x	created = as.character(info$mtime),
250	3x	last_modified = as.character(info$ctime),
251	3x	row_count = nrow(data),
252	3x	entity_count = if (length(idvars)) {
253	3x	length(unique(data[[idvars[1]]]))
254		} else {
255	!	nrow(data)
256		},
257	3x	schema = list(
258	3x	fields = lapply(
259	3x	if (summarize_ids) {
260	3x	colnames(data)
261		} else {
262	!	colnames(data)[!colnames(data) %in% idvars]
263		},
264	3x	function(cn) {
265	9x	v <- data[[cn]]
266	9x	invalid <- !is.finite(v)
267	9x	r <- list(name = cn, duplicates = sum(duplicated(v)))
268	9x	if (!single_meta) {
269	!	if (cn %in% varinf_full) {
270	!	r$info <- varinf[[cn]]
271	!	} else if (cn %in% varinf_suf) {
272	!	r$info <- varinf[[which(varinf_suf == cn)]]
273		}
274	!	r$info <- r$info[r$info != ""]
275		}
276	9x	su <- !is.na(v)
277	9x	if (any(su)) {
278	9x	r$time_range <- which(times_unique %in% range(times[su])) - 1
279	9x	r$time_range <- if (length(r$time_range)) {
280	9x	r$time_range[c(1, length(r$time_range))]
281		} else {
282	!	c(-1, -1)
283		}
284		} else {
285	!	r$time_range <- c(-1, -1)
286		}
287	9x	if (!is.character(v) && all(invalid)) {
288	!	r$type <- "unknown"
289	!	r$missing <- length(v)
290	9x	} else if (is.numeric(v)) {
291	3x	r$type <- if (all(invalid \| as.integer(v) == v)) {
292	3x	"integer"
293		} else {
294	!	"float"
295		}
296	3x	r$missing <- sum(invalid)
297	3x	r$mean <- round(mean(v, na.rm = TRUE), 6)
298	3x	r$sd <- round(stats::sd(v, na.rm = TRUE), 6)
299	3x	r$min <- round(min(v, na.rm = TRUE), 6)
300	3x	r$max <- round(max(v, na.rm = TRUE), 6)
301		} else {
302	6x	r$type <- "string"
303	6x	if (!is.factor(v)) {
304	6x	v <- as.factor(as.character(v))
305		}
306	6x	r$missing <- sum(is.na(v) \| is.nan(v) \| grepl("^[\\s.-]$", v))
307	6x	r$table <- structure(as.list(tabulate(v)), names = levels(v))
308		}
309	9x	r
310		}
311		)
312		)
313		)
314	3x	if (!single_meta && "_references" %in% names(varinf)) {
315	!	res[["_references"]] <- varinf[["_references"]]
316		}
317	3x	if (Sys.which("openssl") != "") {
318	3x	res[[paste0("sha", sha)]] <- calculate_sha(f, sha)
319		}
320	3x	res
321		}
322	3x	metadata <- lapply(seq_along(filename), collect_metadata)
323	3x	if (single_meta) {
324	3x	package$measure_info <- lapply(meta$variables, function(e) e[e != ""])
325		}
326	3x	names <- vapply(metadata, "[[", "", "filename")
327	3x	for (resource in package$resources) {
328	2x	if (length(resource$versions)) {
329	!	su <- which(names %in% resource$filename)
330	!	if (length(su)) {
331	!	if (length(metadata[[su]]$versions)) {
332	!	metadata[[su]]$versions <- rbind(
333	!	metadata[[su]]$versions,
334	!	if (is.data.frame(resource$versions)) {
335	!	resource$versions
336		} else {
337	!	as.data.frame(do.call(cbind, resource$versions))
338		}
339		)
340	!	metadata[[su]]$versions <- metadata[[su]]$versions[
341	!	!duplicated(metadata[[su]]$versions),
342		]
343		}
344		}
345		}
346		}
347	3x	if (refresh) {
348	3x	package$resources <- metadata
349		} else {
350	!	package$resources <- c(
351	!	metadata,
352	!	package$resources[
353	!	!(vapply(package$resources, "[[", "", "filename") %in% names)
354		]
355		)
356		}
357	3x	if (write) {
358	3x	packagename <- if (is.character(packagename)) {
359	3x	packagename
360		} else {
361	!	"datapackage.json"
362		}
363	3x	jsonlite::write_json(
364	3x	package,
365	3x	if (file.exists(packagename)) {
366	3x	packagename
367		} else {
368	!	paste0(dir, "/", packagename)
369		},
370	3x	auto_unbox = TRUE,
371	3x	digits = 6,
372	3x	dataframe = "columns",
373	3x	pretty = pretty
374		)
375	3x	if (verbose) {
376	!	cli::cli_bullets(c(
377	!	v = paste(
378	!	if (refresh) "updated resource in" else "added resource to",
379	!	"datapackage.json:"
380		),
381	!	"*" = paste0("{.path ", packagename, "}")
382		))
383	!	if (open_after) rstudioapi::navigateToFile(packagename)
384		}
385		}
386	3x	invisible(package)
387		}
388
389		get_versions <- function(file) {
390	3x	log <- suppressWarnings(system2(
391	3x	"git",
392	3x	c("log", file),
393	3x	stdout = TRUE
394		))
395	3x	if (is.null(attr(log, "status"))) {
396	1x	log_entries <- strsplit(paste(log, collapse = "\|"), "commit ")[[
397	1x	1
398		]]
399	1x	log_entries <- do.call(
400	1x	rbind,
401	1x	Filter(
402	1x	function(x) length(x) == 4L,
403	1x	strsplit(
404	1x	log_entries[log_entries != ""],
405	1x	"\\\|+(?:[^:]+:)?\\s*"
406		)
407		)
408		)
409	1x	if (length(log_entries)) {
410	1x	colnames(log_entries) <- c(
411	1x	"hash",
412	1x	"author",
413	1x	"date",
414	1x	"message"
415		)
416	1x	as.data.frame(log_entries)
417		}
418		}
419		}
420
421		attempt_read <- function(file, id_cols) {
422	3x	tryCatch(
423		{
424	3x	sep <- if (grepl(".csv", file, fixed = TRUE)) "," else "\t"
425	3x	cols <- scan(file, "", nlines = 1, sep = sep, quiet = TRUE)
426	3x	types <- rep("?", length(cols))
427	3x	types[cols %in% id_cols] <- "c"
428	3x	arrow::read_delim_arrow(
429	3x	gzfile(file),
430	3x	sep,
431	3x	col_names = cols,
432	3x	col_types = paste(types, collapse = ""),
433	3x	skip = 1
434		)
435		},
436	3x	error = function(e) NULL
437		)
438		}
439
440		calculate_sha <- function(file, level) {
441	3x	if (Sys.which("openssl") != "") {
442	3x	tryCatch(
443	3x	strsplit(
444	3x	system2(
445	3x	"openssl",
446	3x	c("dgst", paste0("-sha", level), shQuote(file)),
447	3x	TRUE
448		),
449		" ",
450	3x	fixed = TRUE
451	3x	)[[1]][2],
452	3x	error = function(e) ""
453		)
454		} else {
455		""
456		}
457		}

1		#' Standardize Epic Data
2		#'
3		#' Standardize a raw Epic data table.
4		#'
5		#' @param raw_data Raw Epic data, such as returned from \link{dcf_read_epic}.
6		#' @returns A standardized form of \code{data}.
7		#' @section Standardization:
8		#' \itemize{
9		#' \item Collapse location columns (\code{state} or \code{county}) to a single
10		#' \code{geography} column, and region names to IDs.
11		#' \item Collapse time columns (\code{year}, \code{month}, or \code{week}) to a single
12		#' \code{time} column, and clean up value formatting.
13		#' \item Drop rows with no values across value columns.
14		#' }
15		#' @examples
16		#' \dontrun{
17		#' raw_data <- dcf_read_epic("data/epic/raw/flu.csv.xz")
18		#' standard_data <- dcf_process_epic_raw(raw_data)
19		#' }
20		#'
21		#' @export
22
23		dcf_standardize_epic <- function(raw_data) {
24	1x	region_names <- epic_id_maps$regions
25	1x	names(region_names) <- gsub(
26	1x	" (?:CITY AND BOROUGH\|BOROUGH\|PARISH\|MUNICIPALITY\|MUNICIPIO)\|[.']",
27		"",
28	1x	toupper(names(region_names))
29		)
30	1x	cols <- colnames(raw_data)
31	1x	time_col <- which(cols == "year")
32	1x	if (length(time_col)) {
33	1x	colnames(raw_data)[time_col] <- "time"
34	1x	raw_data$time <- as.integer(substring(
35	1x	raw_data$time,
36	1x	nchar(raw_data$time) - 4L
37		))
38		}
39	1x	month_col <- which(cols == "month")
40	1x	if (length(month_col)) {
41	!	raw_data$time <- paste0(
42	!	raw_data$time,
43		"-",
44	!	epic_id_maps$months[raw_data$month]
45		)
46		}
47	1x	week_col <- which(cols == "week")
48	1x	if (length(week_col)) {
49	1x	raw_data$time <- paste0(
50	1x	raw_data$time,
51		"-",
52	1x	vapply(
53	1x	strsplit(raw_data$week, "[^A-Za-z0-9]"),
54	1x	function(p) {
55	4x	paste0(
56	4x	epic_id_maps$months[[p[[1L]]]],
57		"-",
58	4x	formatC(as.integer(p[[2L]]), width = 2L, flag = "0")
59		)
60		},
61		""
62		)
63		)
64		}
65	1x	geo_col <- grep("^(?:state\|county)", cols)
66	1x	if (length(geo_col)) {
67	1x	colnames(raw_data)[geo_col] <- "geography"
68	1x	raw_data$geography <- toupper(raw_data$geography)
69	1x	missing_geo <- !(raw_data$geography %in% names(region_names))
70	1x	if (any(missing_geo)) {
71	1x	geo <- sub(
72	1x	"LA ",
73	1x	"LA",
74	1x	sub("^SAINT", "ST", raw_data$geography[missing_geo]),
75	1x	fixed = TRUE
76		)
77	1x	if (any(grepl(", VA", geo, fixed = TRUE))) {
78	1x	geo[geo == "SALEM, VA"] <- "SALEM CITY, VA"
79	1x	geo[geo == "RADFORD, VA"] <- "RADFORD CITY, VA"
80	1x	geo[geo == "DONA ANA, NM"] <- "DO\u00d1A ANA, NM"
81	1x	geo[geo == "MATANUSKA SUSITNA, AK"] <- "MATANUSKA-SUSITNA, AK"
82		}
83	1x	raw_data$geography[missing_geo] <- geo
84		}
85	1x	missing_regions <- raw_data$geography[
86	1x	!(raw_data$geography %in% names(region_names))
87		]
88	1x	if (length(missing_regions)) {
89	!	cli::cli_warn(
90	!	'unrecognized regions: {paste(unique(missing_regions), collapse = "; ")}'
91		)
92		}
93	1x	raw_data$geography <- region_names[raw_data$geography]
94	1x	raw_data <- raw_data[!is.na(raw_data$geography), ]
95		}
96	1x	raw_data <- raw_data[,
97	1x	!(colnames(raw_data) %in% c("state", "county", "year", "month", "week"))
98		]
99	1x	raw_data[
100	1x	rowSums(
101	1x	!is.na(raw_data[,
102	1x	!(colnames(raw_data) %in% c("geography", "time", "age"))
103		])
104		) !=
105	1x	0L,
106		]
107		}

1		#' Download Data from the CDC
2		#'
3		#' Download data and metadata from the Centers for Disease Control and Prevention (CDC).
4		#'
5		#' @param id ID of the resource (e.g., \code{ijqb-a7ye}).
6		#' @param out_dir Directory in which to save the metadata and data files.
7		#' @param state The state ID of a previous download; if provided, will only download if the
8		#' new state does not match.
9		#' @param parquet Logical; if \code{TRUE}, will convert the downloaded CSV file to Parquet.
10		#' @param verbose Logical; if \code{FALSE}, will not display status messages.
11		#' @returns The state ID of the downloaded files;
12		#' downloads files (\code{<id>.json} and \code{<id>.csv.xz}) to \code{out_dir}
13		#' @section \code{data.cdc.gov} URLs:
14		#'
15		#' For each resource ID, there are 3 relevant CDC URLs:
16		#' \itemize{
17		#' \item \strong{\code{resource/<id>}}: This redirects to the resource's main page,
18		#' with displayed metadata and a data preview
19		#' (e.g., \href{https://data.cdc.gov/resource/ijqb-a7ye}{data.cdc.gov/resource/ijqb-a7ye}).
20		#' \item \strong{\code{api/views/<id>}}: This is a direct link to the underlying
21		#' JSON metadata (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye}{data.cdc.gov/api/views/ijqb-a7ye}).
22		#' \item \strong{\code{api/views/<id>/rows.csv}}: This is a direct link to the full
23		#' CSV dataset (e.g., \href{https://data.cdc.gov/api/views/ijqb-a7ye/rows.csv}{data.cdc.gov/api/views/ijqb-a7ye/rows.csv}).
24		#' }
25		#'
26		#' @examples
27		#' \dontrun{
28		#' dcf_download_cdc("ijqb-a7ye")
29		#' }
30		#' @export
31
32		dcf_download_cdc <- function(
33		id,
34		out_dir = "raw",
35		state = NULL,
36		parquet = FALSE,
37		verbose = TRUE
38		) {
39	2x	dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
40	2x	if (verbose) {
41	2x	resource_url <- paste0("https://data.cdc.gov/resource/", id)
42	2x	cli::cli_h1(
43	2x	"downloading resource {.url {resource_url}}"
44		)
45		}
46	2x	url <- paste0("https://data.cdc.gov/api/views/", id)
47	2x	initial_timeout <- options(timeout = 99999L)$timeout
48	2x	on.exit(options(timeout = initial_timeout))
49	2x	if (verbose) {
50	2x	cli::cli_progress_step("metadata: {.url {url}}")
51		}
52	2x	metadata_file <- paste0(tempdir(), "/", id, ".json")
53	2x	status <- utils::download.file(url, metadata_file, quiet = TRUE)
54	2x	if (status != 0L) {
55	!	cli::cli_abort("failed to download metadata")
56		}
57	2x	metadata <- jsonlite::read_json(metadata_file)
58	2x	new_state <- if (is.null(metadata$rowsUpdatedAt)) {
59	!	as.list(tools::md5sum(metadata_file))
60		} else {
61	2x	metadata$rowsUpdatedAt
62		}
63	2x	if (!identical(new_state, state)) {
64	2x	file.rename(metadata_file, paste0(out_dir, "/", id, ".json"))
65	2x	data_url <- paste0(url, "/rows.csv")
66	2x	out_path <- paste0(out_dir, "/", id, ".csv")
67	2x	if (verbose) {
68	2x	cli::cli_progress_step("data: {.url {data_url}}")
69		}
70	2x	status <- utils::download.file(data_url, out_path, quiet = TRUE)
71	2x	if (status != 0L) {
72	!	cli::cli_abort("failed to download data")
73		}
74	2x	if (verbose) {
75	2x	cli::cli_progress_step("compressing data")
76		}
77	2x	if (parquet) {
78	1x	data <- vroom::vroom(out_path, show_col_types = FALSE)
79	1x	arrow::write_parquet(
80	1x	data,
81	1x	compression = "gzip",
82	1x	sub(".csv", ".parquet", out_path, fixed = TRUE)
83		)
84	1x	unlink(out_path)
85		} else {
86	1x	unlink(paste0(out_path, ".xz"))
87	1x	status <- system2("xz", c("-f", out_path))
88	1x	if (status != 0L) {
89	!	cli::cli_abort("failed to compress data")
90		}
91		}
92	2x	if (verbose) {
93	2x	cli::cli_progress_done()
94		}
95	2x	invisible(new_state)
96		} else {
97	!	unlink(metadata_file)
98	!	invisible(state)
99		}
100		}

1		#' Make a measurement metadata file
2		#'
3		#' Make a \code{measure_info.json} file, or add measure entries to an existing one.
4		#'
5		#' @param path Path to the \code{measure_info.json} file, existing or to be created.
6		#' @param ... Lists containing individual measure items. See the Measure Entries section.
7		#' @param info A list containing measurement information to be added.
8		#' @param references A list containing citation entries. See the Reference Entries section.
9		#' @param strict Logical; if \code{TRUE}, will only allow recognized entries and values.
10		#' @param include_empty Logical; if \code{FALSE}, will omit entries that have not been provided.
11		#' @param overwrite_entry Logical; if \code{TRUE}, will replace rather than add to an existing entry.
12		#' @param render Path to save a version of \code{path} to, with dynamic entries expanded. See the
13		#' Dynamic Entries section.
14		#' @param overwrite Logical; if \code{TRUE}, will overwrite rather than add to an existing \code{path}.
15		#' @param write Logical; if \code{FALSE}, will not write the build or rendered measure info.
16		#' @param verbose Logical; if \code{FALSE}, will not display status messages.
17		#' @param open_after Logical; if \code{FALSE}, will not open the measure file after writing/updating.
18		#' @section Measure Entries:
19		#' Measure entries are named by the unique variable name with any of these entries (if \code{strict}):
20		#' \itemize{
21		#' \item \strong{\code{id}}: Unique identifier of the measure; same as the entry name.
22		#' This is meant to correspond to the column name containing the measure in data files.
23		#' It should be minimal in length while still being unique across all files within the project.
24		#' It should only contain the characters \code{a-z}, \code{0-9}, or \code{_}.
25		#' \item \strong{\code{short_name}}: Shortest possible display name.
26		#' \item \strong{\code{long_name}}: Longer display name.
27		#' \item \strong{\code{category}}: Arbitrary category for the measure.
28		#' \item \strong{\code{short_description}}: Shortest possible description.
29		#' \item \strong{\code{long_description}}: Complete description. Either description can include
30		#' TeX-style equations, enclosed in escaped square brackets (e.g.,
31		#' \code{"The equation \\\\[a_{i} = b^\\\\frac{c}{d}\\\\] was used."}; or \code{$...$},
32		#' \code{\\\\(...\\\\)}, or \code{\\\\begin{math}...\\\\end{math}}). The final enclosing symbol must be
33		#' followed by a space or the end of the string. These are pre-render to MathML with
34		#' \code{\link[katex]{katex_mathml}}.
35		#' \item \strong{\code{statement}}: String with dynamic references to entity features
36		#' (e.g., \code{"measure value = {value}"}). References can include:
37		#' \itemize{
38		#' \item \code{value}: Value of a currently displaying variable at a current time.
39		#' \item \code{region_name}: Alias of \code{features.name}.
40		#' \item \code{features.<entry>}: An entity feature, coming from \code{entity_info.json} or GeoJSON properties.
41		#' All entities have at least \code{name} and \code{id} entries (e.g., \code{"{features.id}"}).
42		#' \item \code{variables.<entry>}: A variable feature such as \code{name} which is the same as
43		#' \code{id} (e.g., \code{"{variables.name}"}).
44		#' \item \code{data.<variable>}: The value of another variable at a current time (e.g., \code{"{data.variable_a}"}).
45		#' }
46		#' \item \strong{\code{measure_type}}: Type of the measure's value. Recognized types are displayed in a special way:
47		#' \itemize{
48		#' \item \code{year} or \code{integer} show as entered (usually as whole numbers). Other numeric
49		#' types are rounded to show a set number of digits.
50		#' \item \code{percent} shows as \code{{value}\%}.
51		#' \item \code{minutes} shows as \code{{value} minutes}.
52		#' \item \code{dollar} shows as \code{${value}}.
53		#' \item \code{internet speed} shows as \code{{value} Mbps}.
54		#' }
55		#' \item \strong{\code{unit}}: Prefix or suffix associated with the measure's type, such as \code{\%} for \code{percent},
56		#' or \code{Mbps} for \code{rate}.
57		#' \item \strong{\code{time_resolution}}: Temporal resolution of the variable, such as \code{year} or \code{week}.
58		#' \item \strong{\code{restrictions}}: A license or description of restrictions that may apply to the measure.
59		#' \item \strong{\code{sources}}: A list or list of list containing source information, including any of these entries:
60		#' \itemize{
61		#' \item \code{name}: Name of the source (such as an organization name).
62		#' \item \code{url}: General URL of the source (such as an organization's website).
63		#' \item \code{location}: More specific description of the source (such as a the name of a particular data product).
64		#' \item \code{location_url}: More direct URL to the resource (such as a page listing data products).
65		#' \item \code{date_accessed}: Date of retrieval (arbitrary format).
66		#' }
67		#' \item \strong{\code{citations}}: A vector of reference ids (the names of \code{reference} entries; e.g., \code{c("ref1", "ref3")}).
68		#' \item \strong{\code{categories}}: A named list of categories, with any of the other measure entries, or a
69		#' \code{default} entry giving a default category name. See the Dynamic Entries section.
70		#' \item \strong{\code{variants}}: A named list of variants, with any of the other measure entries, or a
71		#' \code{default} entry giving a default variant name. See the Dynamic Entries section.
72		#' }
73		#' @section Dynamic Entries:
74		#' You may have several closely related variables in a dataset, which share sections of metadata,
75		#' or have formulaic differences. In cases like this, the \code{categories} and/or \code{variants} entries
76		#' can be used along with dynamic notation to construct multiple entries from a single template.
77		#'
78		#' Though functionally the same, \code{categories} might include broken-out subsets of some total
79		#' (such as race groups, as categories of a total population), whereas \code{variants} may be different
80		#' transformations of the same variable (such as raw counts versus percentages).
81		#'
82		#' In dynamic entries, \code{{category}} or \code{{variant}} refers to entries in the \code{categories}
83		#' or \code{variants} lists. By default, these are replaced with the name of each entries in those lists
84		#' (e.g., \code{"variable_{category}"} where \code{categories = "a"} would become \code{"variable_a"}).
85		#' A \code{default} entry would change this behavior (e.g., with \code{categories = list(a = list(default = "b")}
86		#' that would become \code{"variable_b"}). Adding \code{.name} would force the original behavior (e.g.,
87		#' \code{"variable_{category.name}"} would be \code{"variable_a"}). A name of \code{"blank"} is treated as
88		#' an empty string.
89		#'
90		#' When notation appears in a measure info entry, they will first default to a matching name in the \code{categories}
91		#' or \code{variants} list; for example, \code{short_name} in \code{list(short_name = "variable {category}")} with
92		#' \code{categories = list(a = list(short_name = "(category a)"))} would become \code{"variable (category a)"}.
93		#' To force this behavior, the entry name can be included in the notation (e.g.,
94		#' \code{"{category.short_name}"} would be \code{"variable (category a)"} in any entry).
95		#'
96		#' Only string entries are processed dynamically -- any list-like entries (such as
97		#' \code{source}, \code{citations}, or \code{layer}) appearing in
98		#' \code{categories} or \code{variants} entries will fully replace the base entry.
99		#'
100		#' Dynamic entries can be kept dynamic when passed to a data site, but can be rendered for other uses,
101		#' where the rendered version will have each dynamic entry replaced with all unique combinations of
102		#' \code{categories} and \code{variants} entries, assuming both are used in the dynamic entry's name
103		#' (e.g., \code{"variable_{category}_{variant}"}). See Examples.
104		#' @section Reference Entries:
105		#' Reference entries can be included in a \code{_references} entry, and should have names corresponding to
106		#' those included in any of the measures' \code{citation} entries. These can include any of these entries:
107		#' \itemize{
108		#' \item \strong{\code{id}}: The reference id, same as the entry name.
109		#' \item \strong{\code{author}}: A list or list of lists specifying one or more authors. These can include
110		#' entries for \code{given} and \code{family} names.
111		#' \item \strong{\code{year}}: Year of the publication.
112		#' \item \strong{\code{title}}: Title of the publication.
113		#' \item \strong{\code{journal}}: Journal in which the publication appears.
114		#' \item \strong{\code{volume}}: Volume number of the journal.
115		#' \item \strong{\code{page}}: Page number of the journal.
116		#' \item \strong{\code{doi}}: Digital Object Identifier, from which a link is made (\code{https://doi.org/{doi}}).
117		#' \item \strong{\code{version}}: Version number of software.
118		#' \item \strong{\code{url}}: Link to the publication, alternative to a DOI.
119		#' }
120		#' @examples
121		#' path <- tempfile()
122		#'
123		#' # make an initial file
124		#' dcf_measure_info(path, "measure_name" = list(
125		#' id = "measure_name",
126		#' short_description = "A measure.",
127		#' statement = "This entity has {value} measure units."
128		#' ), verbose = FALSE)
129		#'
130		#' # add another measure to that
131		#' measure_info <- dcf_measure_info(path, "measure_two" = list(
132		#' id = "measure_two",
133		#' short_description = "Another measure.",
134		#' statement = "This entity has {value} measure units."
135		#' ), verbose = FALSE)
136		#' names(measure_info)
137		#'
138		#' # add a dynamic measure, and make a rendered version
139		#' measure_info_rendered <- dcf_measure_info(
140		#' path,
141		#' "measure_{category}_{variant.name}" = list(
142		#' id = "measure_{category}_{variant.name}",
143		#' short_description = "Another measure ({category}; {variant}).",
144		#' statement = "This entity has {value} {category} {variant}s.",
145		#' categories = c("a", "b"),
146		#' variants = list(u1 = list(default = "U1"), u2 = list(default = "U2"))
147		#' ),
148		#' render = TRUE, verbose = FALSE
149		#' )
150		#' names(measure_info_rendered)
151		#' measure_info_rendered[["measure_a_u1"]]$statement
152		#' @return An invisible list containing measurement metadata (the rendered version if made).
153		#' @export
154
155		dcf_measure_info <- function(
156		path,
157		...,
158		info = list(),
159		references = list(),
160		strict = FALSE,
161		include_empty = TRUE,
162		overwrite_entry = FALSE,
163		render = NULL,
164		overwrite = FALSE,
165		write = TRUE,
166		verbose = TRUE,
167		open_after = interactive()
168		) {
169	14x	if (write) {
170	7x	if (missing(path) \|\| !is.character(path)) {
171	!	cli::cli_abort(
172	!	"enter a path to the measure_info.json file as {.arg path}"
173		)
174		}
175	7x	dir.create(dirname(path), FALSE, TRUE)
176		}
177	14x	built <- list()
178	14x	if (!overwrite && is.character(path) && file.exists(path)) {
179	12x	if (verbose) {
180	4x	cli::cli_bullets(c(
181	4x	i = "updating existing file: {.path {basename(path)}}"
182		))
183		}
184	12x	built <- jsonlite::read_json(path)
185	12x	if (all(c("id", "measure_type") %in% names(built))) {
186	!	built <- list(built)
187	!	names(built) <- built[[1]]$id
188		}
189		}
190	14x	if (length(references)) {
191	1x	references <- c(references, built$`_references`)
192	1x	references <- references[!duplicated(names(references))]
193	1x	built$`_references` <- references
194		} else {
195	13x	references <- built$`_references`
196		}
197	14x	defaults <- list(
198	14x	id = "",
199	14x	short_name = "",
200	14x	long_name = "",
201	14x	category = "",
202	14x	short_description = "",
203	14x	long_description = "",
204	14x	statement = "",
205	14x	measure_type = "",
206	14x	unit = "",
207	14x	time_resolution = "",
208	14x	restrictions = "",
209	14x	sources = list(),
210	14x	citations = list()
211		)
212	14x	if (!is.list(info)) {
213	!	info <- sapply(info, function(name) list())
214		}
215	14x	info <- c(list(...), info)
216	14x	if (length(info) && is.null(names(info))) {
217	!	cli::cli_abort("supplied measure entries must be named")
218		}
219	14x	for (n in names(info)) {
220	7x	if (overwrite_entry \|\| is.null(built[[n]])) {
221	6x	l <- info[[n]]
222		} else {
223	1x	l <- c(info[[n]], built[[n]])
224	1x	l <- l[!duplicated(names(l))]
225		}
226	7x	if (is.null(l$id)) {
227	2x	l$id <- n
228		}
229	7x	if (strict) {
230	1x	su <- names(l) %in% names(defaults)
231	1x	if (verbose && any(!su)) {
232	1x	cli::cli_warn(paste0(
233	1x	"unrecognized {?entry/entries} in ",
234	1x	n,
235	1x	": {names(l)[!su]}"
236		))
237		}
238	1x	if (include_empty) {
239	!	for (e in names(l)) {
240	!	if (!is.null(defaults[[e]])) {
241	!	defaults[[e]] <- l[[e]]
242		}
243		}
244	!	l <- defaults
245		} else {
246	1x	l <- l[su]
247		}
248	6x	} else if (include_empty) {
249	5x	su <- !names(defaults) %in% names(l)
250	5x	if (any(su)) l <- c(l, defaults[su])
251		}
252	7x	if (!is.null(l$categories) && !is.list(l$categories)) {
253	1x	l$categories <- structure(
254	1x	lapply(l$categories, function(e) list(default = e)),
255	1x	names = l$categories
256		)
257		}
258	7x	if (!is.null(l$variants) && !is.list(l$variants)) {
259	!	l$variants <- structure(
260	!	lapply(l$variants, function(e) list(default = e)),
261	!	names = l$categories
262		)
263		}
264	7x	if (verbose && !is.null(l$citations)) {
265	5x	su <- !l$citations %in% names(references)
266	5x	if (any(su)) {
267	1x	cli::cli_warn(
268	1x	"no matching reference entry for {.val {l$citations[su]}} in {.val {n}}"
269		)
270		}
271		}
272	7x	built[[n]] <- l
273		}
274	14x	built <- built[order(grepl("^_", names(built)))]
275	14x	if (write) {
276	7x	if (verbose) {
277	5x	cli::cli_bullets(c(i = "writing info to {.path {path}}"))
278		}
279	7x	jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE)
280		}
281	14x	if (!is.null(render)) {
282	8x	expanded <- list()
283	8x	for (name in names(built)) {
284	14x	expanded <- c(
285	14x	expanded,
286	14x	if (grepl("{", name, fixed = TRUE)) {
287	1x	render_info(built[name])
288		} else {
289	13x	structure(list(built[[name]]), names = name)
290		}
291		)
292		}
293	8x	changed <- !identical(built, expanded)
294	8x	built <- expanded
295	8x	if (write && changed) {
296	1x	path <- if (is.character(render)) {
297	!	render
298		} else {
299	1x	sub("\\.json", "_rendered.json", path, TRUE)
300		}
301	1x	if (verbose) {
302	1x	cli::cli_bullets(c(i = "writing rendered info to {.path {path}}"))
303		}
304	1x	jsonlite::write_json(built, path, auto_unbox = TRUE, pretty = TRUE)
305		}
306		}
307	14x	if (open_after) {
308	!	rstudioapi::navigateToFile(path)
309		}
310	14x	invisible(built)
311		}
312
313		replace_equations <- function(info) {
314	3x	lapply(info, function(e) {
315	!	if (!is.list(e)) e <- list(default = e)
316	4x	descriptions <- grep("description", names(e), fixed = TRUE)
317	4x	if (length(descriptions)) {
318	4x	for (d in descriptions) {
319	8x	p <- gregexpr(
320	8x	"(?:\\$\|\\\\\\[\|\\\\\\(\|\\\\begin\\{math\\})(.+?)(?:\\$\|\\\\\\]\|\\\\\\)\|\\\\end\\{math\\})(?=\\s\|$)",
321	8x	e[[d]],
322	8x	perl = TRUE
323	8x	)[[1]]
324	8x	if (p[[1]] != -1) {
325	!	re <- paste("", e[[d]], "")
326	!	fm <- regmatches(e[[d]], p)
327	!	for (i in seq_along(p)) {
328	!	mp <- attr(p, "capture.start")[i, ]
329	!	eq <- substring(e[[d]], mp, mp + attr(p, "capture.length")[i, ] - 1)
330	!	parsed <- tryCatch(
331	!	katex::katex_mathml(eq),
332	!	error = function(e) NULL
333		)
334	!	if (!is.null(parsed)) {
335	!	re <- paste(
336	!	strsplit(re, fm[[i]], fixed = TRUE)[[1]],
337	!	collapse = sub("^<[^>]>", "", sub("<[^>]>$", "", parsed))
338		)
339		}
340		}
341	!	e[[d]] <- gsub("^ \| $", "", re)
342		}
343		}
344		}
345	!	if (is.list(e$categories)) e$categories <- replace_equations(e$categories)
346	!	if (is.list(e$variants)) e$variants <- replace_equations(e$variants)
347	4x	e
348		})
349		}
350
351		preprocess <- function(l) {
352	!	if (!is.list(l)) l <- sapply(l, function(n) list())
353	2x	ns <- names(l)
354	2x	for (i in seq_along(l)) {
355	4x	name <- if (ns[i] == "blank") "" else ns[i]
356	4x	l[[i]]$name <- name
357	1x	if (is.null(l[[i]]$default)) l[[i]]$default <- name
358		}
359	2x	l
360		}
361
362		replace_dynamic <- function(e, p, s, v = NULL, default = "default") {
363	68x	m <- gregexpr(p, e)
364	68x	if (m[[1]][[1]] != -1) {
365	16x	t <- regmatches(e, m)[[1]]
366	16x	tm <- structure(gsub("\\{[^.]+\\.?\|\\}", "", t), names = t)
367	16x	tm <- tm[!duplicated(names(tm))]
368	16x	tm[tm == ""] <- default
369	16x	for (tar in names(tm)) {
370	32x	us <- (if (is.null(v) \|\| substring(tar, 2, 2) == "c") s else v)
371	32x	entry <- tm[[tar]]
372	32x	if (is.null(us[[entry]]) && grepl("description", entry, fixed = TRUE)) {
373	8x	entry <- default <- "description"
374		}
375	20x	if (is.null(us[[entry]]) && entry == default) entry <- "default"
376	32x	if (is.null(us[[entry]]))
377	!	cli::cli_abort("failed to render measure info from {tar}")
378	32x	e <- gsub(tar, us[[entry]], e, fixed = TRUE)
379		}
380		}
381	68x	e
382		}
383
384		prepare_source <- function(o, s, p) {
385	8x	if (length(o)) {
386	8x	lapply(o, function(e) {
387	2x	if (is.character(e) && length(e) == 1) replace_dynamic(e, p, s) else e
388		})
389		} else {
390	!	list(name = "", default = "")
391		}
392		}
393
394		render_info_names <- function(infos) {
395	!	r <- lapply(names(infos), function(n) render_info(infos[n], TRUE))
396	!	structure(rep(names(infos), vapply(r, length, 0)), names = unlist(r))
397		}
398
399		render_info <- function(info, names_only = FALSE) {
400	1x	base_name <- names(info)
401	1x	base <- info[[1]]
402	1x	if (is.null(base$categories) && is.null(base$variants)) {
403	!	return(if (names_only) base_name else info)
404		}
405	1x	categories <- preprocess(base$categories)
406	1x	variants <- preprocess(base$variants)
407	1x	base$categories <- NULL
408	1x	base$variants <- NULL
409	1x	expanded <- NULL
410	1x	vars <- strsplit(
411	1x	as.character(outer(
412	1x	if (is.null(names(categories))) "" else names(categories),
413	1x	if (is.null(names(variants))) "" else names(variants),
414	1x	paste,
415	1x	sep = "\|\|\|"
416		)),
417		"\|\|\|",
418	1x	fixed = TRUE
419		)
420	1x	for (var in vars) {
421	4x	cs <- if (var[1] == "") list() else categories[[var[1]]]
422	4x	vs <- if (length(var) == 1 \|\| var[2] == "") list() else variants[[var[2]]]
423	4x	cs <- prepare_source(cs, vs, "\\{variants?(?:\\.[^}]+?)?\\}")
424	4x	vs <- prepare_source(vs, cs, "\\{categor(?:y\|ies)(?:\\.[^}]+?)?\\}")
425	4x	s <- c(cs, vs[!names(vs) %in% names(cs)])
426	4x	p <- "\\{(?:categor(?:y\|ies)\|variants?)(?:\\.[^}]+?)?\\}"
427	4x	key <- replace_dynamic(base_name, p, cs, vs)
428	4x	if (names_only) {
429	!	expanded <- c(expanded, key)
430		} else {
431	4x	expanded[[key]] <- c(
432	4x	structure(
433	4x	lapply(names(base), function(n) {
434	52x	e <- base[[n]]
435	52x	if (is.character(e) && length(e) == 1)
436	44x	e <- replace_dynamic(e, p, cs, vs, n)
437	52x	e
438		}),
439	4x	names = names(base)
440		),
441	4x	s[
442	4x	!names(s) %in%
443	4x	c(
444	4x	"default",
445	4x	"name",
446	4x	if (any(base[c("long_description", "short_description")] != ""))
447	4x	"description",
448	4x	names(base)
449		)
450		]
451		)
452		}
453		}
454	1x	expanded
455		}

1		#' Read Epic Cosmos Data
2		#'
3		#' Read in metadata and data from an Epic Cosmos file.
4		#'
5		#' @param path Path to the file.
6		#' @param path_root Directory containing \code{path}, if it is not full.
7		#' @returns A list with \code{data.frame} entries for \code{metadata} and \code{data}.
8		#'
9		#' @examples
10		#' # write an example file
11		#' path <- tempfile(fileext = ".csv")
12		#' raw_lines <- c(
13		#' "metadata field,metadata value,",
14		#' ",,",
15		#' ",Measures,Value Name",
16		#' "Year,Measure 1,",
17		#' "2020,m1,1",
18		#' ",m2,2",
19		#' "2021,m1,3",
20		#' ",m2,4"
21		#' )
22		#' writeLines(raw_lines, path)
23		#'
24		#' # read it in
25		#' dcf_read_epic(basename(path), dirname(path))
26		#'
27		#' @export
28
29		dcf_read_epic <- function(path, path_root = ".") {
30	2x	full_path <- if (file.exists(path)) {
31	2x	path
32		} else {
33	!	sub("//", "/", paste0(path_root, "/", path), fixed = TRUE)
34		}
35	2x	lines <- readLines(full_path, n = 25L, skipNul = FALSE)
36	2x	metadata_break <- grep("^[, ]*$", lines)
37	2x	if (!length(metadata_break)) {
38	!	cli::cli_abort(
39	!	"path does not appear to point to a file in the Epic format (no metadata separation)"
40		)
41		}
42	2x	meta_end <- min(metadata_break) - 1L
43	2x	data_start <- (if (length(metadata_break) == 1L) {
44	2x	metadata_break
45		} else {
46	!	max(metadata_break[
47	!	metadata_break == c(-1L, metadata_break[-1L])
48		])
49		}) +
50	2x	1L
51	2x	meta <- c(
52	2x	list(
53	2x	file = path,
54	2x	md5 = unname(tools::md5sum(full_path)),
55	2x	date_processed = Sys.time(),
56	2x	standard_name = ""
57		),
58	2x	as.list(unlist(lapply(
59	2x	strsplit(sub(",+$", "", lines[seq_len(meta_end)]), ",", fixed = TRUE),
60	2x	function(r) {
61	2x	l <- list(paste(r[-1L], collapse = ","))
62	2x	if (l[[1]] == "") {
63	!	r <- strsplit(r, ": ", fixed = TRUE)[[1L]]
64	!	l <- list(paste(r[-1L], collapse = ","))
65		}
66	2x	names(l) <- r[[1L]]
67	2x	l[[1L]] <- gsub('^"\|"$', "", l[[1L]])
68	2x	l
69		}
70		)))
71		)
72	2x	standard_names <- c(
73	2x	vaccine_mmr = "MMR receipt",
74	2x	rsv_tests = "RSV tests",
75	2x	flu = "Influenza",
76	2x	self_harm = "self-harm",
77	2x	covid = "COVID",
78	2x	rsv = "RSV",
79	2x	obesity = "BMI",
80	2x	obesity = "obesity",
81	2x	all_encounters = "All ED Encounters"
82		)
83	2x	meta_string <- paste(unlist(meta), collapse = " ")
84	2x	for (i in seq_along(standard_names)) {
85	12x	if (grepl(standard_names[[i]], meta_string, fixed = TRUE)) {
86	2x	meta$standard_name = names(standard_names)[[i]]
87	2x	break
88		}
89		}
90	2x	id_cols <- seq_len(
91	2x	length(strsplit(lines[data_start], "^,\|Measures,")[[1L]]) - 1L
92		)
93	2x	header <- c(
94	2x	strsplit(lines[data_start + 1L], ",", fixed = TRUE)[[1L]][id_cols],
95	2x	strsplit(lines[data_start], ",", fixed = TRUE)[[1L]][-id_cols]
96		)
97	2x	data <- arrow::read_csv_arrow(
98	2x	full_path,
99	2x	col_names = header,
100	2x	col_types = paste(rep("c", length(header)), collapse = ""),
101	2x	na = c("", "-"),
102	2x	skip = data_start + 1L
103		)
104	2x	percents <- grep("^(?:Percent\|Base\|RSV test)", header)
105	2x	if (length(percents)) {
106	!	for (col in percents) {
107	!	data[[col]] <- sub("%", "", data[[col]], fixed = TRUE)
108		}
109		}
110	2x	number <- grep("Number", header, fixed = TRUE)
111	2x	if (length(number)) {
112	!	for (col in number) {
113	!	data[[col]][data[[col]] == "10 or fewer"] <- 5L
114		}
115		}
116	2x	for (col in id_cols) {
117	10x	data[[col]] <- vctrs::vec_fill_missing(data[[col]], "down")
118		}
119	2x	if (all(c("Measures", "Base Patient") %in% colnames(data))) {
120	!	data <- Reduce(
121	!	merge,
122	!	lapply(split(data, data$Measures), function(d) {
123	!	measure <- d$Measures[[1L]]
124	!	d[[measure]] <- d[["Base Patient"]]
125	!	d[, !(colnames(d) %in% c("Measures", "Base Patient"))]
126		})
127		)
128		}
129	2x	colnames(data) <- standard_columns(colnames(data))
130	2x	if (meta$standard_name == "obesity") {
131	!	meta$standard_name <- paste0(
132	!	meta$standard_name,
133		"_",
134	!	if ("state" %in% colnames(data)) "state" else "county"
135		)
136		}
137	2x	if ("age" %in% colnames(data)) {
138	2x	std_age <- standard_age(data$age)
139	2x	missed_ages <- (data$age != "No value") & is.na(std_age)
140	2x	if (any(missed_ages)) {
141	!	std_age[missed_ages] <- data$age[missed_ages]
142	!	missed_levels <- unique(data$age[missed_ages])
143	!	cli::cli_warn("missed age levels: {.field {missed_levels}}")
144		}
145	2x	data$age <- std_age
146		}
147	2x	list(metadata = meta, data = data)
148		}
149
150		standard_age <- function(age) {
151	2x	c(
152	2x	`less than 1 years` = "<1 Years",
153	2x	`1 and < 2 years` = "1-2 Years",
154	2x	`2 and < 3 years` = "2-3 Years",
155	2x	`3 and < 4 years` = "3-4 Years",
156	2x	`1 and < 5 years` = "1-4 Years",
157	2x	`1 year or more and less than 5 years` = "1-4 Years",
158	2x	`4 and < 5 years` = "4-5 Years",
159	2x	`less than 5 years` = "<5 Years",
160	2x	`5 and < 6 years` = "5-6 Years",
161	2x	`5 and < 18 years` = "5-17 Years",
162	2x	`5 years or more and less than 18 years (1)` = "5-17 Years",
163	2x	`6 and < 7 years` = "6-7 Years",
164	2x	`6 years or more` = "6+ Years",
165	2x	`7 and < 8 years` = "7-8 Years",
166	2x	`8 and < 9 years` = "8-9 Years",
167	2x	`9 years or more` = "9+ Years",
168	2x	`less than 10 years` = "<10 Years",
169	2x	`10 and < 15 years` = "10-14 Years",
170	2x	`15 and < 20 years` = "15-19 Years",
171	2x	`18 and < 40 years` = "18-39 Years",
172	2x	`18 and < 50 years` = "18-49 Years",
173	2x	`18 years or more and less than 50 years` = "18-49 Years",
174	2x	`20 and < 40 years` = "20-39 Years",
175	2x	`40 and < 65 years` = "40-64 Years",
176	2x	`50 and < 65 years` = "50-64 Years",
177	2x	`50 years or more and less than 64 years` = "50-64 Years",
178	2x	`65 years or more` = "65+ Years",
179	2x	`65 and < 110 years` = "65+ Years",
180	2x	`total` = "Total"
181		)[
182	2x	sub("^[^a-z0-9]+\|:.*$", "", tolower(age))
183		]
184		}
185
186		standard_columns <- function(cols) {
187	2x	cols <- gsub(" ", "_", sub("number of ", "n_", tolower(cols)), fixed = TRUE)
188	2x	cols[grep("^age", cols)] <- "age"
189	2x	cols[grep("^state", cols)] <- "state"
190	2x	cols[grep("^county", cols)] <- "county"
191	2x	cols[grep("bmi_30", cols)] <- "bmi_30_49.8"
192	2x	cols[grep("hemoglobin_a1c_7", cols)] <- "hemoglobin_a1c_7"
193	2x	cols[grep("mmr_receipt", cols)] <- "mmr_receipt"
194	2x	cols[grep("^rsv_tests", cols)] <- "rsv_tests"
195	2x	cols
196		}

1		#' Check Data Sources
2		#'
3		#' Check the data files and measure info of source projects.
4		#'
5		#' @param names Name or names of source projects.
6		#' @param project_dir Path to the Data Collection Framework project.
7		#' @param verbose Logical; if \code{FALSE}, will not print status messages.
8		#' @returns A list with an entry for each source, containing a character vector
9		#' including any issue codes:
10		#' \itemize{
11		#' \item \code{not_compressed}: The file does not appear to be compressed.
12		#' \item \code{cant_read}: Failed to read the file in.
13		#' \item \code{geography_missing}: File does not contain a \code{geography} column.
14		#' \item \code{geography_nas}: The file's \code{geography} column contains NAs.
15		#' \item \code{time_missing}: File does not contain a \code{time} column.
16		#' \item \code{time_nas}: The file's \code{time} column contains NAs.
17		#' \item \code{missing_info: {column_name}}: The file's indicated column does not have
18		#' a matching entry in \code{measure_info.json}.
19		#' }
20		#' @examples
21		#' \dontrun{
22		#' dcf_check_sources("gtrends")
23		#' }
24		#' @export
25
26		dcf_check_sources <- function(
27		names = NULL,
28		project_dir = ".",
29		verbose = TRUE
30		) {
31	3x	settings <- dcf_read_settings(project_dir)
32	3x	base_dir <- paste0(project_dir, "/", settings$data_dir)
33	3x	if (is.null(names)) {
34	2x	names <- list.dirs(base_dir, recursive = FALSE, full.names = FALSE)
35		}
36	3x	issues <- list()
37	3x	for (name in names) {
38	4x	source_dir <- paste0(base_dir, "/", name, "/")
39	4x	if (!dir.exists(source_dir)) {
40	!	cli::cli_abort("specify the name of an existing data source")
41		}
42	4x	process_file <- paste0(source_dir, "process.json")
43	4x	dcf_add_source(name, project_dir, FALSE)
44	4x	if (!file.exists(process_file)) {
45	!	cli::cli_abort("{name} does not appear to be a data source project")
46		}
47	4x	process <- dcf_process_record(process_file)
48	4x	info_file <- paste0(source_dir, "measure_info.json")
49	4x	info <- tryCatch(
50	4x	dcf_measure_info(
51	4x	info_file,
52	4x	render = TRUE,
53	4x	write = FALSE,
54	4x	verbose = FALSE,
55	4x	open_after = FALSE
56		),
57	4x	error = function(e) NULL
58		)
59	4x	if (is.null(info)) {
60	!	cli::cli_abort("{.file {info_file}} is malformed")
61		}
62	4x	if (verbose) {
63	4x	cli::cli_bullets(c("", "Checking data source {.strong {name}}"))
64		}
65	4x	data_files <- list.files(
66	4x	paste0(source_dir, "standard"),
67	4x	"\\.(?:csv\|parquet)",
68	4x	full.names = TRUE
69		)
70	4x	source_issues <- list()
71	4x	for (file in list.files(
72	4x	paste0(source_dir, "raw"),
73	4x	"csv$",
74	4x	full.names = TRUE
75		)) {
76	1x	source_issues[[file]] <- list(data = "not_compressed")
77		}
78	4x	if (length(data_files)) {
79	3x	for (file in data_files) {
80	3x	issue_messages <- NULL
81	3x	if (verbose) {
82	3x	cli::cli_progress_step("checking file {.file {file}}", spinner = TRUE)
83		}
84	3x	data_issues <- NULL
85	3x	measure_issues <- NULL
86	3x	data <- tryCatch(
87	3x	if (grepl("parquet$", file)) {
88	!	dplyr::collect(arrow::read_parquet(file))
89		} else {
90	3x	con <- gzfile(file)
91	3x	on.exit(con)
92	3x	vroom::vroom(con, show_col_types = FALSE)
93		},
94	3x	error = function(e) NULL
95		)
96	3x	if (is.null(data)) {
97	!	data_issues <- c(data_issues, "cant_read")
98		} else {
99	3x	if (grepl("csv$", file)) {
100	1x	data_issues <- c(data_issues, "not_compressed")
101	1x	if (verbose) {
102	1x	issue_messages <- c(
103	1x	issue_messages,
104	1x	"file is not compressed"
105		)
106		}
107		}
108	3x	if (!("geography" %in% colnames(data))) {
109	!	data_issues <- c(data_issues, "geography_missing")
110	!	if (verbose) {
111	!	issue_messages <- c(
112	!	issue_messages,
113	!	"missing {.emph geography} column"
114		)
115		}
116	3x	} else if (anyNA(data$geography)) {
117	1x	data_issues <- c(data_issues, "geography_nas")
118	1x	if (verbose) {
119	1x	issue_messages <- c(
120	1x	issue_messages,
121	1x	"{.emph geography} column contains NAs"
122		)
123		}
124		}
125	3x	if (!("time" %in% colnames(data))) {
126	!	data_issues <- c(data_issues, "time_missing")
127	!	if (verbose) {
128	!	issue_messages <- c(
129	!	issue_messages,
130	!	"missing {.emph time} column"
131		)
132		}
133	3x	} else if (anyNA(data$time)) {
134	1x	data_issues <- c(data_issues, "time_nas")
135	1x	if (verbose) {
136	1x	issue_messages <- c(
137	1x	issue_messages,
138	1x	"{.emph time} column contains NAs"
139		)
140		}
141		}
142	3x	for (col in colnames(data)) {
143	9x	if (!(col %in% c("geography", "time")) && !(col %in% names(info))) {
144	1x	measure_issues <- c(measure_issues, paste("missing_info:", col))
145	1x	if (verbose) {
146	1x	issue_messages <- c(
147	1x	issue_messages,
148	1x	paste0(
149	1x	"{.emph ",
150	1x	col,
151	1x	"} column does not have an entry in measure_info"
152		)
153		)
154		}
155		}
156		}
157		}
158	3x	file_issues <- list()
159	3x	if (length(data_issues)) {
160	1x	file_issues$data <- data_issues
161		}
162	3x	if (length(measure_issues)) {
163	1x	file_issues$measures <- measure_issues
164		}
165	3x	source_issues[[file]] <- file_issues
166	3x	if (verbose) {
167	3x	if (length(issue_messages)) {
168	1x	cli::cli_progress_done(result = "failed")
169	1x	cli::cli_bullets(structure(
170	1x	issue_messages,
171	1x	names = rep(" ", length(issue_messages))
172		))
173		} else {
174	2x	cli::cli_progress_done()
175		}
176		}
177		}
178		} else {
179	1x	if (verbose) cli::cli_alert_info("no standard data files found to check")
180		}
181	4x	if (!identical(process$check_results, source_issues)) {
182	3x	process$checked <- Sys.time()
183	3x	process$check_results <- source_issues
184	3x	dcf_process_record(process_file, process)
185		}
186	4x	issues[[name]] <- source_issues
187		}
188
189	3x	invisible(issues)
190		}

1		#' Process Epic Stating Files
2		#'
3		#' Process Epic stating files, lightly standardizing them and moving them to raw.
4		#'
5		#' @param staging_dir Directory containing the staging files.
6		#' @param out_dir Directory to write new raw files to.
7		#' @param verbose Logical; if \code{FALSE}, will not show status messages.
8		#' @param cleanup Logical; if \code{FALSE}, will not remove staging files after being processed.
9		#' @returns \code{NULL} if no staging files are found.
10		#' Otherwise, a list with entries for \code{data} and \code{metadata}.
11		#' Each of these are lists with entries for each recognized standard name,
12		#' with potentially combined outputs similar to \code{\link{dcf_read_epic}}
13		#'
14		#' @examples
15		#' \dontrun{
16		#' # run from a source project
17		#' dcf_process_epic_staging()
18		#' }
19		#'
20		#' @export
21
22		dcf_process_epic_staging <- function(
23		staging_dir = "raw/staging",
24		out_dir = "raw",
25		verbose = TRUE,
26		cleanup = TRUE
27		) {
28	1x	files <- sort(list.files(
29	1x	staging_dir,
30	1x	"csv",
31	1x	full.names = TRUE,
32	1x	recursive = TRUE
33		))
34	1x	files <- files[!grepl("census", files)]
35	1x	if (!length(files)) {
36	!	if (verbose) {
37	!	cli::cli_progress_message("no staging files found")
38		}
39	!	return(NULL)
40		}
41	1x	id_cols <- c("state", "county", "age", "year", "month", "week")
42	1x	metadata <- list()
43	1x	data <- list()
44	1x	for (file in files) {
45	2x	if (verbose) {
46	2x	cli::cli_progress_step("processing file {.file {file}}", spinner = TRUE)
47		}
48	2x	epic <- tryCatch(dcf_read_epic(file), error = function(e) NULL)
49	2x	if (is.null(epic)) {
50	!	if (verbose) {
51	!	cli::cli_progress_done(result = "failed")
52		}
53	!	next
54		}
55	2x	if (epic$metadata$standard_name == "") {
56	!	if (verbose) {
57	!	cli::cli_progress_update(
58	!	status = "failed to identify standard type for {.file {file}}"
59		)
60	!	cli::cli_progress_done(result = "failed")
61		}
62	!	next
63		}
64	2x	name <- epic$metadata$standard_name
65	2x	metadata[[name]] <- c(list(epic$metadata), metadata[[name]])
66	2x	file_id_cols <- id_cols[id_cols %in% colnames(epic$data)]
67	2x	epic$data <- epic$data[
68	2x	rowMeans(is.na(epic$data[,
69	2x	!(colnames(epic$data) %in% file_id_cols),
70	2x	drop = FALSE
71		])) !=
72	2x	1,
73		]
74	2x	n_col <- grep("^n_", colnames(epic$data))
75	2x	if (length(n_col)) {
76	!	colnames(epic$data)[[n_col]] <- paste0("n_", epic$metadata$standard_name)
77		}
78	2x	if (!is.null(data[[name]])) {
79	1x	cols <- colnames(data[[name]])
80	1x	cols <- cols[!(cols %in% colnames(epic$data))]
81	1x	if (length(cols)) {
82	!	epic$data[, cols] <- NA
83		}
84	1x	epic$data <- epic$data[, colnames(data[[name]])]
85	1x	file_id_cols <- id_cols[id_cols %in% colnames(data[[name]])]
86	1x	data[[name]] <- data[[name]][
87	1x	!(do.call(paste, data[[name]][, file_id_cols]) %in%
88	1x	do.call(paste, epic$data[, file_id_cols])),
89		]
90		}
91	2x	data[[name]] <- rbind(epic$data, data[[name]])
92	2x	if (verbose) cli::cli_progress_done()
93		}
94	1x	for (name in names(metadata)) {
95	1x	if (verbose) {
96	1x	cli::cli_progress_step(
97	1x	"writing standard raw output for {.field {name}}",
98	1x	spinner = TRUE
99		)
100		}
101	1x	paths <- paste0(out_dir, "/", name, ".", c("json", "csv.xz"))
102	1x	jsonlite::write_json(
103	1x	metadata[[name]],
104	1x	paths[[1L]],
105	1x	auto_unbox = TRUE,
106	1x	pretty = TRUE
107		)
108	1x	vroom::vroom_write(data[[name]], paths[[2L]])
109	1x	if (cleanup) {
110	1x	unlink(vapply(metadata[[name]], "[[", "", "file"))
111		}
112	1x	if (verbose) cli::cli_process_done()
113		}
114	1x	return(list(metadata = metadata, data = data))
115		}

1		#' Run Data Project Processes
2		#'
3		#' Operates over data source and bundle projects, optionally running the source
4		#' ingest scripts, then collecting metadata.
5		#'
6		#' @param name Name of a source project to process. Will
7		#' @param project_dir Path to the project directory. If not specified, and being called
8		#' from a source project, this will be assumed to be two steps back from the working directory.
9		#' @param ingest Logical; if \code{FALSE}, will re-process standardized data without running
10		#' ingestion scripts. Only applies to source projects.
11		#' @param is_auto Logical; if \code{TRUE}, will skip process scripts marked as manual.
12		#' @param force Logical; if \code{TRUE}, will ignore process frequencies
13		#' (will run scripts even if recently run).
14		#' @param clear_state Logical; if \code{TRUE}, will clear stored states before processing.
15		#' @returns A list with processing results:
16		#' \itemize{
17		#' \item \code{timings}: How many seconds the scripts took to run.
18		#' \item \code{logs}: The captured output of the scripts.
19		#' }
20		#' Each entry has an entry for each project.
21		#'
22		#' A `datapackage.json` file is also created / update in each source's `standard` directory.
23		#' @examples
24		#' \dontrun{
25		#' # run from a directory containing a `data` directory containing the source
26		#' dcf_process("source_name")
27		#'
28		#' # run without executing the ingestion script
29		#' dcf_process("source_name", ingest = FALSE)
30		#' }
31		#' @export
32
33		dcf_process <- function(
34		name = NULL,
35		project_dir = ".",
36		ingest = TRUE,
37		is_auto = FALSE,
38		force = FALSE,
39		clear_state = FALSE
40		) {
41	4x	settings_file <- paste0(project_dir, "/settings.json")
42	4x	from_project <- file.exists(settings_file)
43	4x	if (from_project) {
44	4x	source_dir <- paste0(
45	4x	project_dir,
46		"/",
47	4x	jsonlite::read_json(settings_file)$data_dir
48		)
49		} else {
50	!	source_dir <- "../.."
51	!	name <- dirname(getwd())
52		}
53
54	4x	sources <- if (is.null(name)) {
55	2x	list.files(
56	2x	source_dir,
57	2x	"process\\.json",
58	2x	recursive = TRUE,
59	2x	full.names = TRUE
60		)
61		} else {
62	2x	process_files <- paste0(source_dir, "/", name, "/process.json")
63	2x	if (any(!file.exists(process_files))) {
64	!	cli::cli_abort(
65	!	"missing process file{?/s}: {.emph {process_files[!file.exists(process_files)]}}"
66		)
67		}
68	2x	process_files
69		}
70	4x	decide_to_run <- function(process_script) {
71	4x	if (is_auto && process_script$manual) {
72	!	return(FALSE)
73		}
74	4x	if (
75	4x	force \|\| process_script$last_run == "" \|\| process_script$frequency == 0L
76		) {
77	4x	return(TRUE)
78		}
79	4x	if (
80	!	difftime(Sys.time(), as.POSIXct(process_script$last_run), units = "day") >
81	!	process_script$frequency
82		) {
83	!	return(TRUE)
84		}
85	!	FALSE
86		}
87	4x	collect_env <- new.env()
88	4x	collect_env$timings <- list()
89	4x	collect_env$logs <- list()
90	4x	process_source <- function(process_file) {
91	4x	process_def <- dcf_process_record(process_file)
92	4x	if (clear_state) {
93	!	process_def$raw_state <- NULL
94	!	process_def$standard_state <- NULL
95	!	dcf_process_record(process_file, process_def)
96		}
97	4x	name <- process_def$name
98	4x	dcf_add_source(name, project_dir, open_after = FALSE)
99	4x	for (si in seq_along(process_def$scripts)) {
100	4x	st <- proc.time()[[3]]
101	4x	process_script <- process_def$scripts[[si]]
102	4x	run_current <- ingest && decide_to_run(process_script)
103	4x	base_dir <- dirname(process_file)
104	4x	standard_dir <- paste0(base_dir, "/standard")
105	4x	script <- paste0(base_dir, "/", process_script$path)
106	4x	file_ref <- if (run_current) paste0(" ({.emph ", script, "})") else NULL
107	4x	cli::cli_progress_step(
108	4x	paste0("processing source {.strong ", name, "}", file_ref),
109	4x	spinner = TRUE
110		)
111	4x	env <- new.env()
112	4x	env$dcf_process_continue <- TRUE
113	4x	status <- if (ingest) {
114	4x	tryCatch(
115	4x	list(
116	4x	log = utils::capture.output(
117	4x	source(script, env, chdir = TRUE),
118	4x	type = "message"
119		),
120	4x	success = TRUE
121		),
122	4x	error = function(e) list(log = e$message, success = FALSE)
123		)
124		} else {
125	!	list(log = "", success = TRUE)
126		}
127	4x	collect_env$logs[[name]] <- status$log
128	4x	if (run_current) {
129	4x	process_script$last_run <- Sys.time()
130	4x	process_script$run_time <- proc.time()[[3]] - st
131	4x	process_script$last_status <- status
132	4x	process_def$scripts[[si]] <- process_script
133		}
134	4x	if (status$success) {
135	4x	collect_env$timings[[name]] <- process_script$run_time
136		}
137	!	if (!env$dcf_process_continue) break
138		}
139	4x	process_def_current <- dcf_process_record(process_file)
140	4x	if (
141	4x	is.null(process_def_current$raw_state) \|\|
142	4x	!identical(process_def$raw_state, process_def_current$raw_state)
143		) {
144	4x	process_def_current$scripts <- process_def$scripts
145	4x	dcf_process_record(process_file, process_def_current)
146		}
147	4x	data_files <- list.files(standard_dir, "\\.(?:csv\|parquet)")
148	4x	if (length(data_files)) {
149	4x	measure_info_file <- paste0(base_dir, "/measure_info.json")
150	4x	standard_state <- as.list(tools::md5sum(c(
151	4x	measure_info_file,
152	4x	paste0(standard_dir, "/", data_files)
153		)))
154	4x	if (!identical(process_def_current$standard_state, standard_state)) {
155	3x	measure_info <- dcf_measure_info(
156	3x	measure_info_file,
157	3x	include_empty = FALSE,
158	3x	render = TRUE,
159	3x	write = FALSE,
160	3x	open_after = FALSE,
161	3x	verbose = FALSE
162		)
163	3x	measure_sources <- list()
164	3x	for (info in measure_info) {
165	4x	for (s in info$sources) {
166	4x	if (
167	!	!is.null(s$location) &&
168	!	!(s$location %in% names(sources))
169		) {
170	!	measure_sources[[s$location]] <- s
171		}
172		}
173		}
174	3x	if (!file.exists(paste0(standard_dir, "/datapackage.json"))) {
175	!	dcf_datapackage_init(name, dir = standard_dir, quiet = TRUE)
176		}
177	3x	dcf_datapackage_add(
178	3x	data_files,
179	3x	meta = list(
180	3x	source = unname(measure_sources),
181	3x	base_dir = base_dir,
182	3x	ids = "geography",
183	3x	time = "time",
184	3x	variables = measure_info
185		),
186	3x	dir = standard_dir,
187	3x	pretty = TRUE,
188	3x	summarize_ids = TRUE,
189	3x	verbose = FALSE
190		)
191	3x	process_def_current$standard_state <- standard_state
192	3x	dcf_process_record(process_file, process_def_current)
193		}
194	4x	cli::cli_progress_done(result = if (status$success) "done" else "failed")
195		} else {
196	!	cli::cli_progress_done(result = "failed")
197	!	cli::cli_bullets(
198	!	c(" " = "no standard data files found in {.path {process_file}}")
199		)
200		}
201		}
202	4x	process_bundle <- function(process_file) {
203	1x	process_def <- dcf_process_record(process_file)
204	1x	if (clear_state) {
205	!	process_def$source_state <- NULL
206	!	process_def$dist_state <- NULL
207	!	dcf_process_record(process_file, process_def)
208		}
209	1x	name <- process_def$name
210	1x	dcf_add_bundle(name, project_dir, open_after = FALSE)
211	1x	for (si in seq_along(process_def$scripts)) {
212	1x	st <- proc.time()[[3]]
213	1x	process_script <- process_def$scripts[[si]]
214	1x	base_dir <- dirname(process_file)
215	1x	script <- paste0(base_dir, "/", process_script$path)
216	1x	measure_info_file <- paste0(base_dir, "/measure_info.json")
217	1x	run_current <- TRUE
218	1x	if (length(process_def$source_files)) {
219	1x	standard_files <- paste0(source_dir, "/", process_def$source_files)
220	1x	standard_state <- as.list(tools::md5sum(c(
221	1x	measure_info_file,
222	1x	paste0(source_dir, "/", process_def$source_files)
223		)))
224	1x	run_current <- !identical(standard_state, process_def$source_state)
225		}
226	1x	if (run_current) {
227	1x	cli::cli_progress_step(
228	1x	paste0(
229	1x	"processing bundle {.strong ",
230	1x	name,
231	1x	"} ({.emph ",
232	1x	script,
233		"})"
234		),
235	1x	spinner = TRUE
236		)
237	1x	env <- new.env()
238	1x	env$dcf_process_continue <- TRUE
239	1x	status <- tryCatch(
240	1x	list(
241	1x	log = utils::capture.output(
242	1x	source(script, env, chdir = TRUE),
243	1x	type = "message"
244		),
245	1x	success = TRUE
246		),
247	1x	error = function(e) list(log = e$message, success = FALSE)
248		)
249	1x	collect_env$logs[[name]] <- status$log
250	1x	if (run_current) {
251	1x	process_script$last_run <- Sys.time()
252	1x	process_script$run_time <- proc.time()[[3]] - st
253	1x	process_script$last_status <- status
254	1x	process_def$scripts[[si]] <- process_script
255		}
256	1x	if (status$success) {
257	1x	collect_env$timings[[name]] <- process_script$run_time
258		}
259	!	if (!env$dcf_process_continue) break
260		}
261		}
262	1x	process_def_current <- dcf_process_record(process_file)
263	1x	dist_files <- list.files(paste0(base_dir, "/dist"))
264	1x	if (length(dist_files)) {
265	1x	measure_info_file <- paste0(base_dir, "/measure_info.json")
266	1x	dist_state <- as.list(tools::md5sum(c(
267	1x	measure_info_file,
268	1x	paste0(base_dir, "/dist/", dist_files)
269		)))
270	1x	if (!identical(process_def_current$dist_state, dist_state)) {
271	1x	process_def_current$scripts <- process_def$scripts
272	1x	process_def_current$dist_state <- dist_state
273	1x	dcf_process_record(process_file, process_def_current)
274		}
275	1x	cli::cli_progress_done(result = if (status$success) "done" else "failed")
276		} else {
277	!	cli::cli_progress_done(result = "failed")
278	!	cli::cli_bullets(
279	!	c(" " = "no standard data files found in {.path {process_file}}")
280		)
281		}
282		}
283	4x	for (process_file in sources[order(
284	4x	vapply(
285	4x	sources,
286	4x	function(f) {
287	5x	type <- jsonlite::read_json(f)$type
288	5x	is.null(type) \|\| type == "bundle"
289		},
290	4x	TRUE
291		) ==
292	4x	"bundle"
293		)]) {
294	5x	process_def <- dcf_process_record(process_file)
295	5x	if (is.null(process_def$type) \|\| process_def$type == "source") {
296	4x	process_source(process_file)
297		} else {
298	1x	process_bundle(process_file)
299		}
300		}
301	4x	invisible(list(timings = collect_env$timings, logs = collect_env$logs))
302		}

1		#' Retrieve A Data File
2		#'
3		#' Load a data file from a source data project, or list versions of the file.
4		#'
5		#' @param path Path to the file.
6		#' @param date Date of the version to load; A \code{Date}, or \code{character} in the format
7		#' \code{YYYY-MM-DD}. Will match to the nearest version.
8		#' @param commit_hash SHA signature of the committed version;
9		#' can be the first 6 or so characters. Ignored if \code{date} is provided.
10		#' @param versions Logical; if \code{TRUE}, will return a list of available version,
11		#' rather than a
12		#' @returns If \code{versions} is \code{TRUE}, a \code{data.frame} with columns for
13		#' the \code{hash}, \code{author}, \code{date}, and \code{message} of each commit.
14		#' Otherwise, the path to a temporary file, if one was extracted.
15		#'
16		#' @examples
17		#' path <- "../../../pophive/data/wastewater/raw/flua.csv.xz"
18		#' if (file.exists(path)) {
19		#' # list versions
20		#' versions <- dcf_get_file(path, versions = TRUE)
21		#' print(versions[, c("date", "hash")])
22		#'
23		#' # extract a version to a temporary file
24		#' temp_path <- dcf_get_file(path, "2025-05")
25		#' basename(temp_path)
26		#' }
27		#'
28		#' @export
29
30		dcf_get_file <- function(
31		path,
32		date = NULL,
33		commit_hash = NULL,
34		versions = FALSE
35		) {
36	3x	if (missing(path)) {
37	!	cli::cli_abort("specify a path")
38		}
39	3x	if (!file.exists(path)) {
40	!	cli::cli_abort("path does not exist")
41		}
42	3x	vs <- data.frame(
43	3x	hash = character(),
44	3x	author = character(),
45	3x	date = character(),
46	3x	message = character()
47		)
48	3x	if (versions \|\| !is.null(date)) {
49	2x	commits <- sys::exec_internal("git", c("log", path))
50	2x	if (commits$status == 0L) {
51	2x	commits <- do.call(
52	2x	rbind,
53	2x	Filter(
54	2x	function(e) length(e) == 4L,
55	2x	strsplit(
56	2x	strsplit(rawToChar(commits$stdout), "commit ", fixed = TRUE)[[1L]],
57	2x	"\\n+(?:[^:]+:)?\\s*"
58		)
59		)
60		)
61	2x	colnames(commits) <- colnames(vs)
62	2x	vs <- as.data.frame(commits)
63		} else {
64	!	cli::cli_abort("failed to git log: {rawToChar(commits$stderr)}")
65		}
66		}
67	3x	if (versions) {
68	1x	return(vs)
69		}
70	2x	if (!is.null(date)) {
71	1x	if (nrow(vs) == 0L) {
72	!	return(path)
73		}
74	1x	if (is.character(date)) {
75	1x	date <- as.POSIXct(
76	1x	date,
77	1x	tryFormats = c(
78	1x	"%Y-%m-%d %H:%M:%S",
79	1x	"%Y-%m-%d %H:%M",
80	1x	"%Y-%m-%d",
81	1x	"%Y-%m",
82	1x	"%Y"
83		),
84	1x	tz = "UTC"
85		)
86		}
87	1x	commit_hash <- vs$hash[which.min(abs(
88	1x	as.POSIXct(vs$date, "%a %b %d %H:%M:%S %Y", tz = "UTC") - date
89		))]
90		}
91	2x	if (is.null(commit_hash)) {
92	!	return(path)
93		}
94	2x	name_parts <- strsplit(basename(path), ".", fixed = TRUE)[[1L]]
95	2x	out_path <- paste0(
96	2x	tempdir(),
97		"/",
98	2x	name_parts[[1L]],
99		"-",
100	2x	substring(commit_hash, 1L, 6L),
101		".",
102	2x	paste(name_parts[-1L], collapse = ".")
103		)
104	2x	if (file.exists(out_path)) {
105	!	return(out_path)
106		}
107	2x	status <- sys::exec_wait(
108	2x	"git",
109	2x	c("show", paste0(commit_hash, ":", path)),
110	2x	std_out = out_path
111		)
112	2x	if (status != 0L) {
113	!	cli::cli_abort("failed to git show: {rawToChar(status$stderr)}")
114		}
115	2x	out_path
116		}

1		#' Adds a Bundle Project
2		#'
3		#' Establishes a new data bundle project, used to prepare outputs from standardized datasets.
4		#'
5		#' @param name Name of the bundle
6		#' @param project_dir Path to the Data Collection Framework project.
7		#' @param source_files Character vector of paths to standard files form source projects.
8		#' @param open_after Logical; if \code{FALSE}, will not open the project.
9		#' @returns Nothing; creates default files and directories.
10		#' @section Project:
11		#'
12		#' Within a bundle project, there are two files to edits:
13		#' \itemize{
14		#' \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun.
15		#' It should store raw data and resources in \code{raw/} where possible,
16		#' then use what's in \code{raw/} to produce standard-format files in \code{standard/}.
17		#' This file is sourced from its location during processing, so any system paths
18		#' must be relative to itself.
19		#' \item \strong{\code{measure_info.json}}: This is where you can record information
20		#' about the variables included in the standardized data files.
21		#' See \code{\link{dcf_measure_info}}.
22		#' }
23		#'
24		#' @examples
25		#' project_dir <- paste0(tempdir(), "/temp_project")
26		#' dcf_init("temp_project", dirname(project_dir))
27		#' dcf_add_bundle("bundle_name", project_dir)
28		#' list.files(paste0(project_dir, "/data/bundle_name"))
29		#'
30		#' @export
31
32		dcf_add_bundle <- function(
33		name,
34		project_dir = ".",
35		source_files = NULL,
36		open_after = interactive()
37		) {
38	2x	if (missing(name)) {
39	!	cli::cli_abort("specify a name")
40		}
41	2x	name <- gsub("[^A-Za-z0-9]+", "_", name)
42	2x	settings <- dcf_read_settings(project_dir)
43	2x	base_dir <- paste(c(project_dir, settings$data_dir, name), collapse = "/")
44	2x	dir.create(paste0(base_dir, "/dist"), showWarnings = FALSE, recursive = TRUE)
45	2x	paths <- paste0(
46	2x	base_dir,
47		"/",
48	2x	c(
49	2x	"README.md",
50	2x	"project.Rproj",
51	2x	"process.json",
52	2x	"measure_info.json",
53	2x	"build.R"
54		)
55		)
56	2x	if (!file.exists(paths[[1L]])) {
57	1x	writeLines(
58	1x	paste0(
59	1x	c(
60	1x	paste("#", name),
61		"",
62	1x	"This is a Data Collection Framework data bundle project, initialized with `dcf::dcf_add_bundle`.",
63		"",
64	1x	"You can us the `dcf` package to rebuild the bundle:",
65		"",
66	1x	"```R",
67	1x	paste0('dcf::dcf_process("', name, '", "..")'),
68		"```"
69		),
70	1x	collapse = "\n"
71		),
72	1x	paths[[1L]]
73		)
74		}
75	2x	if (!file.exists(paths[[2L]])) {
76	1x	writeLines("Version: 1.0\n", paths[[2L]])
77		}
78	2x	if (!file.exists(paths[[3L]])) {
79	1x	jsonlite::write_json(
80	1x	list(
81	1x	name = name,
82	1x	type = "bundle",
83	1x	scripts = list(
84	1x	list(
85	1x	path = "build.R",
86	1x	last_run = "",
87	1x	run_time = "",
88	1x	last_status = list(log = "", success = TRUE)
89		)
90		),
91	1x	source_files = source_files
92		),
93	1x	paths[[3L]],
94	1x	auto_unbox = TRUE,
95	1x	pretty = TRUE
96		)
97		}
98	2x	if (!file.exists(paths[[4L]])) {
99	1x	writeLines("{}\n", paths[[4L]])
100		}
101	2x	if (!file.exists(paths[[5L]])) {
102	1x	writeLines(
103	1x	paste0(
104	1x	c(
105	1x	"# read data from data source projects",
106	1x	"# and write to this project's `dist` directory",
107		""
108		),
109	1x	collapse = "\n"
110		),
111	1x	paths[[5L]]
112		)
113		}
114	!	if (open_after) rstudioapi::openProject(paths[[2L]], newSession = TRUE)
115		}

1		#' Initialize a Data Collection Project
2		#'
3		#' Establishes a new data collection framework project.
4		#'
5		#' @param name Name of the source. Defaults to the current directory name.
6		#' @param base_dir Path to the parent of the project directory (where the \code{name}
7		#' directory should be created). If \code{name} is not specified, will treat the current
8		#' directory as \code{name}, and \code{".."} as \code{base_dir}.
9		#' @param data_dir Name of the directory to store projects in, relative to \code{base_dir}.
10		#' @param github_account Name of the GitHub account that will host the repository.
11		#' @param branch Name of the repository's branch.
12		#' @param repo_name Name of the repository.
13		#' @param use_git Logical; if \code{TRUE}, will initialize a git repository.
14		#' @param open_after Logical; if \code{TRUE}, will open the project in a new RStudio instance.
15		#' @returns Nothing; creates default files and directories.
16		#' @section Data Collection Project:
17		#'
18		#' A data collection project starts with a \code{settings.json} file, which
19		#' specifies where source and bundle projects live (a \code{data} subdirectory by default).
20		#'
21		#' The bulk of the project will then be in the source and bundle projects, as created
22		#' by the \code{\link{dcf_add_source}} and \code{\link{dcf_add_bundle}}.
23		#'
24		#' Once these sub-projects are in place, they can be operated over by the
25		#' \code{\link{dcf_build}}, which processes each sub-project using
26		#' \code{\link{dcf_process}}, and checks them with \code{\link{dcf_check_sources}},
27		#' resulting in a report.
28		#'
29		#' @examples
30		#' base_dir <- tempdir()
31		#' dcf_init("project_name", base_dir)
32		#' list.files(paste0(base_dir, "/project_name"))
33		#'
34		#' @export
35
36		dcf_init <- function(
37		name,
38		base_dir = ".",
39		data_dir = "data",
40		github_account = "",
41		branch = "main",
42		repo_name = name,
43		use_git = TRUE,
44		open_after = FALSE
45		) {
46	1x	if (missing(name)) {
47	!	name <- basename(getwd())
48	!	base_dir <- ".."
49		} else {
50	1x	name <- gsub("[^A-Za-z0-9]+", "_", name)
51		}
52	1x	base_path <- paste0(base_dir, "/", name, "/")
53	1x	dir.create(base_path, showWarnings = FALSE, recursive = TRUE)
54	1x	paths <- paste0(
55	1x	base_path,
56	1x	c(
57	1x	"project.Rproj",
58	1x	"settings.json",
59	1x	"README.md",
60	1x	"scripts/build.R",
61	1x	".github/workflows/build.yaml",
62	1x	".gitignore"
63		)
64		)
65		if (
66	1x	!file.exists(paths[[1L]]) && !length(list.files(base_path, "\\.Rproj$"))
67		) {
68	1x	writeLines("Version: 1.0\n", paths[[1L]])
69		}
70	1x	if (!file.exists(paths[[2L]])) {
71	1x	jsonlite::write_json(
72	1x	list(
73	1x	name = name,
74	1x	data_dir = data_dir,
75	1x	github_account = github_account,
76	1x	branch = "main",
77	1x	repo_name = repo_name
78		),
79	1x	paths[[2L]],
80	1x	auto_unbox = TRUE,
81	1x	pretty = TRUE
82		)
83		}
84	1x	if (!file.exists(paths[[3L]])) {
85	1x	writeLines(
86	1x	paste0(
87	1x	c(
88	1x	paste("#", name),
89		"",
90	1x	"This is a Data Collection Framework project, initialized with `dcf::dcf_init`.",
91		"",
92	1x	"You can us the `dcf` package to check the source projects:",
93		"",
94	1x	"```R",
95	1x	paste0('dcf::dcf_check_source()'),
96		"```",
97		"",
98	1x	"And process them:",
99		"",
100	1x	"```R",
101	1x	paste0('dcf::dcf_process()'),
102		"```"
103		),
104	1x	collapse = "\n"
105		),
106	1x	paths[[3L]]
107		)
108		}
109	1x	if (!file.exists(paths[[4L]])) {
110	1x	dir.create(dirname(paths[[4L]]), showWarnings = FALSE)
111	1x	writeLines(
112	1x	paste(c("library(dcf)", "dcf_build()"), collapse = "\n"),
113	1x	paths[[4L]]
114		)
115		}
116	1x	if (!file.exists(paths[[5L]])) {
117	1x	dir.create(dirname(paths[[5L]]), recursive = TRUE, showWarnings = FALSE)
118	1x	file.copy(system.file("workflows/build.yaml", package = "dcf"), paths[[5L]])
119		}
120	1x	if (!file.exists(paths[[6L]])) {
121	1x	writeLines(
122	1x	paste(
123	1x	c(
124	1x	"*.Rproj",
125	1x	".Rproj.user",
126	1x	".Rprofile",
127	1x	".Rhistory",
128	1x	".Rdata",
129	1x	".DS_Store",
130	1x	"renv"
131		),
132	1x	collapse = "\n"
133		),
134	1x	paths[[6L]]
135		)
136		}
137	!	if (open_after) rstudioapi::openProject(paths[[1L]], newSession = TRUE)
138		}

1		#' Run a Project's Build Process
2		#'
3		#' Build a Data Collection Framework project,
4		#' which involves processing and checking all data projects.
5		#'
6		#' @param project_dir Path to the Data Collection Framework project to be built.
7		#' @param is_auto Logical; if \code{FALSE}, will run \code{\link{dcf_process}} as if it were run
8		#' manually.
9		#' @param ... Passes arguments to \code{\link{dcf_process}}.
10		#' @param make_diagram Logical; if \code{FALSE}, will not make a \code{status.md} diagram.
11		#' @returns A version of the project report, which is also written to
12		#' \code{project_dir/docs/report.json.gz}.
13		#' @examples
14		#' project_file <- "../../../pophive"
15		#' if (file.exists(project_file)) {
16		#' report <- dcf_build(project_file)
17		#' }
18		#' @export
19
20		dcf_build <- function(
21		project_dir = ".",
22		is_auto = TRUE,
23		...,
24		make_diagram = TRUE
25		) {
26	2x	settings <- dcf_read_settings(project_dir)
27	2x	data_dir <- paste0(project_dir, "/", settings$data_dir)
28	2x	processes <- list.files(
29	2x	data_dir,
30	2x	"process\\.json",
31	2x	recursive = TRUE,
32	2x	full.names = TRUE
33		)
34	2x	process_state <- tools::md5sum(processes)
35	2x	process <- dcf_process(project_dir = project_dir, is_auto = TRUE, ...)
36	2x	issues <- dcf_check_sources(project_dir = project_dir)
37	2x	report_file <- paste0(project_dir, "/report.json.gz")
38		if (
39	2x	!identical(
40	2x	process_state,
41	2x	tools::md5sum(list.files(
42	2x	data_dir,
43	2x	"process\\.json",
44	2x	recursive = TRUE,
45	2x	full.names = TRUE
46		))
47		)
48		) {
49	2x	datapackages <- list.files(
50	2x	data_dir,
51	2x	"datapackage\\.json",
52	2x	recursive = TRUE,
53	2x	full.names = TRUE
54		)
55	2x	names(datapackages) <- list.dirs(
56	2x	data_dir,
57	2x	recursive = FALSE,
58	2x	full.names = FALSE
59		)
60	2x	names(processes) <- names(datapackages)
61	2x	report <- list(
62	2x	date = Sys.time(),
63	2x	settings = settings,
64	2x	source_times = process$timings,
65	2x	logs = process$logs,
66	2x	issues = issues,
67	2x	metadata = lapply(datapackages, jsonlite::read_json),
68	2x	processes = lapply(processes, jsonlite::read_json)
69		)
70	2x	jsonlite::write_json(
71	2x	report,
72	2x	gzfile(report_file),
73	2x	auto_unbox = TRUE,
74	2x	dataframe = "columns"
75		)
76	2x	if (make_diagram) {
77	2x	writeLines(
78	2x	dcf_status_diagram(project_dir),
79	2x	paste0(project_dir, "/status.md")
80		)
81		}
82	2x	invisible(report)
83		} else {
84	!	invisible(jsonlite::read_json(report_file))
85		}
86		}

1		#' Make a Status Diagram
2		#'
3		#' Make a Data Collection Project status diagram.
4		#'
5		#' @param project_dir Path to the Data Collection Framework project to be built.
6		#' @returns A character vector of the status diagram, which is also written to
7		#' the \code{project_dir/status.md} file.
8		#' @examples
9		#' \dontrun{
10		#' dcf_status_diagram("project_directory")
11		#' }
12		#' @export
13
14		dcf_status_diagram <- function(project_dir = ".") {
15	2x	report_file <- paste0(project_dir, "/report.json.gz")
16	2x	if (!file.exists(report_file)) {
17	!	cli::cli_abort("no report file found")
18		}
19	2x	report <- jsonlite::read_json(report_file)
20	2x	data_dir <- if (is.null(report$settings$data_dir)) "data" else
21	2x	report$settings$data_dir
22	2x	branch <- if (is.null(report$settings$branch)) "main" else
23	2x	report$settings$branch
24	2x	repo <- if (report$settings$github_account == "") {
25	2x	NULL
26		} else {
27	!	paste0(report$settings$github_account, "/", report$settings$repo_name)
28		}
29	2x	indent <- " "
30	2x	d <- c(
31	2x	'classDef source fill:#ffcdb4, stroke:#303',
32	2x	'classDef pass fill:#66bb6a, stroke:#303',
33	2x	'classDef warn fill:#ffa726, stroke:#303',
34	2x	'classDef fail fill:#f44336, stroke:#303'
35		)
36	2x	sources <- NULL
37	2x	source_ids <- list()
38	2x	file_ids <- NULL
39	2x	relationships <- NULL
40	2x	projects <- NULL
41	2x	node_id <- 0L
42	2x	for (project_meta in report$metadata[names(sort(vapply(
43	2x	report$processes,
44	2x	function(p) !is.na(p$type) && p$type == "bundle",
45	2x	TRUE
46		)))]) {
47	3x	name <- project_meta$name
48	3x	timing <- report$source_times[[name]]
49	3x	issues <- report$issues[[name]]
50	3x	measures <- report$metadata[[name]]$measure_info
51	3x	process <- report$processes[[name]]
52	3x	contents <- NULL
53	3x	node_id <- node_id + 1L
54	3x	if (!is.null(process$type) && process$type == "bundle") {
55	1x	dist_files <- grep(
56	1x	"measure_info",
57	1x	names(process$dist_state),
58	1x	value = TRUE,
59	1x	invert = TRUE
60		)
61	1x	for (filename in basename(dist_files)) {
62	1x	contents <- c(
63	1x	contents,
64	1x	paste0(
65	1x	"n",
66	1x	node_id,
67		'["`',
68	1x	if (is.null(repo)) filename else
69	1x	make_link(
70	1x	paste0(
71	1x	"https://github.com/",
72	1x	repo,
73	1x	"/blob/",
74	1x	branch,
75		"/",
76	1x	data_dir,
77		"/",
78	1x	name,
79	1x	"/dist/",
80	1x	filename
81		),
82	1x	filename
83		),
84		'`"]'
85		)
86		)
87		}
88	1x	relationships <- c(
89	1x	relationships,
90	1x	paste0("n", file_ids[process$source_files], " --> ", name)
91		)
92		} else {
93	2x	for (r in project_meta$resources) {
94	2x	file_path <- paste0(
95		"./",
96	2x	report$settings$data_dir,
97		"/",
98	2x	name,
99	2x	"/standard/",
100	2x	r$filename
101		)
102	2x	file_ids[paste0(name, "/standard/", r$filename)] <- node_id
103	2x	file_issues <- issues[[file_path]]
104	2x	measure_sources <- NULL
105	2x	for (field in r$schema$fields) {
106	6x	field_source <- measures[[field$name]]$source
107	6x	for (s in field_source) {
108	!	if (is.null(source_ids[[s$name]])) {
109	!	source_id <- paste0("s", length(source_ids))
110	!	source_ids[[s$name]] <- source_id
111	!	sources[[source_id]] <- list(
112	!	id = source_id,
113	!	general = make_link(s$url, s$name),
114	!	specific = NULL
115		)
116		}
117	!	source_id <- source_ids[[s$name]]
118	!	if (!is.null(s$location)) {
119	!	sources[[source_id]]$specific <- unique(c(
120	!	sources[[source_id]]$specific,
121	!	make_link(s$location_url, s$location)
122		))
123		}
124	!	relationships <- unique(c(
125	!	relationships,
126	!	paste0(source_id, " --> n", node_id)
127		))
128		}
129		}
130	2x	contents <- c(
131	2x	contents,
132	2x	paste0(
133	2x	"n",
134	2x	node_id,
135		'["`',
136	2x	if (is.null(repo)) r$filename else
137	2x	make_link(
138	2x	paste0(
139	2x	"https://github.com/",
140	2x	repo,
141	2x	"/blob/",
142	2x	branch,
143		"/",
144	2x	data_dir,
145		"/",
146	2x	name,
147	2x	"/standard/",
148	2x	r$filename
149		),
150	2x	r$filename
151		),
152	2x	if (length(file_issues)) make_list(unlist(file_issues)),
153	2x	paste0('`"]:::', if (length(file_issues)) "warn" else "pass")
154		)
155		)
156		}
157		}
158	3x	projects <- c(
159	3x	projects,
160	3x	c(
161	3x	paste0(
162	3x	"subgraph ",
163	3x	name,
164		'["`',
165	3x	if (is.null(repo)) name else
166	3x	make_link(
167	3x	paste0(
168	3x	"https://github.com/",
169	3x	repo,
170	3x	"/tree/",
171	3x	branch,
172		"/",
173	3x	data_dir,
174		"/",
175	3x	name
176		),
177	3x	name
178		),
179		'`"]'
180		),
181	3x	paste0(indent, contents),
182	3x	"end"
183		)
184		)
185		}
186	2x	c(
187	2x	"```mermaid",
188	2x	"flowchart LR",
189	2x	paste0(
190	2x	indent,
191	2x	c(
192	2x	d,
193	2x	vapply(
194	2x	sources,
195	2x	function(s)
196	2x	paste(
197	2x	c(
198	2x	s$id,
199	2x	'("`<h4>',
200	2x	s$general,
201	2x	"</h4>",
202	2x	if (length(s$specific)) paste0("<br/>", make_list(s$specific)),
203	2x	'`"):::source'
204		),
205	2x	collapse = ""
206		),
207		""
208		),
209	2x	projects,
210	2x	relationships
211		)
212		),
213		"```"
214		)
215		}
216
217		make_link <- function(url, name = NULL) {
218	!	paste0(
219	!	'<a href="',
220	!	url,
221	!	'" target="_blank" rel="noreferrer">',
222	!	if (is.null(name)) sub("https?://(?:www\\.)?", "", url) else name,
223	!	"</a>"
224		)
225		}
226
227		make_list <- function(items) {
228	!	paste0(
229	!	"<ul>",
230	!	vapply(items, function(i) paste0("<br/><li>", i, "</li>"), ""),
231	!	"</ul>"
232		)
233		}

1		#' Interact with a Source Process File
2		#'
3		#' Read or update the current source process file.
4		#'
5		#' @param path Path to the process JSON file.
6		#' @param updated An update version of the process definition. If specified, will
7		#' write this as the new process file, rather than reading any existing file.
8		#' @returns The process definition of the source project.
9		#' @examples
10		#' epic_process_file <- "../../data/epic/process.json"
11		#' if (file.exists(epic_process_file)) {
12		#' dcf_process_record(path = epic_process_file)
13		#' }
14		#' @export
15
16		dcf_process_record <- function(path = "process.json", updated = NULL) {
17	30x	if (is.null(updated)) {
18	19x	if (!file.exists(path)) {
19	!	cli::cli_abort("process file {path} does not exist")
20		}
21	19x	jsonlite::read_json(path)
22		} else {
23	11x	jsonlite::write_json(updated, path, auto_unbox = TRUE, pretty = TRUE)
24	11x	updated
25		}
26		}

1		#' Update renv.lock
2		#'
3		#' Updates the \code{renv.lock} file with dependencies found in project scripts.
4		#'
5		#' @param project_dir Directory of the Data Collection project.
6		#' @param refresh Logical; if \code{FALSE}, will update an existing
7		#' \code{renv.lock} file, rather than recreating it.
8		#' @returns Nothing; writes an \code{renv.lock} file.
9		#' @examples
10		#' \dontrun{
11		#' dcf_update_lock()
12		#' }
13		#' @export
14
15		dcf_update_lock <- function(
16		project_dir = ".",
17		refresh = TRUE
18		) {
19	1x	settings <- dcf_read_settings(project_dir)
20	1x	extra <- unique(
21	1x	renv::dependencies(list.files(
22	1x	paste0(project_dir, "/", settings$data_dir),
23	1x	"\\.[Rr]$",
24	1x	recursive = TRUE,
25	1x	full.names = TRUE
26	1x	))$Package
27		)
28	1x	not_installed <- !(extra %in% rownames(utils::installed.packages()))
29	!	if (any(not_installed)) utils::install.packages(extra[not_installed])
30	1x	if (refresh) unlink(paste0(project_dir, "/renv.lock"))
31	1x	renv::snapshot(packages = extra, lockfile = paste0(project_dir, "/renv.lock"))
32		}

1		#' Create a datapackage.json template
2		#'
3		#' Initialize dataset documentation with a \code{datapackage.json} template, based on a
4		#' \href{https://specs.frictionlessdata.io/data-package}{Data Package} standard.
5		#'
6		#' @param name A unique name for the dataset; allowed characters are \code{[a-z._/-]}.
7		#' @param title A display name for the dataset; if not specified, will be a formatted version of \code{name}.
8		#' @param dir Directory in which to save the \code{datapackage.json} file.
9		#' @param ... passes arguments to \code{\link{dcf_datapackage_add}}.
10		#' @param write Logical; if \code{FALSE}, the package object will not be written to a file.
11		#' @param overwrite Logical; if \code{TRUE} and \code{write} is \code{TRUE}, an existing
12		#' \code{datapackage.json} file will be overwritten.
13		#' @param quiet Logical; if \code{TRUE}, will not print messages or navigate to files.
14		#' @examples
15		#' \dontrun{
16		#' # make a template datapackage.json file in the current working directory
17		#' dcf_datapackage_init("mtcars", "Motor Trend Car Road Tests")
18		#' }
19		#' @return An invisible list with the content written to the \code{datapackage.json} file.
20		#' @seealso Add basic information about a dataset with \code{\link{dcf_datapackage_add}}.
21		#' @export
22
23		dcf_datapackage_init <- function(
24		name,
25		title = name,
26		dir = ".",
27		...,
28		write = TRUE,
29		overwrite = FALSE,
30		quiet = !interactive()
31		) {
32	2x	if (missing(name)) {
33	!	cli::cli_abort("{.arg name} must be specified")
34		}
35	2x	package <- list(
36	2x	name = name,
37	2x	title = if (title == name) {
38	2x	gsub("\\b(\\w)", "\\U\\1", gsub("[._/-]", " ", name), perl = TRUE)
39		} else {
40	!	title
41		},
42	2x	licence = list(
43	2x	url = "http://opendatacommons.org/licenses/pddl",
44	2x	name = "Open Data Commons Public Domain",
45	2x	version = "1.0",
46	2x	id = "odc-pddl"
47		),
48	2x	resources = list()
49		)
50	2x	package_path <- normalizePath(paste0(dir, "/datapackage.json"), "/", FALSE)
51	2x	if (write && !overwrite && file.exists(package_path)) {
52	!	cli::cli_abort(c(
53	!	"datapackage ({.path {package_path}}) already exists",
54	!	i = "add {.code overwrite = TRUE} to overwrite it"
55		))
56		}
57	2x	if (length(list(...))) {
58	!	package$resources <- dcf_datapackage_add(..., dir = dir, write = FALSE)
59		}
60	2x	if (write) {
61	2x	if (!dir.exists(dir)) {
62	!	dir.create(dir, recursive = TRUE)
63		}
64	2x	jsonlite::write_json(
65	2x	package,
66	2x	package_path,
67	2x	auto_unbox = TRUE,
68	2x	digits = 6,
69	2x	pretty = TRUE
70		)
71	2x	if (!quiet) {
72	!	cli::cli_bullets(c(
73	!	v = "created metadata template for {name}:",
74	!	"*" = paste0("{.path ", package_path, "}")
75		))
76	!	rstudioapi::navigateToFile(package_path)
77		}
78		}
79	2x	invisible(package)
80		}

1		#' Adds a Source Project
2		#'
3		#' Establishes a new data source project, used to collect and prepare data from a new source.
4		#'
5		#' @param name Name of the source.
6		#' @param project_dir Path to the Data Collection Framework project.
7		#' @param open_after Logical; if \code{FALSE}, will not open the project.
8		#' @returns Nothing; creates default files and directories.
9		#' @section Project:
10		#'
11		#' Within a source project, there are two files to edits:
12		#' \itemize{
13		#' \item \strong{\code{ingest.R}}: This is the primary script, which is automatically rerun.
14		#' It should store raw data and resources in \code{raw/} where possible,
15		#' then use what's in \code{raw/} to produce standard-format files in \code{standard/}.
16		#' This file is sourced from its location during processing, so any system paths
17		#' must be relative to itself.
18		#' \item \strong{\code{measure_info.json}}: This is where you can record information
19		#' about the variables included in the standardized data files.
20		#' See \code{\link{dcf_measure_info}}.
21		#' }
22		#'
23		#' @examples
24		#' project_dir <- paste0(tempdir(), "/temp_project")
25		#' dcf_init("temp_project", dirname(project_dir))
26		#' dcf_add_source("source_name", project_dir)
27		#' list.files(paste0(project_dir, "/data/source_name"))
28		#'
29		#' @export
30
31		dcf_add_source <- function(
32		name,
33		project_dir = ".",
34		open_after = interactive()
35		) {
36	9x	if (missing(name)) {
37	!	cli::cli_abort("specify a name")
38		}
39	9x	name <- gsub("[^A-Za-z0-9]+", "_", name)
40	9x	settings <- dcf_read_settings(project_dir)
41	9x	base_dir <- paste0(project_dir, "/", settings$data_dir)
42	9x	base_path <- paste0(base_dir, "/", name, "/")
43	9x	dir.create(paste0(base_path, "raw"), showWarnings = FALSE, recursive = TRUE)
44	9x	dir.create(paste0(base_path, "standard"), showWarnings = FALSE)
45	9x	paths <- paste0(
46	9x	base_path,
47	9x	c(
48	9x	"measure_info.json",
49	9x	"ingest.R",
50	9x	"project.Rproj",
51	9x	"standard/datapackage.json",
52	9x	"process.json",
53	9x	"README.md"
54		)
55		)
56	9x	if (!file.exists(paths[[1L]])) {
57	1x	dcf_measure_info(
58	1x	paths[[1L]],
59	1x	example_variable = list(),
60	1x	verbose = FALSE,
61	1x	open_after = FALSE
62		)
63		}
64	9x	if (!file.exists(paths[[2L]])) {
65	2x	writeLines(
66	2x	paste0(
67	2x	c(
68		"#",
69	2x	"# Download",
70		"#",
71		"",
72	2x	"# add files to the `raw` directory",
73		"",
74		"#",
75	2x	"# Reformat",
76		"#",
77		"",
78	2x	"# read from the `raw` directory, and write to the `standard` directory",
79		""
80		),
81	2x	collapse = "\n"
82		),
83	2x	paths[[2L]]
84		)
85		}
86	9x	if (!file.exists(paths[[3L]])) {
87	1x	writeLines("Version: 1.0\n", paths[[3L]])
88		}
89	9x	if (!file.exists(paths[[4L]])) {
90	2x	dcf_datapackage_init(
91	2x	name,
92	2x	dir = paste0(base_path, "standard"),
93	2x	quiet = TRUE
94		)
95		}
96
97	9x	if (!file.exists(paths[[5L]])) {
98	1x	jsonlite::write_json(
99	1x	list(
100	1x	name = name,
101	1x	type = "source",
102	1x	scripts = list(
103	1x	list(
104	1x	path = "ingest.R",
105	1x	manual = FALSE,
106	1x	frequency = 0L,
107	1x	last_run = "",
108	1x	run_time = "",
109	1x	last_status = list(log = "", success = TRUE)
110		)
111		),
112	1x	checked = "",
113	1x	check_results = list()
114		),
115	1x	paths[[5L]],
116	1x	auto_unbox = TRUE,
117	1x	pretty = TRUE
118		)
119		}
120	9x	if (!file.exists(paths[[6L]])) {
121	1x	writeLines(
122	1x	paste0(
123	1x	c(
124	1x	paste("#", name),
125		"",
126	1x	"This is a dcf data source project, initialized with `dcf::dcf_add_source`.",
127		"",
128	1x	"You can us the `dcf` package to check the project:",
129		"",
130	1x	"```R",
131	1x	paste0('dcf_check_source("', name, '", "..")'),
132		"```",
133		"",
134	1x	"And process it:",
135		"",
136	1x	"```R",
137	1x	paste0('dcf_process("', name, '", "..")'),
138		"```"
139		),
140	1x	collapse = "\n"
141		),
142	1x	paths[[6L]]
143		)
144		}
145	!	if (open_after) rstudioapi::openProject(paths[[3L]], newSession = TRUE)
146		}