explore <- function(dataset, vars = "", byvar = "", fun = c("mean", "sd"), top = "fun", tabfilt = "", tabsort = "", tabslice = "", nr = Inf, data_filter = "", arr = "", rows = NULL, envir = parent.frame()) { tvars <- vars if (!is.empty(byvar)) tvars <- unique(c(tvars, byvar)) df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset)) dataset <- get_data(dataset, tvars, filt = data_filter, arr = arr, rows = rows, na.rm = FALSE, envir = envir) rm(tvars) ## in case : was used vars <- base::setdiff(colnames(dataset), byvar) ## converting data as needed for summarization dc <- get_class(dataset) fixer <- function(x, fun = as_integer) { if (is.character(x) || is.Date(x)) { x <- rep(NA, length(x)) } else if (is.factor(x)) { x_num <- sshhr(as.integer(as.character(x))) if (length(na.omit(x_num)) == 0) { x <- fun(x) } else { x <- x_num } } x } fixer_first <- function(x) { x <- fixer(x, function(x) as_integer(x == levels(x)[1])) } mean <- function(x, na.rm = TRUE) sshhr(base::mean(fixer_first(x), na.rm = na.rm)) sum <- function(x, na.rm = TRUE) sshhr(base::sum(fixer_first(x), na.rm = na.rm)) var <- function(x, na.rm = TRUE) sshhr(stats::var(fixer_first(x), na.rm = na.rm)) sd <- function(x, na.rm = TRUE) sshhr(stats::sd(fixer_first(x), na.rm = na.rm)) se <- function(x, na.rm = TRUE) sshhr(radiant.data::se(fixer_first(x), na.rm = na.rm)) me <- function(x, na.rm = TRUE) sshhr(radiant.data::me(fixer_first(x), na.rm = na.rm)) cv <- function(x, na.rm = TRUE) sshhr(radiant.data::cv(fixer_first(x), na.rm = na.rm)) prop <- function(x, na.rm = TRUE) sshhr(radiant.data::prop(fixer_first(x), na.rm = na.rm)) varprop <- function(x, na.rm = TRUE) sshhr(radiant.data::varprop(fixer_first(x), na.rm = na.rm)) sdprop <- function(x, na.rm = TRUE) sshhr(radiant.data::sdprop(fixer_first(x), na.rm = na.rm)) seprop <- function(x, na.rm = TRUE) sshhr(radiant.data::seprop(fixer_first(x), na.rm = na.rm)) meprop <- function(x, na.rm = TRUE) sshhr(radiant.data::meprop(fixer_first(x), na.rm = na.rm)) varpop <- function(x, na.rm = TRUE) sshhr(radiant.data::varpop(fixer_first(x), na.rm = na.rm)) sdpop <- function(x, na.rm = TRUE) sshhr(radiant.data::sdpop(fixer_first(x), na.rm = na.rm)) median <- function(x, na.rm = TRUE) sshhr(stats::median(fixer(x), na.rm = na.rm)) min <- function(x, na.rm = TRUE) sshhr(base::min(fixer(x), na.rm = na.rm)) max <- function(x, na.rm = TRUE) sshhr(base::max(fixer(x), na.rm = na.rm)) p01 <- function(x, na.rm = TRUE) sshhr(radiant.data::p01(fixer(x), na.rm = na.rm)) p025 <- function(x, na.rm = TRUE) sshhr(radiant.data::p025(fixer(x), na.rm = na.rm)) p05 <- function(x, na.rm = TRUE) sshhr(radiant.data::p05(fixer(x), na.rm = na.rm)) p10 <- function(x, na.rm = TRUE) sshhr(radiant.data::p10(fixer(x), na.rm = na.rm)) p25 <- function(x, na.rm = TRUE) sshhr(radiant.data::p25(fixer(x), na.rm = na.rm)) p75 <- function(x, na.rm = TRUE) sshhr(radiant.data::p75(fixer(x), na.rm = na.rm)) p90 <- function(x, na.rm = TRUE) sshhr(radiant.data::p90(fixer(x), na.rm = na.rm)) p95 <- function(x, na.rm = TRUE) sshhr(radiant.data::p95(fixer(x), na.rm = na.rm)) p975 <- function(x, na.rm = TRUE) sshhr(radiant.data::p975(fixer(x), na.rm = na.rm)) p99 <- function(x, na.rm = TRUE) sshhr(radiant.data::p99(fixer(x), na.rm = na.rm)) skew <- function(x, na.rm = TRUE) sshhr(radiant.data::skew(fixer(x), na.rm = na.rm)) kurtosi <- function(x, na.rm = TRUE) sshhr(radiant.data::kurtosi(fixer(x), na.rm = na.rm)) isLogNum <- "logical" == dc & names(dc) %in% base::setdiff(vars, byvar) if (sum(isLogNum) > 0) { dataset[, isLogNum] <- select(dataset, which(isLogNum)) %>% mutate_all(as.integer) dc[isLogNum] <- "integer" } if (is.empty(byvar)) { byvar <- c() tab <- summarise_all(dataset, fun, na.rm = TRUE) } else { ## convert categorical variables to factors if needed ## needed to deal with empty/missing values dataset[, byvar] <- select_at(dataset, .vars = byvar) %>% mutate_all(~ empty_level(.)) tab <- dataset %>% group_by_at(.vars = byvar) %>% summarise_all(fun, na.rm = TRUE) } ## adjust column names if (length(vars) == 1 || length(fun) == 1) { rng <- (length(byvar) + 1):ncol(tab) colnames(tab)[rng] <- paste0(vars, "_", fun) rm(rng) } ## setup regular expression to split variable/function column appropriately rex <- paste0("(.*?)_", glue('({glue_collapse(fun, "$|")}$)')) ## useful answer and comments: http://stackoverflow.com/a/27880388/1974918 tab <- gather(tab, "variable", "value", !!-(seq_along(byvar))) %>% extract(variable, into = c("variable", "fun"), regex = rex) %>% mutate(fun = factor(fun, levels = !!fun), variable = factor(variable, levels = vars)) %>% # mutate(variable = paste0(variable, " {", dc[variable], "}")) %>% spread("fun", "value") ## flip the table if needed if (top != "fun") { tab <- list(tab = tab, byvar = byvar, fun = fun) %>% flip(top) } nrow_tab <- nrow(tab) ## filtering the table if desired from Report > Rmd if (!is.empty(tabfilt)) { tab <- filter_data(tab, tabfilt) } ## sorting the table if desired from Report > Rmd if (!identical(tabsort, "")) { tabsort <- gsub(",", ";", tabsort) tab <- tab %>% arrange(!!!rlang::parse_exprs(tabsort)) } ## ensure factors ordered as in the (sorted) table if (!is.empty(byvar) && top != "byvar") { for (i in byvar) tab[[i]] <- tab[[i]] %>% (function(x) factor(x, levels = unique(x))) rm(i) } ## frequencies converted to doubles during gather/spread above check_int <- function(x) { if (is.double(x) && length(na.omit(x)) > 0) { x_int <- sshhr(as.integer(round(x, .Machine$double.rounding))) if (isTRUE(all.equal(x, x_int, check.attributes = FALSE))) x_int else x } else { x } } tab <- ungroup(tab) %>% mutate_all(check_int) ## slicing the table if desired if (!is.empty(tabslice)) { tab <- tab %>% slice_data(tabslice) %>% droplevels() } ## convert to data.frame to maintain attributes tab <- as.data.frame(tab, stringsAsFactors = FALSE) attr(tab, "radiant_nrow") <- nrow_tab if (!isTRUE(is.infinite(nr))) { ind <- if (nr > nrow(tab)) 1:nrow(tab) else 1:nr tab <- tab[ind, , drop = FALSE] rm(ind) } list( tab = tab, df_name = df_name, vars = vars, byvar = byvar, fun = fun, top = top, tabfilt = tabfilt, tabsort = tabsort, tabslice = tabslice, nr = nr, data_filter = data_filter, arr = arr, rows = rows ) %>% add_class("explore") } summary.explore <- function(object, dec = 3, ...) { cat("Explore\n") cat("Data :", object$df_name, "\n") if (!is.empty(object$data_filter)) { cat("Filter :", gsub("\\n", "", object$data_filter), "\n") } if (!is.empty(object$arr)) { cat("Arrange :", gsub("\\n", "", object$arr), "\n") } if (!is.empty(object$rows)) { cat("Slice :", gsub("\\n", "", object$rows), "\n") } if (!is.empty(object$tabfilt)) { cat("Table filter:", object$tabfilt, "\n") } if (!is.empty(object$tabsort[1])) { cat("Table sorted:", paste0(object$tabsort, collapse = ", "), "\n") } if (!is.empty(object$tabslice)) { cat("Table slice :", object$tabslice, "\n") } nr <- attr(object$tab, "radiant_nrow") if (!isTRUE(is.infinite(nr)) && !isTRUE(is.infinite(object$nr)) && object$nr < nr) { cat(paste0("Rows shown : ", object$nr, " (out of ", nr, ")\n")) } if (!is.empty(object$byvar[1])) { cat("Grouped by :", object$byvar, "\n") } cat("Functions :", paste0(object$fun, collapse = ", "), "\n") cat("Top :", c("fun" = "Function", "var" = "Variables", "byvar" = "Group by")[object$top], "\n") cat("\n") format_df(object$tab, dec = dec, mark = ",") %>% print(row.names = FALSE) invisible() } store.explore <- function(dataset, object, name, ...) { if (missing(name)) { object$tab } else { stop( paste0( "This function is deprecated. Use the code below instead:\n\n", name, " <- ", deparse(substitute(object)), "$tab\nregister(\"", name, ")" ), call. = FALSE ) } } flip <- function(expl, top = "fun") { cvars <- expl$byvar %>% (function(x) if (is.empty(x[1])) character(0) else x) if (top[1] == "var") { expl$tab %<>% gather(".function", "value", !!-(1:(length(cvars) + 1))) %>% spread("variable", "value") expl$tab[[".function"]] %<>% factor(., levels = expl$fun) } else if (top[1] == "byvar" && length(cvars) > 0) { expl$tab %<>% gather(".function", "value", !!-(1:(length(cvars) + 1))) %>% spread(!!cvars[1], "value") expl$tab[[".function"]] %<>% factor(., levels = expl$fun) ## ensure we don't have invalid column names colnames(expl$tab) <- fix_names(colnames(expl$tab)) } expl$tab } dtab.explore <- function(object, dec = 3, searchCols = NULL, order = NULL, pageLength = NULL, caption = NULL, ...) { style <- if (exists("bslib_current_version") && "4" %in% bslib_current_version()) "bootstrap4" else "bootstrap" tab <- object$tab cn_all <- colnames(tab) cn_num <- cn_all[sapply(tab, is.numeric)] cn_cat <- cn_all[-which(cn_all %in% cn_num)] isInt <- sapply(tab, is.integer) isDbl <- sapply(tab, is_double) dec <- ifelse(is.empty(dec) || dec < 0, 3, round(dec, 0)) top <- c("fun" = "Function", "var" = "Variables", "byvar" = paste0("Group by: ", object$byvar[1]))[object$top] sketch <- shiny::withTags( table( thead( tr( th(" ", colspan = length(cn_cat)), lapply(top, th, colspan = length(cn_num), class = "text-center") ), tr(lapply(cn_all, th)) ) ) ) if (!is.empty(caption)) { ## from https://github.com/rstudio/DT/issues/630#issuecomment-461191378 caption <- shiny::tags$caption(style = "caption-side: bottom; text-align: left; font-size:100%;", caption) } ## for display options see https://datatables.net/reference/option/dom dom <- if (nrow(tab) < 11) "t" else "ltip" fbox <- if (nrow(tab) > 5e6) "none" else list(position = "top") dt_tab <- DT::datatable( tab, container = sketch, caption = caption, selection = "none", rownames = FALSE, filter = fbox, ## must use fillContainer = FALSE to address ## see https://github.com/rstudio/DT/issues/367 ## https://github.com/rstudio/DT/issues/379 fillContainer = FALSE, style = style, options = list( dom = dom, stateSave = TRUE, ## store state searchCols = searchCols, order = order, columnDefs = list(list(orderSequence = c("desc", "asc"), targets = "_all")), autoWidth = TRUE, processing = FALSE, pageLength = { if (is.null(pageLength)) 10 else pageLength }, lengthMenu = list(c(5, 10, 25, 50, -1), c("5", "10", "25", "50", "All")) ), ## https://github.com/rstudio/DT/issues/146#issuecomment-534319155 callback = DT::JS('$(window).on("unload", function() { table.state.clear(); })') ) %>% DT::formatStyle(., cn_cat, color = "white", backgroundColor = "grey") ## rounding as needed if (sum(isDbl) > 0) { dt_tab <- DT::formatRound(dt_tab, names(isDbl)[isDbl], dec) } if (sum(isInt) > 0) { dt_tab <- DT::formatRound(dt_tab, names(isInt)[isInt], 0) } ## see https://github.com/yihui/knitr/issues/1198 dt_tab$dependencies <- c( list(rmarkdown::html_dependency_bootstrap("bootstrap")), dt_tab$dependencies ) dt_tab } ########################################### ## turn functions below into functional ... ########################################### n_obs <- function(x, ...) length(x) n_missing <- function(x, ...) sum(is.na(x)) p01 <- function(x, na.rm = TRUE) quantile(x, .01, na.rm = na.rm) p025 <- function(x, na.rm = TRUE) quantile(x, .025, na.rm = na.rm) p05 <- function(x, na.rm = TRUE) quantile(x, .05, na.rm = na.rm) p10 <- function(x, na.rm = TRUE) quantile(x, .1, na.rm = na.rm) p25 <- function(x, na.rm = TRUE) quantile(x, .25, na.rm = na.rm) p75 <- function(x, na.rm = TRUE) quantile(x, .75, na.rm = na.rm) p90 <- function(x, na.rm = TRUE) quantile(x, .90, na.rm = na.rm) p95 <- function(x, na.rm = TRUE) quantile(x, .95, na.rm = na.rm) p975 <- function(x, na.rm = TRUE) quantile(x, .975, na.rm = na.rm) p99 <- function(x, na.rm = TRUE) quantile(x, .99, na.rm = na.rm) cv <- function(x, na.rm = TRUE) { m <- mean(x, na.rm = na.rm) if (m == 0) { message("Mean should be greater than 0") NA } else { sd(x, na.rm = na.rm) / m } } se <- function(x, na.rm = TRUE) { if (na.rm) x <- na.omit(x) sd(x) / sqrt(length(x)) } me <- function(x, conf_lev = 0.95, na.rm = TRUE) { if (na.rm) x <- na.omit(x) se(x) * qt(conf_lev / 2 + .5, length(x) - 1, lower.tail = TRUE) } prop <- function(x, na.rm = TRUE) { if (na.rm) x <- na.omit(x) if (is.numeric(x)) { mean(x == max(x, 1)) ## gives proportion of max value in x } else if (is.factor(x)) { mean(x == levels(x)[1]) ## gives proportion of first level in x } else if (is.logical(x)) { mean(x) } else { NA } } varprop <- function(x, na.rm = TRUE) { p <- prop(x, na.rm = na.rm) p * (1 - p) } sdprop <- function(x, na.rm = TRUE) sqrt(varprop(x, na.rm = na.rm)) seprop <- function(x, na.rm = TRUE) { if (na.rm) x <- na.omit(x) sqrt(varprop(x, na.rm = FALSE) / length(x)) } meprop <- function(x, conf_lev = 0.95, na.rm = TRUE) { if (na.rm) x <- na.omit(x) seprop(x) * qnorm(conf_lev / 2 + .5, lower.tail = TRUE) } varpop <- function(x, na.rm = TRUE) { if (na.rm) x <- na.omit(x) n <- length(x) var(x) * ((n - 1) / n) } sdpop <- function(x, na.rm = TRUE) sqrt(varpop(x, na.rm = na.rm)) ln <- function(x, na.rm = TRUE) { if (na.rm) log(na.omit(x)) else log(x) } does_vary <- function(x, na.rm = TRUE) { ## based on http://stackoverflow.com/questions/4752275/test-for-equality-among-all-elements-of-a-single-vector if (length(x) == 1L) { FALSE } else { if (is.factor(x) || is.character(x)) { length(unique(x)) > 1 } else { abs(max(x, na.rm = na.rm) - min(x, na.rm = na.rm)) > .Machine$double.eps^0.5 } } } empty_level <- function(x) { if (!is.factor(x)) x <- as.factor(x) levs <- levels(x) if ("" %in% levs) { levs[levs == ""] <- "NA" x <- factor(x, levels = levs) x[is.na(x)] <- "NA" } else if (any(is.na(x))) { x <- factor(x, levels = unique(c(levs, "NA"))) x[is.na(x)] <- "NA" } x } modal <- function(x, na.rm = TRUE) { if (na.rm) x <- na.omit(x) unv <- unique(x) unv[which.max(tabulate(match(x, unv)))] }