diff --git a/radiant.basics/DESCRIPTION b/radiant.basics/DESCRIPTION index c1cd2c87709196ed923418130bb06c5703a6838d..73bbac24379fc0b9033f9e5f9af126fce918dc05 100644 --- a/radiant.basics/DESCRIPTION +++ b/radiant.basics/DESCRIPTION @@ -26,7 +26,8 @@ Imports: shiny.i18n, rlang (>= 1.0.6), ggpp, - nortest + nortest, + naniar Suggests: testthat (>= 2.0.0), pkgdown (>= 1.1.0), diff --git a/radiant.basics/NAMESPACE b/radiant.basics/NAMESPACE index d0ee331fa279e269b2a71ff062f3e7268810870f..b3bdc0eba1634a22e7085201f643c6e2ec419a5c 100644 --- a/radiant.basics/NAMESPACE +++ b/radiant.basics/NAMESPACE @@ -8,6 +8,7 @@ S3method(plot,cross_tabs) S3method(plot,goodness) S3method(plot,homo_variance_test) S3method(plot,mda) +S3method(plot,missing) S3method(plot,normality_test) S3method(plot,outlier) S3method(plot,prob_binom) @@ -30,6 +31,7 @@ S3method(summary,cross_tabs) S3method(summary,goodness) S3method(summary,homo_variance_test) S3method(summary,mda) +S3method(summary,missing) S3method(summary,normality_test) S3method(summary,outlier) S3method(summary,prob_binom) @@ -54,6 +56,7 @@ export(get_single_norm) export(goodness) export(homo_variance_test) export(mda) +export(missing) export(normality_test) export(outlier) export(prob_binom) diff --git a/radiant.basics/R/missing.R b/radiant.basics/R/missing.R index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2f265c894808fb94a44e8fda255253f93ffcdfd7 100644 --- a/radiant.basics/R/missing.R +++ b/radiant.basics/R/missing.R @@ -0,0 +1,353 @@ +############################################ +## Missing Value Analysis +############################################ +#' @export +missing <- function(dataset, + vars = NULL, + data_filter = "", + envir = parent.frame()) { + + # 1. 基础参数处理 + df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset)) + + # 2. 数据提取 + # 2.1 获取完整数据集(不限制vars,保留所有字段) + full_dataset <- get_data( + dataset, + filt = data_filter, # 仅过滤行,不过滤列 + na.rm = FALSE, # 保留缺失值 + envir = envir + ) + + # 2.2 从完整数据集中筛选出用户选中的变量 + if (!is.null(vars) && length(vars) > 0) { + # 校验选中变量是否存在于完整数据集 + invalid_vars <- setdiff(vars, colnames(full_dataset)) + if (length(invalid_vars) > 0) { + stop(paste(i18n$t("Variables not found in dataset:"), paste(invalid_vars, collapse = ", ")), call. = FALSE) + } + dataset <- full_dataset[, vars, drop = FALSE] # 选中变量的子集 + } else { + # 未选变量时,默认分析所有变量 + dataset <- full_dataset + vars <- colnames(dataset) + } + + # 3. 数据校验(仅校验完整数据集非空) + if (nrow(full_dataset) == 0) stop(i18n$t("Dataset is empty."), call. = FALSE) + + # 4. 统计计算 + # 4.1 整体缺失统计 + overall_stats <- tibble::tibble( + total_samples = nrow(dataset), + complete_samples = sum(stats::complete.cases(dataset)), # 无任何缺失的样本数 + missing_samples = nrow(dataset) - sum(stats::complete.cases(dataset)), # 含缺失的样本数 + missing_sample_pct = round((missing_samples / total_samples) * 100, 2) + ) + + # 4.2 变量级缺失统计 + var_stats <- purrr::map_dfr(vars, function(var) { + var_data <- dataset[[var]] + n_missing <- sum(is.na(var_data)) + tibble::tibble( + variable = var, + var_type = class(var_data)[1], # 变量类型 + total_samples = length(var_data), + missing_count = n_missing, + missing_pct = round((n_missing / length(var_data)) * 100, 2), + valid_count = length(var_data) - n_missing, + valid_pct = round(((length(var_data) - n_missing) / length(var_data)) * 100, 2) + ) + }) + + # 4.3 单个变量缺失样本的填充率详情 + missing_sample_fill_rate <- purrr::map(vars, function(target_var) { + # 1. 找到目标变量(选中的)在完整数据集中的缺失行索引 + missing_row_idx <- which(is.na(full_dataset[[target_var]])) + if (length(missing_row_idx) == 0) { + return(list( + target_variable = target_var, + missing_sample_count = 0, + fill_rate_details = tibble::tibble(variable = character(0), fill_count = integer(0), fill_pct = numeric(0)) + )) + } + # 2. 提取缺失样本的完整数据集 + missing_samples_data <- full_dataset[missing_row_idx, , drop = FALSE] + # 3. 遍历完整数据集的所有字段计算填充率 + fill_rate <- purrr::map_dfr(colnames(full_dataset), function(var) { + n_valid <- sum(!is.na(missing_samples_data[[var]])) + tibble::tibble( + variable = var, + fill_count = n_valid, + fill_pct = round((n_valid / nrow(missing_samples_data)) * 100, 2) + ) + }) + list( + target_variable = target_var, + missing_sample_count = length(missing_row_idx), + fill_rate_details = fill_rate + ) + }) + names(missing_sample_fill_rate) <- vars + + # 5. 结果打包 + out <- structure( + list( + df_name = df_name, + vars = vars, + data_filter = if (data_filter == "") "None" else data_filter, + overall_stats = overall_stats, + var_stats = var_stats, + missing_sample_fill_rate = missing_sample_fill_rate, + raw_data = dataset + ), + class = "missing" + ) + out +} + +# ------------------------------ +# Summary方法:展示缺失值统计结果 +# ------------------------------ +#' @export +summary.missing <- function(object, dec = 2, ...) { + # 1. 基础信息 + cat(i18n$t("Missing Value Analysis Results\n")) + cat(i18n$t("Data :"), object$df_name, "\n") + cat(i18n$t("Variables :"), paste(object$vars, collapse = ", "), "\n") + cat(i18n$t("Filter :"), object$data_filter, "\n\n") + + # 2. 整体缺失统计 + cat("=== 1. Overall Missing Statistics ===\n") + overall_formatted <- object$overall_stats %>% + dplyr::mutate( + missing_sample_pct = paste0(missing_sample_pct, "%") + ) %>% + dplyr::rename( + "Total Samples" = total_samples, + "Complete Samples" = complete_samples, + "Samples with Missing Values" = missing_samples, + "Missing Sample %" = missing_sample_pct + ) + print(overall_formatted, row.names = FALSE, right = FALSE) + cat("\n") + + # 3. 变量级缺失统计 + cat("=== 2. Variable-wise Missing Statistics ===\n") + var_formatted <- object$var_stats %>% + dplyr::mutate( + missing_pct = paste0(missing_pct, "%"), + valid_pct = paste0(valid_pct, "%") + ) %>% + dplyr::rename( + "Variable" = variable, + "Variable Type" = var_type, + "Total Samples" = total_samples, + "Missing Count" = missing_count, + "Missing %" = missing_pct, + "Valid Count" = valid_count, + "Valid %" = valid_pct + ) + print(var_formatted, row.names = FALSE, right = FALSE) + cat("\n") + + # 4. 单个变量缺失样本的填充率详情 + cat("=== 3. Fill Rate Details of Samples with Missing Values (By Variable) ===\n") + for (var in object$vars) { + fill_rate_data <- object$missing_sample_fill_rate[[var]] + if (fill_rate_data$missing_sample_count == 0) { + cat(paste("•", var, ":", i18n$t("No missing values, no fill rate details.\n"))) + next + } + cat(paste("• Target Variable:", var, "|", i18n$t("Missing Sample Count:"), fill_rate_data$missing_sample_count, "\n")) + fill_formatted <- fill_rate_data$fill_rate_details %>% + dplyr::mutate(fill_pct = paste0(fill_pct, "%")) %>% + dplyr::rename( + "Variable" = variable, + "Fill Count" = fill_count, + "Fill %" = fill_pct + ) + print(fill_formatted, row.names = FALSE, right = FALSE) + cat("\n") + } + + invisible(object) +} + +# ------------------------------ +# Plot方法:生成缺失值可视化图表 +# ------------------------------ +#' @export +plot.missing <- function(x, + plots = c("heatmap", "barplot"), + shiny = FALSE, ...) { + # 1. 基础校验 + if (length(plots) == 0) { + return(ggplot2::ggplot() + + ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No plots selected")) + + ggplot2::theme_void()) + } + + plot_list <- list() + raw_data <- x$raw_data + + # 2. 缺失热图 + if ("heatmap" %in% plots) { + # 步骤1:数据预处理(先校验数据,再安全抽样) + if (nrow(raw_data) == 0) { + # 空数据时返回提示图 + heatmap_data <- ggplot2::ggplot() + + ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No data to display")) + + ggplot2::theme_void() + plot_list[["heatmap"]] <- heatmap_data + next + } + + # 步骤2:安全抽样(优先保留缺失样本,避免weight_by报错) + # 2.1 分离缺失样本和有效样本 + missing_sample_idx <- which(!stats::complete.cases(raw_data)) # 有缺失值的样本索引 + valid_sample_idx <- which(stats::complete.cases(raw_data)) # 无缺失值的样本索引 + + # 2.2 抽样逻辑:优先保留所有缺失样本,再随机补有效样本至3000 + max_samples <- 3000 + selected_idx <- c() + # 先加所有缺失样本 + if (length(missing_sample_idx) > 0) { + selected_idx <- c(selected_idx, missing_sample_idx) + } + # 再补有效样本(不超过max_samples) + need_samples <- max_samples - length(selected_idx) + if (need_samples > 0 && length(valid_sample_idx) > 0) { + selected_valid <- sample(valid_sample_idx, size = min(need_samples, length(valid_sample_idx))) + selected_idx <- c(selected_idx, selected_valid) + } + # 无缺失样本时,直接随机抽有效样本 + if (length(selected_idx) == 0) { + selected_idx <- sample(1:nrow(raw_data), size = min(max_samples, nrow(raw_data))) + } + + # 2.3 提取抽样后的数据并整理 + heatmap_prep <- raw_data[selected_idx, , drop = FALSE] %>% + tibble::rowid_to_column("sample_id") %>% # 新增样本ID + tidyr::pivot_longer(cols = -sample_id, names_to = "variable", values_to = "value") %>% + dplyr::mutate(is_missing = is.na(value)) # 标记缺失状态 + + # 步骤3:绘制优化热图(无报错+高对比) + heatmap_data <- ggplot2::ggplot( + heatmap_prep, + ggplot2::aes(x = variable, y = reorder(sample_id, -sample_id), fill = is_missing) + ) + + # 核心1:矩形块+白色边框(区分度拉满) + ggplot2::geom_tile(color = "white", size = 0.2) + + # 核心2:高对比配色(缺失=亮红,有效=深灰) + ggplot2::scale_fill_manual( + values = c("FALSE" = "#2c3e50", "TRUE" = "#e74c3c"), + labels = c("FALSE" = i18n$t("Valid"), "TRUE" = i18n$t("Missing")), + name = i18n$t("Data Status"), + drop = FALSE # 强制显示两种状态,避免无缺失时图例消失 + ) + + # 核心3:尺寸适配+标签优化 + ggplot2::theme_minimal() + + ggplot2::theme( + # 动态尺寸(避免压缩) + plot.width = ggplot2::unit(min(12, length(colnames(raw_data)) * 0.8), "in"), + plot.height = ggplot2::unit(min(8, nrow(heatmap_prep) * 0.006), "in"), + # 标签优化 + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 10, face = "bold"), + axis.text.y = ggplot2::element_text(size = 6), + axis.title = ggplot2::element_text(size = 11, face = "bold"), + # 图例优化(置顶+横向) + legend.position = "top", + legend.direction = "horizontal", + legend.key.width = ggplot2::unit(1.5, "cm"), + legend.title = ggplot2::element_text(size = 10), + legend.text = ggplot2::element_text(size = 9), + # 去除网格线 + panel.grid = ggplot2::element_blank(), + # 标题样式 + plot.title = ggplot2::element_text(hjust = 0.5, size = 14, face = "bold", margin = ggplot2::margin(b = 10)), + plot.subtitle = ggplot2::element_text(hjust = 0.5, size = 11, color = "#666666") + ) + + # 标签设置 + ggplot2::labs( + title = i18n$t("Missing Value Distribution Heatmap"), + subtitle = i18n$t("Red = Missing, Dark Gray = Valid | Max 3000 samples (missing samples prioritized)"), + x = i18n$t("Variables"), + y = i18n$t("Sample ID") + ) + + plot_list[["heatmap"]] <- heatmap_data + } + + + # 3. 缺失条形图 + if ("barplot" %in% plots) { + barplot <- NULL + if (!"variable" %in% colnames(x$var_stats) || !"missing_pct" %in% colnames(x$var_stats)) { + barplot <- ggplot2::ggplot() + + ggplot2::annotate("text", x = 1, y = 1, label = paste("原始列名:", paste(colnames(x$var_stats), collapse = ", "))) + + ggplot2::theme_void() + } else { + # 提取数据 + bar_data <- data.frame( + variable = x$var_stats$variable, + missing_pct = x$var_stats$missing_pct + ) + bar_data$variable <- factor(bar_data$variable, levels = x$var_stats$variable) + + # ========== 加调试代码:打印bar_data内容 ========== + cat("=== bar_data真实内容 ===\n") + print(bar_data) + cat("========================\n") + + # 画图 + barplot <- ggplot2::ggplot(bar_data, ggplot2::aes(x = variable, y = missing_pct, fill = variable)) + + ggplot2::geom_bar(stat = "identity", alpha = 0.8) + + ggplot2::scale_fill_brewer(palette = "Set2") + + ggplot2::scale_y_continuous( + limits = c(0, max(bar_data$missing_pct) * 1.2), + breaks = function(limits) seq(0, limits[2], by = max(limits[2]/10, 0.5)), # 自动分刻度 + expand = c(0, 0) + ) + + ggplot2::geom_text( + aes(label = round(missing_pct, 2)), + vjust = -0.3, + size = 2.5, + color = "black" + ) + + ggplot2::labs( + x = "Variables", + y = "Missing %", + title = "Variable-wise Missing Percentage", + fill = NULL + ) + + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(hjust = 0.5, size = 12), + axis.text.x = ggplot2::element_text( + angle = if (nrow(bar_data) > 8) 60 else 45, + hjust = 1, + size = if (nrow(bar_data) > 10) 7 else 8 + ), + axis.text.y = ggplot2::element_text(size = 8), + legend.position = "none", + panel.grid.major.y = ggplot2::element_line(color = "gray90"), + panel.grid.minor.y = ggplot2::element_blank(), + panel.grid.major.x = ggplot2::element_blank() + ) + } + plot_list[["barplot"]] <- barplot + } + + + # 4. 组合图表 + combined_plot <- patchwork::wrap_plots(plot_list[plots], ncol = 1, guides = "collect") + + # 5. 输出 + if (shiny) { + print(combined_plot) + return(invisible(combined_plot)) + } else { + return(combined_plot) + } +} diff --git a/radiant.basics/inst/app/tools/analysis/missing_ui.R b/radiant.basics/inst/app/tools/analysis/missing_ui.R index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..142b2072c5b59fd3d10a3fefac1349eb9f5ed6cd 100644 --- a/radiant.basics/inst/app/tools/analysis/missing_ui.R +++ b/radiant.basics/inst/app/tools/analysis/missing_ui.R @@ -0,0 +1,241 @@ +############################################ +## Missing Value Analysis - UI +############################################ +## 1. 标签 +missing_plots <- c("heatmap", "barplot") +names(missing_plots) <- c( + i18n$t("Missing Heatmap"), + i18n$t("Missing Barplot") +) + +## 2. 函数形参 +missing_args <- as.list(formals(missing)) +missing_args <- missing_args[names(missing_args) %in% c("dataset", "vars", "data_filter")] + +## 3. 输入收集 +missing_inputs <- reactive({ + req(input$dataset) + inputs <- list( + dataset = input$dataset, + vars = input$missing_vars, # 用户选择的变量 + data_filter = if (input$show_filter) input$data_filter else "", + envir = r_data + ) + # 校验参数完整性 + for (arg in names(missing_args)) { + if (is.null(inputs[[arg]]) || length(inputs[[arg]]) == 0) { + inputs[[arg]] <- missing_args[[arg]] + } + } + inputs +}) + +## 4. 变量选择UI +output$ui_missing_vars <- renderUI({ + req(input$dataset) + current_data <- get_data(input$dataset, envir = r_data) + all_vars <- colnames(current_data) + + if (length(all_vars) == 0) { + return(div(class = "alert alert-warning", i18n$t("No variables in dataset. Please select another dataset."))) + } + + # 显示变量类型(数值型/分类型) + var_types <- sapply(current_data[, all_vars, drop = FALSE], function(col) class(col)[1]) + choices <- setNames(nm = paste0(all_vars, " {", var_types, "}"), object = all_vars) + + selectizeInput( + inputId = "missing_vars", + label = i18n$t("Select variables to analyze:"), + choices = choices, + selected = state_multiple("missing_vars", character(0)), + multiple = TRUE, + options = list(placeholder = i18n$t("Select one or more variables"), plugins = list("remove_button", "drag_drop")) + ) +}) + +## 5. 主UI(Summary + Plot标签页) +output$ui_missing <- renderUI({ + req(input$dataset) + tagList( + wellPanel( + # Summary标签页:变量选择 + 统计结果 + conditionalPanel( + condition = "input.tabs_missing == 'Summary'", + uiOutput("ui_missing_vars") + ), + # Plot标签页:可视化类型选择 + conditionalPanel( + condition = "input.tabs_missing == 'Plot'", + selectizeInput( + inputId = "missing_plots", + label = i18n$t("Select plots:"), + choices = missing_plots, + selected = state_multiple("missing_plots", missing_plots, "barplot"), # 默认选中条形图 + multiple = TRUE, + options = list(placeholder = i18n$t("Select plot types"), plugins = list("remove_button", "drag_drop")) + ) + ) + ), + # 帮助与报告(和离群值分析保持一致) + help_and_report( + modal_title = i18n$t("Missing Value Analysis"), + fun_name = "missing", + help_file = inclMD(file.path(getOption("radiant.path.basics"), "app/tools/help/missing.md")) + ) + ) +}) + +## 6. 图表尺寸计算 +missing_plot_dims <- reactive({ + req(.missing()) + plot_count <- length(input$missing_plots) + var_count <- length(.missing()$vars) + + # 基础高度 + base_height_px <- if ("heatmap" %in% input$missing_plots) { + min(500 + (nrow(.missing()$raw_data) * 0.5), 1200) # 样本越多,热图越高 + } else { + 400 + } + total_height_px <- base_height_px * plot_count + + # 限制最大/最小高度 + total_height_px <- min(total_height_px, 2000) + total_height_px <- max(total_height_px, 500) + + list( + width = 800, + height = total_height_px + ) +}) +missing_plot_width <- function() missing_plot_dims()$width +missing_plot_height <- function() missing_plot_dims()$height + +## 7. 输出面板 +output$missing <- renderUI({ + # 注册输出组件 + register_print_output("summary_missing", ".summary_missing") + register_plot_output("plot_missing", ".plot_missing", + height_fun = "missing_plot_height") + + # 标签页布局(Summary + Plot) + missing_panels <- tabsetPanel( + id = "tabs_missing", + tabPanel( + title = i18n$t("Summary"), + value = "Summary", + verbatimTextOutput("summary_missing", placeholder = TRUE) + ), + tabPanel( + title = i18n$t("Plot"), + value = "Plot", + download_link("dlp_missing"), # 下载按钮 + plotOutput("plot_missing", height = "100%"), + style = "margin-top: 10px;" + ) + ) + + # 集成到Data Quality菜单下 + stat_tab_panel( + menu = i18n$t("Basics > Data Quality"), + tool = i18n$t("Missing Value Analysis"), + tool_ui = "ui_missing", + output_panels = missing_panels + ) +}) + +## 8. 可用性检验 +missing_available <- reactive({ + req(input$dataset) + current_data <- get_data(input$dataset, envir = r_data) + + # 校验是否选择变量:未选则返回提示,阻止后续计算 + if (not_available(input$missing_vars)) { + return(i18n$t("Please select at least one variable to analyze.")) + } + + # 校验变量是否存在 + invalid_vars <- input$missing_vars[!input$missing_vars %in% colnames(current_data)] + if (length(invalid_vars) > 0) { + return(i18n$t(paste("Invalid variables: ", paste(invalid_vars, collapse = ", "), ". Please reselect.", sep = ""))) + } + + "available" +}) + +## 9. 计算核心 +.missing <- reactive({ + req(missing_available() == "available") + do.call(missing, missing_inputs()) +}) + +## 10. Summary输出 +.summary_missing <- reactive({ + req(missing_available() == "available") + summary(.missing()) +}) + +## 11. Plot输出 +.plot_missing <- reactive({ + req(missing_available() == "available") + validate(need(input$missing_plots, i18n$t("Please select at least one plot type first."))) + withProgress(message = i18n$t("Generating missing value plots..."), value = 0.5, { + p <- plot(.missing(), plots = input$missing_plots, shiny = TRUE) + setProgress(value = 1) + }) + p +}) + +## 12. 下载功能 +download_handler( + id = "dlp_missing", + fun = function(file) { + plot_obj <- .plot_missing() + width_in <- missing_plot_width() / 96 + height_in <- missing_plot_height() / 96 + + ggsave( + filename = file, + plot = plot_obj, + width = width_in, + height = height_in, + device = "png", + dpi = 300, + limitsize = FALSE, + bg = "white" + ) + }, + fn = function() paste0(input$dataset, "_missing_value_analysis"), + type = "png", + caption = i18n$t("Save missing value plots") +) + +## 13. 报告生成 +missing_report <- function() { + req(missing_available() == "available") + figs <- length(input$missing_plots) > 0 + update_report( + inp_main = clean_args(missing_inputs(), missing_args), + fun_name = "missing", + inp_out = if (figs) list("", list(plots = input$missing_plots)) else list(""), + outputs = if (figs) c("summary", "plot") else "summary", + figs = figs, + fig.width = missing_plot_width(), + fig.height = missing_plot_height() + ) +} + +## 14. 截图功能 +observeEvent(input$missing_report, { + r_info[["latest_screenshot"]] <- NULL + missing_report() +}) +observeEvent(input$missing_screenshot, { + r_info[["latest_screenshot"]] <- NULL + radiant_screenshot_modal("modal_missing_screenshot") +}) +observeEvent(input$modal_missing_screenshot, { + missing_report() + removeModal() +})