############################################ ## Missing Value Analysis ############################################ #' @export missing <- function(dataset, vars = NULL, data_filter = "", envir = parent.frame()) { # 1. 基础参数处理 df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset)) # 2. 数据提取 # 2.1 获取完整数据集(不限制vars,保留所有字段) full_dataset <- get_data( dataset, filt = data_filter, # 仅过滤行,不过滤列 na.rm = FALSE, # 保留缺失值 envir = envir ) # 2.2 从完整数据集中筛选出用户选中的变量 if (!is.null(vars) && length(vars) > 0) { # 校验选中变量是否存在于完整数据集 invalid_vars <- setdiff(vars, colnames(full_dataset)) if (length(invalid_vars) > 0) { stop(paste(i18n$t("Variables not found in dataset:"), paste(invalid_vars, collapse = ", ")), call. = FALSE) } dataset <- full_dataset[, vars, drop = FALSE] # 选中变量的子集 } else { # 未选变量时,默认分析所有变量 dataset <- full_dataset vars <- colnames(dataset) } # 3. 数据校验(仅校验完整数据集非空) if (nrow(full_dataset) == 0) stop(i18n$t("Dataset is empty."), call. = FALSE) # 4. 统计计算 # 4.1 整体缺失统计 overall_stats <- tibble::tibble( total_samples = nrow(dataset), complete_samples = sum(stats::complete.cases(dataset)), # 无任何缺失的样本数 missing_samples = nrow(dataset) - sum(stats::complete.cases(dataset)), # 含缺失的样本数 missing_sample_pct = round((missing_samples / total_samples) * 100, 2) ) # 4.2 变量级缺失统计 var_stats <- purrr::map_dfr(vars, function(var) { var_data <- dataset[[var]] n_missing <- sum(is.na(var_data)) tibble::tibble( variable = var, var_type = class(var_data)[1], # 变量类型 total_samples = length(var_data), missing_count = n_missing, missing_pct = round((n_missing / length(var_data)) * 100, 2), valid_count = length(var_data) - n_missing, valid_pct = round(((length(var_data) - n_missing) / length(var_data)) * 100, 2) ) }) # 4.3 单个变量缺失样本的填充率详情 missing_sample_fill_rate <- purrr::map(vars, function(target_var) { # 1. 找到目标变量(选中的)在完整数据集中的缺失行索引 missing_row_idx <- which(is.na(full_dataset[[target_var]])) if (length(missing_row_idx) == 0) { return(list( target_variable = target_var, missing_sample_count = 0, fill_rate_details = tibble::tibble(variable = character(0), fill_count = integer(0), fill_pct = numeric(0)) )) } # 2. 提取缺失样本的完整数据集 missing_samples_data <- full_dataset[missing_row_idx, , drop = FALSE] # 3. 遍历完整数据集的所有字段计算填充率 fill_rate <- purrr::map_dfr(colnames(full_dataset), function(var) { n_valid <- sum(!is.na(missing_samples_data[[var]])) tibble::tibble( variable = var, fill_count = n_valid, fill_pct = round((n_valid / nrow(missing_samples_data)) * 100, 2) ) }) list( target_variable = target_var, missing_sample_count = length(missing_row_idx), fill_rate_details = fill_rate ) }) names(missing_sample_fill_rate) <- vars # 5. 结果打包 out <- structure( list( df_name = df_name, vars = vars, data_filter = if (data_filter == "") "None" else data_filter, overall_stats = overall_stats, var_stats = var_stats, missing_sample_fill_rate = missing_sample_fill_rate, raw_data = dataset ), class = "missing" ) out } # ------------------------------ # Summary方法:展示缺失值统计结果 # ------------------------------ #' @export summary.missing <- function(object, dec = 2, ...) { # 1. 基础信息 cat(i18n$t("Missing Value Analysis Results\n")) cat(i18n$t("Data :"), object$df_name, "\n") cat(i18n$t("Variables :"), paste(object$vars, collapse = ", "), "\n") cat(i18n$t("Filter :"), object$data_filter, "\n\n") # 2. 整体缺失统计 cat("=== 1. Overall Missing Statistics ===\n") overall_formatted <- object$overall_stats %>% dplyr::mutate( missing_sample_pct = paste0(missing_sample_pct, "%") ) %>% dplyr::rename( "Total Samples" = total_samples, "Complete Samples" = complete_samples, "Samples with Missing Values" = missing_samples, "Missing Sample %" = missing_sample_pct ) print(overall_formatted, row.names = FALSE, right = FALSE) cat("\n") # 3. 变量级缺失统计 cat("=== 2. Variable-wise Missing Statistics ===\n") var_formatted <- object$var_stats %>% dplyr::mutate( missing_pct = paste0(missing_pct, "%"), valid_pct = paste0(valid_pct, "%") ) %>% dplyr::rename( "Variable" = variable, "Variable Type" = var_type, "Total Samples" = total_samples, "Missing Count" = missing_count, "Missing %" = missing_pct, "Valid Count" = valid_count, "Valid %" = valid_pct ) print(var_formatted, row.names = FALSE, right = FALSE) cat("\n") # 4. 单个变量缺失样本的填充率详情 cat("=== 3. Fill Rate Details of Samples with Missing Values (By Variable) ===\n") for (var in object$vars) { fill_rate_data <- object$missing_sample_fill_rate[[var]] if (fill_rate_data$missing_sample_count == 0) { cat(paste("•", var, ":", i18n$t("No missing values, no fill rate details.\n"))) next } cat(paste("• Target Variable:", var, "|", i18n$t("Missing Sample Count:"), fill_rate_data$missing_sample_count, "\n")) fill_formatted <- fill_rate_data$fill_rate_details %>% dplyr::mutate(fill_pct = paste0(fill_pct, "%")) %>% dplyr::rename( "Variable" = variable, "Fill Count" = fill_count, "Fill %" = fill_pct ) print(fill_formatted, row.names = FALSE, right = FALSE) cat("\n") } invisible(object) } # ------------------------------ # Plot方法:生成缺失值可视化图表 # ------------------------------ #' @export plot.missing <- function(x, plots = c("heatmap", "barplot"), shiny = FALSE, ...) { # 1. 基础校验 if (length(plots) == 0) { return(ggplot2::ggplot() + ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No plots selected")) + ggplot2::theme_void()) } plot_list <- list() raw_data <- x$raw_data # 2. 缺失热图 if ("heatmap" %in% plots) { # 步骤1:数据预处理(先校验数据,再安全抽样) if (nrow(raw_data) == 0) { # 空数据时返回提示图 heatmap_data <- ggplot2::ggplot() + ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No data to display")) + ggplot2::theme_void() plot_list[["heatmap"]] <- heatmap_data next } # 步骤2:安全抽样(优先保留缺失样本,避免weight_by报错) # 2.1 分离缺失样本和有效样本 missing_sample_idx <- which(!stats::complete.cases(raw_data)) # 有缺失值的样本索引 valid_sample_idx <- which(stats::complete.cases(raw_data)) # 无缺失值的样本索引 # 2.2 抽样逻辑:优先保留所有缺失样本,再随机补有效样本至3000 max_samples <- 3000 selected_idx <- c() # 先加所有缺失样本 if (length(missing_sample_idx) > 0) { selected_idx <- c(selected_idx, missing_sample_idx) } # 再补有效样本(不超过max_samples) need_samples <- max_samples - length(selected_idx) if (need_samples > 0 && length(valid_sample_idx) > 0) { selected_valid <- sample(valid_sample_idx, size = min(need_samples, length(valid_sample_idx))) selected_idx <- c(selected_idx, selected_valid) } # 无缺失样本时,直接随机抽有效样本 if (length(selected_idx) == 0) { selected_idx <- sample(1:nrow(raw_data), size = min(max_samples, nrow(raw_data))) } # 2.3 提取抽样后的数据并整理 heatmap_prep <- raw_data[selected_idx, , drop = FALSE] %>% tibble::rowid_to_column("sample_id") %>% # 新增样本ID tidyr::pivot_longer(cols = -sample_id, names_to = "variable", values_to = "value") %>% dplyr::mutate(is_missing = is.na(value)) # 标记缺失状态 # 步骤3:绘制优化热图(无报错+高对比) heatmap_data <- ggplot2::ggplot( heatmap_prep, ggplot2::aes(x = variable, y = reorder(sample_id, -sample_id), fill = is_missing) ) + # 核心1:矩形块+白色边框(区分度拉满) ggplot2::geom_tile(color = "white", size = 0.2) + # 核心2:高对比配色(缺失=亮红,有效=深灰) ggplot2::scale_fill_manual( values = c("FALSE" = "#2c3e50", "TRUE" = "#e74c3c"), labels = c("FALSE" = i18n$t("Valid"), "TRUE" = i18n$t("Missing")), name = i18n$t("Data Status"), drop = FALSE # 强制显示两种状态,避免无缺失时图例消失 ) + # 核心3:尺寸适配+标签优化 ggplot2::theme_minimal() + ggplot2::theme( # 动态尺寸(避免压缩) plot.width = ggplot2::unit(min(12, length(colnames(raw_data)) * 0.8), "in"), plot.height = ggplot2::unit(min(8, nrow(heatmap_prep) * 0.006), "in"), # 标签优化 axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 10, face = "bold"), axis.text.y = ggplot2::element_text(size = 6), axis.title = ggplot2::element_text(size = 11, face = "bold"), # 图例优化(置顶+横向) legend.position = "top", legend.direction = "horizontal", legend.key.width = ggplot2::unit(1.5, "cm"), legend.title = ggplot2::element_text(size = 10), legend.text = ggplot2::element_text(size = 9), # 去除网格线 panel.grid = ggplot2::element_blank(), # 标题样式 plot.title = ggplot2::element_text(hjust = 0.5, size = 14, face = "bold", margin = ggplot2::margin(b = 10)), plot.subtitle = ggplot2::element_text(hjust = 0.5, size = 11, color = "#666666") ) + # 标签设置 ggplot2::labs( title = i18n$t("Missing Value Distribution Heatmap"), subtitle = i18n$t("Red = Missing, Dark Gray = Valid | Max 3000 samples (missing samples prioritized)"), x = i18n$t("Variables"), y = i18n$t("Sample ID") ) plot_list[["heatmap"]] <- heatmap_data } # 3. 缺失条形图 if ("barplot" %in% plots) { barplot <- NULL if (!"variable" %in% colnames(x$var_stats) || !"missing_pct" %in% colnames(x$var_stats)) { barplot <- ggplot2::ggplot() + ggplot2::annotate("text", x = 1, y = 1, label = paste("原始列名:", paste(colnames(x$var_stats), collapse = ", "))) + ggplot2::theme_void() } else { # 提取数据 bar_data <- data.frame( variable = x$var_stats$variable, missing_pct = x$var_stats$missing_pct ) bar_data$variable <- factor(bar_data$variable, levels = x$var_stats$variable) # ========== 加调试代码:打印bar_data内容 ========== cat("=== bar_data真实内容 ===\n") print(bar_data) cat("========================\n") # 画图 barplot <- ggplot2::ggplot(bar_data, ggplot2::aes(x = variable, y = missing_pct, fill = variable)) + ggplot2::geom_bar(stat = "identity", alpha = 0.8) + ggplot2::scale_fill_brewer(palette = "Set2") + ggplot2::scale_y_continuous( limits = c(0, max(bar_data$missing_pct) * 1.2), breaks = function(limits) seq(0, limits[2], by = max(limits[2]/10, 0.5)), # 自动分刻度 expand = c(0, 0) ) + ggplot2::geom_text( aes(label = round(missing_pct, 2)), vjust = -0.3, size = 2.5, color = "black" ) + ggplot2::labs( x = "Variables", y = "Missing %", title = "Variable-wise Missing Percentage", fill = NULL ) + ggplot2::theme_minimal() + ggplot2::theme( plot.title = ggplot2::element_text(hjust = 0.5, size = 12), axis.text.x = ggplot2::element_text( angle = if (nrow(bar_data) > 8) 60 else 45, hjust = 1, size = if (nrow(bar_data) > 10) 7 else 8 ), axis.text.y = ggplot2::element_text(size = 8), legend.position = "none", panel.grid.major.y = ggplot2::element_line(color = "gray90"), panel.grid.minor.y = ggplot2::element_blank(), panel.grid.major.x = ggplot2::element_blank() ) } plot_list[["barplot"]] <- barplot } # 4. 组合图表 combined_plot <- patchwork::wrap_plots(plot_list[plots], ncol = 1, guides = "collect") # 5. 输出 if (shiny) { print(combined_plot) return(invisible(combined_plot)) } else { return(combined_plot) } }