Commit 4688178c authored by wuzekai's avatar wuzekai

update

parent be466d32
......@@ -26,7 +26,8 @@ Imports:
shiny.i18n,
rlang (>= 1.0.6),
ggpp,
nortest
nortest,
naniar
Suggests:
testthat (>= 2.0.0),
pkgdown (>= 1.1.0),
......
......@@ -8,6 +8,7 @@ S3method(plot,cross_tabs)
S3method(plot,goodness)
S3method(plot,homo_variance_test)
S3method(plot,mda)
S3method(plot,missing)
S3method(plot,normality_test)
S3method(plot,outlier)
S3method(plot,prob_binom)
......@@ -30,6 +31,7 @@ S3method(summary,cross_tabs)
S3method(summary,goodness)
S3method(summary,homo_variance_test)
S3method(summary,mda)
S3method(summary,missing)
S3method(summary,normality_test)
S3method(summary,outlier)
S3method(summary,prob_binom)
......@@ -54,6 +56,7 @@ export(get_single_norm)
export(goodness)
export(homo_variance_test)
export(mda)
export(missing)
export(normality_test)
export(outlier)
export(prob_binom)
......
############################################
## Missing Value Analysis
############################################
#' @export
missing <- function(dataset,
vars = NULL,
data_filter = "",
envir = parent.frame()) {
# 1. 基础参数处理
df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset))
# 2. 数据提取
# 2.1 获取完整数据集(不限制vars,保留所有字段)
full_dataset <- get_data(
dataset,
filt = data_filter, # 仅过滤行,不过滤列
na.rm = FALSE, # 保留缺失值
envir = envir
)
# 2.2 从完整数据集中筛选出用户选中的变量
if (!is.null(vars) && length(vars) > 0) {
# 校验选中变量是否存在于完整数据集
invalid_vars <- setdiff(vars, colnames(full_dataset))
if (length(invalid_vars) > 0) {
stop(paste(i18n$t("Variables not found in dataset:"), paste(invalid_vars, collapse = ", ")), call. = FALSE)
}
dataset <- full_dataset[, vars, drop = FALSE] # 选中变量的子集
} else {
# 未选变量时,默认分析所有变量
dataset <- full_dataset
vars <- colnames(dataset)
}
# 3. 数据校验(仅校验完整数据集非空)
if (nrow(full_dataset) == 0) stop(i18n$t("Dataset is empty."), call. = FALSE)
# 4. 统计计算
# 4.1 整体缺失统计
overall_stats <- tibble::tibble(
total_samples = nrow(dataset),
complete_samples = sum(stats::complete.cases(dataset)), # 无任何缺失的样本数
missing_samples = nrow(dataset) - sum(stats::complete.cases(dataset)), # 含缺失的样本数
missing_sample_pct = round((missing_samples / total_samples) * 100, 2)
)
# 4.2 变量级缺失统计
var_stats <- purrr::map_dfr(vars, function(var) {
var_data <- dataset[[var]]
n_missing <- sum(is.na(var_data))
tibble::tibble(
variable = var,
var_type = class(var_data)[1], # 变量类型
total_samples = length(var_data),
missing_count = n_missing,
missing_pct = round((n_missing / length(var_data)) * 100, 2),
valid_count = length(var_data) - n_missing,
valid_pct = round(((length(var_data) - n_missing) / length(var_data)) * 100, 2)
)
})
# 4.3 单个变量缺失样本的填充率详情
missing_sample_fill_rate <- purrr::map(vars, function(target_var) {
# 1. 找到目标变量(选中的)在完整数据集中的缺失行索引
missing_row_idx <- which(is.na(full_dataset[[target_var]]))
if (length(missing_row_idx) == 0) {
return(list(
target_variable = target_var,
missing_sample_count = 0,
fill_rate_details = tibble::tibble(variable = character(0), fill_count = integer(0), fill_pct = numeric(0))
))
}
# 2. 提取缺失样本的完整数据集
missing_samples_data <- full_dataset[missing_row_idx, , drop = FALSE]
# 3. 遍历完整数据集的所有字段计算填充率
fill_rate <- purrr::map_dfr(colnames(full_dataset), function(var) {
n_valid <- sum(!is.na(missing_samples_data[[var]]))
tibble::tibble(
variable = var,
fill_count = n_valid,
fill_pct = round((n_valid / nrow(missing_samples_data)) * 100, 2)
)
})
list(
target_variable = target_var,
missing_sample_count = length(missing_row_idx),
fill_rate_details = fill_rate
)
})
names(missing_sample_fill_rate) <- vars
# 5. 结果打包
out <- structure(
list(
df_name = df_name,
vars = vars,
data_filter = if (data_filter == "") "None" else data_filter,
overall_stats = overall_stats,
var_stats = var_stats,
missing_sample_fill_rate = missing_sample_fill_rate,
raw_data = dataset
),
class = "missing"
)
out
}
# ------------------------------
# Summary方法:展示缺失值统计结果
# ------------------------------
#' @export
summary.missing <- function(object, dec = 2, ...) {
# 1. 基础信息
cat(i18n$t("Missing Value Analysis Results\n"))
cat(i18n$t("Data :"), object$df_name, "\n")
cat(i18n$t("Variables :"), paste(object$vars, collapse = ", "), "\n")
cat(i18n$t("Filter :"), object$data_filter, "\n\n")
# 2. 整体缺失统计
cat("=== 1. Overall Missing Statistics ===\n")
overall_formatted <- object$overall_stats %>%
dplyr::mutate(
missing_sample_pct = paste0(missing_sample_pct, "%")
) %>%
dplyr::rename(
"Total Samples" = total_samples,
"Complete Samples" = complete_samples,
"Samples with Missing Values" = missing_samples,
"Missing Sample %" = missing_sample_pct
)
print(overall_formatted, row.names = FALSE, right = FALSE)
cat("\n")
# 3. 变量级缺失统计
cat("=== 2. Variable-wise Missing Statistics ===\n")
var_formatted <- object$var_stats %>%
dplyr::mutate(
missing_pct = paste0(missing_pct, "%"),
valid_pct = paste0(valid_pct, "%")
) %>%
dplyr::rename(
"Variable" = variable,
"Variable Type" = var_type,
"Total Samples" = total_samples,
"Missing Count" = missing_count,
"Missing %" = missing_pct,
"Valid Count" = valid_count,
"Valid %" = valid_pct
)
print(var_formatted, row.names = FALSE, right = FALSE)
cat("\n")
# 4. 单个变量缺失样本的填充率详情
cat("=== 3. Fill Rate Details of Samples with Missing Values (By Variable) ===\n")
for (var in object$vars) {
fill_rate_data <- object$missing_sample_fill_rate[[var]]
if (fill_rate_data$missing_sample_count == 0) {
cat(paste("•", var, ":", i18n$t("No missing values, no fill rate details.\n")))
next
}
cat(paste("• Target Variable:", var, "|", i18n$t("Missing Sample Count:"), fill_rate_data$missing_sample_count, "\n"))
fill_formatted <- fill_rate_data$fill_rate_details %>%
dplyr::mutate(fill_pct = paste0(fill_pct, "%")) %>%
dplyr::rename(
"Variable" = variable,
"Fill Count" = fill_count,
"Fill %" = fill_pct
)
print(fill_formatted, row.names = FALSE, right = FALSE)
cat("\n")
}
invisible(object)
}
# ------------------------------
# Plot方法:生成缺失值可视化图表
# ------------------------------
#' @export
plot.missing <- function(x,
plots = c("heatmap", "barplot"),
shiny = FALSE, ...) {
# 1. 基础校验
if (length(plots) == 0) {
return(ggplot2::ggplot() +
ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No plots selected")) +
ggplot2::theme_void())
}
plot_list <- list()
raw_data <- x$raw_data
# 2. 缺失热图
if ("heatmap" %in% plots) {
# 步骤1:数据预处理(先校验数据,再安全抽样)
if (nrow(raw_data) == 0) {
# 空数据时返回提示图
heatmap_data <- ggplot2::ggplot() +
ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No data to display")) +
ggplot2::theme_void()
plot_list[["heatmap"]] <- heatmap_data
next
}
# 步骤2:安全抽样(优先保留缺失样本,避免weight_by报错)
# 2.1 分离缺失样本和有效样本
missing_sample_idx <- which(!stats::complete.cases(raw_data)) # 有缺失值的样本索引
valid_sample_idx <- which(stats::complete.cases(raw_data)) # 无缺失值的样本索引
# 2.2 抽样逻辑:优先保留所有缺失样本,再随机补有效样本至3000
max_samples <- 3000
selected_idx <- c()
# 先加所有缺失样本
if (length(missing_sample_idx) > 0) {
selected_idx <- c(selected_idx, missing_sample_idx)
}
# 再补有效样本(不超过max_samples)
need_samples <- max_samples - length(selected_idx)
if (need_samples > 0 && length(valid_sample_idx) > 0) {
selected_valid <- sample(valid_sample_idx, size = min(need_samples, length(valid_sample_idx)))
selected_idx <- c(selected_idx, selected_valid)
}
# 无缺失样本时,直接随机抽有效样本
if (length(selected_idx) == 0) {
selected_idx <- sample(1:nrow(raw_data), size = min(max_samples, nrow(raw_data)))
}
# 2.3 提取抽样后的数据并整理
heatmap_prep <- raw_data[selected_idx, , drop = FALSE] %>%
tibble::rowid_to_column("sample_id") %>% # 新增样本ID
tidyr::pivot_longer(cols = -sample_id, names_to = "variable", values_to = "value") %>%
dplyr::mutate(is_missing = is.na(value)) # 标记缺失状态
# 步骤3:绘制优化热图(无报错+高对比)
heatmap_data <- ggplot2::ggplot(
heatmap_prep,
ggplot2::aes(x = variable, y = reorder(sample_id, -sample_id), fill = is_missing)
) +
# 核心1:矩形块+白色边框(区分度拉满)
ggplot2::geom_tile(color = "white", size = 0.2) +
# 核心2:高对比配色(缺失=亮红,有效=深灰)
ggplot2::scale_fill_manual(
values = c("FALSE" = "#2c3e50", "TRUE" = "#e74c3c"),
labels = c("FALSE" = i18n$t("Valid"), "TRUE" = i18n$t("Missing")),
name = i18n$t("Data Status"),
drop = FALSE # 强制显示两种状态,避免无缺失时图例消失
) +
# 核心3:尺寸适配+标签优化
ggplot2::theme_minimal() +
ggplot2::theme(
# 动态尺寸(避免压缩)
plot.width = ggplot2::unit(min(12, length(colnames(raw_data)) * 0.8), "in"),
plot.height = ggplot2::unit(min(8, nrow(heatmap_prep) * 0.006), "in"),
# 标签优化
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
axis.text.y = ggplot2::element_text(size = 6),
axis.title = ggplot2::element_text(size = 11, face = "bold"),
# 图例优化(置顶+横向)
legend.position = "top",
legend.direction = "horizontal",
legend.key.width = ggplot2::unit(1.5, "cm"),
legend.title = ggplot2::element_text(size = 10),
legend.text = ggplot2::element_text(size = 9),
# 去除网格线
panel.grid = ggplot2::element_blank(),
# 标题样式
plot.title = ggplot2::element_text(hjust = 0.5, size = 14, face = "bold", margin = ggplot2::margin(b = 10)),
plot.subtitle = ggplot2::element_text(hjust = 0.5, size = 11, color = "#666666")
) +
# 标签设置
ggplot2::labs(
title = i18n$t("Missing Value Distribution Heatmap"),
subtitle = i18n$t("Red = Missing, Dark Gray = Valid | Max 3000 samples (missing samples prioritized)"),
x = i18n$t("Variables"),
y = i18n$t("Sample ID")
)
plot_list[["heatmap"]] <- heatmap_data
}
# 3. 缺失条形图
if ("barplot" %in% plots) {
barplot <- NULL
if (!"variable" %in% colnames(x$var_stats) || !"missing_pct" %in% colnames(x$var_stats)) {
barplot <- ggplot2::ggplot() +
ggplot2::annotate("text", x = 1, y = 1, label = paste("原始列名:", paste(colnames(x$var_stats), collapse = ", "))) +
ggplot2::theme_void()
} else {
# 提取数据
bar_data <- data.frame(
variable = x$var_stats$variable,
missing_pct = x$var_stats$missing_pct
)
bar_data$variable <- factor(bar_data$variable, levels = x$var_stats$variable)
# ========== 加调试代码:打印bar_data内容 ==========
cat("=== bar_data真实内容 ===\n")
print(bar_data)
cat("========================\n")
# 画图
barplot <- ggplot2::ggplot(bar_data, ggplot2::aes(x = variable, y = missing_pct, fill = variable)) +
ggplot2::geom_bar(stat = "identity", alpha = 0.8) +
ggplot2::scale_fill_brewer(palette = "Set2") +
ggplot2::scale_y_continuous(
limits = c(0, max(bar_data$missing_pct) * 1.2),
breaks = function(limits) seq(0, limits[2], by = max(limits[2]/10, 0.5)), # 自动分刻度
expand = c(0, 0)
) +
ggplot2::geom_text(
aes(label = round(missing_pct, 2)),
vjust = -0.3,
size = 2.5,
color = "black"
) +
ggplot2::labs(
x = "Variables",
y = "Missing %",
title = "Variable-wise Missing Percentage",
fill = NULL
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(hjust = 0.5, size = 12),
axis.text.x = ggplot2::element_text(
angle = if (nrow(bar_data) > 8) 60 else 45,
hjust = 1,
size = if (nrow(bar_data) > 10) 7 else 8
),
axis.text.y = ggplot2::element_text(size = 8),
legend.position = "none",
panel.grid.major.y = ggplot2::element_line(color = "gray90"),
panel.grid.minor.y = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank()
)
}
plot_list[["barplot"]] <- barplot
}
# 4. 组合图表
combined_plot <- patchwork::wrap_plots(plot_list[plots], ncol = 1, guides = "collect")
# 5. 输出
if (shiny) {
print(combined_plot)
return(invisible(combined_plot))
} else {
return(combined_plot)
}
}
############################################
## Missing Value Analysis - UI
############################################
## 1. 标签
missing_plots <- c("heatmap", "barplot")
names(missing_plots) <- c(
i18n$t("Missing Heatmap"),
i18n$t("Missing Barplot")
)
## 2. 函数形参
missing_args <- as.list(formals(missing))
missing_args <- missing_args[names(missing_args) %in% c("dataset", "vars", "data_filter")]
## 3. 输入收集
missing_inputs <- reactive({
req(input$dataset)
inputs <- list(
dataset = input$dataset,
vars = input$missing_vars, # 用户选择的变量
data_filter = if (input$show_filter) input$data_filter else "",
envir = r_data
)
# 校验参数完整性
for (arg in names(missing_args)) {
if (is.null(inputs[[arg]]) || length(inputs[[arg]]) == 0) {
inputs[[arg]] <- missing_args[[arg]]
}
}
inputs
})
## 4. 变量选择UI
output$ui_missing_vars <- renderUI({
req(input$dataset)
current_data <- get_data(input$dataset, envir = r_data)
all_vars <- colnames(current_data)
if (length(all_vars) == 0) {
return(div(class = "alert alert-warning", i18n$t("No variables in dataset. Please select another dataset.")))
}
# 显示变量类型(数值型/分类型)
var_types <- sapply(current_data[, all_vars, drop = FALSE], function(col) class(col)[1])
choices <- setNames(nm = paste0(all_vars, " {", var_types, "}"), object = all_vars)
selectizeInput(
inputId = "missing_vars",
label = i18n$t("Select variables to analyze:"),
choices = choices,
selected = state_multiple("missing_vars", character(0)),
multiple = TRUE,
options = list(placeholder = i18n$t("Select one or more variables"), plugins = list("remove_button", "drag_drop"))
)
})
## 5. 主UI(Summary + Plot标签页)
output$ui_missing <- renderUI({
req(input$dataset)
tagList(
wellPanel(
# Summary标签页:变量选择 + 统计结果
conditionalPanel(
condition = "input.tabs_missing == 'Summary'",
uiOutput("ui_missing_vars")
),
# Plot标签页:可视化类型选择
conditionalPanel(
condition = "input.tabs_missing == 'Plot'",
selectizeInput(
inputId = "missing_plots",
label = i18n$t("Select plots:"),
choices = missing_plots,
selected = state_multiple("missing_plots", missing_plots, "barplot"), # 默认选中条形图
multiple = TRUE,
options = list(placeholder = i18n$t("Select plot types"), plugins = list("remove_button", "drag_drop"))
)
)
),
# 帮助与报告(和离群值分析保持一致)
help_and_report(
modal_title = i18n$t("Missing Value Analysis"),
fun_name = "missing",
help_file = inclMD(file.path(getOption("radiant.path.basics"), "app/tools/help/missing.md"))
)
)
})
## 6. 图表尺寸计算
missing_plot_dims <- reactive({
req(.missing())
plot_count <- length(input$missing_plots)
var_count <- length(.missing()$vars)
# 基础高度
base_height_px <- if ("heatmap" %in% input$missing_plots) {
min(500 + (nrow(.missing()$raw_data) * 0.5), 1200) # 样本越多,热图越高
} else {
400
}
total_height_px <- base_height_px * plot_count
# 限制最大/最小高度
total_height_px <- min(total_height_px, 2000)
total_height_px <- max(total_height_px, 500)
list(
width = 800,
height = total_height_px
)
})
missing_plot_width <- function() missing_plot_dims()$width
missing_plot_height <- function() missing_plot_dims()$height
## 7. 输出面板
output$missing <- renderUI({
# 注册输出组件
register_print_output("summary_missing", ".summary_missing")
register_plot_output("plot_missing", ".plot_missing",
height_fun = "missing_plot_height")
# 标签页布局(Summary + Plot)
missing_panels <- tabsetPanel(
id = "tabs_missing",
tabPanel(
title = i18n$t("Summary"),
value = "Summary",
verbatimTextOutput("summary_missing", placeholder = TRUE)
),
tabPanel(
title = i18n$t("Plot"),
value = "Plot",
download_link("dlp_missing"), # 下载按钮
plotOutput("plot_missing", height = "100%"),
style = "margin-top: 10px;"
)
)
# 集成到Data Quality菜单下
stat_tab_panel(
menu = i18n$t("Basics > Data Quality"),
tool = i18n$t("Missing Value Analysis"),
tool_ui = "ui_missing",
output_panels = missing_panels
)
})
## 8. 可用性检验
missing_available <- reactive({
req(input$dataset)
current_data <- get_data(input$dataset, envir = r_data)
# 校验是否选择变量:未选则返回提示,阻止后续计算
if (not_available(input$missing_vars)) {
return(i18n$t("Please select at least one variable to analyze."))
}
# 校验变量是否存在
invalid_vars <- input$missing_vars[!input$missing_vars %in% colnames(current_data)]
if (length(invalid_vars) > 0) {
return(i18n$t(paste("Invalid variables: ", paste(invalid_vars, collapse = ", "), ". Please reselect.", sep = "")))
}
"available"
})
## 9. 计算核心
.missing <- reactive({
req(missing_available() == "available")
do.call(missing, missing_inputs())
})
## 10. Summary输出
.summary_missing <- reactive({
req(missing_available() == "available")
summary(.missing())
})
## 11. Plot输出
.plot_missing <- reactive({
req(missing_available() == "available")
validate(need(input$missing_plots, i18n$t("Please select at least one plot type first.")))
withProgress(message = i18n$t("Generating missing value plots..."), value = 0.5, {
p <- plot(.missing(), plots = input$missing_plots, shiny = TRUE)
setProgress(value = 1)
})
p
})
## 12. 下载功能
download_handler(
id = "dlp_missing",
fun = function(file) {
plot_obj <- .plot_missing()
width_in <- missing_plot_width() / 96
height_in <- missing_plot_height() / 96
ggsave(
filename = file,
plot = plot_obj,
width = width_in,
height = height_in,
device = "png",
dpi = 300,
limitsize = FALSE,
bg = "white"
)
},
fn = function() paste0(input$dataset, "_missing_value_analysis"),
type = "png",
caption = i18n$t("Save missing value plots")
)
## 13. 报告生成
missing_report <- function() {
req(missing_available() == "available")
figs <- length(input$missing_plots) > 0
update_report(
inp_main = clean_args(missing_inputs(), missing_args),
fun_name = "missing",
inp_out = if (figs) list("", list(plots = input$missing_plots)) else list(""),
outputs = if (figs) c("summary", "plot") else "summary",
figs = figs,
fig.width = missing_plot_width(),
fig.height = missing_plot_height()
)
}
## 14. 截图功能
observeEvent(input$missing_report, {
r_info[["latest_screenshot"]] <- NULL
missing_report()
})
observeEvent(input$missing_screenshot, {
r_info[["latest_screenshot"]] <- NULL
radiant_screenshot_modal("modal_missing_screenshot")
})
observeEvent(input$modal_missing_screenshot, {
missing_report()
removeModal()
})
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment