Commit 66193f60 authored by wuzekai's avatar wuzekai

update:更新了离群值分析功能

parent cfd0855b
...@@ -9,6 +9,7 @@ S3method(plot,goodness) ...@@ -9,6 +9,7 @@ S3method(plot,goodness)
S3method(plot,homo_variance_test) S3method(plot,homo_variance_test)
S3method(plot,mda) S3method(plot,mda)
S3method(plot,normality_test) S3method(plot,normality_test)
S3method(plot,outlier)
S3method(plot,prob_binom) S3method(plot,prob_binom)
S3method(plot,prob_chisq) S3method(plot,prob_chisq)
S3method(plot,prob_disc) S3method(plot,prob_disc)
...@@ -30,6 +31,7 @@ S3method(summary,goodness) ...@@ -30,6 +31,7 @@ S3method(summary,goodness)
S3method(summary,homo_variance_test) S3method(summary,homo_variance_test)
S3method(summary,mda) S3method(summary,mda)
S3method(summary,normality_test) S3method(summary,normality_test)
S3method(summary,outlier)
S3method(summary,prob_binom) S3method(summary,prob_binom)
S3method(summary,prob_chisq) S3method(summary,prob_chisq)
S3method(summary,prob_disc) S3method(summary,prob_disc)
...@@ -53,6 +55,7 @@ export(goodness) ...@@ -53,6 +55,7 @@ export(goodness)
export(homo_variance_test) export(homo_variance_test)
export(mda) export(mda)
export(normality_test) export(normality_test)
export(outlier)
export(prob_binom) export(prob_binom)
export(prob_chisq) export(prob_chisq)
export(prob_disc) export(prob_disc)
......
############################################
## Outlier Analysis (IQR/Z-score Method)
############################################
#' @export
outlier <- function(dataset,
vars,
method = c("iqr", "zscore"),
iqr_multiplier = 1.5,
z_threshold = 3,
data_filter = "",
envir = parent.frame()) {
# 1. 基础参数处理
method <- match.arg(method, choices = c("iqr", "zscore"))
df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset))
# 2. 数据提取:仅保留选择的变量+过滤数据
dataset <- get_data(
dataset,
vars = vars,
filt = data_filter,
na.rm = FALSE, # 保留缺失值,后续标记为非离群值
envir = envir
)
# 3. 数据校验
if (length(vars) == 0) stop("Please select at least one numeric variable.", call. = FALSE)
if (!all(vars %in% colnames(dataset))) {
stop(paste("Variables not found in dataset:", paste(setdiff(vars, colnames(dataset)), collapse = ", ")), call. = FALSE)
}
# 4. 离群值计算
outlier_results <- list()
for (var in vars) {
var_data <- dataset[[var]]
valid_data <- var_data[!is.na(var_data)] # 仅对非缺失值计算
if (length(valid_data) < 5) {
outlier_results[[var]] <- list(
overview = tibble::tibble(
var = var,
n_total = length(var_data),
n_valid = length(valid_data),
n_outlier = 0,
outlier_pct = 0,
lower_bound = NA_real_,
upper_bound = NA_real_
),
# 强制deviation为character(0)
details = tibble::tibble(
row_idx = integer(0),
value = numeric(0),
deviation = character(0) # 明确指定字符型空向量
)
)
next
}
# 调用辅助函数计算离群值
if (method == "iqr") {
res <- calc_outlier_iqr(valid_data, multiplier = iqr_multiplier)
} else {
res <- calc_outlier_zscore(valid_data, threshold = z_threshold)
}
# 匹配原始行索引
outlier_idx <- which(!is.na(var_data) & var_data %in% res$outlier_values)
# 当无离群值时,deviation仍为character(0)
deviation <- if (length(outlier_idx) == 0) {
character(0)
} else {
ifelse(var_data[outlier_idx] < res$lower_bound, "Below Lower Bound", "Above Upper Bound")
}
# 整理结果
outlier_results[[var]] <- list(
overview = tibble::tibble(
var = var,
n_total = length(var_data),
n_valid = length(valid_data),
n_outlier = length(res$outlier_values),
outlier_pct = round(length(res$outlier_values)/length(valid_data)*100, 2),
lower_bound = round(res$lower_bound, 3),
upper_bound = round(res$upper_bound, 3)
),
details = tibble::tibble(
row_idx = outlier_idx,
value = var_data[outlier_idx],
deviation = deviation
)
)
}
# 5. 结果打包
out <- structure(
list(
df_name = df_name,
vars = vars,
method = method,
params = list(
iqr_multiplier = if (method == "iqr") iqr_multiplier else NA,
z_threshold = if (method == "zscore") z_threshold else NA
),
data_filter = if (data_filter == "") "None" else data_filter,
results = outlier_results, # 每个变量的离群值结果
raw_data = dataset # 原始数据
),
class = "outlier"
)
out
}
# ------------------------------
# 辅助函数1:IQR法计算离群值
# ------------------------------
calc_outlier_iqr <- function(data, multiplier = 1.5) {
q1 <- stats::quantile(data, 0.25, na.rm = TRUE)
q3 <- stats::quantile(data, 0.75, na.rm = TRUE)
iqr_val <- q3 - q1
lower <- q1 - multiplier * iqr_val
upper <- q3 + multiplier * iqr_val
outlier_values <- data[data < lower | data > upper]
list(
lower_bound = lower,
upper_bound = upper,
outlier_values = outlier_values
)
}
# ------------------------------
# 辅助函数2:Z-score法计算离群值
# ------------------------------
calc_outlier_zscore <- function(data, threshold = 3) {
mean_val <- mean(data, na.rm = TRUE)
sd_val <- stats::sd(data, na.rm = TRUE)
z_scores <- (data - mean_val) / sd_val
lower <- mean_val - threshold * sd_val
upper <- mean_val + threshold * sd_val
outlier_values <- data[abs(z_scores) > threshold]
list(
lower_bound = lower,
upper_bound = upper,
outlier_values = outlier_values,
mean = mean_val,
sd = sd_val
)
}
# ------------------------------
# Summary方法:展示离群值概览
# ------------------------------
#' @export
summary.outlier <- function(object, dec = 3, ...) {
# 1. 基础信息
cat("Outlier Analysis Results\n")
cat("Data :", object$df_name, "\n")
cat("Variables :", paste(object$vars, collapse = ", "), "(numeric)\n")
cat("Method :", if (object$method == "iqr") paste("IQR Method (Multiplier =", object$params$iqr_multiplier, ")") else paste("Z-score Method (Threshold =", object$params$z_threshold, ")"), "\n")
cat("Filter :", object$data_filter, "\n\n")
# 2. 离群值概览表
cat("=== 1. Outlier Overview ===\n")
overview_df <- purrr::map_dfr(object$results, ~ .x$overview)
overview_formatted <- overview_df %>%
dplyr::mutate(
outlier_pct = paste0(outlier_pct, "%"),
lower_bound = as.character(round(lower_bound, dec)),
upper_bound = as.character(round(upper_bound, dec))
) %>%
dplyr::rename(
"Variable" = var,
"Total Samples" = n_total,
"Valid Samples" = n_valid,
"Outlier Count" = n_outlier,
"Outlier %" = outlier_pct,
"Lower Bound" = lower_bound,
"Upper Bound" = upper_bound
) %>%
as.data.frame(stringsAsFactors = FALSE)
print(overview_formatted, row.names = FALSE, right = FALSE)
cat("\n")
# 3. 离群值明细提示
cat("=== 2. Outlier Details (By Variable) ===\n")
for (var in object$vars) {
details <- object$results[[var]]$details
if (nrow(details) == 0) {
cat(paste("•", var, ": No outliers detected\n"))
} else {
cat(paste("•", var, ":", nrow(details), "outliers\n"))
details_formatted <- details %>%
dplyr::mutate(
value = round(value, dec)
) %>%
dplyr::rename(
"Row Index" = row_idx,
"Value" = value,
"Deviation" = deviation
)
print(details_formatted, row.names = FALSE, right = FALSE)
cat("\n")
}
}
# 4. 方法说明
cat("=== 3. Method Explanation ===\n")
if (object$method == "iqr") {
cat("• IQR Method: Outliers are values outside [Q1 - k×IQR, Q3 + k×IQR] (k =", object$params$iqr_multiplier, ")\n")
cat("• Q1 = 25th percentile, Q3 = 75th percentile, IQR = Q3 - Q1\n")
} else {
cat("• Z-score Method: Outliers are values with |Z-score| >", object$params$z_threshold, "\n")
cat("• Z-score = (Value - Mean) / Standard Deviation\n")
}
invisible(object)
}
# ------------------------------
# Plot方法:可视化离群值
# ------------------------------
#' @export
plot.outlier <- function(x,
plots = c("boxplot", "histogram", "scatter"),
shiny = FALSE, custom = FALSE, ...) {
# 1. 基础校验
if (length(plots) == 0) {
return(ggplot2::ggplot() +
ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No plots selected")) +
ggplot2::theme_void())
}
plot_list <- list()
vars <- x$vars
raw_data <- x$raw_data %>%
tidyr::pivot_longer(cols = all_of(vars), names_to = "variable", values_to = "value") # 长格式便于分面
# 2. 箱线图
if ("boxplot" %in% plots) {
raw_data <- raw_data %>%
dplyr::mutate(variable = as.factor(variable))
p <- ggplot2::ggplot(raw_data, ggplot2::aes(x = 1, y = value, fill = variable)) +
ggplot2::geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.size = 2, show.legend = FALSE) +
ggplot2::scale_fill_brewer(palette = "Set2") +
ggplot2::facet_wrap(~variable, scales = "free_y", ncol = 2) +
ggplot2::labs(
x = "",
y = i18n$t("Value"),
title = i18n$t("Outlier Detection: Boxplot (Per Variable)")
) +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text.x = ggplot2::element_blank(),
strip.text = ggplot2::element_text(size = 11),
plot.title = ggplot2::element_text(hjust = 0.5, size = 12),
panel.spacing = ggplot2::unit(1, "cm")
)
plot_list[["boxplot"]] <- p
}
# 3. 直方图
if ("histogram" %in% plots) {
# 合并所有变量的界值
bound_data <- purrr::map_dfr(x$results, function(res) {
tibble::tibble(
variable = res$overview$var,
lower_bound = res$overview$lower_bound,
upper_bound = res$overview$upper_bound
)
})
p <- ggplot2::ggplot(raw_data, ggplot2::aes(x = value)) +
ggplot2::geom_histogram(fill = "#4287f5", alpha = 0.7, bins = 30) +
ggplot2::geom_vline(
data = bound_data,
ggplot2::aes(xintercept = lower_bound),
color = "red", linetype = "dashed", linewidth = 1
) +
ggplot2::geom_vline(
data = bound_data,
ggplot2::aes(xintercept = upper_bound),
color = "red", linetype = "dashed", linewidth = 1
) +
ggplot2::facet_wrap(~variable, scales = "free") +
ggplot2::labs(x = i18n$t("Value"), y = i18n$t("Count"),
title = i18n$t("Outlier Detection: Histogram (Red Dashed = Thresholds)")) +
ggplot2::theme_minimal() +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 12))
plot_list[["histogram"]] <- p
}
# 4. 散点图
if ("scatter" %in% plots) {
if (length(vars) >= 2) {
# 变量数≥2:正常生成散点图
outlier_row_idx <- purrr::map(x$results, ~ .x$details$row_idx)
outlier_row_idx <- unique(unlist(outlier_row_idx))
# 取前两个变量做散点图
var1 <- vars[1]
var2 <- vars[2]
# 构建散点图数据
scatter_data <- x$raw_data %>%
dplyr::mutate(
row_idx = dplyr::row_number(),
is_outlier = row_idx %in% outlier_row_idx
) %>%
dplyr::select(row_idx, all_of(c(var1, var2)), is_outlier)
p <- ggplot2::ggplot(scatter_data, ggplot2::aes(x = .data[[var1]], y = .data[[var2]], color = is_outlier)) +
ggplot2::geom_point(alpha = 0.7, size = 1.5) +
ggplot2::scale_color_manual(values = c("black", "red"), labels = c("Normal", "Outlier")) +
ggplot2::labs(x = var1, y = var2, color = i18n$t("Type"),
title = i18n$t("Outlier Detection: Scatter Plot")) +
ggplot2::theme_minimal() +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 12))
} else {
p <- ggplot2::ggplot() +
ggplot2::annotate("text",
x = 1, y = 1,
label = i18n$t("Scatter Plot requires at least 2 numeric variables.\nPlease select more variables."),
size = 4.5, color = "#666666") +
ggplot2::labs(title = i18n$t("Outlier Detection: Scatter Plot")) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(hjust = 0.5, size = 12),
axis.text = ggplot2::element_blank(),
axis.title = ggplot2::element_blank(),
panel.grid = ggplot2::element_blank()
)
}
plot_list[["scatter"]] <- p
}
# 5. 组合图表
combined_plot <- patchwork::wrap_plots(plot_list[plots], ncol = 1, guides = "collect")
# 6. 输出
if (shiny) {
print(combined_plot)
return(invisible(combined_plot))
} else {
return(combined_plot)
}
}
...@@ -31,6 +31,9 @@ options( ...@@ -31,6 +31,9 @@ options(
tags$head( tags$head(
tags$script(src = "www_basics/js/run_return.js") tags$script(src = "www_basics/js/run_return.js")
), ),
i18n$t("Data Quality"),
tabPanel(i18n$t("Missing Value Analysis"), uiOutput("missing")),
tabPanel(i18n$t("Outlier Analysis"), uiOutput("outlier")),
i18n$t("Probability"), i18n$t("Probability"),
tabPanel(i18n$t("Probability calculator"), uiOutput("prob_calc")), tabPanel(i18n$t("Probability calculator"), uiOutput("prob_calc")),
tabPanel(i18n$t("Central Limit Theorem"), uiOutput("clt")), tabPanel(i18n$t("Central Limit Theorem"), uiOutput("clt")),
......
############################################ ############################################
## Multigroup Difference Analysis (ANOVA/KW) - UI ## Multigroup Difference Analysis (ANOVA/KW) - UI
## 对齐单独检验的UI设计:简洁+严格校验+统一风格
############################################ ############################################
## 1. 翻译标签(对齐单独检验的i18n逻辑,保持术语一致) ## 1. 翻译标签
mda_norm_type <- c("overall", "by_group") mda_norm_type <- c("overall", "by_group")
names(mda_norm_type) <- c(i18n$t("Overall (Whole variable)"), names(mda_norm_type) <- c(i18n$t("Overall (Whole variable)"),
i18n$t("By Group (Each level separately)")) i18n$t("By Group (Each level separately)"))
......
############################################
## Outlier Analysis - UI
############################################
## 1. 翻译标签
outlier_methods <- c("iqr", "zscore")
names(outlier_methods) <- c(i18n$t("IQR Method (1.5×IQR)"),
i18n$t("Z-score Method (±3σ)"))
outlier_plots <- c("boxplot", "histogram", "scatter")
names(outlier_plots) <- c(i18n$t("Boxplot (Mark Outliers)"),
i18n$t("Histogram (With Thresholds)"),
i18n$t("Scatter Plot (Variable Pairs)"))
## 2. 函数形参
outlier_args <- as.list(formals(outlier))
outlier_args <- outlier_args[names(outlier_args) %in% c("dataset", "vars", "method", "iqr_multiplier", "z_threshold", "data_filter")]
## 3. 输入收集
outlier_inputs <- reactive({
req(input$dataset)
inputs <- list(
dataset = input$dataset,
vars = input$outlier_vars,
method = input$outlier_method,
iqr_multiplier = input$outlier_iqr_multiplier,
z_threshold = input$outlier_z_threshold,
data_filter = if (input$show_filter) input$data_filter else "",
envir = r_data
)
# 校验参数完整性
for (arg in names(outlier_args)) {
if (is.null(inputs[[arg]]) || length(inputs[[arg]]) == 0) {
inputs[[arg]] <- outlier_args[[arg]]
}
}
inputs
})
## 4. 变量选择UI
output$ui_outlier_vars <- renderUI({
req(input$dataset)
current_data <- get_data(input$dataset, envir = r_data)
is_num <- sapply(current_data, function(col) is.numeric(col) || is.ts(col))
num_vars <- names(is_num)[is_num]
if (length(num_vars) == 0) {
return(div(class = "alert alert-warning", i18n$t("No numeric variables in dataset. Please select another dataset.")))
}
# 提取变量类型并显示
var_types <- sapply(current_data[, num_vars, drop = FALSE], function(col) class(col)[1])
choices <- setNames(nm = paste0(num_vars, " {", var_types, "}"), object = num_vars)
selectizeInput(
inputId = "outlier_vars",
label = i18n$t("Select numeric variable:"),
choices = choices,
selected = state_multiple("outlier_vars", num_vars),
multiple = TRUE,
options = list(placeholder = i18n$t("Select one or more variables"), plugins = list("remove_button", "drag_drop"))
)
})
## 5. 方法参数调整UI
output$ui_outlier_params <- renderUI({
req(input$outlier_method)
tagList(
# IQR方法:调整倍数(默认1.5)
conditionalPanel(
condition = "input.outlier_method == 'iqr'",
numericInput(
inputId = "outlier_iqr_multiplier",
label = i18n$t("IQR Multiplier:"),
value = state_init("outlier_iqr_multiplier", 1.5),
min = 0.5, max = 5, step = 0.5
)
),
# Z-score方法:调整阈值(默认3)
conditionalPanel(
condition = "input.outlier_method == 'zscore'",
numericInput(
inputId = "outlier_z_threshold",
label = i18n$t("Z-score Threshold:"),
value = state_init("outlier_z_threshold", 3),
min = 1.5, max = 5, step = 0.5
)
)
)
})
## 6. 主UI
output$ui_outlier <- renderUI({
req(input$dataset)
tagList(
wellPanel(
# Summary标签页:变量选择+方法选择+参数调整
conditionalPanel(
condition = "input.tabs_outlier == 'Summary'",
uiOutput("ui_outlier_vars"),
radioButtons(
inputId = "outlier_method",
label = i18n$t("Select outlier detection method:"),
choices = outlier_methods,
selected = state_single("outlier_method", outlier_methods, "iqr"),
inline = FALSE
),
uiOutput("ui_outlier_params") # 动态参数面板
),
# Plot标签页:图表选择
conditionalPanel(
condition = "input.tabs_outlier == 'Plot'",
selectizeInput(
inputId = "outlier_plots",
label = i18n$t("Select plots:"),
choices = outlier_plots,
selected = state_multiple("outlier_plots", outlier_plots, "boxplot"),
multiple = TRUE,
options = list(placeholder = i18n$t("Select plot types"), plugins = list("remove_button", "drag_drop"))
)
)
),
# 帮助与报告
help_and_report(
modal_title = i18n$t("Outlier Analysis"),
fun_name = "outlier",
help_file = inclMD(file.path(getOption("radiant.path.basics"), "app/tools/help/outlier.md"))
)
)
})
## 7. 图表尺寸
outlier_plot_dims <- reactive({
req(.outlier())
plot_count <- length(input$outlier_plots)
var_count <- length(.outlier()$vars) # 选择的变量数
# 每个子图基础高度(像素)
base_height_px <- 300
total_height_px <- base_height_px * plot_count * ceiling(var_count / 2) # 2列布局
# 限制最大/最小高度
total_height_px <- min(total_height_px, 2500)
total_height_px <- max(total_height_px, 500)
list(
width = 800, # 宽屏适配多变量
height = total_height_px
)
})
outlier_plot_width <- function() outlier_plot_dims()$width
outlier_plot_height <- function() outlier_plot_dims()$height
## 8. 输出面板
output$outlier<- renderUI({
# 注册输出组件
register_print_output("summary_outlier", ".summary_outlier")
register_plot_output("plot_outlier", ".plot_outlier",
height_fun = "outlier_plot_height")
# 标签页布局
outlier_panels <- tabsetPanel(
id = "tabs_outlier",
tabPanel(
title = i18n$t("Summary"),
value = "Summary",
verbatimTextOutput("summary_outlier", placeholder = TRUE)
),
tabPanel(
title = i18n$t("Plot"),
value = "Plot",
download_link("dlp_outlier"), # 下载按钮
plotOutput("plot_outlier", height = "100%"),
style = "margin-top: 10px;"
)
)
stat_tab_panel(
menu = i18n$t("Basics > Data Quality"),
tool = i18n$t("Outlier Analysis"),
tool_ui = "ui_outlier",
output_panels = outlier_panels
)
})
## 9. 可用性检验
outlier_available <- reactive({
req(input$dataset)
current_data <- get_data(input$dataset, envir = r_data)
# 校验是否选择变量
if (not_available(input$outlier_vars)) {
return(i18n$t("Please select at least one numeric variable."))
}
# 校验变量是否存在且为数值型
invalid_vars <- input$outlier_vars[!input$outlier_vars %in% colnames(current_data)]
if (length(invalid_vars) > 0) {
return(i18n$t(paste("Invalid variables: ", paste(invalid_vars, collapse = ", "), ". Please reselect.", sep = "")))
}
# 校验变量是否为数值型
non_num_vars <- input$outlier_vars[!sapply(current_data[, input$outlier_vars, drop = FALSE], is.numeric)]
if (length(non_num_vars) > 0) {
return(i18n$t(paste("Non-numeric variables: ", paste(non_num_vars, collapse = ", "), ". Please select numeric variables.", sep = "")))
}
"available"
})
## 10. 计算核心
.outlier <- reactive({
req(outlier_available() == "available")
do.call(outlier, outlier_inputs())
})
## 11. Summary输出
.summary_outlier <- reactive({
req(outlier_available() == "available")
summary(.outlier())
})
## 12. Plot输出
.plot_outlier <- reactive({
req(outlier_available() == "available")
validate(need(input$outlier_plots, i18n$t("Please select at least one plot type first.")))
withProgress(message = i18n$t("Generating outlier plots..."), value = 0.5, {
p <- plot(.outlier(), plots = input$outlier_plots, shiny = TRUE)
setProgress(value = 1)
})
p
})
## 13. 下载与截图
download_handler(
id = "dlp_outlier",
fun = function(file) {
plot_obj <- .plot_outlier()
width_in <- outlier_plot_width() / 96
height_in <- outlier_plot_height() / 96
ggsave(
filename = file,
plot = plot_obj,
width = width_in,
height = height_in,
device = "png",
dpi = 300,
limitsize = FALSE,
bg = "white"
)
},
fn = function() paste0(input$dataset, "_outlier_analysis"),
type = "png",
caption = i18n$t("Save outlier plots")
)
## 14. 报告生成
outlier_report <- function() {
req(outlier_available() == "available")
figs <- length(input$outlier_plots) > 0
update_report(
inp_main = clean_args(outlier_inputs(), outlier_args),
fun_name = "outlier",
inp_out = if (figs) list("", list(plots = input$outlier_plots)) else list(""),
outputs = if (figs) c("summary", "plot") else "summary",
figs = figs,
fig.width = outlier_plot_width(),
fig.height = outlier_plot_height()
)
}
## 15. 截图功能
observeEvent(input$outlier_report, {
r_info[["latest_screenshot"]] <- NULL
outlier_report()
})
observeEvent(input$outlier_screenshot, {
r_info[["latest_screenshot"]] <- NULL
radiant_screenshot_modal("modal_outlier_screenshot")
})
observeEvent(input$modal_outlier_screenshot, {
outlier_report()
removeModal()
})
xxxxxmiss
\ No newline at end of file
xxxxxout
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment