############################################ ## Outlier Analysis (IQR/Z-score Method) ############################################ #' @export outlier <- function(dataset, vars, method = c("iqr", "zscore"), iqr_multiplier = 1.5, z_threshold = 3, data_filter = "", envir = parent.frame()) { # 1. 基础参数处理 method <- match.arg(method, choices = c("iqr", "zscore")) df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset)) # 2. 数据提取:仅保留选择的变量+过滤数据 dataset <- get_data( dataset, vars = vars, filt = data_filter, na.rm = FALSE, # 保留缺失值,后续标记为非离群值 envir = envir ) # 3. 数据校验 if (length(vars) == 0) stop("Please select at least one numeric variable.", call. = FALSE) if (!all(vars %in% colnames(dataset))) { stop(paste("Variables not found in dataset:", paste(setdiff(vars, colnames(dataset)), collapse = ", ")), call. = FALSE) } # 4. 离群值计算 outlier_results <- list() for (var in vars) { var_data <- dataset[[var]] valid_data <- var_data[!is.na(var_data)] # 仅对非缺失值计算 if (length(valid_data) < 5) { outlier_results[[var]] <- list( overview = tibble::tibble( var = var, n_total = length(var_data), n_valid = length(valid_data), n_outlier = 0, outlier_pct = 0, lower_bound = NA_real_, upper_bound = NA_real_ ), # 强制deviation为character(0) details = tibble::tibble( row_idx = integer(0), value = numeric(0), deviation = character(0) # 明确指定字符型空向量 ) ) next } # 调用辅助函数计算离群值 if (method == "iqr") { res <- calc_outlier_iqr(valid_data, multiplier = iqr_multiplier) } else { res <- calc_outlier_zscore(valid_data, threshold = z_threshold) } # 匹配原始行索引 outlier_idx <- which(!is.na(var_data) & var_data %in% res$outlier_values) # 当无离群值时,deviation仍为character(0) deviation <- if (length(outlier_idx) == 0) { character(0) } else { ifelse(var_data[outlier_idx] < res$lower_bound, "Below Lower Bound", "Above Upper Bound") } # 整理结果 outlier_results[[var]] <- list( overview = tibble::tibble( var = var, n_total = length(var_data), n_valid = length(valid_data), n_outlier = length(res$outlier_values), outlier_pct = round(length(res$outlier_values)/length(valid_data)*100, 2), lower_bound = round(res$lower_bound, 3), upper_bound = round(res$upper_bound, 3) ), details = tibble::tibble( row_idx = outlier_idx, value = var_data[outlier_idx], deviation = deviation ) ) } # 5. 结果打包 out <- structure( list( df_name = df_name, vars = vars, method = method, params = list( iqr_multiplier = if (method == "iqr") iqr_multiplier else NA, z_threshold = if (method == "zscore") z_threshold else NA ), data_filter = if (data_filter == "") "None" else data_filter, results = outlier_results, # 每个变量的离群值结果 raw_data = dataset # 原始数据 ), class = "outlier" ) out } # ------------------------------ # 辅助函数1:IQR法计算离群值 # ------------------------------ calc_outlier_iqr <- function(data, multiplier = 1.5) { q1 <- stats::quantile(data, 0.25, na.rm = TRUE) q3 <- stats::quantile(data, 0.75, na.rm = TRUE) iqr_val <- q3 - q1 lower <- q1 - multiplier * iqr_val upper <- q3 + multiplier * iqr_val outlier_values <- data[data < lower | data > upper] list( lower_bound = lower, upper_bound = upper, outlier_values = outlier_values ) } # ------------------------------ # 辅助函数2:Z-score法计算离群值 # ------------------------------ calc_outlier_zscore <- function(data, threshold = 3) { mean_val <- mean(data, na.rm = TRUE) sd_val <- stats::sd(data, na.rm = TRUE) z_scores <- (data - mean_val) / sd_val lower <- mean_val - threshold * sd_val upper <- mean_val + threshold * sd_val outlier_values <- data[abs(z_scores) > threshold] list( lower_bound = lower, upper_bound = upper, outlier_values = outlier_values, mean = mean_val, sd = sd_val ) } # ------------------------------ # Summary方法:展示离群值概览 # ------------------------------ #' @export summary.outlier <- function(object, dec = 3, ...) { # 1. 基础信息 cat("Outlier Analysis Results\n") cat("Data :", object$df_name, "\n") cat("Variables :", paste(object$vars, collapse = ", "), "(numeric)\n") cat("Method :", if (object$method == "iqr") paste("IQR Method (Multiplier =", object$params$iqr_multiplier, ")") else paste("Z-score Method (Threshold =", object$params$z_threshold, ")"), "\n") cat("Filter :", object$data_filter, "\n\n") # 2. 离群值概览表 cat("=== 1. Outlier Overview ===\n") overview_df <- purrr::map_dfr(object$results, ~ .x$overview) overview_formatted <- overview_df %>% dplyr::mutate( outlier_pct = paste0(outlier_pct, "%"), lower_bound = as.character(round(lower_bound, dec)), upper_bound = as.character(round(upper_bound, dec)) ) %>% dplyr::rename( "Variable" = var, "Total Samples" = n_total, "Valid Samples" = n_valid, "Outlier Count" = n_outlier, "Outlier %" = outlier_pct, "Lower Bound" = lower_bound, "Upper Bound" = upper_bound ) %>% as.data.frame(stringsAsFactors = FALSE) print(overview_formatted, row.names = FALSE, right = FALSE) cat("\n") # 3. 离群值明细提示 cat("=== 2. Outlier Details (By Variable) ===\n") for (var in object$vars) { details <- object$results[[var]]$details if (nrow(details) == 0) { cat(paste("•", var, ": No outliers detected\n")) } else { cat(paste("•", var, ":", nrow(details), "outliers\n")) details_formatted <- details %>% dplyr::mutate( value = round(value, dec) ) %>% dplyr::rename( "Row Index" = row_idx, "Value" = value, "Deviation" = deviation ) print(details_formatted, row.names = FALSE, right = FALSE) cat("\n") } } # 4. 方法说明 cat("=== 3. Method Explanation ===\n") if (object$method == "iqr") { cat("• IQR Method: Outliers are values outside [Q1 - k×IQR, Q3 + k×IQR] (k =", object$params$iqr_multiplier, ")\n") cat("• Q1 = 25th percentile, Q3 = 75th percentile, IQR = Q3 - Q1\n") } else { cat("• Z-score Method: Outliers are values with |Z-score| >", object$params$z_threshold, "\n") cat("• Z-score = (Value - Mean) / Standard Deviation\n") } invisible(object) } # ------------------------------ # Plot方法:可视化离群值 # ------------------------------ #' @export plot.outlier <- function(x, plots = c("boxplot", "histogram", "scatter"), shiny = FALSE, custom = FALSE, ...) { # 1. 基础校验 if (length(plots) == 0) { return(ggplot2::ggplot() + ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("No plots selected")) + ggplot2::theme_void()) } plot_list <- list() vars <- x$vars raw_data <- x$raw_data %>% tidyr::pivot_longer(cols = all_of(vars), names_to = "variable", values_to = "value") # 长格式便于分面 # 2. 箱线图 if ("boxplot" %in% plots) { raw_data <- raw_data %>% dplyr::mutate(variable = as.factor(variable)) p <- ggplot2::ggplot(raw_data, ggplot2::aes(x = 1, y = value, fill = variable)) + ggplot2::geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.size = 2, show.legend = FALSE) + ggplot2::scale_fill_brewer(palette = "Set2") + ggplot2::facet_wrap(~variable, scales = "free_y", ncol = 2) + ggplot2::labs( x = "", y = i18n$t("Value"), title = i18n$t("Outlier Detection: Boxplot (Per Variable)") ) + ggplot2::theme_minimal() + ggplot2::theme( axis.text.x = ggplot2::element_blank(), strip.text = ggplot2::element_text(size = 11), plot.title = ggplot2::element_text(hjust = 0.5, size = 12), panel.spacing = ggplot2::unit(1, "cm") ) plot_list[["boxplot"]] <- p } # 3. 直方图 if ("histogram" %in% plots) { # 合并所有变量的界值 bound_data <- purrr::map_dfr(x$results, function(res) { tibble::tibble( variable = res$overview$var, lower_bound = res$overview$lower_bound, upper_bound = res$overview$upper_bound ) }) p <- ggplot2::ggplot(raw_data, ggplot2::aes(x = value)) + ggplot2::geom_histogram(fill = "#4287f5", alpha = 0.7, bins = 30) + ggplot2::geom_vline( data = bound_data, ggplot2::aes(xintercept = lower_bound), color = "red", linetype = "dashed", linewidth = 1 ) + ggplot2::geom_vline( data = bound_data, ggplot2::aes(xintercept = upper_bound), color = "red", linetype = "dashed", linewidth = 1 ) + ggplot2::facet_wrap(~variable, scales = "free") + ggplot2::labs(x = i18n$t("Value"), y = i18n$t("Count"), title = i18n$t("Outlier Detection: Histogram (Red Dashed = Thresholds)")) + ggplot2::theme_minimal() + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 12)) plot_list[["histogram"]] <- p } # 4. 散点图 if ("scatter" %in% plots) { if (length(vars) >= 2) { # 变量数≥2:正常生成散点图 outlier_row_idx <- purrr::map(x$results, ~ .x$details$row_idx) outlier_row_idx <- unique(unlist(outlier_row_idx)) # 取前两个变量做散点图 var1 <- vars[1] var2 <- vars[2] # 构建散点图数据 scatter_data <- x$raw_data %>% dplyr::mutate( row_idx = dplyr::row_number(), is_outlier = row_idx %in% outlier_row_idx ) %>% dplyr::select(row_idx, all_of(c(var1, var2)), is_outlier) p <- ggplot2::ggplot(scatter_data, ggplot2::aes(x = .data[[var1]], y = .data[[var2]], color = is_outlier)) + ggplot2::geom_point(alpha = 0.7, size = 1.5) + ggplot2::scale_color_manual(values = c("black", "red"), labels = c("Normal", "Outlier")) + ggplot2::labs(x = var1, y = var2, color = i18n$t("Type"), title = i18n$t("Outlier Detection: Scatter Plot")) + ggplot2::theme_minimal() + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 12)) } else { p <- ggplot2::ggplot() + ggplot2::annotate("text", x = 1, y = 1, label = i18n$t("Scatter Plot requires at least 2 numeric variables.\nPlease select more variables."), size = 4.5, color = "#666666") + ggplot2::labs(title = i18n$t("Outlier Detection: Scatter Plot")) + ggplot2::theme_minimal() + ggplot2::theme( plot.title = ggplot2::element_text(hjust = 0.5, size = 12), axis.text = ggplot2::element_blank(), axis.title = ggplot2::element_blank(), panel.grid = ggplot2::element_blank() ) } plot_list[["scatter"]] <- p } # 5. 组合图表 combined_plot <- patchwork::wrap_plots(plot_list[plots], ncol = 1, guides = "collect") # 6. 输出 if (shiny) { print(combined_plot) return(invisible(combined_plot)) } else { return(combined_plot) } }