Commit d68fce7d authored by wuzekai's avatar wuzekai

update:更新了不同的代码生成大模型

parent ab43003a
# === 配置 ===
OLLAMA_URL <- "http://172.29.2.110:8139/api/generate"
MODEL_ID <- "qwen3-coder:30b"
OLLAMA_URL <- "http://180.169.131.147:8139/api/generate"
MODEL_ID <- "qwen3-coder:30b"
# === 单次对话 ===
#' @export
metrics_completion <- function(user_prompt) {
# 构建 Ollama 请求体
req <- httr2::request(OLLAMA_URL) %>%
httr2::req_headers(
"Content-Type" = "application/json"
) %>%
httr2::req_body_json(list(
model = MODEL_ID,
prompt = user_prompt,
stream = FALSE
))
resp <- httr2::req_perform(req)
body <- httr2::resp_body_json(resp)
Sys.unsetenv("http_proxy")
Sys.unsetenv("https_proxy")
# 构建请求体
req_body <- list(
model = MODEL_ID,
prompt = user_prompt,
stream = FALSE
)
# 使用 curl 包发送请求
h <- curl::new_handle()
curl::handle_setheaders(h, "Content-Type" = "application/json")
curl::handle_setopt(h, postfields = jsonlite::toJSON(req_body, auto_unbox = TRUE))
curl::handle_setopt(h, timeout = 60)
con <- curl::curl(OLLAMA_URL, handle = h)
result <- readLines(con, warn = FALSE)
close(con)
# 解析响应
body <- jsonlite::fromJSON(paste(result, collapse = ""))
# 解析 Ollama 响应
if (is.null(body$response) || trimws(body$response) == "")
stop("Ollama API 返回空内容:", jsonlite::toJSON(body))
stop("Ollama API 返回空内容:", paste(result, collapse = ""))
body$response # 返回 Ollama 生成的核心代码/结果
body$response
}
# === 构造发给模型的 Prompt ===
#' @export
build_metrics_prompt <- function(user_prompt, data_call) {
......@@ -43,14 +54,15 @@ build_metrics_prompt <- function(user_prompt, data_call) {
- 变量适配:严格匹配方法对变量类型的要求(如 t检验要求连续变量,卡方检验要求分类变量,回归分析因变量需连续),类型不匹配则触发“无法计算”。
- 缺失值处理:所有涉及数据计算的函数必须加 na.rm=TRUE(方法不支持则除外)。
3. 默认参数:用户未指定时,按科研规范设默认值并在结果中体现:
- 置信水平默认 0.95(conf.level=0.95
- 多重比较调整方法默认 p.adjust.method=none
- 假设检验默认双侧检验(alternative=two.sided
- 置信水平默认 0.95(conf.level='0.95'
- 多重比较调整方法默认(p.adjust.method='none')
- 假设检验默认双侧检验(alternative='two.sided'
〓 技术规范(确保结果结构化、可展示)〓
1. 数据集已读入为:%s(直接用 df$列名 引用变量,无需重复读入数据)。
1. 数据集已读入为:%s(直接用 df$列名 引用变量)。
2. 结果输出要求:
- 优先用 broom::tidy() 或 broom::glance() 将统计结果转为结构化数据框(包含统计量、p值、置信区间、自由度、显著性标记等核心指标),再用 print() 输出。
- 若 broom 包不支持该方法(如部分复杂模型),直接 print(统计函数结果),确保保留显著性标记(***、**、*)。
- 禁止使用broom::tidy()、data.frame()等函数将统计结果转为结构化数据框,必须保持原始统计结果
- 必须将统计结果的关键指标通过 cat() 函数逐行输出,确保每行一个指标,清晰可读。最后一行需要将统计的原始结果print出来。
- 输出必须包含以下上下文信息:数据集名称(来自进行统计计算的 dataset)、变量名称(如 age, charges等变量原名称)、检验类型、原假设、样本信息(n, 缺失值数量)、关键统计量(t, df, p, CI, 均值等)
3. 函数规范:统计函数必须写全名+完整括号(如 t.test()、lm()、chisq.test()),禁止省略括号或简写。
4. 变量校验:自动校验变量存在性和类型(如分组变量必须是因子/字符型,连续变量不能用于卡方检验),校验失败则输出“无法计算”。
用户请求:%s",
......@@ -74,9 +86,6 @@ metrics_generate <- function(prompt, dataset, envir = parent.frame()) {
type = "empty",
auto_run = FALSE))
}
r_code <- paste0(data_call, "\n", r_code)
has_gg <- grepl("ggplot\\(|geom_", r_code)
has_tbl <- grepl("data\\.frame\\(|tibble\\(|tbl_summary|tableOne|CreateTableOne|t.test|lm|cor.test|anova", r_code)
type <- if (has_gg) "plot" else if (has_tbl) "table" else "text"
......@@ -91,6 +100,11 @@ metrics_get_data_call <- function(dataset, envir) {
}
#' @export
metrics_run_code <- function(r_code, envir = parent.frame()) {
eval(parse(text = r_code), envir = envir)
output_lines <- capture.output({
eval(parse(text = r_code), envir = envir)
})
paste(output_lines, collapse = "\n")
}
......@@ -104,38 +104,82 @@ output$field_info_display <- renderUI({
# 获取并编码字段信息
get_field_info <- function() {
# 1. 校验数据集是否存在
if (is.null(input$dataset) || !exists("r_data")) {
return(NULL)
}
if (is.null(input$dataset) || !exists("r_data")) return(NULL)
# 2. 尝试获取数据集
df <- tryCatch({
get(input$dataset, envir = r_data)
}, error = function(e) NULL)
df <- tryCatch(get(input$dataset, envir = r_data), error = function(e) NULL)
if (is.null(df) || !is.data.frame(df) || nrow(df) == 0) return(NULL)
# 3. 数据集为空则返回NULL
if (is.null(df) || !is.data.frame(df) || nrow(df) == 0) {
return(NULL)
}
fields_summary <- list()
# 4. 构建字段名-类型映射
fields_list <- list()
for (col_name in names(df)) {
fields_list[[col_name]] <- class(df[[col_name]])[1]
for (col in names(df)) {
x <- df[[col]]
cls <- class(x)[1]
# 提取非 NA 的值
valid_x <- x[!is.na(x)]
if (cls %in% c("numeric", "double")) {
if (length(valid_x) == 0) {
fields_summary[[col]] <- list(type = cls)
} else {
fields_summary[[col]] <- list(
type = cls,
min = round(min(valid_x), 3),
max = round(max(valid_x), 3),
mean = round(mean(valid_x), 3)
)
}
} else if (cls == "integer") {
if (length(valid_x) == 0) {
fields_summary[[col]] <- list(type = cls, levels = list(), n_levels = 0L)
} else {
unique_vals <- sort(unique(valid_x))
n_unique <- length(unique_vals)
if (n_unique <= 10) {
levs_out <- unique_vals
} else {
levs_out <- c(unique_vals[1:10], "...")
}
fields_summary[[col]] <- list(
type = cls,
levels = levs_out,
n_levels = n_unique
)
}
} else {
# factor 或 character
if (length(valid_x) == 0) {
fields_summary[[col]] <- list(type = cls, levels = list(), n_levels = 0L)
} else {
if (is.factor(x)) {
# 只取实际出现的值
unique_vals <- sort(unique(as.character(valid_x)))
} else {
unique_vals <- sort(unique(as.character(valid_x)))
}
n_unique <- length(unique_vals)
if (n_unique <= 10) {
levs_out <- unique_vals
} else {
levs_out <- c(unique_vals[1:10], "...")
}
fields_summary[[col]] <- list(
type = cls,
levels = levs_out,
n_levels = n_unique
)
}
}
}
# 5. 构建JSON结构
json_struct <- list(
dataset_name = input$dataset,
fields = fields_list
)
# 6. JSON序列化+URL编码
URLencode(
jsonlite::toJSON(json_struct, auto_unbox = TRUE),
reserved = TRUE
dataset_name = input$dataset,
n_rows = nrow(df),
fields = fields_summary
)
URLencode(jsonlite::toJSON(json_struct, auto_unbox = TRUE, digits = 3), reserved = TRUE)
}
# 无数据时的UI
......
......@@ -2,182 +2,6 @@
library(shinyjs)
library(shinyAce)
# 新增:统计结果文本格式化函数
format_stat_text <- function(result, dataset_name) {
# 1. 处理“无法计算”的情况
if (is.character(result) && grepl("无法计算", result)) {
return(result)
}
# 2. 单样本t检验(htest类)
if (inherits(result, "htest") && grepl("One Sample t-test", result$method)) {
# 提取关键指标
var_name <- gsub("^x = df\\$|^df\\$", "", as.character(result$data.name))
mean_val <- round(mean(result$data, na.rm = TRUE), 3)
n_val <- length(na.omit(result$data))
n_missing <- sum(is.na(result$data))
sd_val <- round(sd(result$data, na.rm = TRUE), 3)
se_val <- round(result$stderr, 3)
me_val <- round(se_val * qt(0.975, df = result$parameter), 3) # 边际误差
diff_val <- round(result$estimate, 3)
t_val <- round(result$statistic, 3)
p_val <- if (result$p.value < 0.001) "< .001" else round(result$p.value, 4)
df_val <- result$parameter
ci_lower <- round(result$conf.int[1], 3)
ci_upper <- round(result$conf.int[2], 3)
conf_level <- round(attr(result$conf.int, "conf.level") * 100, 1)
# 显著性标记
signif <- if (p_val == "< .001") "***" else if (p_val <= 0.001) "***" else if (p_val <= 0.01) "**" else if (p_val <= 0.05) "*" else if (p_val <= 0.1) "." else " "
# 组织文本(完全匹配示例格式)
text <- sprintf(paste0(
"Single mean test\n",
"Data : %s \n",
"Variable : %s \n",
"Confidence: %.1f%% \n",
"Null hyp. : the mean of %s = %.1f \n",
"Alt. hyp. : the mean of %s is not equal to %.1f \n",
" mean n n_missing sd se me\n",
" %6.3f %5s %d %6.3f %5.3f %5.3f\n",
" diff se t.value p.value df 2.5%% 97.5%% \n",
" %6.3f %5.3f %6.3f %8s %5.0f %6.3f %6.3f %s\n",
"Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
),
dataset_name, var_name, conf_level, var_name, result$null.value,
var_name, result$null.value, mean_val, format(n_val, big.mark = ","),
n_missing, sd_val, se_val, me_val, diff_val, se_val, t_val, p_val,
df_val, ci_lower, ci_upper, signif
)
return(text)
}
# 3. 独立样本t检验(htest类)
if (inherits(result, "htest") && grepl("Welch Two Sample t-test|Two Sample t-test", result$method)) {
# 提取分组变量和数值变量
data_parts <- strsplit(result$data.name, " ~ ")[[1]]
var_name <- data_parts[1]
group_var <- data_parts[2]
group_vals <- levels(factor(df[[group_var]]))
# 分组统计
group1_data <- df[[var_name]][df[[group_var]] == group_vals[1]]
group2_data <- df[[var_name]][df[[group_var]] == group_vals[2]]
mean1 <- round(mean(group1_data, na.rm = TRUE), 3)
mean2 <- round(mean(group2_data, na.rm = TRUE), 3)
n1 <- length(na.omit(group1_data))
n2 <- length(na.omit(group2_data))
n_missing1 <- sum(is.na(group1_data))
n_missing2 <- sum(is.na(group2_data))
sd1 <- round(sd(group1_data, na.rm = TRUE), 3)
sd2 <- round(sd(group2_data, na.rm = TRUE), 3)
se1 <- round(sd1 / sqrt(n1), 3)
se2 <- round(sd2 / sqrt(n2), 3)
me1 <- round(se1 * qt(0.975, df = n1 - 1), 3)
me2 <- round(se2 * qt(0.975, df = n2 - 1), 3)
# 检验结果
diff_val <- round(result$estimate[1] - result$estimate[2], 3)
t_val <- round(result$statistic, 3)
p_val <- if (result$p.value < 0.001) "< .001" else round(result$p.value, 4)
df_val <- round(result$parameter, 0)
ci_lower <- round(result$conf.int[1], 3)
ci_upper <- round(result$conf.int[2], 3)
conf_level <- round(attr(result$conf.int, "conf.level") * 100, 1)
signif <- if (p_val == "< .001") "***" else if (p_val <= 0.001) "***" else if (p_val <= 0.01) "**" else if (p_val <= 0.05) "*" else if (p_val <= 0.1) "." else " "
# 组织文本
text <- sprintf(paste0(
"Pairwise mean comparisons (t-test)\n",
"Data : %s \n",
"Variables : %s, %s \n",
"Samples : independent \n",
"Confidence: %.1f%% \n",
"Adjustment: None \n",
" mean n n_missing sd se me\n",
" %6s %6.3f %5s %d %6.3f %5.3f %5.3f\n",
" %6s %6.3f %5s %d %6.3f %5.3f %5.3f\n",
" Null hyp. Alt. hyp. diff p.value \n",
" %s = %s %s not equal to %s %6.3f %8s %s\n",
"Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
),
dataset_name, group_var, var_name, conf_level,
group_vals[1], mean1, format(n1, big.mark = ","), n_missing1, sd1, se1, me1,
group_vals[2], mean2, format(n2, big.mark = ","), n_missing2, sd2, se2, me2,
group_vals[1], group_vals[2], group_vals[1], group_vals[2], diff_val, p_val, signif
)
return(text)
}
# 4. 线性回归(lm类)
if (inherits(result, "lm")) {
# 提取模型信息
model_formula <- as.character(result$call$formula)
var_dep <- model_formula[2]
vars_indep <- gsub("^ ~ ", "", model_formula[1])
# 系数表
coef_sum <- summary(result)$coefficients
coef_text <- " Estimate Std.Error t.value p.value Signif.\n"
for (i in 1:nrow(coef_sum)) {
term <- rownames(coef_sum)[i]
est <- round(coef_sum[i, 1], 4)
se <- round(coef_sum[i, 2], 4)
t_val <- round(coef_sum[i, 3], 3)
p_val <- if (coef_sum[i, 4] < 0.001) "< .001" else round(coef_sum[i, 4], 4)
signif <- if (p_val == "< .001") "***" else if (p_val <= 0.001) "***" else if (p_val <= 0.01) "**" else if (p_val <= 0.05) "*" else if (p_val <= 0.1) "." else " "
coef_text <- paste0(coef_text, sprintf(" %12s %8.4f %8.4f %6.3f %8s %3s\n", term, est, se, t_val, p_val, signif))
}
# 模型拟合度
r2 <- round(summary(result)$r.squared, 4)
r2_adj <- round(summary(result)$adj.r.squared, 4)
n_val <- nrow(result$model)
n_missing <- nrow(df) - n_val
# 组织文本
text <- sprintf(paste0(
"Linear Regression Model\n",
"Data : %s \n",
"Formula : %s ~ %s \n",
"Dependent : %s \n",
"Independent: %s \n",
"Sample : n = %s, n_missing = %d \n",
"Fit : R² = %.4f, Adjusted R² = %.4f \n",
"%s",
"Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
),
dataset_name, var_dep, vars_indep, var_dep, vars_indep,
format(n_val, big.mark = ","), n_missing, r2, r2_adj, coef_text
)
return(text)
}
# 5. 其他统计方法(卡方检验、方差分析等,按示例风格扩展)
if (inherits(result, "htest") && grepl("Chi-squared test", result$method)) {
# 卡方检验文本格式化(类似逻辑,提取列联表、卡方值、p值等)
data_parts <- strsplit(result$data.name, " ~ ")[[1]]
var1 <- data_parts[1]
var2 <- data_parts[2]
contingency <- table(df[[var1]], df[[var2]], useNA = "no")
chi_val <- round(result$statistic, 3)
df_val <- result$parameter
p_val <- if (result$p.value < 0.001) "< .001" else round(result$p.value, 4)
signif <- if (p_val == "< .001") "***" else if (p_val <= 0.001) "***" else if (p_val <= 0.01) "**" else if (p_val <= 0.05) "*" else if (p_val <= 0.1) "." else " "
text <- sprintf(paste0(
"Chi-squared Test of Independence\n",
"Data : %s \n",
"Variables : %s (row), %s (column) \n",
"Contingency Table:\n%s\n",
"Test Statistic: Chi-squared = %.3f, df = %.0f, p-value = %s %s\n",
"Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
),
dataset_name, var1, var2, capture.output(print(contingency)),
chi_val, df_val, p_val, signif
)
return(text)
}
# 6. 直接输出原始结果
return(capture.output(print(result)))
}
## ==================== 右下角浮框 ====================
ui_metrics_progress <- tags$div(
id = "metrics_progress_box",
......@@ -188,7 +12,7 @@ ui_metrics_progress <- tags$div(
background:#f5f5f5; color:#333;
border:1px solid #337ab7; border-radius:4px;
padding:10px 15px; box-shadow:0 2px 8px rgba(0,0,0,.25);",
tags$strong(i18n$t("AI calculating...")),
tags$strong(i18n$t("AI generating...")),
tags$div(class = "progress",
tags$div(class = "progress-bar progress-bar-striped active",
style = "width:100%"))
......@@ -341,12 +165,12 @@ observeEvent(input$metrics_run_code, {
# 分支2:所有统计结果转为结构化文本
else {
r_values$metrics_result_type <- "text"
# 调用格式化函数,生成示例风格的文本
formatted_text <- format_stat_text(result, dataset_name)
# 输出文本(保留换行和空格对齐)
output$metrics_result_text <- renderText(paste(formatted_text, collapse = "\n"))
if (is.null(result) || result == "") {
output$metrics_result_text <- renderText(i18n$t("No output generated."))
} else {
output$metrics_result_text <- renderText(result)
}
}
r_values$metrics_result_ready <- TRUE
}, error = function(e) {
r_values$metrics_result_type <- "error"
......
xxxxxx
\ No newline at end of file
> 大模型生成描述性统计(指标)
## 使用方法
以下是 `大模型生成描述性统计`的使用方法。
1.`数据对象`必须要有,且必须和数据集中的名字一致(大小写也一致)
2.`统计方法`必须要有,比如t检验、卡方检验等。
3.`生成代码`如果有误,或者想要修改,可以点击`编辑`按钮对R代码进行修改,保存后点击`运行`按钮即可。
## 示例
**1. 单变量推断**
-`age` 进行单样本 t 检验,原假设总体均值为 35。
- 检验 `bmi` 的平均值是否显著高于 25(单侧 t 检验)。
**2. 两组比较**
- 比较 `smoker` = "yes" 和 "no" 两组的 `charges` 均值差异(独立样本 t 检验)。
- 使用 Wilcoxon 秩和检验,比较男性与女性(`sex`)的 `bmi` 中位数是否不同。
**3.分类变量关联**
- 执行卡方独立性检验,判断 `sex``smoker` 是否相关。
- 分析 `region``smoker` 的列联表,进行卡方检验。
**4.多组比较**
-`charges``region` 分组进行单因素方差分析(one-way ANOVA)。
- 使用 Kruskal-Wallis 检验,比较不同 `children` 数量(0~5)对应的 `charges` 分布差异。
**5.相关与回归**
- 计算 `age``charges` 的 Pearson 相关系数并检验显著性。
- 拟合线性回归模型、以 `charges` 为因变量,`age``bmi``children``smoker` 为自变量。
- 在控制 `age``bmi` 的情况下,检验 `smoker``charges` 的偏回归系数是否显著(多元线性回归)。
**6.交互效应**
- 通过双因素方差分析,检验 `smoker``sex``charges` 是否存在交互作用。
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment