#' Collaborative Filtering #' #' @details See \url{https://radiant-rstats.github.io/docs/model/crs.html} for an example in Radiant #' #' @param dataset Dataset #' @param id String with name of the variable containing user ids #' @param prod String with name of the variable with product ids #' @param pred Products to predict for #' @param rate String with name of the variable with product ratings #' @param data_filter Expression entered in, e.g., Data > View to filter the dataset in Radiant. The expression should be a string (e.g., "training == 1") #' @param arr Expression to arrange (sort) the data on (e.g., "color, desc(price)") #' @param rows Rows to select from the specified dataset #' @param envir Environment to extract data from #' #' @return A data.frame with the original data and a new column with predicted ratings #' #' @seealso \code{\link{summary.crs}} to summarize results #' @seealso \code{\link{plot.crs}} to plot results if the actual ratings are available #' #' @examples #' crs(ratings, #' id = "Users", prod = "Movies", pred = c("M6", "M7", "M8", "M9", "M10"), #' rate = "Ratings", data_filter = "training == 1" #' ) %>% str() #' @importFrom dplyr distinct_at #' #' @export crs <- function(dataset, id, prod, pred, rate, data_filter = "", arr = "", rows = NULL, envir = parent.frame()) { vars <- c(id, prod, rate) df_name <- if (!is_string(dataset)) deparse(substitute(dataset)) else dataset uid <- get_data(dataset, id, filt = data_filter, arr = arr, rows = rows, na.rm = FALSE, envir = envir) %>% unique() dataset <- get_data(dataset, vars, na.rm = FALSE, envir = envir) ## creating a matrix layout ## will not be efficient for very large and sparse datasets ## improvement possible with dplyr or sparse matrix? ## make sure spread doesn't complain cn <- colnames(dataset) nr <- dplyr::distinct_at(dataset, .vars = base::setdiff(cn, rate), .keep_all = TRUE) %>% nrow() if (nr < nrow(dataset)) { return("Rows are not unique. Data not appropriate for collaborative filtering" %>% add_class("crs")) } dataset <- spread(dataset, !!prod, !!rate) %>% as.data.frame(stringsAsFactors = FALSE) idv <- select_at(dataset, .vars = id) uid <- seq_len(nrow(dataset))[idv[[1]] %in% uid[[1]]] dataset <- select_at(dataset, .vars = base::setdiff(colnames(dataset), id)) ## can use : for long sets of products to predict for if (any(grepl(":", pred))) { pred <- select( dataset[1, , drop = FALSE], !!!rlang::parse_exprs(paste0(pred, collapse = ";")) ) %>% colnames() } ## stop if insufficient overlap in ratings if (length(pred) >= (ncol(dataset) - 1)) { return("Cannot predict for all products. Ratings must overlap on at least two products." %>% add_class("crs")) } if (length(vars) < (ncol(dataset) - 1)) { vars <- evar <- colnames(dataset)[-1] } ## indices cn <- colnames(dataset) nind <- which(cn %in% pred) ind <- (seq_along(cn))[-nind] ## average scores and rankings avg <- dataset[uid, , drop = FALSE] %>% .[, nind, drop = FALSE] %>% summarise_all(mean, na.rm = TRUE) ravg <- avg ravg[1, ] <- min_rank(desc(as.numeric(avg))) ravg <- mutate_all(ravg, as.integer) ## actual scores and rankings (if available, else will be NA) act <- dataset[-uid, , drop = FALSE] %>% .[, nind, drop = FALSE] ract <- act if (nrow(act) == 0) { return("Invalid filter used. Users to predict for should not be in the training set." %>% add_class("crs")) } rank <- apply(act, 1, function(x) as.integer(min_rank(desc(x)))) %>% (function(x) if (length(pred) == 1) x else t(x)) ract[, pred] <- rank ract <- bind_cols(idv[-uid, , drop = FALSE], ract) act <- bind_cols(idv[-uid, , drop = FALSE], act) ## CF calculations per row ms <- apply(dataset[, -nind, drop = FALSE], 1, function(x) mean(x, na.rm = TRUE)) sds <- apply(dataset[, -nind, drop = FALSE], 1, function(x) sd(x, na.rm = TRUE)) ## to forego standardization # ms <- ms * 0 # sds <- sds/sds ## standardized ratings if (length(nind) < 2) { srate <- (dataset[uid, nind] - ms[uid]) / sds[uid] } else { srate <- sweep(dataset[uid, nind], 1, ms[uid], "-") %>% sweep(1, sds[uid], "/") } ## comfirmed to produce consistent results -- see cf-demo-missing-state.rda and cf-demo-missing.xlsx srate[is.na(srate)] <- 0 srate <- mutate_all(as.data.frame(srate, stringsAsFactors = FALSE), ~ ifelse(is.infinite(.), 0, .)) cors <- sshhr(cor(t(dataset[uid, ind]), t(dataset[-uid, ind]), use = "pairwise.complete.obs")) ## comfirmed to produce correct results -- see cf-demo-missing-state.rda and cf-demo-missing.xlsx cors[is.na(cors)] <- 0 dnom <- apply(cors, 2, function(x) sum(abs(x), na.rm = TRUE)) wts <- sweep(cors, 2, dnom, "/") cf <- (crossprod(wts, as.matrix(srate)) * sds[-uid] + ms[-uid]) %>% as.data.frame(stringsAsFactors = FALSE) %>% bind_cols(idv[-uid, , drop = FALSE], .) %>% set_colnames(c(id, pred)) ## Ranking based on CF rcf <- cf rank <- apply(select(cf, -1), 1, function(x) as.integer(min_rank(desc(x)))) %>% (function(x) if (length(pred) == 1) x else t(x)) rcf[, pred] <- rank recommendations <- inner_join( bind_cols( gather(act, "product", "rating", -1, factor_key = TRUE), select_at(gather(ract, "product", "ranking", -1, factor_key = TRUE), .vars = "ranking"), select_at(gather(cf, "product", "cf", -1, factor_key = TRUE), .vars = "cf"), select_at(gather(rcf, "product", "cf_rank", -1, factor_key = TRUE), .vars = "cf_rank") ), data.frame( product = names(avg) %>% factor(., levels = .), average = t(avg), avg_rank = t(ravg) ), by = "product" ) %>% arrange_at(.vars = c(id, "product")) %>% select_at(.vars = c(id, "product", "rating", "average", "cf", "ranking", "avg_rank", "cf_rank")) rm(dataset, ms, sds, srate, cors, dnom, wts, cn, ind, nind, nr, uid, idv, envir) as.list(environment()) %>% add_class("crs") } #' Summary method for Collaborative Filter #' #' @details See \url{https://radiant-rstats.github.io/docs/model/crs.html} for an example in Radiant #' #' @param object Return value from \code{\link{crs}} #' @param n Number of lines of recommendations to print. Use -1 to print all lines #' @param dec Number of decimals to show #' @param ... further arguments passed to or from other methods #' #' @seealso \code{\link{crs}} to generate the results #' @seealso \code{\link{plot.crs}} to plot results if the actual ratings are available #' #' @examples #' crs(ratings, #' id = "Users", prod = "Movies", pred = c("M6", "M7", "M8", "M9", "M10"), #' rate = "Ratings", data_filter = "training == 1" #' ) %>% summary() #' @export summary.crs <- function(object, n = 36, dec = 2, ...) { if (is.character(object)) { return(cat(object)) } cat("Collaborative filtering") cat("\nData :", object$df_name) if (!is.empty(object$data_filter)) { cat("\nFilter :", gsub("\\n", "", object$data_filter)) } if (!is.empty(object$arr)) { cat("\nArrange :", gsub("\\n", "", object$arr)) } if (!is.empty(object$rows)) { cat("\nFilter :", gsub("\\n", "", object$rows)) } cat("\nUser id :", object$id) cat("\nProduct id :", object$prod) cat("\nPredict for:", paste0(object$pred, collapse = ", "), "\n") if (nrow(object$recommendations) > n) { cat("Rows shown :", n, "out of", format_nr(nrow(object$recommendations), dec = 0), "\n") } if (nrow(object$act) > 0 && !any(is.na(object$act))) { cat("\nSummary:\n") ## From FZs do file output, calculate if actual ratings are available ## best based on highest average rating best <- which(object$ravg == 1) ar1 <- mean(object$ract[, best + 1] == 1) cat("\n- Average rating picks the best product", format_nr(ar1, dec = 1, perc = TRUE), "of the time") ## best based on cf best <- which(object$rcf == 1, arr.ind = TRUE) cf1 <- mean(object$ract[best] == 1) cat("\n- Collaborative filtering picks the best product", format_nr(cf1, dec = 1, perc = TRUE), "of the time") ## best based on highest average rating in top 3 best <- which(object$ravg == 1) ar3 <- mean(object$ract[, best + 1] < 4) cat("\n- Pick based on average rating is in the top 3 products", format_nr(ar3, dec = 1, perc = TRUE), "of the time") ## best based on cf in top 3 best <- which(object$rcf == 1, arr.ind = TRUE) cf3 <- mean(object$ract[best] < 4) cat("\n- Pick based on collaborative filtering is in the top 3 products", format_nr(cf3, dec = 1, perc = TRUE), "of the time") ## best 3 based on highest average rating contains best product best <- which(object$ravg < 4) inar3 <- mean(rowSums(object$ract[, best + 1, drop = FALSE] == 1) > 0) cat("\n- Top 3 based on average ratings contains the best product", format_nr(inar3, dec = 1, perc = TRUE), "of the time") ## best 3 based on cf contains best product best <- which(!object$rcf[, -1, drop = FALSE] < 4, arr.ind = TRUE) best[, "col"] <- best[, "col"] + 1 object$ract[best] <- NA incf3 <- mean(rowSums(object$ract == 1, na.rm = TRUE) > 0) cat("\n- Top 3 based on collaborative filtering contains the best product", format_nr(incf3, dec = 1, perc = TRUE), "of the time\n") } cat("\nRecommendations:\n\n") if (n == -1) { cat("\n") format_df(object$recommendations, dec = dec) %>% (function(x) { x[x == "NA"] <- "" x }) %>% print(row.names = FALSE) } else { head(object$recommendations, n) %>% format_df(dec = dec) %>% (function(x) { x[x == "NA"] <- "" x }) %>% print(row.names = FALSE) } } #' Plot method for the crs function #' #' @details Plot that compares actual to predicted ratings. See \url{https://radiant-rstats.github.io/docs/model/crs.html} for an example in Radiant #' #' @param x Return value from \code{\link{crs}} #' @param ... further arguments passed to or from other methods #' #' @seealso \code{\link{crs}} to generate results #' @seealso \code{\link{summary.crs}} to summarize results #' #' @export plot.crs <- function(x, ...) { if (is.character(x)) { return(x) } if (any(is.na(x$act)) || all(is.na(x$cf))) { return("Plotting for Collaborative Filter requires the actual ratings associated\nwith the predictions") } ## use quantile to avoid plotting extreme predictions lim <- quantile(x$recommendations[, c("rating", "cf")], probs = c(.025, .975), na.rm = TRUE) p <- visualize( x$recommendations, xvar = "cf", yvar = "rating", type = "scatter", facet_col = "product", check = "line", custom = TRUE ) + geom_segment(aes(x = 1, y = 1, xend = 5, yend = 5), color = "blue", linewidth = .05) + coord_cartesian(xlim = lim, ylim = lim) + labs( title = "Recommendations based on Collaborative Filtering", x = "Predicted ratings", y = "Actual ratings" ) + theme(legend.position = "none") sshhr(p) } #' Deprecated: Store method for the crs function #' #' @details Return recommendations See \url{https://radiant-rstats.github.io/docs/model/crs.html} for an example in Radiant #' #' @param dataset Dataset #' @param object Return value from \code{\link{crs}} #' @param name Name to assign to the dataset #' @param ... further arguments passed to or from other methods #' #' @export store.crs <- function(dataset, object, name, ...) { if (missing(name)) { object$recommendations } else { stop( paste0( "This function is deprecated. Use the code below instead:\n\n", name, " <- ", deparse(substitute(object)), "$recommendations\nregister(\"", name, ")" ), call. = FALSE ) } }