# We run the code in a fresh session library(tidyverse) library(duckdb) library(data.table) library(collapse) library(polars) polars_info() # 8 threads setDTthreads(8) con <- dbConnect(duckdb(config = list(threads = "8"))) set.seed(1) N <- 10^(5:8) m_queries <- 3 results <- vector("list", length(N) * m_queries) for (i in seq_along(N)) { n <- N(i) # Create data y <- rexp(n) w <- runif(n) g <- factor(sample(LETTERS, n, TRUE)) df <- tibble(y = y, g = g, w = w) dt <- data.table(df) dfp <- as_polars_df(df) duckdb_register(con, name = "df", df = df, overwrite = TRUE) # Grouped counts results((1 + (i - 1) * m_queries)) <- bench::mark( base = tabulate(g), dplyr = dplyr::count(df, g), data.table = dt(, .N, by = g), polars = dfp$get_column("g")$value_counts(), collapse = fcount(g), duckdb = dbGetQuery(con, "SELECT g, COUNT(*) N FROM df GROUP BY g"), check = FALSE, min_iterations = 3, ) |> bind_cols(n = n, query = "counts") results((2 + (i - 1) * m_queries)) <- bench::mark( base = rowsum(y, g) / tabulate(g), dplyr = df |> group_by(g) |> summarize(mean(y)), data.table = dt(, mean(y), by = g), polars = dfp$select(c("g", "y"))$group_by("g")$mean(), collapse = fmean(y, g = g), duckdb = dbGetQuery(con, "SELECT g, AVG(y) AS mean FROM df GROUP BY g"), check = FALSE, min_iterations = 3 ) |> bind_cols(n = n, query = "means") results((3 + (i - 1) * m_queries)) <- bench::mark( base = { ws <- rowsum(data.frame(y = y * w, w), g) ws(, 1L) / ws(, 2L) }, dplyr = df |> group_by(g) |> summarize(sum(w * y) / sum(w)), data.table = dt(, sum(w * y) / sum(w), by = g), polars = ( dfp $with_columns(pl$col("y") * pl$col("w")) $group_by("g") $sum() $with_columns(pl$col("y") / pl$col("w")) $drop("w") ), collapse = fmean(y, g = g, w = w), duckdb = dbGetQuery( con, "SELECT g, SUM(y * w) / sum(w) as wmean FROM df GROUP BY g" ), check = FALSE, min_iterations = 3 ) |> bind_cols(n = n, query = "weighted means") } results_df <- bind_rows(results) |> group_by(n, query) |> mutate( time = median, approach = as.character(expression), relative = as.numeric(time / min(time)) ) |> ungroup() ggplot(results_df, aes(y = relative, x = query, group = approach, color = approach)) + geom_point() + geom_line() + facet_wrap("n", scales = "free_y") + labs(x = element_blank(), y = "Relative timings") + theme_gray(base_size = 14) ggplot(results_df, aes(y = time, x = query, group = approach, color = approach)) + geom_point() + geom_line() + facet_wrap("n", scales = "free_y") + labs(x = element_blank(), y = "Absolute time in seconds") + theme_gray(base_size = 14)
Numărul și mijloacele grupate rapid în r
Pe același subiect