others - [R - 在另一列中,使用自定义函数的聚合(使用基于其它列的值)

我有一个表是这样的。


benchmark technqiue stat value



perlbench compression encoding_Zero 10


perlbench compression encoding_Repeated_Values 20


perlbench compression encoding_Base8_1 30


perlbench compression encoding_Base8_2 40


perlbench compression encoding_Base8_4 50


perlbench compression encoding_Base4_1 60


perlbench compression encoding_Base4_2 70


perlbench compression encoding_Base2_1 80


perlbench compression encoding_Uncompressed 90



函数类似于: compressed_size = (10*1 20*8 30*16 . . . 90*64 )

时间:

可以创建一个函数来增加值,乘以并使用aggregate


apply_fun <- function(x) {


 sum(x * c(1, seq_along(x[-1]) * length(x[-1])))


}



aggregate(value~benchmark + technqiue, df, apply_fun)



# benchmark technqiue value


#1 perlbench compression 19210



函数也可用于dplyrdata.table


library(dplyr)


df %>% group_by(benchmark, technqiue) %>% summarise(total = apply_fun(value))



library(data.table)


setDT(df)[, (total = apply_fun(value)), .(benchmark, technqiue)]



数据


df <- structure(list(benchmark = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 


1L, 1L, 1L), .Label ="perlbench", class ="factor"), technqiue = structure(c(1L, 


1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label ="compression", class ="factor"), 


stat = structure(c(9L, 7L, 4L, 5L, 6L, 2L, 3L, 1L, 8L), 


.Label = c("encoding_Base2_1","encoding_Base4_1","encoding_Base4_2",


"encoding_Base8_1","encoding_Base8_2","encoding_Base8_4", 


"encoding_Repeated_Values","encoding_Uncompressed","encoding_Zero"),


class ="factor"), value = c(10L, 20L, 30L, 40L, 50L, 60L, 70L, 80L, 90L)), 


class ="data.frame", row.names = c(NA, -9L))



这看起来像一个应用程序case_when :


library(dplyr)



df_summary <- df %>%


 mutate(


 stat_multiplier = case_when(


 stat == 'encoding_Zero' ~ 1,


 stat == 'encoding_Repeated_Values' ~ 8,


 stat == 'encoding_Base8_1' ~ 16,


 [...],


 stat == 'encoding_Uncompressed' ~ 64,


 TRUE ~ 1 # if none of the above is true, this would keep the value as-is instead of returning a NA


 )


 ) %>%


 group_by(benchmark, technique) %>%


 summarise(


 compressed_size = sum(value * stat_multiplier, na.rm = TRUE)


 )



...