tapply
允許您跨矢量組應用函數,並將結果簡化爲矩陣或向量(如果可以)。使用tcrossprod
乘以每個組的所有組合,並在一些合適的大數據也表現良好:
# setup
set.seed(47)
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
foo2 <- data.frame(id = sample(1000, 40000, TRUE), x = rnorm(40000))
h_OP <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff %*% t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod * id.indicator))
}
h3_AEBilgrau <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h_d.b <- function(a, b, foo){
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
}
h_alistaire <- function(a, b, foo){
sum(tapply(foo$x, foo$id, function(x){sum(tcrossprod(a - x, b - x))}))
}
都返回同樣的事情,而不是在小數據不同:
h_OP(0, 1, foo)
#> [1] 891
h3_AEBilgrau(0, 1, foo)
#> [1] 891
h_d.b(0, 1, foo)
#> [1] 891
h_alistaire(0, 1, foo)
#> [1] 891
# small data test
microbenchmark::microbenchmark(
h_OP(0, 1, foo),
h3_AEBilgrau(0, 1, foo),
h_d.b(0, 1, foo),
h_alistaire(0, 1, foo)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> h_OP(0, 1, foo) 143.749 157.8895 189.5092 189.7235 214.3115 262.258 100 b
#> h3_AEBilgrau(0, 1, foo) 80.970 93.8195 112.0045 106.9285 125.9835 225.855 100 a
#> h_d.b(0, 1, foo) 355.084 381.0385 467.3812 437.5135 516.8630 2056.972 100 c
#> h_alistaire(0, 1, foo) 148.735 165.1360 194.7361 189.9140 216.7810 287.990 100 b
儘管數據量較大,但差異變得更加嚴峻。最初揚言要崩潰我的筆記本電腦,但這裏是最快的兩個標杆:
# on 1k groups, 40k rows
microbenchmark::microbenchmark(
h3_AEBilgrau(0, 1, foo2),
h_alistaire(0, 1, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h3_AEBilgrau(0, 1, foo2) 336.98199 403.04104 412.06778 410.52391 423.33008 443.8286 100 b
#> h_alistaire(0, 1, foo2) 14.00472 16.25852 18.07865 17.22296 18.09425 96.9157 100 a
另一種可能性是使用data.frame按組進行總結,再總結相應的列。在R基礎上,你可以用aggregate
來做到這一點,但dplyr和data.table是流行的,因爲使用更復雜的聚合可以使這種方法更簡單。
aggregate
比tapply
慢。 dplyr比aggregate
快,但仍然比較慢。 data.table,專爲速度而設計,幾乎與tapply
一樣快。
library(dplyr)
library(data.table)
h_aggregate <- function(a, b, foo){sum(aggregate(x ~ id, foo, function(x){sum(tcrossprod(a - x, b - x))})$x)}
tidy_h <- function(a, b, foo){foo %>% group_by(id) %>% summarise(x = sum(tcrossprod(a - x, b - x))) %>% select(x) %>% sum()}
h_dt <- function(a, b, foo){setDT(foo)[, .(x = sum(tcrossprod(a - x, b - x))), by = id][, sum(x)]}
microbenchmark::microbenchmark(
h_alistaire(1, 0, foo2),
h_aggregate(1, 0, foo2),
tidy_h(1, 0, foo2),
h_dt(1, 0, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h_alistaire(1, 0, foo2) 13.30518 15.52003 18.64940 16.48818 18.13686 62.35675 100 a
#> h_aggregate(1, 0, foo2) 93.08401 96.61465 107.14391 99.16724 107.51852 143.16473 100 c
#> tidy_h(1, 0, foo2) 39.47244 42.22901 45.05550 43.94508 45.90303 90.91765 100 b
#> h_dt(1, 0, foo2) 13.31817 15.09805 17.27085 16.46967 17.51346 56.34200 100 a
簡單,並且仍非常活潑:字中,h < - 功能(A,B,富){總和(tapply(FOO $ X,FOO $ ID,函數(X){總和(tcrossprod(一 - x,b - x))}))}' – alistaire