2017-02-27 48 views
9

假設我有一個數據幀,如下所示:在計算配對差異高效實現

> foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3)) 
> foo 
    x id 
1 1 1 
2 2 1 
3 3 2 
4 4 2 
5 5 2 
6 6 3 
7 7 3 
8 8 3 
9 9 3 

我想要一個非常有效的實現的H(A,B),計算總和所有(一個 - XI)*(B - xj)代表屬於同一個id類的xi,xj。例如,我的當前的實現是

h(a, b, foo){ 
    a.diff = a - foo$x 
    b.diff = b - foo$x 
    prod = a.diff%*%t(b.diff) 
    id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo)) 
    return(sum(prod*id.indicator)) 
} 

例如,對於(A,B)=(0,1),這裏是從每個步驟在功能

> a.diff 
[1] -1 -2 -3 -4 -5 -6 -7 -8 -9 
> b.diff 
[1] 0 -1 -2 -3 -4 -5 -6 -7 -8 
> prod 
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] 
[1,] 0 1 2 3 4 5 6 7 8 
[2,] 0 2 4 6 8 10 12 14 16 
[3,] 0 3 6 9 12 15 18 21 24 
[4,] 0 4 8 12 16 20 24 28 32 
[5,] 0 5 10 15 20 25 30 35 40 
[6,] 0 6 12 18 24 30 36 42 48 
[7,] 0 7 14 21 28 35 42 49 56 
[8,] 0 8 16 24 32 40 48 56 64 
[9,] 0 9 18 27 36 45 54 63 72 
> id.indicator 
    1 2 3 4 5 6 7 8 9 
1 1 1 0 0 0 0 0 0 0 
2 1 1 0 0 0 0 0 0 0 
3 0 0 1 1 1 0 0 0 0 
4 0 0 1 1 1 0 0 0 0 
5 0 0 1 1 1 0 0 0 0 
6 0 0 0 0 0 1 1 1 1 
7 0 0 0 0 0 1 1 1 1 
8 0 0 0 0 0 1 1 1 1 
9 0 0 0 0 0 1 1 1 1 

在現實輸出,可以有多達1000個id的簇,每個簇至少有40個,這使得這個方法效率太低,因爲id.indicator中的稀疏條目以及在不使用的塊外對角線上的額外計算。

+2

簡單,並且仍非常活潑:字中,h < - 功能(A,B,富){總和(tapply(FOO $ X,FOO $ ID,函數(X){總和(tcrossprod(一 - x,b - x))}))}' – alistaire

回答

3

tapply允許您跨矢量組應用函數,並將結果簡化爲矩陣或向量(如果可以)。使用tcrossprod乘以每個組的所有組合,並在一些合適的大數據也表現良好:

# setup 
set.seed(47) 
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3)) 
foo2 <- data.frame(id = sample(1000, 40000, TRUE), x = rnorm(40000)) 

h_OP <- function(a, b, foo){ 
    a.diff = a - foo$x 
    b.diff = b - foo$x 
    prod = a.diff %*% t(b.diff) 
    id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo)) 
    return(sum(prod * id.indicator)) 
} 

h3_AEBilgrau <- function(a, b, foo) { 
    a.diff <- a - foo$x 
    b.diff <- b - foo$x 
    ids <- unique(foo$id) 
    res <- 0 
    for (i in seq_along(ids)) { 
    indx <- which(foo$id == ids[i]) 
    res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx])) 
    } 
    return(res) 
} 

h_d.b <- function(a, b, foo){ 
    sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))) 
} 

h_alistaire <- function(a, b, foo){ 
    sum(tapply(foo$x, foo$id, function(x){sum(tcrossprod(a - x, b - x))})) 
} 

都返回同樣的事情,而不是在小數據不同:

h_OP(0, 1, foo) 
#> [1] 891 
h3_AEBilgrau(0, 1, foo) 
#> [1] 891 
h_d.b(0, 1, foo) 
#> [1] 891 
h_alistaire(0, 1, foo) 
#> [1] 891 

# small data test 
microbenchmark::microbenchmark(
    h_OP(0, 1, foo), 
    h3_AEBilgrau(0, 1, foo), 
    h_d.b(0, 1, foo), 
    h_alistaire(0, 1, foo) 
) 
#> Unit: microseconds 
#>      expr  min  lq  mean median  uq  max neval cld 
#>   h_OP(0, 1, foo) 143.749 157.8895 189.5092 189.7235 214.3115 262.258 100 b 
#> h3_AEBilgrau(0, 1, foo) 80.970 93.8195 112.0045 106.9285 125.9835 225.855 100 a 
#>   h_d.b(0, 1, foo) 355.084 381.0385 467.3812 437.5135 516.8630 2056.972 100 c 
#> h_alistaire(0, 1, foo) 148.735 165.1360 194.7361 189.9140 216.7810 287.990 100 b 

儘管數據量較大,但差異變得更加嚴峻。最初揚言要崩潰我的筆記本電腦,但這裏是最快的兩個標杆:

# on 1k groups, 40k rows 
microbenchmark::microbenchmark(
    h3_AEBilgrau(0, 1, foo2), 
    h_alistaire(0, 1, foo2) 
) 
#> Unit: milliseconds 
#>      expr  min  lq  mean median  uq  max neval cld 
#> h3_AEBilgrau(0, 1, foo2) 336.98199 403.04104 412.06778 410.52391 423.33008 443.8286 100 b 
#> h_alistaire(0, 1, foo2) 14.00472 16.25852 18.07865 17.22296 18.09425 96.9157 100 a 

另一種可能性是使用data.frame按組進行總結,再總結相應的列。在R基礎上,你可以用aggregate來做到這一點,但dplyr和data.table是流行的,因爲使用更復雜的聚合可以使這種方法更簡單。

aggregatetapply慢。 dplyr比aggregate快,但仍然比較慢。 data.table,專爲速度而設計,幾乎與tapply一樣快。

library(dplyr) 
library(data.table) 

h_aggregate <- function(a, b, foo){sum(aggregate(x ~ id, foo, function(x){sum(tcrossprod(a - x, b - x))})$x)} 
tidy_h <- function(a, b, foo){foo %>% group_by(id) %>% summarise(x = sum(tcrossprod(a - x, b - x))) %>% select(x) %>% sum()} 
h_dt <- function(a, b, foo){setDT(foo)[, .(x = sum(tcrossprod(a - x, b - x))), by = id][, sum(x)]} 

microbenchmark::microbenchmark(
    h_alistaire(1, 0, foo2), 
    h_aggregate(1, 0, foo2), 
    tidy_h(1, 0, foo2), 
    h_dt(1, 0, foo2) 
) 
#> Unit: milliseconds 
#>      expr  min  lq  mean median  uq  max neval cld 
#> h_alistaire(1, 0, foo2) 13.30518 15.52003 18.64940 16.48818 18.13686 62.35675 100 a 
#> h_aggregate(1, 0, foo2) 93.08401 96.61465 107.14391 99.16724 107.51852 143.16473 100 c 
#>  tidy_h(1, 0, foo2) 39.47244 42.22901 45.05550 43.94508 45.90303 90.91765 100 b 
#>  h_dt(1, 0, foo2) 13.31817 15.09805 17.27085 16.46967 17.51346 56.34200 100 a 
3
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))) 
#[1] 891 

#TESTING 
foo = data.frame(x = sample(1:9,10000,replace = TRUE), 
         id = sample(1:3, 10000, replace = TRUE)) 
system.time(sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))) 
# user system elapsed 
# 0.15 0.01 0.17 
6

我玩了一下。首先,您的實現:

foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3)) 

h <- function(a, b, foo){ 
    a.diff = a - foo$x 
    b.diff = b - foo$x 
    prod = a.diff%*%t(b.diff) 
    id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + 
    diag(nrow(foo)) 
    return(sum(prod*id.indicator)) 
} 

h(a = 1, b = 0, foo = foo) 
#[1] 891 

接下來,我試圖用一個適當稀疏矩陣實現(通過Matrix包),該指數基體的變形和功能。我也使用tcrossprod,我經常發現它比a %*% t(b)快一點。

library("Matrix") 

h2 <- function(a, b, foo) { 
    a.diff <- a - foo$x 
    b.diff <- b - foo$x 
    prod <- tcrossprod(a.diff, b.diff) # the same as a.diff%*%t(b.diff) 
    id.indicator <- do.call(bdiag, lapply(table(foo$id), function(n) matrix(1,n,n))) 
    return(sum(prod*id.indicator)) 
} 

h2(a = 1, b = 0, foo = foo) 
#[1] 891 

請注意,此功能依賴foo$id被排序。

最後,我試着避免通過n矩陣創建完整的n。

h3 <- function(a, b, foo) { 
    a.diff <- a - foo$x 
    b.diff <- b - foo$x 
    ids <- unique(foo$id) 
    res <- 0 
    for (i in seq_along(ids)) { 
    indx <- which(foo$id == ids[i]) 
    res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx])) 
    } 
    return(res) 
} 

h3(a = 1, b = 0, foo = foo) 
#[1] 891 
您例如

標杆:

library("microbenchmark") 
microbenchmark(h(a = 1, b = 0, foo = foo), 
       h2(a = 1, b = 0, foo = foo), 
       h3(a = 1, b = 0, foo = foo)) 
# Unit: microseconds 
#      expr  min  lq  mean median  uq  max neval 
# h(a = 1, b = 0, foo = foo) 248.569 261.9530 493.2326 279.3530 298.2825 21267.890 100 
# h2(a = 1, b = 0, foo = foo) 4793.546 4893.3550 5244.7925 5051.2915 5386.2855 8375.607 100 
# h3(a = 1, b = 0, foo = foo) 213.386 227.1535 243.1576 234.6105 248.3775 334.612 100 

現在,在這個例子中,h3是最快h2實在是太慢了。但我想這對於更大的例子來說都會更快。可能h3仍然會贏得更大的例子。雖然有更多的優化空間,h3應該更快,更高效的內存。所以,我認爲你應該去找一個h3的變體,它不會產生不必要的大矩陣。

+0

'tab'從哪裏來? – Raad

+0

@NBATrends糟糕,應該是'ids';來自原型的剩餘物。 –