強制性data.table
解決方案 -
options(stringsAsFactors=FALSE)
library(data.table)
##
set.seed(1234)
dTbl <- data.table(
ID = sample(c(letters,LETTERS),100000,replace=TRUE),
NrBlocks = rnorm(100000),
key = "ID")
##
gTbl <- dTbl[
,
list(sumNrBlocks = sum(NrBlocks)),
by = list(ID)]
##
> head(gTbl)
ID sumNrBlocks
1: A 56.50234
2: B -13.61380
3: C 24.66750
4: D 65.18829
5: E 26.14085
6: F 41.64376
時序:
library(microbenchmark)
##
uniqueIDs <- unique(dTbl$ID)
f1 <- function(){
sapply(1:length(uniqueIDs),
FUN = function(x){
sum(dTbl[which(dTbl$ID == uniqueIDs[x]),]$NrBlocks)
}
)
}
##
f2 <- function(){
dTbl[
,
list(sumNrBlocks = sum(NrBlocks)),
by = list(ID)]
}
##
Res <- microbenchmark(
f1(),
f2(),
times=100L)
Res
> Res
Unit: milliseconds
expr min lq median uq max neval
f1() 139.054620 141.534227 144.213253 156.747569 193.278071 100
f2() 1.813652 1.911069 1.980874 2.140971 3.522545 100
多列:
dTbl2 <- copy(dTbl)
set.seed(1234)
dTbl2[,col3:=rexp(100000)]
dTbl2[,col4:=col3*2]
##
gTbl2 <- dTbl2[
,
lapply(.SD,sum),
by=list(ID)]
##
> head(gTbl2)
ID NrBlocks col3 col4
1: A 56.50234 1933.443 3866.886
2: B -13.61380 1904.282 3808.563
3: C 24.66750 1834.655 3669.310
4: D 65.18829 1884.364 3768.728
5: E 26.14085 1874.761 3749.523
6: F 41.64376 1977.219 3954.438
中號與規範ultiple列 -
gTbl2.2 <- dTbl2[
,
lapply(.SD,sum),
by=list(ID),
.SDcols=c(2,4)]
##
> head(gTbl2.2)
ID NrBlocks col4
1: A 56.50234 3866.886
2: B -13.61380 3808.563
3: C 24.66750 3669.310
4: D 65.18829 3768.728
5: E 26.14085 3749.523
6: F 41.64376 3954.438
聽起來像一個直接聚合。 '聚合(價值〜ID,數據,總和)' – 2014-09-01 14:07:13
@RichardScriven,他要求一個快速解決方案:) – 2014-09-01 14:23:05
非常感謝,正是我所期待的。爲什麼我從不會偶然發現聚合函數?你讓我今天一整天都感覺很好。似乎比我以前的解決方案更快,但也許有一個更快的變種? – 2014-09-01 14:24:16