分析的位顯示,大部分的時間在[<-.data.frame
花費。因此
縮放的問題來自於如何Ops.data.frame
和[<-.dataframe
工作,以及如何[<-.data.frame
份,[[<-
副本命名列表,。
在Ops.data.frame
相關的代碼(有我的評語)
# cn is the names of your data.frame
for (j in seq_along(cn)) {
left <- if (!lscalar)
e1[[j]]
else e1
right <- if (!rscalar)
e2[[j]]
else e2
value[[j]] <- eval(f)
}
# sometimes return a data.frame
if (.Generic %in% c("+", "-", "*", "/", "%%", "%/%")) {
names(value) <- cn
data.frame(value, row.names = rn, check.names = FALSE,
check.rows = FALSE)
} # sometimes return a matrix
else matrix(unlist(value, recursive = FALSE, use.names = FALSE),
nrow = nr, dimnames = list(rn, cn))
當你使用它Ops.data.frame
週期會通過你列在使用[[<-
每次更換循環。 這意味着隨着列數的增加,所需的時間將增加(因爲它會有一些保護性的內部複製,因爲它是一個data.frame被命名的列表) - 因此它將隨列數線性縮放
# for example only this part will scale with the number of columns
f.df.1 <- function(df , x = 0.5){
df <- df - x
return(df)
}
microbenchmark(f.df.1(df1),f.df.1(df2),f.df.1(df3), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# f.df.1(df1) 96.739646 97.143298 98.36253 172.937100 175.539239 10
# f.df.1(df2) 11.697373 11.955173 12.12206 12.304543 281.055865 10
# f.df.1(df3) 3.114089 3.149682 3.41174 3.575835 3.640467 10
[<-.data.frame
具有穿過列的類似環時i
是相同的尺寸,爲x
if(is.logical(i) && is.matrix(i) && all(dim(i) == dim(x))) {
nreplace <- sum(i, na.rm=TRUE)
if(!nreplace) return(x) # nothing to replace
## allow replication of length(value) > 1 in 1.8.0
N <- length(value)
if(N > 1L && N < nreplace && (nreplace %% N) == 0L)
value <- rep(value, length.out = nreplace)
if(N > 1L && (length(value) != nreplace))
stop("'value' is the wrong length")
n <- 0L
nv <- nrow(x)
for(v in seq_len(dim(i)[2L])) {
thisvar <- i[, v, drop = TRUE]
nv <- sum(thisvar, na.rm = TRUE)
if(nv) {
if(is.matrix(x[[v]]))
x[[v]][thisvar, ] <- if(N > 1L) value[n+seq_len(nv)] else value
else
x[[v]][thisvar] <- if(N > 1L) value[n+seq_len(nv)] else value
}
n <- n+nv
}
return(x)
f.df.2 <- function(df , x = 0.5){
df[df < 0 ] <- 0
return(df)
}
microbenchmark(f.df.2(df1), f.df.2(df2), f.df.2(df3), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# f.df.2(df1) 20.500873 20.575801 20.699469 20.993723 84.825607 10
# f.df.2(df2) 3.143228 3.149111 3.173265 3.353779 3.409068 10
# f.df.2(df3) 1.581727 1.634463 1.707337 1.876240 1.887746 10
[<-
data.frame(和012的邏輯矩陣)都將複製以及
如何改善。您可以使用lapply
或set
從data.table
包
library(data.table)
sdf <- function(df, x = 0.5){
# explicit copy so there are no changes to original
dd <- copy(df)
for(j in names(df)){
set(dd, j= j, value = dd[[j]] - 0.5)
# this is slow when (necessarily) done repeatedly perhaps this
# could come out of the loop and into a `lapply` or `vapply` statment
whi <- which(dd[[j]] < 0)
if(length(whi)){
set(dd, j= j, i = whi, value = 0.0)
}
}
return(dd)
}
microbenchmark(sdf(df1), sdf(df2), sdf(df3), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# sdf(df1) 87.471560 88.323686 89.880685 92.659141 153.218536 10
# sdf(df2) 6.235951 6.531192 6.630981 6.786801 7.230825 10
# sdf(df3) 2.631641 2.729612 2.775762 2.884807 2.970556 10
# a base R approach using lapply
ldf <- function(df, x = 0.5){
as.data.frame(lapply(df, function(xx,x){ xxx <- xx-x;replace(xxx, xxx<0,0)}, x=x))
}
# pretty good. Does well with large data.frames
microbenchmark(ldf(df1), ldf(df2), ldf(df3), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# ldf(df1) 84.380144 84.659572 85.987488 159.928249 161.720599 10
# ldf(df2) 11.507918 11.793418 11.948194 12.175975 86.186517 10
# ldf(df3) 4.237206 4.368717 4.449018 4.627336 5.081222 10
# they all produce the same
dd <- sdf(df1)
ddf1 <- f.df(df1)
ldf1 <- ldf(df1)
identical(dd,ddf1)
## [1] TRUE
identical(ddf1, ldf1)
## [1] TRUE
# sdf and ldf comparable with lots of columns
# see benchmarking below.
microbenchmark(sdf(df1), ldf(df1), f.df(df1), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# sdf(df1) 85.75355 86.47659 86.76647 87.88829 172.0589 10
# ldf(df1) 84.73023 85.27622 85.61528 172.02897 356.4318 10
# f.df(df1) 3689.83135 3730.20084 3768.44067 3905.69565 3949.3532 10
# sdf ~ twice as fast with smaller data.frames
microbenchmark(sdf(df2), ldf(df2), f.df(df2), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# sdf(df2) 6.46860 6.557955 6.603772 6.927785 7.019567 10
# ldf(df2) 12.26376 12.551905 12.576802 12.667775 12.982594 10
# f.df(df2) 268.42042 273.800762 278.435929 346.112355 503.551387 10
microbenchmark(sdf(df3), ldf(df3), f.df(df3), times = 10L)
# Unit: milliseconds
# expr min lq median uq max neval
# sdf(df3) 2.538830 2.911310 3.020998 3.120961 74.980466 10
# ldf(df3) 4.698771 5.202121 5.272721 5.407351 5.424124 10
# f.df(df3) 17.819254 18.039089 18.158069 19.692038 90.620645 10
# copying of larger objects is slower, repeated calls to which are slow.
microbenchmark(copy(df1), copy(df2), copy(df3), times = 10L)
# Unit: microseconds
# expr min lq median uq max neval
# copy(df1) 369.926 407.218 480.5710 527.229 618.698 10
# copy(df2) 165.402 224.626 279.5445 296.215 519.773 10
# copy(df3) 150.148 180.625 214.9140 276.035 467.972 10
你只是描述發生了什麼,不爲什麼。我的問題開始*爲什麼需要更長的時間才能對數據進行比較。框架具有相同數量的元素,但排列在更多列* !!每個示例具有相同的元素總數。 – 2013-04-25 16:36:17
你正在*操作*的數量混淆*元素的數量。在你的函數中,*操作的數量*是列數的函數。 – 2013-04-25 16:40:57
我很抱歉,但您必須解釋這一點。這裏我不是在列上應用函數。你是否在說如果我有一個10 *元素*的數據框架排列在2列(5行)上,並且我從每一列中減去一個數字,那麼我以某種方式計算兩倍的操作,就好像我用數據框所做的那樣10個元素排成一列(10行)? – 2013-04-25 16:44:45