2013-04-25 65 views
4

爲什麼在數據框架上使用相同數量的元素進行比較需要更長的時間,但是在矢量化操作中排列更多列?取這個簡單的例子,其中我們減去來自每個元件0.5,然後比較它以查看其是否是< 0(related to this question):矢量化運算的速度取決於數據列的數量。框架

f.df <- function(df , x = 0.5){ 
    df <- df - x 
    df[ df < 0 ] <- 0 
    return(df) 
} 


df1 <- data.frame(matrix(runif(1e5) , nrow = 1e2)) 
df2 <- data.frame(matrix(runif(1e5) , nrow = 1e3)) 
df3 <- data.frame(matrix(runif(1e5) , nrow = 1e4)) 

require(microbenchmark) 
microbenchmark(f.df(df1) , f.df(df2) , f.df(df3) , times = 10L) 


#Unit: milliseconds 
#  expr  min   lq  median   uq  max neval 
# f.df(df1) 1562.66827 1568.21097 1595.07005 1674.91726 1680.90092 10 
# f.df(df2) 95.77452 98.12557 101.31215 190.46906 198.23927 10 
# f.df(df3) 16.25295 16.42373 16.74989 17.95621 18.69218 10 

回答

7

分析的位顯示,大部分的時間在[<-.data.frame花費。因此

縮放的問題來自於如何Ops.data.frame[<-.dataframe工作,以及如何[<-.data.frame份,[[<-副本命名列表,。

Ops.data.frame相關的代碼(有我的評語)

# cn is the names of your data.frame 
for (j in seq_along(cn)) { 
     left <- if (!lscalar) 
      e1[[j]] 
     else e1 
     right <- if (!rscalar) 
      e2[[j]] 
     else e2 
     value[[j]] <- eval(f) 
    } 
    # sometimes return a data.frame 
    if (.Generic %in% c("+", "-", "*", "/", "%%", "%/%")) { 
     names(value) <- cn 
     data.frame(value, row.names = rn, check.names = FALSE, 
      check.rows = FALSE) 
    } # sometimes return a matrix 
    else matrix(unlist(value, recursive = FALSE, use.names = FALSE), 
     nrow = nr, dimnames = list(rn, cn)) 

當你使用它Ops.data.frame週期會通過你列在使用[[<-每次更換循環。 這意味着隨着列數的增加,所需的時間將增加(因爲它會有一些保護性的內部複製,因爲它是一個data.frame被命名的列表) - 因此它將隨列數線性縮放

# for example only this part will scale with the number of columns 
f.df.1 <- function(df , x = 0.5){ 
    df <- df - x 

    return(df) 
} 
microbenchmark(f.df.1(df1),f.df.1(df2),f.df.1(df3), times = 10L) 
# Unit: milliseconds 
#  expr  min  lq median   uq  max neval 
# f.df.1(df1) 96.739646 97.143298 98.36253 172.937100 175.539239 10 
# f.df.1(df2) 11.697373 11.955173 12.12206 12.304543 281.055865 10 
# f.df.1(df3) 3.114089 3.149682 3.41174 3.575835 3.640467 10 

[<-.data.frame具有穿過列的類似環時i是相同的尺寸,爲x

if(is.logical(i) && is.matrix(i) && all(dim(i) == dim(x))) { 
      nreplace <- sum(i, na.rm=TRUE) 
      if(!nreplace) return(x) # nothing to replace 
      ## allow replication of length(value) > 1 in 1.8.0 
      N <- length(value) 
      if(N > 1L && N < nreplace && (nreplace %% N) == 0L) 
       value <- rep(value, length.out = nreplace) 
      if(N > 1L && (length(value) != nreplace)) 
       stop("'value' is the wrong length") 
      n <- 0L 
      nv <- nrow(x) 
      for(v in seq_len(dim(i)[2L])) { 
       thisvar <- i[, v, drop = TRUE] 
       nv <- sum(thisvar, na.rm = TRUE) 
       if(nv) { 
        if(is.matrix(x[[v]])) 
         x[[v]][thisvar, ] <- if(N > 1L) value[n+seq_len(nv)] else value 
        else 
         x[[v]][thisvar] <- if(N > 1L) value[n+seq_len(nv)] else value 
       } 
       n <- n+nv 
      } 
      return(x) 


f.df.2 <- function(df , x = 0.5){ 
    df[df < 0 ] <- 0 

    return(df) 
} 
microbenchmark(f.df.2(df1), f.df.2(df2), f.df.2(df3), times = 10L) 
# Unit: milliseconds 
#  expr  min  lq median  uq  max neval 
# f.df.2(df1) 20.500873 20.575801 20.699469 20.993723 84.825607 10 
# f.df.2(df2) 3.143228 3.149111 3.173265 3.353779 3.409068 10 
# f.df.2(df3) 1.581727 1.634463 1.707337 1.876240 1.887746 10 

[<- data.frame(和012的邏輯矩陣)都將複製以及


如何改善。您可以使用lapplysetdata.table

library(data.table) 
sdf <- function(df, x = 0.5){ 
    # explicit copy so there are no changes to original 
    dd <- copy(df) 
    for(j in names(df)){ 
    set(dd, j= j, value = dd[[j]] - 0.5) 
    # this is slow when (necessarily) done repeatedly perhaps this 
    # could come out of the loop and into a `lapply` or `vapply` statment 
    whi <- which(dd[[j]] < 0) 
    if(length(whi)){ 
    set(dd, j= j, i = whi, value = 0.0) 
    } 
    } 
    return(dd) 
} 

microbenchmark(sdf(df1), sdf(df2), sdf(df3), times = 10L) 
# Unit: milliseconds 
# expr  min  lq median  uq  max neval 
# sdf(df1) 87.471560 88.323686 89.880685 92.659141 153.218536 10 
# sdf(df2) 6.235951 6.531192 6.630981 6.786801 7.230825 10 
# sdf(df3) 2.631641 2.729612 2.775762 2.884807 2.970556 10 

# a base R approach using lapply 
ldf <- function(df, x = 0.5){ 

    as.data.frame(lapply(df, function(xx,x){ xxx <- xx-x;replace(xxx, xxx<0,0)}, x=x)) 

} 

# pretty good. Does well with large data.frames 
microbenchmark(ldf(df1), ldf(df2), ldf(df3), times = 10L) 
# Unit: milliseconds 
# expr  min  lq median   uq  max neval 
# ldf(df1) 84.380144 84.659572 85.987488 159.928249 161.720599 10 
# ldf(df2) 11.507918 11.793418 11.948194 12.175975 86.186517 10 
# ldf(df3) 4.237206 4.368717 4.449018 4.627336 5.081222 10 

# they all produce the same 
dd <- sdf(df1) 
ddf1 <- f.df(df1) 
ldf1 <- ldf(df1) 
identical(dd,ddf1) 
## [1] TRUE 
identical(ddf1, ldf1) 
## [1] TRUE 

# sdf and ldf comparable with lots of columns 
# see benchmarking below. 
microbenchmark(sdf(df1), ldf(df1), f.df(df1), times = 10L) 
# Unit: milliseconds 
# expr  min   lq  median   uq  max neval 
# sdf(df1) 85.75355 86.47659 86.76647 87.88829 172.0589 10 
# ldf(df1) 84.73023 85.27622 85.61528 172.02897 356.4318 10 
# f.df(df1) 3689.83135 3730.20084 3768.44067 3905.69565 3949.3532 10 
# sdf ~ twice as fast with smaller data.frames 
microbenchmark(sdf(df2), ldf(df2), f.df(df2), times = 10L) 
# Unit: milliseconds 
# expr  min   lq  median   uq  max neval 
# sdf(df2) 6.46860 6.557955 6.603772 6.927785 7.019567 10 
# ldf(df2) 12.26376 12.551905 12.576802 12.667775 12.982594 10 
# f.df(df2) 268.42042 273.800762 278.435929 346.112355 503.551387 10 
microbenchmark(sdf(df3), ldf(df3), f.df(df3), times = 10L) 
# Unit: milliseconds 
# expr  min  lq median  uq  max neval 
# sdf(df3) 2.538830 2.911310 3.020998 3.120961 74.980466 10 
# ldf(df3) 4.698771 5.202121 5.272721 5.407351 5.424124 10 
# f.df(df3) 17.819254 18.039089 18.158069 19.692038 90.620645 10 

# copying of larger objects is slower, repeated calls to which are slow. 

microbenchmark(copy(df1), copy(df2), copy(df3), times = 10L) 
# Unit: microseconds 
# expr  min  lq median  uq  max neval 
# copy(df1) 369.926 407.218 480.5710 527.229 618.698 10 
# copy(df2) 165.402 224.626 279.5445 296.215 519.773 10 
# copy(df3) 150.148 180.625 214.9140 276.035 467.972 10 
0

比較df1df2df3時:通過改變行數還保持因此元素總數不變,因此您將更改列數。

data.frame中的每一列都是一個列表。在你的例子中,每個data.frame的列數都有一個數量級,因此數量級更多,因此時間也更多。

+0

你只是描述發生了什麼,不爲什麼。我的問題開始*爲什麼需要更長的時間才能對數據進行比較。框架具有相同數量的元素,但排列在更多列* !!每個示例具有相同的元素總數。 – 2013-04-25 16:36:17

+1

你正在*操作*的數量混淆*元素的數量。在你的函數中,*操作的數量*是列數的函數。 – 2013-04-25 16:40:57

+0

我很抱歉,但您必須解釋這一點。這裏我不是在列上應用函數。你是否在說如果我有一個10 *元素*的數據框架排列在2列(5行)上,並且我從每一列中減去一個數字,那麼我以某種方式計算兩倍的操作,就好像我用數據框所做的那樣10個元素排成一列(10行)? – 2013-04-25 16:44:45

7

data.frames是列表:每列可以保存不同類的數據。如你所想,當你運行你的代碼時,R必須分別對待每一列。結果,「向量化」只發生在列的基礎上。對於data.frame中相同數量的元素,您有更多的列需要處理的時間越長。

這不像矩陣(更一般的數組),它只保存一個類的數據,所以矢量化可以在整個過程中發生。在這裏,對於相同數量的元素,無論列的數量如何,計算時間都是相同的。正如你可以看到:

df1 <- matrix(runif(1e5) , nrow = 1e2) 
df2 <- matrix(runif(1e5) , nrow = 1e3) 
df3 <- matrix(runif(1e5) , nrow = 1e4) 

require(microbenchmark) 
microbenchmark(f.df(df1) , f.df(df2) , f.df(df3) , times = 10L) 

# Unit: milliseconds 
#  expr  min  lq median  uq  max neval 
# f.df(df1) 4.837330 5.218258 5.350093 5.587897 7.081086 10 
# f.df(df2) 5.158825 5.313685 5.510549 5.731780 5.880861 10 
# f.df(df3) 5.237361 5.344613 5.399209 5.481276 5.940132 10