優化for循環中的R

僞數據組：（從我的數據集差異ITEM_CODE是串在我的情況）優化for循環中的R

in_cluster <- data.frame(item_code = c(1:500)) 
in_cluster$cluster <- 
     sample(5, size = nrow(in_cluster), replace = TRUE) 
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0)) 
real_sales <- 
    data.frame(
      item_code = sample(500, size = 100000, replace = TRUE), 
      sales = sample(500, size = 100000, replace = TRUE) 
    ) 

mean_trajectory <- data.frame(sales = c(1:52)) 
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE) 
training_df <- data.frame(
     LTF_t_minus_1 = numeric(0), 
     LTF_t = numeric(0), 
     LTF_t_plus_1 = numeric(0), 
     RS_t_minus_1 = numeric(0), 
     RS_t = numeric(0), 
     STF_t_plus_1 = numeric(0) 
) 
training_df[nrow(training_df) + 1, ] <- 
     c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0 

week = 2

我有R中一個簡單的函數中，所有我做的是：

system.time({ 
    for (r in 1:nrow(in_cluster)) { 
      item <- in_cluster[r,] 
      sale_row <- 
        dplyr::filter(real_sales, item_code == item$item_code) 
      if (nrow(sale_row) > 2) { 
        new_df <- data.frame(
          LTF_t_minus_1 = mean_trajectory$sales[[week - 1]], 
          LTF_t = mean_trajectory$sales[[week]], 
          LTF_t_plus_1 = mean_trajectory$sales[[week + 1]], 
          RS_t_minus_1 = sale_row$sales[[week - 1]], 
          RS_t = sale_row$sales[[week]], 
          STF_t_plus_1 = sale_row$sales[[week + 1]] 
        ) 
        training_df <- 
          bind_rows(training_df, new_df) 
      } 
    } 
})

我很新的R和發現這個很奇怪看多小時實在是又多久（421.59 seconds通過500行循環）它通過數據幀採取循環。

EDIT_IMPORTANT：不過，對於上面給出的僞數據組時採取了1.10 seconds得到輸出>可這是因爲具有ITEM_CODE字符串？是否需要很多時間來處理字符串item_code。（我沒有使用假人數據集的字符串，因爲我不知道該怎麼對item_code 500個獨特的字符串in_cluster，並且具有相同的字符串作爲real_salesitem_code）

我通過建議的方式少的其他文章閱讀來優化將R代碼，並使用用於bind_rows代替rbind或：使用bind_rows

training_df[nrow(training_df) + 1,] <- 
    c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])

似乎已經通過36秒通過500行數據幀的循環時改善了性能in_cluster

在這種情況下可以使用lapply嗎？我想下面的代碼，並得到了一個錯誤：

Error in filter_impl(.data, dots) : $ operator is invalid for atomic vectors

myfun <- function(item, sales, mean_trajectory, week) { 
sale_row<- filter(sales, item_code == item$item_code) 
data.frame(
    LTF_t_minus_1 = mean_trajectory$sales[[week-1]], 
    LTF_t = mean_trajectory$sales[[week]], 
    LTF_t_plus_1 = mean_trajectory$sales[[week+1]], 
    RS_t_minus_1 = sale_row$sales[[week-1]], 
    RS_t = sale_row$sales[[week]], 
    STF_t_plus_1 = sale_row$sales[[week+1]]) 
} 

system.time({ 
     lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows() 
})

幫助lapply將不勝感激，但是我的主要目標是加快循環。

來源

2017-04-23 ro ko

請包括一個[可重現的示例]（http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610），這將使其他人更容易來幫你。 – Jaap

@Jaap當然，是在上面。不過謝謝。 –

好的，所以在你的代碼中有很多不好的做法。

你每行操作
你正在創建2（！），每行（很貴）新數據幀
你成長在一個循環對象）training_df <- bind_rows(training_df, new_df)保持在每個迭代增長，而運行一個相當昂貴的操作（bind_rows））
你一次又一次地運行相同的操作，當你可以只運行一次（爲什麼你運行mean_trajectory$sales[[week-1]]和每行AL，而mean_trajectory與循環無關？你可以之後分配）。
而這樣的例子不勝枚舉...

我建議的替代簡單data.table解決方案，將執行好得多。這個想法是首先在in_cluster和real_sales之間進行二進制連接（並且在連接時運行所有操作而不創建額外的數據幀並綁定它們）。然後，只運行一次所有mean_trajectory相關行。（我忽略了training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19)初始化，因爲它是無關緊要這裏，你可以在事後只用添加和rbind）

library(data.table) #v1.10.4 
## First step 
res <- 
    setDT(real_sales)[setDT(in_cluster), # binary join 
        if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do 
           RS_t = sales[week],    # by condition 
           STF_t_plus_1 = sales[week + 1]), 
        on = "item_code", # The join key 
        by = .EACHI] # Do the operations per each join 

## Second step (run the `mean_trajectory` only once) 
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1], 
      LTF_t = mean_trajectory$sales[week], 
      LTF_t_plus_1 = mean_trajectory$sales[week + 1])]

一些性能測試：

### Creating your data sets 
set.seed(123) 
N <- 1e5 
N2 <- 5e7 

in_cluster <- data.frame(item_code = c(1:N)) 

real_sales <- 
    data.frame(
    item_code = sample(N, size = N2, replace = TRUE), 
    sales = sample(N, size = N2, replace = TRUE) 
) 

mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE)) 

training_df <- data.frame(
    LTF_t_minus_1 = numeric(0), 
    LTF_t = numeric(0), 
    LTF_t_plus_1 = numeric(0), 
    RS_t_minus_1 = numeric(0), 
    RS_t = numeric(0), 
    STF_t_plus_1 = numeric(0) 
) 
week = 2 

############################### 
################# Your solution 
system.time({ 
    for (r in 1:nrow(in_cluster)) { 
    item <- in_cluster[r,, drop = FALSE] 
    sale_row <- 
     dplyr::filter(real_sales, item_code == item$item_code) 
    if (nrow(sale_row) > 2) { 
     new_df <- data.frame(
     LTF_t_minus_1 = mean_trajectory$sales[[week - 1]], 
     LTF_t = mean_trajectory$sales[[week]], 
     LTF_t_plus_1 = mean_trajectory$sales[[week + 1]], 
     RS_t_minus_1 = sale_row$sales[[week - 1]], 
     RS_t = sale_row$sales[[week]], 
     STF_t_plus_1 = sale_row$sales[[week + 1]] 
    ) 
     training_df <- 
     bind_rows(training_df, new_df) 
    } 
    } 
}) 
### Ran forever- I've killed it after half an hour 


###################### 
########## My solution 
library(data.table) 
system.time({ 
res <- 
    setDT(real_sales)[setDT(in_cluster), 
        if(.N > 2) .(RS_t_minus_1 = sales[week - 1], 
           RS_t = sales[week], 
           STF_t_plus_1 = sales[week + 1]), 
        on = "item_code", 
        by = .EACHI] 
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1], 
      LTF_t = mean_trajectory$sales[week], 
      LTF_t_plus_1 = mean_trajectory$sales[week + 1])] 
}) 

# user system elapsed 
# 2.42 0.05 2.47

所以對於50MM行data.table解決方案跑了約2秒，而你的解決方案無休止地運行，直到我殺死它（半小時後）。

來源

2017-04-23 20:20:15

優化for循環中的R

回答

相關問題