2017-01-03 169 views
0

我有兩個數據集: competitor_data - 包含給定產品的競爭對手,以及價格和日期收集競爭對手的價格時。替代嵌套For循環中的R

PRODUCT_PRICE - 每次價格變動的日期。

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), 
          crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22", 
            "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"), 
          competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"), 
          competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE) 

competitor_data$crawl_date = as.Date(competitor_data$crawl_date) 
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), 
             date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22", 
                "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"), 
            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE) 

product_price$date = as.Date(product_price$date) 

目的

  • 對於PRODUCT_PRICE給定的產品,每個記錄(日期),找到 從competitor_data相關crawl_date價格。
  • 比較PRODUCT_PRICE $價格到最低competitor_data $ competitor_price。
  • 如果PRODUCT_PRICE $價格< = competitor_data $ competitor_price,然後創建一個新列的標誌1(price_leader)else標誌0(price_leader)

我下面使用嵌套的for循環腳本,但它需要超過24小時process 5000 unique product_id:

unique_skus <- unique(product_price$productId) 
all_competitive_data <- data.frame() 
mid_step_data <- data.frame() 

start_time <-Sys.time() 
for (i in 1:length(unique_skus)){ 
    step1 <- subset(product_price, productId == unique_skus[i]) 
    transact_dates = unique(step1$date) 
    for (a in 1:length(transact_dates)){ 
    step2 <- subset(step1, date ==transact_dates[a]) 
    step3 <- inner_join(step2,competitor_data, by='productId') 
    if (nrow(subset(step3, date > crawl_date)) == 0){ 
     step3 <- step3[ order(step3$crawl_date , decreasing = FALSE),] 
     competitor_price <- head(step3,1)$competitor_price 
     step2$competitor_price = competitor_price 
    } 
    else { 
     step4 <- subset(step3, date > crawl_date) 
     step4 <- step4[ order(step4$crawl_date , decreasing = TRUE),] 
     competitor_price <- head(step4,1)$competitor_price 
     step2$competitor_price = competitor_price 
    } 
    step2$price_leader <- ifelse(step2$price <= step2$competitor_price, 1, 0) 
    mid_step_data = rbind(mid_step_data,step2) 
    } 
    all_competitive_data <- rbind(all_competitive_data,mid_step_data) 
} 
Sys.time()-start_time 
all_competitive_data = unique(all_competitive_data) 

有沒有一種方法可以使用dplyr快速完成此操作?

+0

爲什麼不通過產品編號和日期合併這兩個數據集,然後比較這兩個價格列 – rawr

+0

因爲crawl_date不一定映射到最新。請查看我的代碼中是否有聲明。 – BlackHat

+0

等你選擇下一個最接近的日期的價格,所以合併後使用最後一個觀察結轉函數來填寫新手 – rawr

回答

3
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), 
           crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22", 
              "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"), 
           competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"), 
           competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE) 

competitor_data$crawl_date = as.Date(competitor_data$crawl_date) 
# 
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), 
          date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22", 
            "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"), 
          price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE) 

product_price$date = as.Date(product_price$date) 

使用該功能,以填補的NA的矢量轉發n向後

## fill in NAs 
f <- function(..., lead = NA) { 
    # f(NA, 1, NA, 2, NA, NA, lead = NULL) 
    x <- c(lead, c(...)) 
    head(zoo::na.locf(zoo::na.locf(x, na.rm = FALSE), fromLast = TRUE), 
     if (is.null(lead)) length(x) else -length(lead)) 
} 

按產品和日期合併兩者。我們墊價格第一的產品,一個額外的NA所以這將有效地使用以前的價格,當我們填寫在NAS

然後做價格和競爭對手的價格進行比較。最後一步是隻是一些清理,以證明它是同樣的結果

dd <- merge(product_price, competitor_data, 
      by.y = c('productId', 'crawl_date'), 
      by.x = c('productId', 'date'), all = TRUE) 
dd$competitor_price <- 
    unlist(sapply(split(dd$competitor_price, dd$productId), f)) 
dd$price_leader <- +(dd$price <= dd$competitor_price) 
(res1 <- `rownames<-`(dd[!is.na(dd$price_leader), -4], NULL)) 

# productId  date price competitor_price price_leader 
# 1  banana 2014-02-22 2.09    2.50   1 
# 2  banana 2014-05-03 2.04    2.35   1 
# 3  banana 2014-05-05 2.12    2.35   1 
# 4  banana 2014-06-22 2.31    2.22   0 
# 5  banana 2014-07-05 2.29    2.52   1 
# 6  banana 2014-08-31 2.01    2.52   1 
# 7  fig 2014-03-09 5.21    5.32   1 
# 8  fig 2014-05-21 5.22    5.32   1 
# 9  fig 2014-06-19 5.36    5.56   1 
# 10  fig 2014-06-22 5.91    5.56   0 
# 11  fig 2014-07-03 5.36    5.86   1 
# 12  fig 2014-09-08 5.56    5.96   1 

res0 <- `rownames<-`(all_competitive_data[ 
    order(all_competitive_data$productId, all_competitive_data$date), ], NULL) 

all.equal(res0, res1) 
# [1] TRUE 

您可以更改這些步驟dplyr或data.table語法;我不使用任何一個,但它應該是直接的:

library('dplyr') 
dd <- full_join(product_price, competitor_data, 
       by = c(
        'productId' = 'productId', 
        'date' = 'crawl_date' 
       ) 
) %>% arrange(productId, date) 

dd %>% group_by(productId) %>% 
    mutate(
    competitor_price = f(competitor_price), 
    price_leader = as.integer(price <= competitor_price) 
) %>% filter(!is.na(price_leader)) %>% select(-competitor) 

# Source: local data frame [12 x 5] 
# Groups: productId [2] 
# 
#  productId  date price competitor_price price_leader 
#   <chr>  <date> <dbl>   <dbl>  <int> 
# 1  banana 2014-02-22 2.09    2.50   1 
# 2  banana 2014-05-03 2.04    2.35   1 
# 3  banana 2014-05-05 2.12    2.35   1 
# 4  banana 2014-06-22 2.31    2.22   0 
# 5  banana 2014-07-05 2.29    2.52   1 
# 6  banana 2014-08-31 2.01    2.52   1 
# 7  fig 2014-03-09 5.21    5.32   1 
# 8  fig 2014-05-21 5.22    5.32   1 
# 9  fig 2014-06-19 5.36    5.56   1 
# 10  fig 2014-06-22 5.91    5.56   0 
# 11  fig 2014-07-03 5.36    5.86   1 
# 12  fig 2014-09-08 5.56    5.96   1 
0

以下解決方案使用dplyr加入相匹配。 (注:我改「crawl_date」到「日期」,以便dplyr加入會自動選擇匹配列,可以明確地匹配使用類似

by=c('productId'='productId', date'='crawl_date') 

作爲參數加入

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), 
           date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22", 
              "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"), 
           competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"), 
           competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE) 

competitor_data$date = as.Date(competitor_data$date) 

product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), 
          date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22", 
            "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"), 
          price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE) 

product_price$date = as.Date(product_price$date) 

require(dplyr) 
joined <- product_price %>% left_join(competitor_data) 
joined$leader <- as.integer(joined$price <= joined$competitor_price) 

joined 

所得數據幀是

productId  date price competitor competitor_price leader 
1  banana 2014-05-05 2.12  google    1.99  0 
2  banana 2014-06-22 2.31 tencent    2.52  1 
3  banana 2014-07-05 2.29  <NA>    NA  NA 
4  banana 2014-08-31 2.01  <NA>    NA  NA 
5  banana 2014-05-03 2.04  <NA>    NA  NA 
6  banana 2014-02-22 2.09  <NA>    NA  NA 
7  fig 2014-05-21 5.22  <NA>    NA  NA 
8  fig 2014-06-19 5.36  <NA>    NA  NA 
9  fig 2014-03-09 5.21  <NA>    NA  NA 
10  fig 2014-06-22 5.91  <NA>    NA  NA 
11  fig 2014-07-03 5.36  <NA>    NA  NA 
12  fig 2014-09-08 5.56  <NA>    NA  NA 
> 
+0

這是缺少的是我的if語句。日期和crawl_date不一定相同。對於給定日期,我們採用最接近的crawl_date(日期前的crawl_date)。請看我的if語句。我把邏輯放在裏面。 – BlackHat