這看起來非常類似於正常的整形操作,除了一些特殊情況需要幾行代碼實現基地R.
首先,參考和比較的目的,這裏就是極簡reshape()
調用產生:
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L),stringsAsFactors=F);
reshape(df,dir='w',idvar='User',timevar='Destination');
## User User_Price.A User_Price.B User_Price.C
## 1 a 5 5 NA
## 2 b 4 NA 4
## 3 c NA 6 NA
## 6 d NA NA 7
顯然,還有之前我們可以在需要的到達必須解決的幾個問題輸出:
- 我們必須從多個加寬列計算所需的單數
User_Price
列。
- 我們必須以0
- 更換NA價格我們必須更換非NA價格與1
- 我們必須修復的列名省略
User_Price.
前綴。
下面是一個完整的解決方案,使用df
從上面:
res <- reshape(df,dir='w',idvar='User',timevar='Destination');
pre <- '^User_Price\\.';
cis <- grep(pre,names(res));
res$User_Price <- do.call(pmax,c(res[cis],na.rm=T));
names(res)[cis] <- sub(pre,'',names(res)[cis]);
nas <- is.na(res[cis]);
res[cis][nas] <- 0;
res[cis][!nas] <- 1;
res;
User A B C User_Price
1 a 1 1 0 5
2 b 1 0 1 4
3 c 0 1 0 6
6 d 0 0 1 7
標杆
library(microbenchmark);
library(dplyr);
library(tidyr);
library(data.table);
bgoldst <- function(df) { res <- reshape(df,dir='w',idvar='User',timevar='Destination'); pre <- '^User_Price\\.'; cis <- grep(pre,names(res)); res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); names(res)[cis] <- sub(pre,'',names(res)[cis]); nas <- is.na(res[cis]); res[cis][nas] <- 0; res[cis][!nas] <- 1; res; };
thelatemail <- function(df) { x <- table(df[,c('User','Destination')]); data.frame(User=rownames(x),User_Price=df[match(rownames(x),df$User),'User_Price'],unclass(x)); };
jazzurro <- function(foo) { count(foo, User, User_Price, Destination) %>% spread(key = Destination, value = n, fill = 0); };
akrun <- function(foo) dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination");
## OP's test case
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L));
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(df) 1767.488 1897.281 2021.7741 1943.894 2035.6260 5227.196 100
## thelatemail(df) 473.412 536.063 574.4233 578.186 608.1225 738.129 100
## jazzurro(df) 2707.468 2914.666 3145.7258 3032.270 3160.3515 5677.514 100
## akrun(dt) 4403.964 4721.069 5026.5023 4875.238 5028.1230 7703.303 100
## scale test
set.seed(1L);
ND <- 1e3L; NU <- 1e3L; NR <- 1e4L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=10L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(df) 1381.46461 1418.13922 1445.20568 1437.82683 1474.79075 1538.37153 10
## thelatemail(df) 31.84727 37.56498 57.47417 44.54106 82.39749 92.63933 10
## jazzurro(df) 79.18924 91.20755 117.20360 126.22693 136.13885 168.26623 10
## akrun(dt) 52.06625 59.02158 79.59568 70.09136 106.93019 130.31208 10
## scale test 2
set.seed(1L);
ND <- 1e4L; NU <- 1e4L; NR <- 1e6L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=1L);
## Unit: seconds
## expr min lq mean median uq max neval
## bgoldst(df) 485.849043 485.849043 485.849043 485.849043 485.849043 485.849043 1
## thelatemail(df) 3.377981 3.377981 3.377981 3.377981 3.377981 3.377981 1
## jazzurro(df) 12.858542 12.858542 12.858542 12.858542 12.858542 12.858542 1
## akrun(dt) 4.132785 4.132785 4.132785 4.132785 4.132785 4.132785 1
本質'表(富[C( 「用戶」, 「目的地」))'如果你只是希望以此作爲一個彙總表。 – thelatemail
相關問題 - 附帶相關鏈接:http://stackoverflow.com/questions/11659128/how-to-use-cast-or-another-function-to-create-a-binary-table-in- r/11659636 – thelatemail