2016-05-30 88 views
0

我有一個數據幀如下:轉換一個數據幀到矩陣守信列屬性

Destination User  User_Price 
    A   a   5 
    A   b   4 
    B   c   6 
    B   a   5 
    C   b   4 
    C   d   7 

我打算將它轉換成顯示了目的地的用戶已經達到一個矩陣如下所示:

dplyrtidyr
User User_Price A B C  
    a   5   1 1 0 
    b   4   1 0 1 
    c   6   0 1 0 
    d   7   0 0 1 
+2

本質'表(富[C( 「用戶」, 「目的地」))'如果你只是希望以此作爲一個彙總表。 – thelatemail

+0

相關問題 - 附帶相關鏈接:http://stackoverflow.com/questions/11659128/how-to-use-cast-or-another-function-to-create-a-binary-table-in- r/11659636 – thelatemail

回答

6

一種方法是:

library(dplyr) 
library(tidyr) 

count(foo, User, User_Price, Destination) %>% 
spread(key = Destination, value = n, fill = 0) 

# User User_Price  A  B  C 
# (fctr)  (int) (dbl) (dbl) (dbl) 
#1  a   5  1  1  0 
#2  b   4  1  0  1 
#3  c   6  0  1  0 
#4  d   7  0  0  1 

如果你需要一個矩陣,你可以將這個結果(數據幀)轉換爲矩陣。

DATA

foo <- structure(list(Destination = structure(c(1L, 1L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), User = structure(c(1L, 
2L, 3L, 1L, 2L, 4L), .Label = c("a", "b", "c", "d"), class = "factor"), 
User_Price = c(5L, 4L, 6L, 5L, 4L, 7L)), .Names = c("Destination", 
"User", "User_Price"), class = "data.frame", row.names = c(NA, 
-6L)) 
0

另一種方式來達到同樣的使用dcast。

一個< - dcast(FOO,用戶+ User_Price〜目的地,填充= 0)

後來更改值對於目的地列

2

下面是一個選項,使用data.table

library(data.table) 
dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination") 
# User User_Price A B C 
#1: a   5 1 1 0 
#2: b   4 1 0 1 
#3: c   6 0 1 0 
#4: d   7 0 0 1 
1

這看起來非常類似於正常的整形操作,除了一些特殊情況需要幾行代碼實現基地R.

首先,參考和比較的目的,這裏就是極簡reshape()調用產生:

df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L),stringsAsFactors=F); 
reshape(df,dir='w',idvar='User',timevar='Destination'); 
## User User_Price.A User_Price.B User_Price.C 
## 1 a   5   5   NA 
## 2 b   4   NA   4 
## 3 c   NA   6   NA 
## 6 d   NA   NA   7 

顯然,還有之前我們可以在需要的到達必須解決的幾個問題輸出:

  • 我們必須從多個加寬列計算所需的單數User_Price列。
  • 我們必須以0
  • 更換NA價格我們必須更換非NA價格與1
  • 我們必須修復的列名省略User_Price.前綴。

下面是一個完整的解決方案,使用df從上面:

res <- reshape(df,dir='w',idvar='User',timevar='Destination'); 
pre <- '^User_Price\\.'; 
cis <- grep(pre,names(res)); 
res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); 
names(res)[cis] <- sub(pre,'',names(res)[cis]); 
nas <- is.na(res[cis]); 
res[cis][nas] <- 0; 
res[cis][!nas] <- 1; 
res; 
    User A B C User_Price 
1 a 1 1 0   5 
2 b 1 0 1   4 
3 c 0 1 0   6 
6 d 0 0 1   7 

標杆

library(microbenchmark); 
library(dplyr); 
library(tidyr); 
library(data.table); 

bgoldst <- function(df) { res <- reshape(df,dir='w',idvar='User',timevar='Destination'); pre <- '^User_Price\\.'; cis <- grep(pre,names(res)); res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); names(res)[cis] <- sub(pre,'',names(res)[cis]); nas <- is.na(res[cis]); res[cis][nas] <- 0; res[cis][!nas] <- 1; res; }; 
thelatemail <- function(df) { x <- table(df[,c('User','Destination')]); data.frame(User=rownames(x),User_Price=df[match(rownames(x),df$User),'User_Price'],unclass(x)); }; 
jazzurro <- function(foo) { count(foo, User, User_Price, Destination) %>% spread(key = Destination, value = n, fill = 0); }; 
akrun <- function(foo) dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination"); 

## OP's test case 
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L)); 
dt <- as.data.table(df); 

ex <- bgoldst(df); o <- names(ex); us <- ex$User; 
all.equal(ex,thelatemail(df)[us,o],check.attributes=F); 
## [1] TRUE 
all.equal(ex,jazzurro(df)[us,o],check.attributes=F); 
## [1] TRUE 
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F); 
## [1] TRUE 

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt)); 
## Unit: microseconds 
##    expr  min  lq  mean median  uq  max neval 
##  bgoldst(df) 1767.488 1897.281 2021.7741 1943.894 2035.6260 5227.196 100 
## thelatemail(df) 473.412 536.063 574.4233 578.186 608.1225 738.129 100 
##  jazzurro(df) 2707.468 2914.666 3145.7258 3032.270 3160.3515 5677.514 100 
##  akrun(dt) 4403.964 4721.069 5026.5023 4875.238 5028.1230 7703.303 100 

## scale test 
set.seed(1L); 
ND <- 1e3L; NU <- 1e3L; NR <- 1e4L; 
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T); 
us <- make.unique(rep(letters,len=NU)); 
users <- ave(dests,dests,FUN=function(x) sample(us,length(x))); 
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x))); 
df <- data.frame(Destination=dests,User=users,User_Price=prices); 
dt <- as.data.table(df); 

ex <- bgoldst(df); o <- names(ex); us <- ex$User; 
all.equal(ex,thelatemail(df)[us,o],check.attributes=F); 
## [1] TRUE 
all.equal(ex,jazzurro(df)[us,o],check.attributes=F); 
## [1] TRUE 
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F); 
## [1] TRUE 

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=10L); 
## Unit: milliseconds 
##    expr  min   lq  mean  median   uq  max neval 
##  bgoldst(df) 1381.46461 1418.13922 1445.20568 1437.82683 1474.79075 1538.37153 10 
## thelatemail(df) 31.84727 37.56498 57.47417 44.54106 82.39749 92.63933 10 
##  jazzurro(df) 79.18924 91.20755 117.20360 126.22693 136.13885 168.26623 10 
##  akrun(dt) 52.06625 59.02158 79.59568 70.09136 106.93019 130.31208 10 

## scale test 2 
set.seed(1L); 
ND <- 1e4L; NU <- 1e4L; NR <- 1e6L; 
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T); 
us <- make.unique(rep(letters,len=NU)); 
users <- ave(dests,dests,FUN=function(x) sample(us,length(x))); 
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x))); 
df <- data.frame(Destination=dests,User=users,User_Price=prices); 
dt <- as.data.table(df); 

ex <- bgoldst(df); o <- names(ex); us <- ex$User; 
all.equal(ex,thelatemail(df)[us,o],check.attributes=F); 
## [1] TRUE 
all.equal(ex,jazzurro(df)[us,o],check.attributes=F); 
## [1] TRUE 
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F); 
## [1] TRUE 

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=1L); 
## Unit: seconds 
##    expr  min   lq  mean  median   uq  max neval 
##  bgoldst(df) 485.849043 485.849043 485.849043 485.849043 485.849043 485.849043  1 
## thelatemail(df) 3.377981 3.377981 3.377981 3.377981 3.377981 3.377981  1 
##  jazzurro(df) 12.858542 12.858542 12.858542 12.858542 12.858542 12.858542  1 
##  akrun(dt) 4.132785 4.132785 4.132785 4.132785 4.132785 4.132785  1 
+0

我不確定當行數增加到大約1e7左右時,'table'的工作效率很高。 – akrun

+0

@akrun我添加了1e6行的基準。我嘗試了1e7,但得到錯誤:表中的錯誤(df [,c(「User」,「Destination」)]):嘗試在運行'thelatemail()'時創建一個大於等於2^31個元素的表。 – bgoldst