2017-07-16 170 views
1

我有下述R data.table通過不在data.table中的列通過行擴展R data.table?

library(data.table) 
DT <- fread('unique_point biased data_points team groupID                           
up1   FALSE  3    1  xy28352                             
up1   TRUE  4    22  xy28352                             
up2   FALSE  1    4  xy28352                             
up2   TRUE  0    3  xy28352                             
up3   FALSE  12   5  xy28352                             
up3   TRUE  35   7  xy28352') 

印刷如下:

DT 
    unique_point biased data_points team groupID                           
1: up1   FALSE  3    1  xy28352                             
2: up1   TRUE  4    22  xy28352                             
3: up2   FALSE  1    4  xy28352                             
4: up2   TRUE  0    3  xy28352                             
5: up3   FALSE  12   5  xy28352                             
6: up3   TRUE  35   7  xy28352 
.... 

此刻,每個unique_point有兩行,具有biasedTRUEFALSE。我想擴大DT使得有6行用於每個unique_point以下列格式:

unique_point biased type data_points team groupID                           
1: up1   FALSE  A  3    1  xy28352                             
2: up1   TRUE  A  4    22  xy28352                             
3: up1   FALSE  B  0    1  xy28352                             
4: up1   TRUE  B  0    22  xy28352                             
5: up1   FALSE  C  0    1  xy28352                             
6: up1   TRUE  C  0    22  xy28352 
7: up2   FALSE  A  1    4  xy28352 
... 

也就是說,對於每個唯一的點,將有一個假/真與A,B,和C.

我開始用下面的代碼:

> DT2 <- DT[, .SD[CJ(type=c("A", "B", "C"), biased = biased, unique = TRUE), 
       on = .(biased, type)], by = .(unique_point)][]  

我收到以下錯誤

Error in `[.data.table`(.SD, CJ(variants = c("SNP", "INS", "DEL"), fused = fused, :                                                                     
    Column(s) [variants] not found in x 

所以,我用下面的技巧,以創建一個名爲DT一個type新列,至少有三個獨特的價值觀:然後

DT$type[2] = "A" 
DT$type[4] = "B" 
DT$type[6] = "C" 

上面的代碼工作。

什麼是擴大通過type類別ABC不使用這個「絕招」 DT正確的方法是什麼?我現在這樣做的方式並不標準,可能會讓第三方感到困惑。讀取代碼並瞭解爲什麼我的原始嘗試不起作用是目標。

編輯:其實,我認爲尺寸是錯的。我的解決方案有錯誤。

+0

我假設你正在尋找一個'數據。表特定的解決方案? – CPak

+0

@ChiPak是的,雖然我可以'dataframe' – ShanZhengYang

回答

1

我會嘗試:

DT2 <- DT[CJ(type = LETTERS[1:3], biased = biased, unique_point = unique_point, unique = TRUE), 
    on = .(unique_point, biased), nomatch = 0] 

#  unique_point biased data_points team groupID type 
# 1:   up1 FALSE   3 1 xy28352 A 
# 2:   up2 FALSE   1 4 xy28352 A 
# 3:   up3 FALSE   12 5 xy28352 A 
# 4:   up1 TRUE   4 22 xy28352 A 
# 5:   up2 TRUE   0 3 xy28352 A 
# 6:   up3 TRUE   35 7 xy28352 A 
# 7:   up1 FALSE   3 1 xy28352 B 
# 8:   up2 FALSE   1 4 xy28352 B 
# 9:   up3 FALSE   12 5 xy28352 B 
# 10:   up1 TRUE   4 22 xy28352 B 
# 11:   up2 TRUE   0 3 xy28352 B 
# 12:   up3 TRUE   35 7 xy28352 B 
# 13:   up1 FALSE   3 1 xy28352 C 
# 14:   up2 FALSE   1 4 xy28352 C 
# 15:   up3 FALSE   12 5 xy28352 C 
# 16:   up1 TRUE   4 22 xy28352 C 
# 17:   up2 TRUE   0 3 xy28352 C 
# 18:   up3 TRUE   35 7 xy28352 C 
1

考慮使用幫助程序數據框或數據表,abc_DT在主表上運行交叉連接。此外,使用條件ifelse填充data_points列中的擴展行。

data.table

abc_DT <- data.table(type=c("A", "B", "C"), data_points_=0) 

# CROSS JOIN      
DT2 <- setkey(DT[,c(k=1,.SD)],k)[abc_DT[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL] 
# RE-ORDER ROWS 
DT2 <- DT2[order(unique_point, type, biased)] 

# CONDITIONAL ASSIGNMENT AND RE-ORDER COLS 
setcolorder(DT2[, data_points:= ifelse(type=="A", data_points, data_points_)][,data_points_:=NULL], 
      c("unique_point", "biased", "type", "data_points", "team", "groupID")) 
DT2 
#  unique_point biased type data_points team groupID 
# 1:   up1 FALSE A   3 1 xy28352 
# 2:   up1 TRUE A   4 22 xy28352 
# 3:   up1 FALSE B   0 1 xy28352 
# 4:   up1 TRUE B   0 22 xy28352 
# 5:   up1 FALSE C   0 1 xy28352 
# 6:   up1 TRUE C   0 22 xy28352 
# 7:   up2 FALSE A   1 4 xy28352 
# ... 

基礎R

abc_df <- data.frame(type=LETTERS[1:3], data_points_=0) 

# CROSS JOIN 
df2 <- merge(df, abc_df, all=TRUE) 
# RE-ORDER ROWS 
df2 <- with(df2, df2[order(unique_point, type, biased),]) 
row.names(df2) <- NULL 

# CONDITIONAL ASSIGNMENT 
df2$data_points <- ifelse(df2$type=='A', df2$data_points, df2$data_points_) 
# SUBSET AND RE-ORDER COLS 
df2 <- df2[c("unique_point", "biased", "type", "data_points", "team", "groupID")] 
df2 
# unique_point biased type data_points team groupID 
# 1   up1 FALSE A   3 1 xy28352 
# 2   up1 TRUE A   4 22 xy28352 
# 3   up1 FALSE B   0 1 xy28352 
# 4   up1 TRUE B   0 22 xy28352 
# 5   up1 FALSE C   0 1 xy28352 
# 6   up1 TRUE C   0 22 xy28352 
# 7   up2 FALSE A   1 4 xy28352 
# ...