2017-07-27 39 views
0

我有一個名爲DF這樣的數據:(有DF沒有重複行)R:按照一定的模式傳輸數據表

a_id   b_id 

111111   18 
111111   17 
222222   18 
333333   14 
444444   13 
555555   18 
555555   24 
222222   13 
222222   17 
333333   17 

我想將它反轉爲這樣的數據df_2:

a_one  a_two  b_list number_of_b 
222222 444444  13  1 
111111 222222  17,18 2 
111111 333333  17  1 
111111 222222  17  1 
222222 333333  17  1 
111111 555555  18  1 
222222 555555  18  1 

如果a_id共享相同的b_id,則它們在df_2上成爲一對;

df_2的b_list是相應的b_id;

的number_of_b是b_list的長度

我有一個Python代碼

import pandas as pd 
from itertools import combinations 
df = df.groupby("b_id").apply(lambda x: list(combinations(x["a_id"], 2))).apply(pd.Series).stack() 
df = df.apply(pd.Series).reset_index().groupby([0,1])["b_id"].apply(lambda x:x.values).reset_index() 
df.columns = ["a_one", "a_two", "b_list"] 
df["number_of_b"] = df.b_list.apply(len) 

誰能幫我實現它R中

+0

ü可以做dcast爲此 – akrun

+0

你能給我更具體的指導,謝謝 – kkjoe

回答

0

我的做法是有點長,但會給你預期的結果。 這裏是我的方法:

library(data.table) 
mydf <- data.table(structure(list(a_id = c(111111L, 111111L, 222222L, 333333L, 444444L, 
              555555L, 555555L, 222222L, 222222L, 333333L), b_id = c(18L, 17L, 
                            18L, 14L, 13L, 18L, 24L, 13L, 17L, 17L)), .Names = c("a_id", 
                                         "b_id"), class = "data.frame", row.names = c(NA, -10L))) 

mydf <- mydf[mydf,.(a_id,a_id2=i.a_id,b_id),on="b_id",allow.cartesian=TRUE][a_id!=a_id2] 

# Find duplicates 
get_index <- function(string,values,current_index){ 
    string_present <- match(string,values) 
    string_present[string_present<current_index] <- 0 
    return(string_present) 
} 

mydf[,c("first","reverse"):= .(paste0(a_id,", ",a_id2,", ",b_id),paste0(a_id2,", ",a_id,", ",b_id))] 
mydf[,duplicate_index:= get_index(first,reverse,.I)] 
mydf[duplicate_index==0,.(b_list=list(b_id),number_of_b=.N),.(a_id,a_id2)] 
# a_id a_id2 b_list number_of_b 
# 1: 111111 222222 18,17   2 
# 2: 111111 555555  18   1 
# 3: 222222 555555  18   1 
# 4: 444444 222222  13   1 
# 5: 111111 333333  17   1 
# 6: 222222 333333  17   1 
0

使用基地-R ...

df2 <- tapply(df$a_id, df$b_id, sort) #gather sorted a ids by b 
df2 <- df2[sapply(df2, function(x) length(x)>1)] #remove single items 
df2 <- stack(lapply(df2, function(x) apply(combn(x,2), 2, paste, collapse=" "))) #paste a's in pairs 
df2 <- as.data.frame(tapply(df2$ind, df2$values, paste, collapse=",")) #gather b ids by a pairs 
names(df2) <- "b_list" 
df2[,c("a_one","a_two")] <- do.call(rbind,strsplit(rownames(df2)," ")) #create a columns from row names 
df2$number_of_b <- sapply(df2$b_list,function(x) length(strsplit(x,",")[[1]])) 
rownames(df2) <- NULL #remove row names 
df2 <- df2[,c(2,3,1,4)] #reorder columns 

df2 
    a_one a_two b_list number_of_b 
1 111111 222222 17,18   2 
2 111111 333333  17   1 
3 111111 555555  18   1 
4 222222 333333  17   1 
5 222222 444444  13   1 
6 222222 555555  18   1 
0

合併到自身,對列進行排序和刪除重複的話總結:

library(dplyr) 

merge(df1, df1, by = "b_id") %>% 
    transmute(a_one = pmin(a_id.x, a_id.y), 
      a_two = pmax(a_id.x, a_id.y), 
      b_id) %>% 
    filter(a_one != a_two) %>% 
    unique() %>% 
    group_by(a_one, a_two) %>% 
    summarise(b_list = paste(b_id, collapse = ","), 
      number_of_b = n()) 

# # A tibble: 6 x 4 
# # Groups: a_one [?] 
# a_one a_two b_list number_of_b 
# <int> <int> <chr>  <int> 
# 1 111111 222222 17,18   2 
# 2 111111 333333  17   1 
# 3 111111 555555  18   1 
# 4 222222 333333  17   1 
# 5 222222 444444  13   1 
# 6 222222 555555  18   1 
相關問題