2015-04-06 84 views
2

我有一個data.frame看起來像這樣:分割數據幀的行

df <- data.frame(col1=c("a","b","c","d"), col2=c("1","1;2;3","5","3;2;5;5;3"), col3=c("0","1;1;0","0","0;0;1;1;0")) 

# col1  col2  col3 
# 1 a   1   0 
# 2 b  1;2;3  1;1;0 
# 3 c   5   0 
# 4 d 3;2;5;5;3 0;0;1;1;0 

在換言之,一些行具有與級聯值的列「;」。在讀取data.frame之前,我不知道哪些列將包含連接的值,但我確實知道對於所有那些具有該值的行,它們將是相同的。我也知道,對於具有連續值的列的行,級聯值的數目在所有這些列中是相同的(第2行在col2和col3中均具有3個值,並且第4行在這些列中具有5個值)

I想要創建一個新的data.frame,將這些連接值分割成不同的行。對於這些行,列中沒有連接值的值應該由連接值的數量複製。

產生的data.frame將是:

df <- data.frame(col1=c("a","b","b","b","c","d","d","d","d","d"), col2=c("1","1","2","3","5","3","2","5","5","3"), col3=c("0","1","1","0","0","0","0","1","1","0")) 

# col1 col2 col3 
# 1  a 1 0 
# 2  b 1 1 
# 3  b 2 1 
# 4  b 3 0 
# 5  c 5 0 
# 6  d 3 0 
# 7  d 2 0 
# 8  d 5 1 
# 9  d 5 1 
# 10 d 3 0 

回答

2

這裏有一個選項

df <- data.frame(col1=c("a","b","c","d"), col2=c("1","1;2;3","5","3;2;5;5;3"), col3=c("0","1;1;0","0","0;0;1;1;0")) 

df2 <- data.frame(col1=c("a","b","b","b","c","d","d","d","d","d"), col2=c("1","1","2","3","5","3","2","5","5","3"), col3=c("0","1","1","0","0","0","0","1","1","0")) 


## reshape `col1` to make it look like the others 
v <- Vectorize(gsub) 
df$col1 <- v('\\b\\d\\b', df$col1, df$col2) 

#  col1  col2  col3 
# 1   a   1   0 
# 2  b;b;b  1;2;3  1;1;0 
# 3   c   5   0 
# 4 d;d;d;d;d 3;2;5;5;3 0;0;1;1;0 


## split on white space or `;` and coerce back into a data frame 
data.frame(do.call('cbind', lapply(df, function(x) 
    unlist(strsplit(as.character(x), '[\\s;]'))))) 

# col1 col2 col3 
# 1  a 1 0 
# 2  b 1 1 
# 3  b 2 1 
# 4  b 3 0 
# 5  c 5 0 
# 6  d 3 0 
# 7  d 2 0 
# 8  d 5 1 
# 9  d 5 1 
# 10 d 3 0 
0

不,與成熟RAWR的答案,但也許更容易地看到發生了什麼

df1 <- data.frame(col1=c("a","b","c","d"), 
        col2=c("1","1;2;3","5","3;2;5;5;3"), 
        col3=c("0","1;1;0","0","0;0;1;1;0"), 
        stringsAsFactors=FALSE) 

df1_rows <- nrow(df1) 
col1_split <- strsplit(df1$col1,";") 
col2_split <- strsplit(df1$col2,";") 
col3_split <- strsplit(df1$col3,";") 

df2 <- data.frame(col1=character(), 
        col2=character(), 
        col3=character(), 
        stringsAsFactors=FALSE) 

for (n in 1:df1_rows){ df2 <- rbind(df2, 
     data.frame(col1=col1_split[[n]], 
        col2=col2_split[[n]], 
        col3=col3_split[[n]], 
        stringsAsFactors=FALSE))} 

哪給出

> df2 
    col1 col2 col3 
1  a 1 0 
2  b 1 1 
3  b 2 1 
4  b 3 0 
5  c 5 0 
6  d 3 0 
7  d 2 0 
8  d 5 1 
9  d 5 1 
10 d 3 0 
1

這是我寫的「splitstackshape」包中的那種數據。您可以使用cSplit,如下所示:

library(splitstackshape) 
cSplit(df, c("col2", "col3"), ";", "long") 
#  col1 col2 col3 
# 1: a 1 0 
# 2: b 1 1 
# 3: b 2 1 
# 4: b 3 0 
# 5: c 5 0 
# 6: d 3 0 
# 7: d 2 0 
# 8: d 5 1 
# 9: d 5 1 
# 10: d 3 0