2015-10-15 66 views
3

在這種情況下,我們的大數據集將是這樣的:子集使用字符串從其他數據兩列大數據集設置

structure(list(Car = c("Mazda RX4", "Maserati Bora", "Leticia", 
         "Hornet 4 Drive", "Hornet Sportabout", "Alex", "Duster 360", 
         "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", "Merc 450SE", 
         "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental", 
         "Chrysler Imperial", "Fiat 128", "Honda Civic", "Toyota Corolla", 
         "Toyota Corona", "Datsun 710", "AMC Javelin", "Camaro Z28", 
         "Datsun 710", "Fiat X1-9", "Mazda RX4", "Lotus Europa", 
         "Ford Pantera L", "Ferrari Dino", "Mazda RX4 Wag", "Volvo 142E" 
), Name = c("Mark", "Random", "Datsun 710", "Trevor", "Joanna", 
      "Valiant", "Random", "Random", "Random", "Random", "Random", 
      "Random", "Random", "Random", "Random", "Random", "Random", "Random", 
      "Random", "Trevor", "Random", "Random", "Random", "Random", "Random", 
      "Random", "Mazda RX4", "Random", "Alex", "Random", "John", "Random" 
), disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
      167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
      71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 301, 
      121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 
         180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 150, 245, 
         175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 3.9, 3.85, 
                     3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07, 3.07, 3.07, 
                     2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 3.15, 3.73, 3.08, 
                     4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11), wt = c(2.62, 2.875, 
                                  2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44, 3.44, 4.07, 
                                  3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 1.615, 1.835, 2.465, 3.52, 
                                  3.435, 3.84, 3.845, 1.935, 2.14, 1.513, 3.17, 2.77, 3.57, 2.78 
                     ), qsec = c(16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 
                        20, 22.9, 18.3, 18.9, 17.4, 17.6, 18, 17.98, 17.82, 17.42, 19.47, 
                        18.52, 19.9, 20.01, 16.87, 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 
                        14.5, 15.5, 14.6, 18.6), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 
                                1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 
                                1), am = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                   1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 
                                                 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
                                                 3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4, 2, 
                                                          2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 2, 2, 4, 
                                                          6, 8, 2)), .Names = c("Car", "Name", "disp", "hp", "drat", "wt", 
                                                                "qsec", "vs", "am", "gear", "carb"), row.names = c(NA, -32L), class = "data.frame") 

我想這個子集數據通過提取它的某些行設置。我想,以提取行存儲在另一個數據幀:

> dput(list_save) 
structure(list(Car = c("Mazda RX4", "Mazda RX4 Wag", "Datsun 710", 
"Hornet 4 Drive", "Hornet Sportabout", "Valiant"), Name = c("Mark", 
"John", "Leticia", "Trevor", "Joanna", "Alex")), .Names = c("Car", 
"Name"), class = "data.frame", row.names = c(NA, -6L)) 

採取對list_save一看,因爲某些字符串可以在不同的列根據df中查到,但是它必須被提取爲好。

所需的輸出應該是這樣的:

   Car  Name disp hp drat wt qsec vs am gear carb 
1   Mazda RX4  Mark 160 110 3.90 2.620 16.46 0 1 4 4 
2  Mazda RX4 Wag  John 301 335 3.54 3.570 14.60 0 1 5 8 
3   Leticia Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 
4 Hornet 4 Drive  Trevor 258 110 3.08 3.215 19.44 1 0 3 1 
5 Hornet Sportabout  Joanna 360 175 3.15 3.440 17.02 0 0 3 2 
6    Alex Valiant 225 105 2.76 3.460 20.22 1 0 3 1 

我想類似的功能下面的一個發現:

test <- df[df[,1:2] %in% list_save, ] 

回答

2

我會運行兩個二進制連接使用data.table一次CarName對他們的自我和一次反對對方,只是結合了。我們將使用在CRAN的最新版本爲這個(V 1.9.6+)

library(data.table) # v 1.9.6+ 
res <- setDT(df)[list_save, on = c("Car", "Name")] 
res2 <- df[list_save, on = c(Name = "Car", Car = "Name"), nomatch = 0L] 
res[is.na(disp), (names(res)) := res2] 
#     Car  Name disp hp drat wt qsec vs am gear carb 
# 1:   Mazda RX4  Mark 160 110 3.90 2.620 16.46 0 1 4 4 
# 2:  Mazda RX4 Wag  John 301 335 3.54 3.570 14.60 0 1 5 8 
# 3:   Leticia Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 
# 4: Hornet 4 Drive  Trevor 258 110 3.08 3.215 19.44 1 0 3 1 
# 5: Hornet Sportabout  Joanna 360 175 3.15 3.440 17.02 0 0 3 2 
# 6:    Alex Valiant 225 105 2.76 3.460 20.22 1 0 3 1 

另外,一個更安全的方法是隻rbind只有匹配的結果,但這樣一來,你就失去了原有的行order

res <- setDT(df)[list_save, on = c("Car", "Name"), nomatch = 0L] 
res2 <- df[list_save, on = c(Name = "Car", Car = "Name"), nomatch = 0L] 
rbind(res, res2) 
#     Car  Name disp hp drat wt qsec vs am gear carb 
# 1:   Mazda RX4  Mark 160 110 3.90 2.620 16.46 0 1 4 4 
# 2:  Mazda RX4 Wag  John 301 335 3.54 3.570 14.60 0 1 5 8 
# 3: Hornet 4 Drive  Trevor 258 110 3.08 3.215 19.44 1 0 3 1 
# 4: Hornet Sportabout  Joanna 360 175 3.15 3.440 17.02 0 0 3 2 
# 5:   Leticia Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 
# 6:    Alex Valiant 225 105 2.76 3.460 20.22 1 0 3 1 
1
sub_df <- df[which(df[,1] %in% list_save[,1] & df[,2] %in% list_save[,2]),] 

雖然,你的意思有AlexCarValiant in Name?我只是問,因爲上面假設那些是錯誤的。如果心不是的情況下使用:

EDITED

sub_df <- df[which(df[,1] %in% list_save[,1] & df[,2] %in% list_save[,2] | 
        df[,1] %in% list_save[,2] & df[,2] %in% list_save[,1]),] 
+0

我認爲代碼有問題。如果我將它用於原始數據,它將提取包含兩列中任何字符串的每一行。 –

+0

不能看到您的原始數據,這很難說,這適用於示例數據。 – amwill04

+0

立即嘗試。編輯的主要數據集。它與原來的相似。 –