2017-09-29 16 views
4
library(tidyverse) 

iris <- iris 

means <- iris %>% 
    group_by(Species) %>% 
    summarise_all(funs(mean)) 

sd <- iris %>% 
    group_by(Species) %>% 
    summarise_all(funs(sd)) 

bottom <- means[ ,2:5] - sd[ ,2:5] 
bottom$Species <- c("setosa", "versicolor", "virginica") 
print(bottom) 
    Sepal.Length Sepal.Width Petal.Length Petal.Width Species 
1  4.653510 3.048936  1.288336 0.1406144  setosa 
2  5.419829 2.456202  3.790089 1.1282473 versicolor 
3  5.952120 2.651503  5.000105 1.7513499 virginica 

top <- means[ ,2:5] + sd[ ,2:5] 
top$Species <- c("setosa", "versicolor", "virginica") 
print(top) 
    Sepal.Length Sepal.Width Petal.Length Petal.Width Species 
1  5.358490 3.807064  1.635664 0.3513856  setosa 
2  6.452171 3.083798  4.729911 1.5237527 versicolor 
3  7.223880 3.296497  6.103895 2.3006501 virginica 

如何獲取虹膜行,其中的Sepal.Length,Sepal.Width,Petal.Length和Petal.Width值在頂部和底部的數據值之間都屬於幀?條件不相等聯接

例如,我只想要setosa行,其中Sepal.Length> 4.65 & Sepal.Length < 5.35和Sepal.Width是3.04和3.80之間,等等理想地,最終的結果只包含4個數字列和物種柱。

謝謝。

回答

4

這會容易得多,如果你可以從一開始就沒有過濾的總結步:

iris %>% 
    group_by(Species) %>% 
    filter_if(is.numeric, all_vars(. < mean(.) + sd(.) & . > mean(.) - sd(.))) 

# A tibble: 54 x 5 
# Groups: Species [3] 
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species 
#   <dbl>  <dbl>  <dbl>  <dbl> <fctr> 
# 1   5.1   3.5   1.4   0.2 setosa 
# 2   4.7   3.2   1.3   0.2 setosa 
# 3   5.0   3.6   1.4   0.2 setosa 
# 4   5.0   3.4   1.5   0.2 setosa 
# 5   4.8   3.4   1.6   0.2 setosa 
# 6   5.1   3.5   1.4   0.3 setosa 
# 7   5.1   3.8   1.5   0.3 setosa 
# 8   5.2   3.5   1.5   0.2 setosa 
# 9   5.2   3.4   1.4   0.2 setosa 
#10   4.7   3.2   1.6   0.2 setosa 
# ... with 44 more rows 

不知道,如果你能避免總結一步,崗位作爲一個不錯的選擇。


或者使用between

iris %>% 
    group_by(Species) %>% 
    filter_if(is.numeric, all_vars(between(., mean(.) - sd(.), mean(.) + sd(.)))) 
2

下面是使用非球菌加入這是對(now deleted) approach of @Frank構建一個解決方案:

library(data.table) 

# add a row number column and to reshape from wide to long 
DT <- melt(data.table(iris)[, rn := .I], id = c("rn", "Species")) 

# compute lower and upper bound for each variable and Species 
mDT <- DT[, .(lb = lb <- mean(value) - (s <- sd(value)), 
       ub = lb + 2 * s), by = .(Species, variable)] 

# find row numbers of items which fulfill conditions 
selected_rn <- 
    # non-equi join 
    DT[DT[mDT, on = .(Species, variable, value > lb, value < ub), which = TRUE]][ 
    # all uniqueN(mDT$variable) variables must have been selected 
    # for an item to pass (thanks to @Frank for tip to avoid hardcoded value) 
    , .N, by = rn][N == uniqueN(mDT$variable), rn] 

head(iris[sort(selected_rn),]) 
Sepal.Length Sepal.Width Petal.Length Petal.Width Species 
1   5.1   3.5   1.4   0.2 setosa 
3   4.7   3.2   1.3   0.2 setosa 
5   5.0   3.6   1.4   0.2 setosa 
8   5.0   3.4   1.5   0.2 setosa 
12   4.8   3.4   1.6   0.2 setosa 
18   5.1   3.5   1.4   0.3 setosa