2017-09-16 49 views
3

當使用dplyr創建我的數據的總結,我經常發現自己(從Rmisc使用CI)計算CI:拼合柱作爲參數

summary <- data %>% 
    group_by(group1, group2) %>% 
    summarize(
    var1.mean = CI(var1, ci=0.95)['mean'], 
    var1.lower = CI(var1, ci=0.95)['lower'], 
    var1.upper = CI(var1, ci=0.95)['upper'], 

    var2.mean = CI(var2, ci=0.95)['mean'], 
    var2.lower = CI(var2, ci=0.95)['lower'], 
    var3.upper = CI(var2, ci=0.95)['upper'], 

    var3.mean = CI(var3, ci=0.95)['mean'], 
    var3.lower = CI(var3, ci=0.95)['lower'], 
    var3.upper = CI(var3, ci=0.95)['upper'], 

    var4 = sum(var4) 
) 

這既是痛苦的冗長和低效率。最後,我希望我可以寫的東西有點像:

summary <- data %>% 
    group_by(group1, group2) %>% 
    summarize(
    var1 = CI(var1, ci=0.95), 
    var2 = CI(var2, ci=0.95), 
    var3 = CI(var3, ci=0.95), 
    var4 = sum(var4) 
) 

對於上面的代碼,因爲CI返回與行

  • "lower"一個名爲列,
  • "upper"
  • "mean"

我希望我能得到與列的數據幀看起來像:

  • "group1"
  • "group2
  • "var1.lower"
  • "var1.mean"
  • "var1.upper"
  • "var2.lower"
  • ...
  • "var3.upper"
  • "var4"

任何想法如何實現?有沒有辦法在R中「扁平」列?像do.call之類的東西,但像JS或Python一樣應用得像休息一樣?

有可能是一些使用quasiquotations的事,但它開始去了我[R技能..

我用this gistplyr,但它不與dplyr管用了,而不是它重新編碼再次,我希望有一種比侵入圖書館更好的方式。

回答

3

我們可以使用tidyr::unnest如果我們格式化輸出爲data.frame第一

數據

library(Rmisc) 
library(dplyr) 
library(tidyr) 
set.seed(1) 
data <- data.frame(group1 = sample(c("A","B"),10,T), 
        group2 = sample(c("A","B"),10,T), 
        var1 = sample(10), 
        var2 = sample(10), 
        var3 = sample(10), 
        var4 = sample(10)) 

通用的解決方案

data %>% group_by(group1, group2) %>% 
    dplyr::summarize(var1 = list(data.frame(t(CI(var1, ci=0.95)))), 
        var2 = list(data.frame(t(CI(var2, ci=0.95)))), 
        var3 = list(data.frame(t(CI(var3, ci=0.95)))), 
        var4 = sum(var4)) %>% 
    unnest (var1,var2,var3,.sep=".") 

結果

# A tibble: 4 x 12 
# Groups: group1 [2] 
# group1 group2 var4 var1.upper var1.mean var1.lower var2.upper var2.mean var2.lower var3.upper var3.mean var3.lower 
# <fctr> <fctr> <int>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> 
# 1  A  A 13 56.824819  6.0 -44.824819 11.85310 5.500000 -0.8531024 26.55931 7.500000 -11.559307 
# 2  A  B 11 38.265512  6.5 -25.265512 50.97172 6.500000 -37.9717166 25.55931 6.500000 -12.559307 
# 3  B  A 11 12.956686  4.0 -4.956686 13.65205 5.666667 -2.3187188 15.07146 5.666667 -3.738127 
# 4  B  B 20 8.484138  6.0 3.515862 14.70619 4.666667 -5.3728564 11.31872 3.333333 -4.652052 

或者與自定義CI功能(相同的輸出)

CI2 <- function(x,ci=0.95) list(data.frame(t(CI(x, ci)))) 

data %>% group_by(group1, group2) %>% 
    dplyr::summarize(var1 = CI2(var1, ci=0.95), 
        var2 = CI2(var2, ci=0.95), 
        var3 = CI2(var3, ci=0.95), 
        var4 = sum(var4)) %>% 
    unnest (var1,var2,var3,.sep=".") 

或使用轉換器功能(相同的輸出)

可以與用於返回數組的任何其他函數

vec2rowdf <- function(v) list(data.frame(t(v))) # creates a 1 row data.frame from a vector, wrapped in a list 
data %>% group_by(group1, group2) %>% 
    dplyr::summarize(var1 = CI(var1, ci=0.95) %>% vec2rowdf, 
        var2 = CI(var2, ci=0.95) %>% vec2rowdf, 
        var3 = CI(var3, ci=0.95) %>% vec2rowdf, 
        var4 = sum(var4)) %>% 
    unnest (var1,var2,var3,.sep=".") 
1

「扁平」由unnest(來自tidyr)處理。總結時,您可以創建一個列表,然後用unnest所以每個上/平均值/低三線顯示爲3行格式:使用收集

res <- data %>% 
    group_by(group1, group2) %>% 
    summarize_at(vars(var1, var2, var3), funs(list(CI(., ci=0.95)))) %>% 
    unnest(var1, var2, var3, .id = 'name') 

然後,您需要格式化您的tibble達到所需輸出和傳播

res %>% 
    group_by(group1, group2) %>% 
    mutate(q = c('upper', 'mean', 'lower')) %>% 
    ungroup %>% 
    gather(var, val, var1, var2, var3) %>% 
    mutate(var = paste(var, q, sep = '.')) %>% 
    select(-q) %>% 
    spread(var, val) 

可以很容易地更改爲在summarize_*使用其他功能(不同的功能應用到你需要從summarize_atsummarize改變並寫出明確爲每個var1表達不同的列,var2var3

+0

Thanks @konvas。我現在無法測試,但我提供了更多信息,我會盡快查看。 –

+0

@QuentinRoy沒有probs,我已經更新了我認爲會給你輸出所需的答案 – konvas

+0

任何想法如何使用這種方法保持'var4'? –

1

這裏的另一種可能的方法,通過連續使用unlist總結爲VAR1-VAR4後的結果:使用

library(dplyr) 

summary <- dat %>% 
    group_by(group1, group2) %>% 
    summarize(
    var1 = list(Rmisc::CI(var1)), 
    var2 = list(Rmisc::CI(var2)), 
    var3 = list(Rmisc::CI(var3)), 
    var4 = sum(var4)) %>% 
    ungroup() 

summary <- t(apply(summary, 1, unlist)) %>%  # unlist by row (this results in a character matrix) 
    as.data.frame(stringsAsFactors = F) %>% # convert matrix back to data frame 
    mutate_at(vars(var1.upper:var4),  # convert appropriate columns back to numeric 
      as.numeric) 

> str(summary) 
'data.frame': 9 obs. of 12 variables: 
$ group1 : chr "A" "A" "A" "B" ... 
$ group2 : chr "d" "e" "f" "d" ... 
$ var1.upper: num 5.77 6.25 5.94 6.49 6.26 ... 
$ var1.mean : num 4.55 4.8 4.66 5.27 4.94 ... 
$ var1.lower: num 3.32 3.35 3.38 4.04 3.62 ... 
$ var2.upper: num 20.5 20.3 20.3 20.1 20.3 ... 
$ var2.mean : num 20.2 19.9 19.9 19.7 19.9 ... 
$ var2.lower: num 19.8 19.5 19.5 19.3 19.5 ... 
$ var3.upper: num 140 155 120 113 141 ... 
$ var3.mean : num 113.9 125.3 94.7 86.3 111.6 ... 
$ var3.lower: num 88.1 95.6 69.9 59.8 82.7 ... 
$ var4  : num 240042 205052 215986 231008 229010 ... 

的樣本數據:

set.seed(123) 
n = 2000 
dat <- data.frame(
    group1 = sample(LETTERS[1:3], n, replace = T), 
    group2 = sample(letters[4:6], n, replace = T), 
    var1 = rnorm(n, mean = 5, sd = 10), 
    var2 = rnorm(n, mean = 20, sd = 3), 
    var3 = rnorm(n, mean = 100, sd = 200), 
    var4 = rnorm(n, mean = 1000, sd = 5) 
) 
+0

謝謝!這樣可行。如此簡單的事情如此複雜和難以實現.. –

1

下面是使用一些更concises解決方案tidyverse工具

set.seed(1) 
data <- data.frame(group1 = sample(c("A","B"),10,T), 
        group2 = sample(c("A","B"),10,T), 
        var1 = sample(10), 
        var2 = sample(10), 
        var3 = sample(10), 
        var4 = sample(10)) 

library(dplyr, warn.conflicts = F) 

創建的功能列表,並應用於列的一個子集首先解決

使用summarise_at functionnalities

summarise_fun <- funs(
    mean = Rmisc::CI(., ci=0.95)['mean'], 
    lower = Rmisc::CI(., ci=0.95)['lower'], 
    upper = Rmisc::CI(., ci=0.95)['upper']) 


summary_CI <- data %>% 
    group_by(group1, group2) %>% 
    summarize_at(vars(num_range("var", 1:3)), summarise_fun) 
summary_CI 
#> # A tibble: 4 x 11 
#> # Groups: group1 [?] 
#> group1 group2 var1_mean var2_mean var3_mean var1_lower var2_lower 
#> <fctr> <fctr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> 
#> 1  A  A  6.0 5.500000 7.500000 -44.824819 -0.8531024 
#> 2  A  B  6.5 6.500000 6.500000 -25.265512 -37.9717166 
#> 3  B  A  4.0 5.666667 5.666667 -4.956686 -2.3187188 
#> 4  B  B  6.0 4.666667 3.333333 3.515862 -5.3728564 
#> # ... with 4 more variables: var3_lower <dbl>, var1_upper <dbl>, 
#> # var2_upper <dbl>, var3_upper <dbl> 

,您可以後一個總結連接上VAR4

inner_join(summary_CI, 
      data %>% 
      group_by(group1, group2) %>% 
      summarize_at("var4", sum), 
      by = c("group1", "group2")) 
#> # A tibble: 4 x 12 
#> # Groups: group1 [?] 
#> group1 group2 var1_mean var2_mean var3_mean var1_lower var2_lower 
#> <fctr> <fctr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> 
#> 1  A  A  6.0 5.500000 7.500000 -44.824819 -0.8531024 
#> 2  A  B  6.5 6.500000 6.500000 -25.265512 -37.9717166 
#> 3  B  A  4.0 5.666667 5.666667 -4.956686 -2.3187188 
#> 4  B  B  6.0 4.666667 3.333333 3.515862 -5.3728564 
#> # ... with 5 more variables: var3_lower <dbl>, var1_upper <dbl>, 
#> # var2_upper <dbl>, var3_upper <dbl>, var4 <int> 

第二種解決方案:使用函數式編程更簡潔purrr

基本上,它在一個塊中完成了解決方案1中的工作。它在.vars.funs的列表中使用了參數summarise_at。結果是一個數據列表。框,你可以結合在一起以加入

library(purrr) 
lst(.vars = lst(vars(num_range("var", 1:3)), "var4"), 
    .fun = lst(summarise_fun, sum)) %>% 
     pmap(~ data %>% 
      group_by(group1, group2) %>% 
      summarise_at(.x, .y)) %>% 
     reduce(inner_join, by = c("group1", "group2")) 
#> # A tibble: 4 x 12 
#> # Groups: group1 [?] 
#> group1 group2 var1_mean var2_mean var3_mean var1_lower var2_lower 
#> <fctr> <fctr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> 
#> 1  A  A  6.0 5.500000 7.500000 -44.824819 -0.8531024 
#> 2  A  B  6.5 6.500000 6.500000 -25.265512 -37.9717166 
#> 3  B  A  4.0 5.666667 5.666667 -4.956686 -2.3187188 
#> 4  B  B  6.0 4.666667 3.333333 3.515862 -5.3728564 
#> # ... with 5 more variables: var3_lower <dbl>, var1_upper <dbl>, 
#> # var2_upper <dbl>, var3_upper <dbl>, var4 <int>