2016-04-30 66 views
0

我正在尋找一種方法來爲函數內的不同變量使用不同的比例。在R中使用矢量來更改函數的輸出

這是問題的一個後續從A simpler way to achieve a frequency count with mean, sum, length and sd in R

鑑於

# create the summary function 
    summaryStatistics <- function(x,levels) { 
     xx <- na.omit(x) 
     c(table(factor(x, levels=levels), useNA='always', exclude=NULL), 
      sum=sum(xx), 
      length=length(x), 
      mean=mean(xx), 
      standard.deviation=sqrt(var(xx)), 
      var=(var(xx)), 
      median=median(xx), 
      min=min(xx), 
      max=max(xx), 
      quantile=quantile(xx), 
      skew=sum((xx-mean(xx))^3/sqrt(var(xx))^3)/length(x) , 
      kurtosis=sum((xx-mean(xx))^4/sqrt(var(xx))^4)/length(x) - 3 
     ) 
    } 

    # create the test data frame 
    Id <- c(1,2,3,4,5,6,7,8,9,10) 
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3) 
    ClassB <- c(2,1,1,3,3,2,1,1,3,3) 
    R <- c(1,2,3,NA,9,2,4,5,6,7) 
    S <- c(3,7,NA,9,5,8,7,NA,7,6) 
    df <- data.frame(Id,ClassA,ClassB,R,S) 

    ClassAAnswers <- c(1:5,NA) 
    ClassBAnswers <- c(1:5,NA) 
    RAnswers <- c(0:10,NA); 
    SAnswers <- c(0:20,NA); 

    # create the result 
    result <- setNames(
     nm=c('answer','question','value'), 
     as.data.frame(
      as.table(
       simplify2array(
        lapply(
         df[c('R', 'S')], 
         summaryStatistics, 
         RAnswers 
        ) 
       ) 
      ) 
     ) 
    ) 

    # change the order to question, answer, value 
    result <- result[, c(2, 1, 3)] 

    # add the filter 
    result <- cbind(filter='None',result) 

    # return the result 
    result 

我得到

 filter question    answer  value 
    1 None  R     0 0.0000000 
    2 None  R     1 1.0000000 
    3 None  R     2 2.0000000 
    4 None  R     3 1.0000000 
    5 None  R     4 1.0000000 
    6 None  R     5 1.0000000 
    7 None  R     6 1.0000000 
    8 None  R     7 1.0000000 
    9 None  R     8 0.0000000 
    10 None  R     9 1.0000000 
    11 None  R     10 0.0000000 
    12 None  R    <NA> 1.0000000 
    13 None  R    sum 39.0000000 
    14 None  R    length 10.0000000 
    15 None  R    mean 4.3333333 
    16 None  R standard.deviation 2.6457513 
    17 None  R    var 7.0000000 
    18 None  R    median 4.0000000 
    19 None  R    min 1.0000000 
    20 None  R    max 9.0000000 
    21 None  R  quantile.0% 1.0000000 
    22 None  R  quantile.25% 2.0000000 
    23 None  R  quantile.50% 4.0000000 
    24 None  R  quantile.75% 6.0000000 
    25 None  R  quantile.100% 9.0000000 
    26 None  R    skew 0.3275692 
    27 None  R   kurtosis -1.5333333 
    28 None  S     0 0.0000000 
    29 None  S     1 0.0000000 
    30 None  S     2 0.0000000 
    31 None  S     3 1.0000000 
    32 None  S     4 0.0000000 
    33 None  S     5 1.0000000 
    34 None  S     6 1.0000000 
    35 None  S     7 3.0000000 
    36 None  S     8 1.0000000 
    37 None  S     9 1.0000000 
    38 None  S     10 0.0000000 
    39 None  S    <NA> 2.0000000 
    40 None  S    sum 52.0000000 
    41 None  S    length 10.0000000 
    42 None  S    mean 6.5000000 
    43 None  S standard.deviation 1.8516402 
    44 None  S    var 3.4285714 
    45 None  S    median 7.0000000 
    46 None  S    min 3.0000000 
    47 None  S    max 9.0000000 
    48 None  S  quantile.0% 3.0000000 
    49 None  S  quantile.25% 5.7500000 
    50 None  S  quantile.50% 7.0000000 
    51 None  S  quantile.75% 7.2500000 
    52 None  S  quantile.100% 9.0000000 
    53 None  S    skew -0.4252986 
    54 None  S   kurtosis -1.3028646 

凡S中的答案是從0到10

的比例關鍵我認爲是樂觀的。

lapply(df[c('R', 'S')], summaryStatistics, c(0:20)) 

產生的結果通過20對R縮放0和S

lapply(df[c('R', 'S')], summaryStatistics, c(0:10)) 

產生的結果通過10對R縮放0和S

lapply(df[c('R', 'S')], summaryStatistics, c(0:20,0:10)) 

給出的結果在第一標尺和在第二個比例中沒有一個警告。

警告消息:

1:在levels<-*tmp*,值=如果(NL == nL)的as.character(標籤)其他paste0(標籤,:在因素 複製水平棄用

2:在levels<-*tmp*,值=如果(NL == nL)的as.character(標籤)其他paste0(標籤,:在因素 複製水平棄用

3:在levels<-*tmp*,VA略=如果(NL == nL)的as.character(標籤)其他paste0(標籤,:在因素 複製水平棄用

4:在levels<-*tmp*,值=如果(NL == nL)的作爲。字符(標籤)else paste0(標籤,: 重複級別因素已棄用

如何更改彙總函數,以便可以傳遞R的縮放比例和S的縮放比例並獲取一組縮放結果爲每個變量?

回答

0

我結束使用另一個函數對彙總功能進行多次調用。

Extracting a vector from a list for a R function新的代碼是

# create the summary function 
    summaryStatistics <- function(x,levels) { 
     xx <- na.omit(x) 
     c(table(factor(x, levels=levels), useNA='always', exclude=NULL), 
      sum=sum(xx), 
      length=length(x), 
      mean=mean(xx), 
      standard.deviation=sqrt(var(xx)), 
      var=(var(xx)), 
      median=median(xx), 
      min=min(xx), 
      max=max(xx), 
      quantile=quantile(xx), 
      skew=sum((xx-mean(xx))^3/sqrt(var(xx))^3)/length(x) , 
      kurtosis=sum((xx-mean(xx))^4/sqrt(var(xx))^4)/length(x) - 3 
     ) 
    } 

    # create the function that steps through the summary function 
    extractSummaryDataframe <- function(questions.dataframe, answers.list, filter) { 

     result <- data.frame(
      answer=factor(), 
      question=factor(), 
      value=double() 
     ) ; 
     listIndex <- 0 ; 
     for (name in names(questions.dataframe)){ 
      listIndex <- listIndex + 1 ; 

      result <- rbind(result, 
       setNames(
         nm=c('answer','question','value'), 
         as.data.frame(
          as.table(
           simplify2array(
            lapply(
             questions.dataframe[c(name)], 
             summaryStatistics, 
             answers.list[[listIndex]] 
            ) 
           ) 
          ) 
         ) 
       ) 
      )   
     } 

     result <- result[, c(2, 1, 3)] ; 
     result <- cbind(filter=filter,result) ; 
     result 
    } 

    # create the test data frame 
    Id <- c(1,2,3,4,5,6,7,8,9,10) 
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3) 
    ClassB <- c(2,1,1,3,3,2,1,1,3,3) 
    R <- c(1,2,3,NA,9,2,4,5,6,7) 
    S <- c(3,7,NA,9,5,8,7,NA,7,6) 
    W <- c(4,5,6,7,2,4,5,6,7,8) 
    df <- data.frame(Id,ClassA,ClassB,R,S,W) 

    ClassAAnswers <- c(1:5,NA) 
    ClassBAnswers <- c(1:5,NA) 

    RAnswers <- c(0:10,NA); 
    SAnswers <- c(0:20,NA); 
    WAnswers <- c(0:30,NA); 
    answers.list <- list(RAnswers,SAnswers,WAnswers); 

    RSW.df <- df[c('R','S','W')]; 

    # create the result 
    result <- extractSummaryDataframe(RSW.df, answers.list, 'None') ; 

    # return the result 
    result 

返回

 filter question    answer  value 
    1  None  R     0 0.0000000 
    2  None  R     1 1.0000000 
    3  None  R     2 2.0000000 
    4  None  R     3 1.0000000 
    5  None  R     4 1.0000000 
    6  None  R     5 1.0000000 
    7  None  R     6 1.0000000 
    8  None  R     7 1.0000000 
    9  None  R     8 0.0000000 
    10 None  R     9 1.0000000 
    11 None  R     10 0.0000000 
    12 None  R    <NA> 1.0000000 
    13 None  R    sum 39.0000000 
    14 None  R    length 10.0000000 
    15 None  R    mean 4.3333333 
    16 None  R standard.deviation 2.6457513 
    17 None  R    var 7.0000000 
    18 None  R    median 4.0000000 
    19 None  R    min 1.0000000 
    20 None  R    max 9.0000000 
    21 None  R  quantile.0% 1.0000000 
    22 None  R  quantile.25% 2.0000000 
    23 None  R  quantile.50% 4.0000000 
    24 None  R  quantile.75% 6.0000000 
    25 None  R  quantile.100% 9.0000000 
    26 None  R    skew 0.3275692 
    27 None  R   kurtosis -1.5333333 
    28 None  S     0 0.0000000 
    29 None  S     1 0.0000000 
    30 None  S     2 0.0000000 
    31 None  S     3 1.0000000 
    32 None  S     4 0.0000000 
    33 None  S     5 1.0000000 
    34 None  S     6 1.0000000 
    35 None  S     7 3.0000000 
    36 None  S     8 1.0000000 
    37 None  S     9 1.0000000 
    38 None  S     10 0.0000000 
    39 None  S     11 0.0000000 
    40 None  S     12 0.0000000 
    41 None  S     13 0.0000000 
    42 None  S     14 0.0000000 
    43 None  S     15 0.0000000 
    44 None  S     16 0.0000000 
    45 None  S     17 0.0000000 
    46 None  S     18 0.0000000 
    47 None  S     19 0.0000000 
    48 None  S     20 0.0000000 
    49 None  S    <NA> 2.0000000 
    50 None  S    sum 52.0000000 
    51 None  S    length 10.0000000 
    52 None  S    mean 6.5000000 
    53 None  S standard.deviation 1.8516402 
    54 None  S    var 3.4285714 
    55 None  S    median 7.0000000 
    56 None  S    min 3.0000000 
    57 None  S    max 9.0000000 
    58 None  S  quantile.0% 3.0000000 
    59 None  S  quantile.25% 5.7500000 
    60 None  S  quantile.50% 7.0000000 
    61 None  S  quantile.75% 7.2500000 
    62 None  S  quantile.100% 9.0000000 
    63 None  S    skew -0.4252986 
    64 None  S   kurtosis -1.3028646 
    65 None  W     0 0.0000000 
    66 None  W     1 0.0000000 
    67 None  W     2 1.0000000 
    68 None  W     3 0.0000000 
    69 None  W     4 2.0000000 
    70 None  W     5 2.0000000 
    71 None  W     6 2.0000000 
    72 None  W     7 2.0000000 
    73 None  W     8 1.0000000 
    74 None  W     9 0.0000000 
    75 None  W     10 0.0000000 
    76 None  W     11 0.0000000 
    77 None  W     12 0.0000000 
    78 None  W     13 0.0000000 
    79 None  W     14 0.0000000 
    80 None  W     15 0.0000000 
    81 None  W     16 0.0000000 
    82 None  W     17 0.0000000 
    83 None  W     18 0.0000000 
    84 None  W     19 0.0000000 
    85 None  W     20 0.0000000 
    86 None  W     21 0.0000000 
    87 None  W     22 0.0000000 
    88 None  W     23 0.0000000 
    89 None  W     24 0.0000000 
    90 None  W     25 0.0000000 
    91 None  W     26 0.0000000 
    92 None  W     27 0.0000000 
    93 None  W     28 0.0000000 
    94 None  W     29 0.0000000 
    95 None  W     30 0.0000000 
    96 None  W    <NA> 0.0000000 
    97 None  W    sum 54.0000000 
    98 None  W    length 10.0000000 
    99 None  W    mean 5.4000000 
    100 None  W standard.deviation 1.7763883 
    101 None  W    var 3.1555556 
    102 None  W    median 5.5000000 
    103 None  W    min 2.0000000 
    104 None  W    max 8.0000000 
    105 None  W  quantile.0% 2.0000000 
    106 None  W  quantile.25% 4.2500000 
    107 None  W  quantile.50% 5.5000000 
    108 None  W  quantile.75% 6.7500000 
    109 None  W  quantile.100% 8.0000000 
    110 None  W    skew -0.3339582 
    111 None  W   kurtosis -0.9871315 

這正是我一直在尋找:-)。

0

我制定了一個解決方法,我爲每對問題和可能的答案調用匯總函數。

創建彙總函數

summaryStatistics <- function(x,levels) { 
     xx <- na.omit(x) 
     c(table(factor(x, levels=levels), useNA='always', exclude=NULL), 
      sum=sum(xx), 
      length=length(x), 
      mean=mean(xx), 
      standard.deviation=sqrt(var(xx)), 
      var=(var(xx)), 
      median=median(xx), 
      min=min(xx), 
      max=max(xx), 
      quantile=quantile(xx), 
      skew=sum((xx-mean(xx))^3/sqrt(var(xx))^3)/length(x) , 
      kurtosis=sum((xx-mean(xx))^4/sqrt(var(xx))^4)/length(x) - 3 
     ) 
    } 

    # create the test data frame 
    Id <- c(1,2,3,4,5,6,7,8,9,10) 
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3) 
    ClassB <- c(2,1,1,3,3,2,1,1,3,3) 
    R <- c(1,2,3,NA,9,2,4,5,6,7) 
    S <- c(3,7,NA,9,5,8,7,NA,7,6) 
    W <- c(4,5,6,7,2,4,5,6,7,8) 
    df <- data.frame(Id,ClassA,ClassB,R,S,W) 

    ClassAAnswers <- c(1:5,NA) 
    ClassBAnswers <- c(1:5,NA) 

    RAnswers <- c(0:10,NA); 
    SAnswers <- c(0:20,NA); 
    WAnswers <- c(0:30,NA); 

    # create the result 
    result <- setNames(
     nm=c('answer','question','value'), 
     as.data.frame(
      as.table(
       simplify2array(
        lapply(
         df[c('R')], 
         summaryStatistics, 
         RAnswers 
        ) 
       ) 
      ) 
     ) 
    ) 

    result <- rbind(result, 
     setNames(
       nm=c('answer','question','value'), 
       as.data.frame(
        as.table(
         simplify2array(
          lapply(
           df[c('S')], 
           summaryStatistics, 
           SAnswers 
          ) 
         ) 
        ) 
       ) 
     ) 
    ) 

    result <- rbind(result, 
     setNames(
       nm=c('answer','question','value'), 
       as.data.frame(
        as.table(
         simplify2array(
          lapply(
           df[c('W')], 
           summaryStatistics, 
           WAnswers 
          ) 
         ) 
        ) 
       ) 
     ) 
    ) 

    # change the order to question, answer, value 
    result <- result[, c(2, 1, 3)] 

    # add the filter 
    result <- cbind(filter='None',result) 

    # return the result 
    result 

它返回

 filter question    answer  value 
    1  None  R     0 0.0000000 
    2  None  R     1 1.0000000 
    3  None  R     2 2.0000000 
    4  None  R     3 1.0000000 
    5  None  R     4 1.0000000 
    6  None  R     5 1.0000000 
    7  None  R     6 1.0000000 
    8  None  R     7 1.0000000 
    9  None  R     8 0.0000000 
    10 None  R     9 1.0000000 
    11 None  R     10 0.0000000 
    12 None  R    <NA> 1.0000000 
    13 None  R    sum 39.0000000 
    14 None  R    length 10.0000000 
    15 None  R    mean 4.3333333 
    16 None  R standard.deviation 2.6457513 
    17 None  R    var 7.0000000 
    18 None  R    median 4.0000000 
    19 None  R    min 1.0000000 
    20 None  R    max 9.0000000 
    21 None  R  quantile.0% 1.0000000 
    22 None  R  quantile.25% 2.0000000 
    23 None  R  quantile.50% 4.0000000 
    24 None  R  quantile.75% 6.0000000 
    25 None  R  quantile.100% 9.0000000 
    26 None  R    skew 0.3275692 
    27 None  R   kurtosis -1.5333333 
    28 None  S     0 0.0000000 
    29 None  S     1 0.0000000 
    30 None  S     2 0.0000000 
    31 None  S     3 1.0000000 
    32 None  S     4 0.0000000 
    33 None  S     5 1.0000000 
    34 None  S     6 1.0000000 
    35 None  S     7 3.0000000 
    36 None  S     8 1.0000000 
    37 None  S     9 1.0000000 
    38 None  S     10 0.0000000 
    39 None  S     11 0.0000000 
    40 None  S     12 0.0000000 
    41 None  S     13 0.0000000 
    42 None  S     14 0.0000000 
    43 None  S     15 0.0000000 
    44 None  S     16 0.0000000 
    45 None  S     17 0.0000000 
    46 None  S     18 0.0000000 
    47 None  S     19 0.0000000 
    48 None  S     20 0.0000000 
    49 None  S    <NA> 2.0000000 
    50 None  S    sum 52.0000000 
    51 None  S    length 10.0000000 
    52 None  S    mean 6.5000000 
    53 None  S standard.deviation 1.8516402 
    54 None  S    var 3.4285714 
    55 None  S    median 7.0000000 
    56 None  S    min 3.0000000 
    57 None  S    max 9.0000000 
    58 None  S  quantile.0% 3.0000000 
    59 None  S  quantile.25% 5.7500000 
    60 None  S  quantile.50% 7.0000000 
    61 None  S  quantile.75% 7.2500000 
    62 None  S  quantile.100% 9.0000000 
    63 None  S    skew -0.4252986 
    64 None  S   kurtosis -1.3028646 
    65 None  W     0 0.0000000 
    66 None  W     1 0.0000000 
    67 None  W     2 1.0000000 
    68 None  W     3 0.0000000 
    69 None  W     4 2.0000000 
    70 None  W     5 2.0000000 
    71 None  W     6 2.0000000 
    72 None  W     7 2.0000000 
    73 None  W     8 1.0000000 
    74 None  W     9 0.0000000 
    75 None  W     10 0.0000000 
    76 None  W     11 0.0000000 
    77 None  W     12 0.0000000 
    78 None  W     13 0.0000000 
    79 None  W     14 0.0000000 
    80 None  W     15 0.0000000 
    81 None  W     16 0.0000000 
    82 None  W     17 0.0000000 
    83 None  W     18 0.0000000 
    84 None  W     19 0.0000000 
    85 None  W     20 0.0000000 
    86 None  W     21 0.0000000 
    87 None  W     22 0.0000000 
    88 None  W     23 0.0000000 
    89 None  W     24 0.0000000 
    90 None  W     25 0.0000000 
    91 None  W     26 0.0000000 
    92 None  W     27 0.0000000 
    93 None  W     28 0.0000000 
    94 None  W     29 0.0000000 
    95 None  W     30 0.0000000 
    96 None  W    <NA> 0.0000000 
    97 None  W    sum 54.0000000 
    98 None  W    length 10.0000000 
    99 None  W    mean 5.4000000 
    100 None  W standard.deviation 1.7763883 
    101 None  W    var 3.1555556 
    102 None  W    median 5.5000000 
    103 None  W    min 2.0000000 
    104 None  W    max 8.0000000 
    105 None  W  quantile.0% 2.0000000 
    106 None  W  quantile.25% 4.2500000 
    107 None  W  quantile.50% 5.5000000 
    108 None  W  quantile.75% 6.7500000 
    109 None  W  quantile.100% 8.0000000 
    110 None  W    skew -0.3339582 
    111 None  W   kurtosis -0.9871315 

這正是我要找的。

1

這裏是在調整原有的功能非常粗略的嘗試:

summaryStatistics <- function(df, a, b, levels1, levels2) { 
    x <- df[,a] 
    y <- df[,b] 
    xx <- na.omit(x) 
    yy <- na.omit(y) 
    levels2 <- levels2[levels2 != 0] 
    answer1 <- c(levels1, "<NA>", "sum", "length", "mean", "standard.deviation", "var", "median", "min", "max", "quantile.0", "quantile.25", "quantile.50", "quantile.75", "quantile.100", "skew", "kurtosis") 
    value1 <- c(as.numeric(table(factor(x, levels1))), nrow(df[is.na(x)==T,]), sum(xx), length(x), mean(xx), sqrt(var(xx)), (var(xx)), median(xx), min(xx), max(xx), as.numeric(quantile(xx)), sum((xx-mean(xx))^3/sqrt(var(xx))^3)/length(x), sum((xx-mean(xx))^4/sqrt(var(xx))^4)/length(x) - 3) 
    answer2 <- c(levels2, "<NA>", "sum", "length", "mean", "standard.deviation", "var", "median", "min", "max", "quantile.0", "quantile.25", "quantile.50", "quantile.75", "quantile.100", "skew", "kurtosis") 
    value2 <- c(as.numeric(table(factor(y, levels2))), nrow(df[is.na(y)==T,]), sum(yy), length(y), mean(yy), sqrt(var(yy)), (var(yy)), median(yy), min(yy), max(yy), as.numeric(quantile(yy)), sum((yy-mean(yy))^3/sqrt(var(yy))^3)/length(y), sum((yy-mean(yy))^4/sqrt(var(yy))^4)/length(y) - 3) 
    answer <- c(answer1, answer2) 
    question <- c(rep(a, length(answer1)), rep(b, length(answer2))) 
    value <- c(value1, value2) 
    result <- data.frame(answer, question, value) 
    return(result) 
    } 

用法是:

summaryStatistics(df, 'R', 'S', c(0:10), c(0:20)) 

這是醜陋的,但這樣的最終結果:)

+0

最終結果被設計成被抽回到mysql表中。它不適合閱讀。它旨在便於提取到Excel工作簿中。 –

+0

完成工作是主要的。你的方法只有兩個注意事項:1)要求你的功能在低位運行表明你的原始功能是不完整的。 2)特別是如果你的目的是進一步的數據處理,你可能會發現[這篇文章由哈德利](http://vita.had.co.nz/papers/tidy-data.pdf)有趣的 – Simon

+0

謝謝@Simon。我是新來的R.讚賞。 –