2009-09-09 100 views
5

作爲開發新的分析化學方法的一部分,我需要計算來自某些數據的運行方差之內和之間的差異。我也需要使用R語言的這些數據的置信區間計算方差和置信區間內的R值

我假設我需要使用anova或其他東西?

我的數據是一樣

> variance 
    Run Rep Value 
1 1 1 9.85 
2 1 2 9.95 
3 1 3 10.00 
4 2 1 9.90 
5 2 2 8.80 
6 2 3 9.50 
7 3 1 11.20 
8 3 2 11.10 
9 3 3 9.80 
10 4 1 9.70 
11 4 2 10.10 
12 4 3 10.00 

回答

1

我一直在尋找類似的問題。我發現參考Burdick和Graybill的計算置信區間(Burdick,R.和Graybill,F.1992,Confidence Intervals on variance components,CRC Press)

使用一些代碼我一直在嘗試獲取這些值



> kiaraov = aov(Value~Run+Error(Run),data=kiar) 

> summary(kiaraov) 

Error: Run 
    Df Sum Sq Mean Sq 
Run 3 2.57583 0.85861 

Error: Within 
      Df Sum Sq Mean Sq F value Pr(>F) 
Residuals 8 1.93833 0.24229    
> confint = 95 

> a = (1-(confint/100))/2 

> grandmean = as.vector(kiaraov$"(Intercept)"[[1]][1]) # Grand Mean (I think) 

> within = summary(kiaraov)$"Error: Within"[[1]]$"Mean Sq" # S2^2Mean Square Value for Within Run 

> dfRun = summary(kiaraov)$"Error: Run"[[1]]$"Df" 

> dfWithin = summary(kiaraov)$"Error: Within"[[1]]$"Df" 

> Run = summary(kiaraov)$"Error: Run"[[1]]$"Mean Sq" # S1^2Mean Square for between Run 

> between = (Run-within)/((dfWithin/(dfRun+1))+1) # (S1^2-S2^2)/J 

> total = between+within 

> between # Between Run Variance 
[1] 0.2054398 

> within # Within Run Variance 
[1] 0.2422917 

> total # Total Variance 
[1] 0.4477315 

> betweenCV = sqrt(between)/grandmean * 100 # Between Run CV% 

> withinCV = sqrt(within)/grandmean * 100 # Within Run CV% 

> totalCV = sqrt(total)/grandmean * 100 # Total CV% 

> #within confidence intervals 

> withinLCB = within/qf(1-a,8,Inf) # Within LCB 

> withinUCB = within/qf(a,8,Inf) # Within UCB 

> #Between Confidence Intervals 

> n1 = dfRun 

> n2 = dfWithin 

> G1 = 1-(1/qf(1-a,n1,Inf)) # According to Burdick and Graybill this should be a 

> G2 = 1-(1/qf(1-a,n2,Inf)) 

> H1 = (1/qf(a,n1,Inf))-1 # and this should be 1-a, but my results don't agree 

> H2 = (1/qf(a,n2,Inf))-1 

> G12 = ((qf(1-a,n1,n2)-1)^2-(G1^2*qf(1-a,n1,n2)^2)-(H2^2))/qf(1-a,n1,n2) # again, should be a, not 1-a 

> H12 = ((1-qf(a,n1,n2))^2-H1^2*qf(a,n1,n2)^2-G2^2)/qf(a,n1,n2) # again, should be 1-a, not a 

> Vu = H1^2*Run^2+G2^2*within^2+H12*Run*within 

> Vl = G1^2*Run^2+H2^2*within^2+G12*within*Run 

> betweenLCB = (Run-within-sqrt(Vl))/J # Betwen LCB 

> betweenUCB = (Run-within+sqrt(Vu))/J # Between UCB 

> #Total Confidence Intervals 

> y = (Run+(J-1)*within)/J 

> totalLCB = y-(sqrt(G1^2*Run^2+G2^2*(J-1)^2*within^2)/J) # Total LCB 

> totalUCB = y+(sqrt(H1^2*Run^2+H2^2*(J-1)^2*within^2)/J) # Total UCB 

> result = data.frame(Name=c("within", "between", "total"),CV=c(withinCV,betweenCV,totalCV),LCB=c(sqrt(withinLCB)/grandmean*100,sqrt(betweenLCB)/grandmean*100,sqrt(totalLCB)/grandmean*100),UCB=c(sqrt(withinUCB)/grandmean*100,sqrt(betweenUCB)/grandmean*100,sqrt(totalUCB)/grandmean*100)) 

> result 
    Name  CV  LCB  UCB 
1 within 4.926418 3.327584 9.43789 
2 between 4.536327  NaN 19.73568 
3 total 6.696855 4.846030 20.42647 

此處運行CV之間的置信區間較低,小於零,因此報告爲NaN。

我很想有一個更好的方式來做到這一點。如果我得到時間,我可能會嘗試創建一個功能來做到這一點。

Paul。

-

編輯:我沒有最終編寫一個函數,這裏是(買者自負)

#' avar Function 
#' 
#' Calculate thewithin, between and total %CV of a dataset by ANOVA, and the 
#' associated confidence intervals 
#' 
#' @param dataf - The data frame to use, in long format 
#' @param afactor Character string representing the column in dataf that contains the factor 
#' @param aresponse Charactyer string representing the column in dataf that contains the response value 
#' @param aconfidence What Confidence limits to use, default = 95% 
#' @param digits Significant Digits to report to, default = 3 
#' @param debug Boolean, Should debug messages be displayed, default=FALSE 
#' @returnType dataframe containing the Mean, Within, Between and Total %CV and LCB and UCB for each 
#' @return 
#' @author Paul Hurley 
#' @export 
#' @examples 
#' #Using the BGBottles data from Burdick and Graybill Page 62 
#' assayvar(dataf=BGBottles, afactor="Machine", aresponse="weight") 
avar<-function(dataf, afactor, aresponse, aconfidence=95, digits=3, debug=FALSE){ 
    dataf<-subset(dataf,!is.na(with(dataf,get(aresponse)))) 
    nmissing<-function(x) sum(!is.na(x)) 
    n<-nrow(subset(dataf,is.numeric(with(dataf,get(aresponse))))) 
    datadesc<-ddply(dataf, afactor, colwise(nmissing,aresponse)) 
    I<-nrow(datadesc) 
    if(debug){print(datadesc)} 
    if(min(datadesc[,2])==max(datadesc[,2])){ 
     balance<-TRUE 
     J<-min(datadesc[,2]) 
     if(debug){message(paste("Dataset is balanced, J=",J,"I is ",I,sep=""))} 
    } else { 
     balance<-FALSE 
     Jh<-I/(sum(1/datadesc[,2], na.rm = TRUE)) 
     J<-Jh 
     m<-min(datadesc[,2]) 
     M<-max(datadesc[,2]) 
     if(debug){message(paste("Dataset is unbalanced, like me, I is ",I,sep=""))} 
     if(debug){message(paste("Jh is ",Jh, ", m is ",m, ", M is ",M, sep=""))} 
    } 
    if(debug){message(paste("Call afactor=",afactor,", aresponse=",aresponse,sep=""))} 
    formulatext<-paste(as.character(aresponse)," ~ 1 + Error(",as.character(afactor),")",sep="") 
    if(debug){message(paste("formula text is ",formulatext,sep=""))} 
    aovformula<-formula(formulatext) 
    if(debug){message(paste("Formula is ",as.character(aovformula),sep=""))} 
    assayaov<-aov(formula=aovformula,data=dataf) 
    if(debug){ 
     print(assayaov) 
     print(summary(assayaov)) 
    } 
    a<-1-((1-(aconfidence/100))/2) 
    if(debug){message(paste("confidence is ",aconfidence,", alpha is ",a,sep=""))} 
    grandmean<-as.vector(assayaov$"(Intercept)"[[1]][1]) # Grand Mean (I think) 
    if(debug){message(paste("n is",n,sep=""))} 

    #This line commented out, seems to choke with an aov object built from an external formula 
    #grandmean<-as.vector(model.tables(assayaov,type="means")[[1]]$`Grand mean`) # Grand Mean (I think) 
    within<-summary(assayaov)[[2]][[1]]$"Mean Sq" # d2e, S2^2 Mean Square Value for Within Machine = 0.1819 
    dfRun<-summary(assayaov)[[1]][[1]]$"Df" # DF for within = 3 
    dfWithin<-summary(assayaov)[[2]][[1]]$"Df" # DF for within = 8 
    Run<-summary(assayaov)[[1]][[1]]$"Mean Sq" # S1^2Mean Square for Machine 
    if(debug){message(paste("mean square for Run ?",Run,sep=""))} 
    #Was between<-(Run-within)/((dfWithin/(dfRun+1))+1) but my comment suggests this should be just J, so I'll use J ! 
    between<-(Run-within)/J # d2a (S1^2-S2^2)/J 
    if(debug){message(paste("S1^2 mean square machine is ",Run,", S2^2 mean square within is ",within))} 
    total<-between+within 
    between # Between Run Variance 
    within # Within Run Variance 
    total # Total Variance 
    if(debug){message(paste("between is ",between,", within is ",within,", Total is ",total,sep=""))} 

    betweenCV<-sqrt(between)/grandmean * 100 # Between Run CV% 
    withinCV<-sqrt(within)/grandmean * 100 # Within Run CV% 
    totalCV<-sqrt(total)/grandmean * 100 # Total CV% 
    n1<-dfRun 
    n2<-dfWithin 
    if(debug){message(paste("n1 is ",n1,", n2 is ",n2,sep=""))} 
    #within confidence intervals 
    if(balance){ 
     withinLCB<-within/qf(a,n2,Inf) # Within LCB 
     withinUCB<-within/qf(1-a,n2,Inf) # Within UCB 
    } else { 
     withinLCB<-within/qf(a,n2,Inf) # Within LCB 
     withinUCB<-within/qf(1-a,n2,Inf) # Within UCB 
    } 
#Mean Confidence Intervals 
    if(debug){message(paste(grandmean,"+/-(sqrt(",Run,"/",n,")*qt(",a,",df=",I-1,"))",sep=""))} 
    meanLCB<-grandmean+(sqrt(Run/n)*qt(1-a,df=I-1)) # wrong 
    meanUCB<-grandmean-(sqrt(Run/n)*qt(1-a,df=I-1)) # wrong 
    if(debug){message(paste("Grandmean is ",grandmean,", meanLCB = ",meanLCB,", meanUCB = ",meanUCB,aresponse,sep=""))} 
    if(debug){print(summary(assayaov))} 
#Between Confidence Intervals 
    G1<-1-(1/qf(a,n1,Inf)) 
    G2<-1-(1/qf(a,n2,Inf)) 
    H1<-(1/qf(1-a,n1,Inf))-1 
    H2<-(1/qf(1-a,n2,Inf))-1 
    G12<-((qf(a,n1,n2)-1)^2-(G1^2*qf(a,n1,n2)^2)-(H2^2))/qf(a,n1,n2) 
    H12<-((1-qf(1-a,n1,n2))^2-H1^2*qf(1-a,n1,n2)^2-G2^2)/qf(1-a,n1,n2) 
    if(debug){message(paste("G1 is ",G1,", G2 is ",G2,sep="")) 
     message(paste("H1 is ",H1,", H2 is ",H2,sep="")) 
     message(paste("G12 is ",G12,", H12 is ",H12,sep="")) 
    } 
    if(balance){ 
     Vu<-H1^2*Run^2+G2^2*within^2+H12*Run*within 
     Vl<-G1^2*Run^2+H2^2*within^2+G12*within*Run 
     betweenLCB<-(Run-within-sqrt(Vl))/J # Betwen LCB 
     betweenUCB<-(Run-within+sqrt(Vu))/J # Between UCB 
    } else { 
     #Burdick and Graybill seem to suggest calculating anova of mean values to find n1S12u/Jh 
     meandataf<-ddply(.data=dataf,.variable=afactor, .fun=function(df){mean(with(df, get(aresponse)), na.rm=TRUE)}) 
     meandataaov<-aov(formula(paste("V1~",afactor,sep="")), data=meandataf) 
     sumsquare<-summary(meandataaov)[[1]]$`Sum Sq` 
     #so maybe S12u is just that bit ? 
     Runu<-(sumsquare*Jh)/n1 
     if(debug){message(paste("n1S12u/Jh is ",sumsquare,", so S12u is ",Runu,sep=""))} 
     Vu<-H1^2*Runu^2+G2^2*within^2+H12*Runu*within 
     Vl<-G1^2*Runu^2+H2^2*within^2+G12*within*Runu 
     betweenLCB<-(Runu-within-sqrt(Vl))/Jh # Betwen LCB 
     betweenUCB<-(Runu-within+sqrt(Vu))/Jh # Between UCB 
     if(debug){message(paste("betweenLCB is ",betweenLCB,", between UCB is ",betweenUCB,sep=""))} 
    } 
#Total Confidence Intervals 
    if(balance){ 
     y<-(Run+(J-1)*within)/J 
     if(debug){message(paste("y is ",y,sep=""))} 
     totalLCB<-y-(sqrt(G1^2*Run^2+G2^2*(J-1)^2*within^2)/J) # Total LCB 
     totalUCB<-y+(sqrt(H1^2*Run^2+H2^2*(J-1)^2*within^2)/J) # Total UCB 
    } else { 
     y<-(Runu+(Jh-1)*within)/Jh 
     if(debug){message(paste("y is ",y,sep=""))} 
     totalLCB<-y-(sqrt(G1^2*Runu^2+G2^2*(Jh-1)^2*within^2)/Jh) # Total LCB 
     totalUCB<-y+(sqrt(H1^2*Runu^2+H2^2*(Jh-1)^2*within^2)/Jh) # Total UCB 
    } 
    if(debug){message(paste("totalLCB is ",totalLCB,", total UCB is ",totalUCB,sep=""))} 
# result<-data.frame(Name=c("within", "between", "total"),CV=c(withinCV,betweenCV,totalCV), 
#   LCB=c(sqrt(withinLCB)/grandmean*100,sqrt(betweenLCB)/grandmean*100,sqrt(totalLCB)/grandmean*100), 
#   UCB=c(sqrt(withinUCB)/grandmean*100,sqrt(betweenUCB)/grandmean*100,sqrt(totalUCB)/grandmean*100)) 
    result<-data.frame(Mean=grandmean,MeanLCB=meanLCB, MeanUCB=meanUCB, Within=withinCV,WithinLCB=sqrt(withinLCB)/grandmean*100, WithinUCB=sqrt(withinUCB)/grandmean*100, 
      Between=betweenCV, BetweenLCB=sqrt(betweenLCB)/grandmean*100, BetweenUCB=sqrt(betweenUCB)/grandmean*100, 
      Total=totalCV, TotalLCB=sqrt(totalLCB)/grandmean*100, TotalUCB=sqrt(totalUCB)/grandmean*100) 
    if(!digits=="NA"){ 
     result$Mean<-signif(result$Mean,digits=digits) 
     result$MeanLCB<-signif(result$MeanLCB,digits=digits) 
     result$MeanUCB<-signif(result$MeanUCB,digits=digits) 
     result$Within<-signif(result$Within,digits=digits) 
     result$WithinLCB<-signif(result$WithinLCB,digits=digits) 
     result$WithinUCB<-signif(result$WithinUCB,digits=digits) 
     result$Between<-signif(result$Between,digits=digits) 
     result$BetweenLCB<-signif(result$BetweenLCB,digits=digits) 
     result$BetweenUCB<-signif(result$BetweenUCB,digits=digits) 
     result$Total<-signif(result$Total,digits=digits) 
     result$TotalLCB<-signif(result$TotalLCB,digits=digits) 
     result$TotalUCB<-signif(result$TotalUCB,digits=digits) 
    } 
    return(result) 
} 

assayvar<-function(adata, aresponse, afactor, anominal, aconfidence=95, digits=3, debug=FALSE){ 
    result<-ddply(adata,anominal,function(df){ 
       resul<-avar(dataf=df,afactor=afactor,aresponse=aresponse,aconfidence=aconfidence, digits=digits, debug=debug) 
       resul$n<-nrow(subset(df, !is.na(with(df, get(aresponse))))) 
       return(resul) 
      }) 
    return(result) 
} 
3

如果你想將一個函數(如var)對面的一個因素,如RunRep,您可以使用tapply

> with(variance, tapply(Value, Run, var)) 
      1   2   3   4 
0.005833333 0.310000000 0.610000000 0.043333333 
> with(variance, tapply(Value, Rep, var)) 
      1   2   3 
0.48562500 0.88729167 0.05583333 
+0

不錯!在我看來,這是典雅的代碼。 – kpierce8 2009-09-10 19:19:50

6

你有四組三觀察:

> run1 = c(9.85, 9.95, 10.00) 
> run2 = c(9.90, 8.80, 9.50) 
> run3 = c(11.20, 11.10, 9.80) 
> run4 = c(9.70, 10.10, 10.00) 
> runs = c(run1, run2, run3, run4) 
> runs 
[1] 9.85 9.95 10.00 9.90 8.80 9.50 11.20 11.10 9.80 9.70 10.10 10.00 

使一些標籤:

> n = rep(3, 4) 
> group = rep(1:4, n) 
> group 
[1] 1 1 1 2 2 2 3 3 3 4 4 4 

計算中運行統計:

> withinRunStats = function(x) c(sum = sum(x), mean = mean(x), var = var(x), n = length(x)) 
> tapply(runs, group, withinRunStats) 
$`1` 
     sum   mean   var   n 
29.800000000 9.933333333 0.005833333 3.000000000 

$`2` 
    sum mean var  n 
28.20 9.40 0.31 3.00 

$`3` 
    sum mean var  n 
32.10 10.70 0.61 3.00 

$`4` 
     sum  mean   var   n 
29.80000000 9.93333333 0.04333333 3.00000000 

你可以在這裏做一些ANOVA:

> data = data.frame(y = runs, group = factor(group)) 
> data 
     y group 
1 9.85  1 
2 9.95  1 
3 10.00  1 
4 9.90  2 
5 8.80  2 
6 9.50  2 
7 11.20  3 
8 11.10  3 
9 9.80  3 
10 9.70  4 
11 10.10  4 
12 10.00  4 

> fit = lm(runs ~ group, data) 
> fit 

Call: 
lm(formula = runs ~ group, data = data) 

Coefficients: 
(Intercept)  group2  group3  group4 
    9.933e+00 -5.333e-01 7.667e-01 -2.448e-15 

> anova(fit) 
Analysis of Variance Table 

Response: runs 
      Df Sum Sq Mean Sq F value Pr(>F) 
group  3 2.57583 0.85861 3.5437 0.06769 . 
Residuals 8 1.93833 0.24229     
--- 
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

> degreesOfFreedom = anova(fit)[, "Df"] 
> names(degreesOfFreedom) = c("treatment", "error") 
> degreesOfFreedom 
treatment  error 
     3   8 

錯誤或組內方差:

> anova(fit)["Residuals", "Mean Sq"] 
[1] 0.2422917 

處理t或組間差異:

> anova(fit)["group", "Mean Sq"] 
[1] 0.8586111 

這應該會給你足夠的置信區間。

3

我會採取裂縫在這個時候我有更多的時間,但同時,這裏的dput()爲Kiar的數據結構:

structure(list(Run = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4), Rep = c(1, 
2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), Value = c(9.85, 9.95, 10, 9.9, 
8.8, 9.5, 11.2, 11.1, 9.8, 9.7, 10.1, 10)), .Names = c("Run", 
"Rep", "Value"), row.names = c(NA, -12L), class = "data.frame") 

...如果你想帶一個快速射擊它。