2014-10-19 82 views
2

我爲決策樹構建了偏差/分裂圖,如下所示。培訓和測試數據的偏差圖

data(mtcars) 
    cars <- mtcars[,1:4] 
    smp_size <- floor(0.75 * nrow(cars)) 
    set.seed(100) 
    train_ind <- sample(seq_len(nrow(LoanData)), size = smp_size) 
    train <- LoanData[train_ind, ] 
    test <- LoanData[-train_ind, ] 

    #Deciscion tree model 
    library(tree) 
    car_tree <- tree(mpg ~., data=cars, mindev=0.003,mincut=2,minsize = 6) 

    #Check deviance 
    cv_tree <- cv.tree(car_tree,FUN=prune.tree, K=10) 
    plot(cv_tree$size,cv_tree$dev,type="b",xlab="splits",ylab="deviance", 
    main="deviance by splits") 

我想在此圖上覆蓋測試數據的偏差,以便觀察在哪個分割後偏差開始再次增加。你能否讓我知道如何做同樣的事情?

回答

0

要查看不同深度樹的準確性,需要修剪樹,預測訓練和測試結果,並評估訓練和測試結果的準確性。

這裏是數據,與所述訓練和測試子集提取

data(mtcars) 
cars <- mtcars 
smp_size <- floor(0.75 * nrow(cars)) 
set.seed(100) 
train_ind <- sample(seq_len(nrow(cars)), size = smp_size) 
train <- cars[train_ind, ] 
test <- cars[-train_ind, ] 

這裏是一個輔助函數來確定給定模型訓練和測試數據的準確性。您可能希望修改此以包含其他驗證估計值。

compare<-function(tr, train, test, dpth, rst=NULL) { 
    est.train <- predict(tr,train) 
    est.test <- predict(tr,test) 
    delta.train = est.train - train$mpg 
    delta.test = est.test - test$mpg 
    df <- data.frame(cor.train = cor(train$mpg,est.train), 
        cor.test = cor(test$mpg,est.test), 
        sd.train = sd(delta.train), 
        sd.test = sd(delta.test), 
        depth = dpth) 
    return(rbind(rst,df)) 
} 

創建樹

#Deciscion tree model 
library(tree) 
car_tree <- tree(mpg ~., data=train, mindev=0.003,mincut=2,minsize = 6) 

打印樹,確定深度(3),和最深分割節點(4:7)

car_tree 
# Depth is 3 and the 3rd level nodes are 4:7 

獲取的結果3的深度

rslts<-compare(car_tree,train,test,3) 

現在修剪t他樹,並打印它。請注意,接下來的深度爲2,而最深的節點是2:3

(car_tree_sn_1 <- snip.tree(car_tree,c(4:7))) 
# Depth is 2 and the 2nd level nodes are 2:3 

得到的結果爲2

rslts<-compare(car_tree_sn_1,train,test,2,rslts) 

深度現在修剪樹,並打印出來。請注意,接下來的深度爲1,並且沒有劃分節點

(car_tree_sn_2 <- snip.tree(car_tree,c(2:3))) 
# Depth is 1 and there are no split nodes 

獲取1個

rslts<-compare(car_tree_sn_2,train,test,1,rslts) 

情節的深度結果準確估計

plot(rslts$depth,rslts$cor.train,type="b",xlab="splits",ylab="Correlation Coefficient", 
    main="Correlation by splits",log="y",ylim=c(.5,1)) 
lines(rslts$depth,rslts$cor.test,type="b",col="red") 

enter image description here

plot(rslts$depth,rslts$sd.train,type="b",xlab="splits",ylab="Standard Deviation", 
    main="Correlation by splits",log="y",ylim=c(.5,5)) 
lines(rslts$depth,rslts$sd.test,type="b",col="red") 

enter image description here

還有其他的樹模型。這是一個rpart的例子。

# Regression Tree Example 
data(mtcars) 
cars <- mtcars[,1:6] 
library(rpart) 

# grow tree 
fit <- rpart(mpg ~., data=cars, control=list(minsplit = 1)) 

printcp(fit) # display the results 
plotcp(fit) # visualize cross-validation results 
summary(fit) # detailed summary of splits 

# create additional plots 
par(mfrow=c(1,2)) # two plots on one page 
rsq.rpart(fit) # visualize cross-validation results  

# plot tree 
library(rpart.plot) 
prp(fit,extra=101,branch.type=3) 
plot(fit, uniform=TRUE, 
    main="Regression Tree for Mileage ") 
text(fit, use.n=TRUE, all=TRUE, cex=.8) 

裁判:cart treeprp plottingmodel prediction

+0

我想創建一個與我在我的腳本生成的「測試」數據的交叉驗證的情節。 rpart可以嗎? – 2014-10-20 02:12:01

+0

感謝您的詳細解釋。 – 2014-10-20 13:34:44

+0

上述方法是介紹性的,意在說明一個過程。抽樣可以改進。而不是隨機的,根據分佈選擇樣本。 K-Fold(caret :: createfold)方法也是對上述方法的改進。祝你好運。 – 2014-10-20 19:36:35