2016-11-15 72 views
1

我想弄清楚如何從xgboost模型樹生成概率,以便它們與我從predict函數中獲得的匹配。如何從xgboost模型手動構建預測

首先,我建立模型

library(xgboost) 
#install.packages("ModelMetrics") 
library(ModelMetrics) 

set.seed(100) 

# - Extreme gbm 
y = as.integer(testDF$y) 

x = testDF[,-which(names(testDF) %in% c('y'))] 
var.names <- names(x) 
x = as.matrix(x) 
x = matrix(as.numeric(x),nrow(x),ncol(x)) 

nround = 10 

XX <- xgboost(param=param, data = x, label = y, nrounds=nround, missing = NA) 

然後我寫了一些代碼來構建所有的規則將導致特定的葉

baseTree <- xgb.model.dt.tree(model = XX) 

Leafs <- filter(baseTree, Feature == 'Leaf') 
Branches <- filter(baseTree, Feature != 'Leaf') 

Branches$Feature = var.names[as.numeric(Branches$Feature) + 1] 

FullRules = rep(NA, nrow(Leafs)) 


AllRules <- foreach(i = 1:nrow(Leafs), .combine = 'rbind') %do% { 

    theLeaf = Leafs[i,] 
    theNode = theLeaf$Node 
    theID = theLeaf$ID 

    count = 1 

    RuleText = '' 
    while(theNode != 0){ 

    FF <- filter(Branches, Yes == theID | No == theID | Missing == theID) 
    isYes = FF$Yes == theID 
    isNo = FF$No == theID 
    isMissing = FF$Missing == theID 

    FullRules[i] = ifelse(isYes & isMissing 
     , paste0("(", FF$Feature, " < ", FF$Split, " | is.na(", FF$Feature, "))") 
     , NA) 
    FullRules[i] = ifelse(isNo & isMissing 
     , paste0("(", FF$Feature, " >= ", FF$Split, " | is.na(", FF$Feature, "))") 
     , FullRules[i]) 
    FullRules[i] = ifelse(isYes & !isMissing 
     , paste0(FF$Feature, " < ", FF$Split) 
     , FullRules[i]) 
    FullRules[i] = ifelse(isNo & !isMissing 
     , paste0(FF$Feature, " >= ", FF$Split) 
     , FullRules[i]) 
    FullRules[i] = ifelse(isMissing & !isYes & !isNo 
     , paste0("is.na(", FF$Feature, ")") 
     , FullRules[i]) 

    if(count == 1){ 
     RuleText = FullRules[i] 
    } else{ 
     RuleText = paste0(RuleText, " & ", FullRules[i]) 
    } 

    theNode = FF$Node 
    theID = FF$ID 
    count = count + 1 
    } 

    data.frame(
    Leafs[i,] 
    ,RuleText 
) 

} 

現在我挑選出1行,並企圖匹配概率。在這種情況下,它匹配。該循環將通過並且針對該特定客戶滿足的所有規則指示TRUE。然後,我可以過濾到這些行並將它們相加以得到logodds估計值。然後我將這些轉化爲概率。

TT <- testDF[25,] 

ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% { 
    TT %>% transmute_(
    Tree = as.character(AllRules$RuleText[i]) 
    , Quality = AllRules$Quality[i]) 
} 


predict(XX, as.matrix(TT[,var.names])) 
#[1] 0.05571342 

filter(ff, Tree) %>% 
    summarise(
    Q1 = sum(sqrt(Quality^2)) 
    # ,Q2 = sum(sqrt(Quality^2)) 
    , Prob1 = exp(Q1)/(1+exp(Q1)) 
    , Prob2 = 1-Prob1 
    ) 
#  Q1  Prob1  Prob2 
#1 2.830209 0.9442866 0.0557134 

但在這種情況下,不匹配的預測功能...

TT <- testDF[17,] 

ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% { 
    TT %>% transmute_(
    Tree = as.character(AllRules$RuleText[i]) 
    , Quality = AllRules$Quality[i]) 
} 


predict(XX, as.matrix(TT[,var.names])) 
#[1] 0.1386877 

filter(ff, Tree) %>% 
    summarise(
    Q1 = sum(sqrt(Quality^2)) 
    # ,Q2 = sum(sqrt(Quality^2)) 
    , Prob1 = exp(Q1)/(1+exp(Q1)) 
    , Prob2 = 1-Prob1 
    ) 
#  Q1 Prob1 Prob2 
#1 1.967608 0.877354 0.122646 

回答

1

要生成你只需要總結的人落入了個別葉子的值預測每個助推器

filter(ff, Tree) %>% 
    summarise(
    Q1 = sum(Quality) 
    , Prob1 = exp(Q1)/(1+exp(Q1)) 
    , Prob2 = 1-Prob1 
    )