1
我想弄清楚如何從xgboost模型樹生成概率,以便它們與我從predict
函數中獲得的匹配。如何從xgboost模型手動構建預測
首先,我建立模型
library(xgboost)
#install.packages("ModelMetrics")
library(ModelMetrics)
set.seed(100)
# - Extreme gbm
y = as.integer(testDF$y)
x = testDF[,-which(names(testDF) %in% c('y'))]
var.names <- names(x)
x = as.matrix(x)
x = matrix(as.numeric(x),nrow(x),ncol(x))
nround = 10
XX <- xgboost(param=param, data = x, label = y, nrounds=nround, missing = NA)
然後我寫了一些代碼來構建所有的規則將導致特定的葉
baseTree <- xgb.model.dt.tree(model = XX)
Leafs <- filter(baseTree, Feature == 'Leaf')
Branches <- filter(baseTree, Feature != 'Leaf')
Branches$Feature = var.names[as.numeric(Branches$Feature) + 1]
FullRules = rep(NA, nrow(Leafs))
AllRules <- foreach(i = 1:nrow(Leafs), .combine = 'rbind') %do% {
theLeaf = Leafs[i,]
theNode = theLeaf$Node
theID = theLeaf$ID
count = 1
RuleText = ''
while(theNode != 0){
FF <- filter(Branches, Yes == theID | No == theID | Missing == theID)
isYes = FF$Yes == theID
isNo = FF$No == theID
isMissing = FF$Missing == theID
FullRules[i] = ifelse(isYes & isMissing
, paste0("(", FF$Feature, " < ", FF$Split, " | is.na(", FF$Feature, "))")
, NA)
FullRules[i] = ifelse(isNo & isMissing
, paste0("(", FF$Feature, " >= ", FF$Split, " | is.na(", FF$Feature, "))")
, FullRules[i])
FullRules[i] = ifelse(isYes & !isMissing
, paste0(FF$Feature, " < ", FF$Split)
, FullRules[i])
FullRules[i] = ifelse(isNo & !isMissing
, paste0(FF$Feature, " >= ", FF$Split)
, FullRules[i])
FullRules[i] = ifelse(isMissing & !isYes & !isNo
, paste0("is.na(", FF$Feature, ")")
, FullRules[i])
if(count == 1){
RuleText = FullRules[i]
} else{
RuleText = paste0(RuleText, " & ", FullRules[i])
}
theNode = FF$Node
theID = FF$ID
count = count + 1
}
data.frame(
Leafs[i,]
,RuleText
)
}
現在我挑選出1行,並企圖匹配概率。在這種情況下,它匹配。該循環將通過並且針對該特定客戶滿足的所有規則指示TRUE。然後,我可以過濾到這些行並將它們相加以得到logodds估計值。然後我將這些轉化爲概率。
TT <- testDF[25,]
ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
TT %>% transmute_(
Tree = as.character(AllRules$RuleText[i])
, Quality = AllRules$Quality[i])
}
predict(XX, as.matrix(TT[,var.names]))
#[1] 0.05571342
filter(ff, Tree) %>%
summarise(
Q1 = sum(sqrt(Quality^2))
# ,Q2 = sum(sqrt(Quality^2))
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)
# Q1 Prob1 Prob2
#1 2.830209 0.9442866 0.0557134
但在這種情況下,不匹配的預測功能...
TT <- testDF[17,]
ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
TT %>% transmute_(
Tree = as.character(AllRules$RuleText[i])
, Quality = AllRules$Quality[i])
}
predict(XX, as.matrix(TT[,var.names]))
#[1] 0.1386877
filter(ff, Tree) %>%
summarise(
Q1 = sum(sqrt(Quality^2))
# ,Q2 = sum(sqrt(Quality^2))
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)
# Q1 Prob1 Prob2
#1 1.967608 0.877354 0.122646