我得到的決策樹分類器的準確性爲1.0,決策樹輸出中只有一個節點也只有一個元素在混亂矩陣中。隨機森林也有類似的問題。決策樹sklearn:預測準確率100%
import pandas
import numpy
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics
data = pandas.read_csv('nesarc_pds.csv', low_memory=False)
#Setting variable to numeric.
data['CONSUMER'] = pandas.to_numeric(data['CONSUMER'], errors='coerce')
data['S2AQ16A'] = pandas.to_numeric(data['S2AQ16A'], errors='coerce')
data['S2DQ3C1'] = pandas.to_numeric(data['S2DQ3C1'], errors='coerce')
data['S2DQ3C2'] = pandas.to_numeric(data['S2DQ3C2'], errors='coerce')
data['S2DQ4C1'] = pandas.to_numeric(data['S2DQ4C1'], errors='coerce')
data['S2DQ4C2'] = pandas.to_numeric(data['S2DQ4C2'], errors='coerce')
data['S2DQ1'] = pandas.to_numeric(data['S2DQ1'], errors='coerce')
data['S2DQ2'] = pandas.to_numeric(data['S2DQ2'], errors='coerce')
data['SEX'] = pandas.to_numeric(data['SEX'], errors='coerce')
#subset data to the age 10 to 30 when started drinking
sub1=data[((data['S2AQ16A']>=10) & (data['S2AQ16A']<=30))]
#Copy new DataFrame
sub2 = sub1.copy()
#Recording missing data
sub2['S2AQ16A'] = sub2['S2AQ16A'].replace(99, numpy.nan)
sub2['S2DQ3C1'] = sub2['S2DQ3C1'].replace(99, numpy.nan)
sub2['S2DQ3C2'] = sub2['S2DQ3C2'].replace(9, numpy.nan)
sub2['S2DQ4C1'] = sub2['S2DQ4C1'].replace(99, numpy.nan)
sub2['S2DQ4C2'] = sub2['S2DQ4C2'].replace(9, numpy.nan)
sub2['S2DQ1'] = sub2['S2DQ1'].replace(9, numpy.nan)
sub2['S2DQ2'] = sub2['S2DQ2'].replace(9, numpy.nan)
#creating a secondary variable for calculating sibling number.
sub2['SIBNO'] = sub2['S2DQ3C1'] + sub2['S2DQ4C1']
#defining new variable for sibling drinking status by combining data of brothers and sisters
def SIBSTS(row):
if any([row['S2DQ3C2'] == 1, row['S2DQ4C2'] == 1]) :
return 1
elif all([row['S2DQ3C2'] == 2, row['S2DQ4C2'] == 2]) :
return 0
else :
return numpy.nan
sub2['SIBSTS'] = sub2.apply(lambda row: SIBSTS (row),axis=1)
#defining new variable for parent status status of drinking
def PRSTS(row):
if any([row['S2DQ1'] == 1, row['S2DQ2'] == 1]) :
return 1
elif all([row['S2DQ1'] == 2, row['S2DQ2'] == 2]) :
return 0
else :
return numpy.nan
sub2['PRSTS'] = sub2.apply(lambda row: PRSTS (row),axis=1)
#recoding values for 'CONSUMER' into a new variable, DRSTS
recode1 = {1: 1, 2: 1, 3: 0}
sub2['DRSTS']= sub2['CONSUMER'].map(recode1)
#recoding new values for SEX variable
recode2 = {1: 1, 2: 0}
sub2['GEN']= sub2['SEX'].map(recode2)
data_clean = sub2.dropna()
data_clean.dtypes
data_clean.describe()
#Modeling and Prediction
#Split into training and testing sets
predictors = data_clean[['S2AQ16A','SIBNO','SIBSTS','PRSTS','GEN']]
targets = data_clean['DRSTS']
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)
print(pred_train.shape)
print(pred_test.shape)
print(tar_train.shape)
print(tar_test.shape)
#Build model on training data
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
print(sklearn.metrics.confusion_matrix(tar_test,predictions))
print(sklearn.metrics.accuracy_score(tar_test, predictions))
#Displaying the decision tree
from sklearn import tree
#from StringIO import StringIO
import io
#from StringIO import StringIO
from IPython.display import Image
out = io.BytesIO()
tree.export_graphviz(classifier, out_file=out)
import pydotplus
graph=pydotplus.graph_from_dot_data(out.getvalue())
Image(graph.create_png())
graph.write_pdf("iris.pdf")
輸出:在代碼 - nesar_pds
感謝您的幫助@Darshan。我早些時候使用過你提到的代碼,但結果是一樣的,所以我這樣做是爲了檢查目的,但在發佈之前忘了糾正它。現在我編輯了這個。我正面臨與隨機森林相同的問題。如果您想嘗試這樣做,我已經分享了數據集的鏈接。 –
如果是這樣,那麼它不應該是這樣。 – Darshan
我會盡力找到原因,並會讓你知道。 – Darshan