2017-04-26 118 views
-1
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 


#reproducible random seed 
seed = 1 
np.random.seed(seed) 

#Import and normalize the data 
df = pd.read_csv('creditcard.csv') 


#Exploring the data 

# print df.head() 
# print df.describe() 
# print df.isnull().sum() 


# count_class = pd.value_counts(df['Class']) 
# count_class.plot(kind = 'bar') 
# plt.title('Fraud class histogram') 
# plt.xlabel('class') 
# plt.ylabel('Frequency') 
# plt.show() 

# print('Clearly the data is totally unbalanced!') 

#to normalize the amount column 
# data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1)) 
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1)) 
df = df.drop(['Time','V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8','Amount'], axis =1) 
X = df.iloc[:,df.columns!='Class'] 
Y = df.iloc[:,df.columns=='Class'] 

# number of records in the minority class 
number_record_fraud = len(df[df.Class==1]) 
fraud_indices = np.array(df[df.Class==1].index) 

#picking normal class 
normal_indices = np.array(df[df.Class==0].index) 

#select random x(number_record_fraud) numbers from normal_indices 
random_normal_indices = np.random.choice(normal_indices,number_record_fraud,replace=False) 
random_normal_indices = np.array(random_normal_indices) 

#under sample data 
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) 
under_sample_data = df.iloc[under_sample_indices,:] 

X_undersample = under_sample_data.iloc[:,under_sample_data.columns!='Class'] 
Y_undersample = under_sample_data.iloc[:,under_sample_data.columns=='Class'] 

# split data into train and test dataset 
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3) 
X_train_undersample,X_test_undersample,Y_train_undersample,Y_test_undersample = train_test_split(X_undersample,Y_undersample,test_size=0.3) 

#parameters 
learning_rate = 0.05 
training_epoch = 10 
batch_size = 43 
display_step = 1 

#tf graph input 
x = tf.placeholder(tf.float32,[None,18]) 
y = tf.placeholder(tf.float32,[None,1]) 

#set model weights 
w = tf.Variable(tf.zeros([18,1])) 
b = tf.Variable(tf.zeros([1])) 

#construct model 
pred = tf.nn.softmax(tf.matmul(x,w) + b) #softmax activation 

#minimize error using cross entropy 
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred),reduction_indices=1)) 
#Gradient descent 
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) 

#initializing variables 
init = tf.global_variables_initializer() 

#launch the graph 
with tf.Session() as sess: 
    sess.run(init) 

    #training cycle 
    for epoch in range(training_epoch): 
     total_batch = len(X_train_undersample)/batch_size 
     avg_cost = 0 
     #loop over all the batches 
     for batch in range(total_batch): 
      batch_xs = X_train.iloc[(batch)*batch_size:(batch+1) *batch_size] 
      batch_ys = Y_train.iloc[(batch)*batch_size:(batch+1) *batch_size] 
      # run optimizer and cost operation 
      _,c= sess.run([optimizer,cost],feed_dict={x:batch_xs,y:batch_ys}) 
      avg_cost += c/total_batch 


     correct_prediction = tf.equal(tf.argmax(pred,1),tf.argmax(y,1)) 
     accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32)) 

     #disply log per epoch step 
     if (epoch+1) % display_step == 0: 
      train_accuracy, newCost = sess.run([accuracy, cost], feed_dict={x: X_test,y: Y_test}) 
      print "test_set_accuracy:",accuracy.eval({x:X_test_undersample,y:Y_test_undersample})*100 
      print "whole_set_accuracy:",accuracy.eval({x:X,y:Y})*100 
      # print train_accuracy 
      # print "cost",newCost 
      print 

    print 'optimization finished.' 

事情我已經試圖找出是什麼導致它:是什麼原因導致的算法過度擬合

  • 試圖改變訓練集的長度。
  • 丟棄了一些不需要的字段。
  • 試圖把驗證塊。

數據集:link

+2

歡迎來到[so]。我編輯了你的帖子,使其更好的可讀性(格式),並刪除不屬於帖子的東西。請查看[遊覽],然後點擊我的頭像(或任何其他人上次編輯您的帖子)上方的「編輯......前」鏈接查看編輯歷史記錄,以便查看刪除/更改的內容(並希望從中學習)。我不知道你的主題,但更好的帖子總是增加了回答的機會。 – Anthon

+0

Thanks @Anthon。欣賞它。 –

+1

你的網絡似乎很小? 0.05的學習率可能偏高,你是否試圖繪製你的訓練/驗證損失來看看它的曲線?爲什麼你的批量大小是43? – TheLaurens

回答

0

可以有它爲什麼過度擬合多種原因,並且也可以有多種方式來調試它,解決它。它很難單純從代碼告訴,因爲它也取決於數據,但這裏有一些常見的reaons以及修正:

  • 太小的數據集,添加更多數據的共同過度擬合修復
  • 太複雜的模型,如果你有很多功能,或者複雜的校正功能,可以嘗試使用功能選擇來降低複雜度。
  • 添加正則化:我在你的代碼中看不到正則化,嘗試添加它。
相關問題