我想對950個樣本和大約5000個特徵的數據使用套索優化。套索函數是$(1 /(2 * numberofsamples))* || y - Xw ||^2_2 + alpha * || w || _1 $。一旦嘗試使用初始化的最小化,我得到完全不同的w這很奇怪,因爲套索是凸的,初始化不應該影響結果。這是帶和不帶初始化的套索的結果。 tol是寬容。如果w的變化變爲低音容忍,則會發生收斂。爲什麼不同的初始點會導致套索優化(凸面)的不同結果?
tol=0.00000001
##### lasso model errors #####
gene: 5478 matrix error: 0.069611732213
with initialization: alpha: 1e-20 promotion: -3.58847815733e-13
coef: [-0.00214732 -0.00509795 0.00272167 -0.00651548 -0.00164646 -0.00115342
0.00553346 0.01047653 0.00139832]
without initialization: alpha: 1e-20 promotion: -19.0735249749
coef: [-0.03650629 0.08992003 -0.01287155 0.03203973 0.1567577 -0.03708655
-0.13710957 -0.01252736 -0.21710334]
with initialization: alpha: 1e-15 promotion: 1.06179081478e-10
coef: [-0.00214732 -0.00509795 0.00272167 -0.00651548 -0.00164646 -0.00115342
0.00553346 0.01047653 0.00139832]
without initialization: alpha: 1e-15 promotion: -19.0735249463
coef: [-0.03650629 0.08992003 -0.01287155 0.03203973 0.1567577 -0.03708655
-0.13710957 -0.01252736 -0.21710334]
Warning (from warnings module):
File "/usr/local/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py", line 491
ConvergenceWarning)
ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
with initialization: alpha: 1e-10 promotion: 0.775144987537
coef: [-0.00185139 -0.0048819 0.00218349 -0.00622618 -0.00145647 -0.00115857
0.0055919 0.01072924 0.00043773]
without initialization: alpha: 1e-10 promotion: -17.8649603301
coef: [-0.03581581 0.0892119 -0..03151441 0.15606195 -0.03734093
-0.13604286 -0.01247732 -0.21233529]
with initialization: alpha: 1e-08 promotion: -5.87121366314
coef: [-0. 0. -0. -0.01064477 0. -0.00116167
-0. 0.01114746 0. ]
without initialization: alpha: 1e-08 promotion: 4.05593555389
coef: [ 0. 0.04505117 0.00668611 0. 0.07731668 -0.03537848
-0.03151995 0. -0.00310122]
max promote:
4.05593555389
對於實現,我使用了python包skylarn.linear_model的lasso函數。我也改變了數據,但新數據的結果也隨着初始化而改變。我認爲這很奇怪,但我無法分析它並找到解釋。
下面是我的代碼的一部分,它與套索有關。我的數據是基因表達。我測試了規範化和非規範化數據上的代碼。在他們兩人的初始點上有所不同。
alpha_lasso = [1e-20,1e-15, 1e-10, 1e-8, 1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2, 1, 5 ,20]
lassoreg = Lasso(alpha=alpha_lasso[i],warm_start=True,tol=0.00000001,max_iter=100000)
lassoreg.coef_ = mybeta[:,j-c]
lassoreg.fit(train[:,predictors],train[:,y])
y_train_pred = lassoreg.predict(A)#train[:,predictors])
y_test_pred = lassoreg.predict(C)#test[:,predictors])
這裏也是我的全部代碼:
import pandas as pd
import random
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from GEOparse.GEOTypes import (GSE, GSM, GPL, GDS,
GDSSubset, GEODatabase,
DataIncompatibilityException,
NoMetadataException,
)
import GEOparse as GEO
import numpy as np
import copy
import sys
import math
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLars
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import coordinate_descent
from sklearn.linear_model import lasso_path, enet_path
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from copy import deepcopy
miss_percent = 0.1
alpha_lasso = [1e-20,1e-15, 1e-10, 1e-8, 1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2, 1, 5 ,20]
mins=[]
maxs=[]
mean_err=[]
alphas=[]
mins1=[]
maxs1=[]
mean_err1=[]
alphas1=[]
#mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True)
def getdata(percent):
gsd = GEO.get_GEO(geo="GDS4971")
ngsd = gsd.table.replace('null', np.NaN)
ngsd = ngsd.dropna(axis=0, how='any')
ngsd =ngsd.transpose()
dataarray = ngsd.values
data = np.delete(dataarray, [0,1], 0)
x = data.astype(np.float)
r_df = x.shape[0]
c_df = x.shape[1]
r = int(r_df-math.sqrt((1-percent)*r_df))
c = int(c_df-math.sqrt((1-percent)*c_df))
train = x[0:r,:]
test = x[r:r_df,:]
return x,train,test,r_df,c_df,r,c
genedata,train,test,r_df,c_df,r,c = getdata(miss_percent)
predictors = range(0,c)
promotion =[[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
promotion = np.asmatrix(promotion)
#error of ax-b
error_aw_b = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_aw_b = np.asmatrix(error_aw_b)
#error of cw-x
error_cw_x = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_cw_x = np.asmatrix(error_cw_x)
#error of lasso function
error_lasso = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_lasso = np.asmatrix(error_lasso)
promotion1 =[[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
promotion1 = np.asmatrix(promotion)
#error of ax-b
error_aw_b1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_aw_b1 = np.asmatrix(error_aw_b)
#error of cw-x
error_cw_x1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_cw_x1 = np.asmatrix(error_cw_x)
#error of lasso function
error_lasso1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_lasso1 = np.asmatrix(error_lasso)
mybeta = #any initialization
###################### LASSO #####################
print("##### lasso model errors #####")
for j in range(c,c+1):
mean_err=[]
print("\n")
y=j
eachMeanError= math.sqrt((np.power(errorC[:,j-c],2)).sum()/(r_df-r))
print("gene: "+str(j)+ " matrix error: "+ str(eachMeanError))
for i in range(0,4):#len(alpha_lasso)):
lassoreg = Lasso(alpha=alpha_lasso[i],warm_start=True,tol=0.00000001,max_iter=100000)
lassoreg.coef_ = mybeta[:,j-c]
lassoreg.fit(train[:,predictors],train[:,y])
y_train_pred = lassoreg.predict(A)#train[:,predictors])
y_test_pred = lassoreg.predict(C)#test[:,predictors])
y_lasso_func = (1/(2*r))*sum(y_train_pred)+sum(abs(lassoreg.coef_))
################## RMS ##################
error_aw_b[j-c,i] = math.sqrt(sum((y_train_pred-train[:,y])**2)/r)
error_lasso[j-c,i] = y_lasso_func
error_cw_x[j-c,i] = math.sqrt(sum((y_test_pred-test[:,y])**2)/(r_df-r))
mins.extend([(error_cw_x.min())])
maxs.extend([(error_cw_x.max())])
promotion[j-c,i] = (((eachMeanError-error_cw_x[j-c,i])/eachMeanError)*100)
print("alpha: "+str(alpha_lasso[i])+ " error_aw_b: "+str(error_aw_b[j-c,i]) + " error_cw_x: " + str(error_cw_x[j-c,i])+" error_lasso: "+str(error_lasso[j-c,i]) + " promotion: " + str(promotion[j-c,i]))
print("coef: " + str(lassoreg.coef_[1:10]))
lassoreg1 = Lasso(alpha=alpha_lasso[i],tol=0.00000001,max_iter=100000)
lassoreg1.fit(train[:,predictors],train[:,y])
y_train_pred1 = lassoreg1.predict(A)#train[:,predictors])
y_test_pred1 = lassoreg1.predict(C)#test[:,predictors])
y_lasso_func1 = (1/(2*r))*sum(y_train_pred1)+sum(abs(lassoreg1.coef_))
################## RMS ##################
error_aw_b1[j-c,i] = math.sqrt(sum((y_train_pred1-train[:,y])**2)/r)
error_lasso1[j-c,i] = y_lasso_func1
error_cw_x1[j-c,i] = math.sqrt(sum((y_test_pred1-test[:,y])**2)/(r_df-r))
mins1.extend([(error_cw_x1.min())])
maxs1.extend([(error_cw_x1.max())])
promotion1[j-c,i] = (((eachMeanError-error_cw_x1[j-c,i])/eachMeanError)*100)
print("alpha: "+str(alpha_lasso[i])+ " error_aw_b: "+str(error_aw_b1[j-c,i]) + " error_cw_x: " + str(error_cw_x1[j-c,i])+" error_lasso: "+str(error_lasso1[j-c,i]) + " promotion: " + str(promotion1[j-c,i]))
print("coef: " + str(lassoreg1.coef_[1:10]))
print("\n")
print("max promote:")
print((promotion[j-c,:].max()))
f = open('analyse_col', 'wb')
np.save(f, [promotion,alphas,error_cw_x,mins,maxs])
f.close()
plt.plot(promotion[:,j-c])
plt.ylabel('coef for ')
plt.xlabel('each gene')
plt.show()
非常感謝,您的回答和論文非常有用。我知道在某些情況下它並沒有收斂,但在某些情況下它確實如此。迭代次數爲100000次。收斂意味着係數的變化小於0.00000001。在收斂的情況下,我認爲它應該接近最佳點,我們不應該看到很大的差異。我認爲問題在於你在答案的第一部分提到的條件。 – shadi
問題可能不是維度(p> n),而不是算法,但是縮放比例不好。你可能沒有預處理你的問題或使用一些非常奇怪的參數。一個簡單的規範(new_x - old_x)
sascha
我實際上測試了標準化和非標準化數據的套索,我的數據是基因表達。以非標準化的方式,範圍介於6和14之間。在標準化模式下,它介於0和1之間。我通過代碼完成了該職位。預處理是什麼意思?你的意思是正常化嗎? – shadi