0

我試圖創建一個多標籤分類問題Python的多項Logistic迴歸:ValueError異常:壞輸入形狀(326L,559L)

traindf = pickle.load(open(self.dataLocation+"train"+self.fname+".pkl","rb")) 

    X, y = traindf[self.predX], traindf[self.predY] 
    from sklearn.preprocessing import MultiLabelBinarizer 
    y=MultiLabelBinarizer().fit_transform(y) 

    Xtrain, Xvalidate , ytrain, yvalidate = train_test_split(X, y, train_size=self.splitFactor) 



    pip = Pipeline([ 
('vect', TfidfVectorizer(
         analyzer='word', 
         binary=False, 
         decode_error='ignore', 
         dtype=<type 'numpy.int64'>, 
         encoding=u'utf-8', 
         input=u'content', 
         lowercase=True, 
         max_df=0.25, 
         max_features=None, 
         min_df=1, 
         ngram_range=(1, 1), 
         norm=u'l2', 
         preprocessor=None, 
         smooth_idf=True, 
         stop_words='english', 
         strip_accents=None, 
         sublinear_tf=True, 
         token_pattern=u'(?u)\\b\\w\\w+\\b', 
         tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), 
         use_idf=True, vocabulary=None)), 
('clf', LogisticRegression(
         C=10, 
         class_weight=None, 
         dual=False, 
         fit_intercept=True, 
         intercept_scaling=1, 
         max_iter=100, 
         multi_class='multinomial', 
         n_jobs=1, 
         penalty='l2', 
         random_state=None, 
         solver='lbfgs', 
         tol=0.0001, 
         verbose=0, 
         warm_start=False)) 
       ]) 

parameters = {} 

    gridSearchTS = GridSearchCV(pip,parameters,n_jobs=3, verbose=1, scoring='accuracy') 
    gridSearchTS.fit(Xtrain, ytrain) 

    predictions = gridSearchTS.predict(Xvalidate) 

    print ('Accuracy:', accuracy_score(yvalidate, predictions)) 
    print ('Confusion Matrix:', confusion_matrix(yvalidate, predictions)) 
    print ('Classification Report:', classification_report(yvalidate, predictions)) 

    testdf = pickle.load(open(self.dataLocation+"test"+self.fname+".pkl","rb")) 

    predictions=gridSearchTS.predict(testdf[self.predX]) 

    testdf[self.predY] = predictions 

    print(testdf.info()) 

    testdf.to_csv(self.resLocation+self.prefix+self.fname+".csv") 

一個LogistcRegression分類,但我得到的錯誤

ValueError: bad input shape (326L, 559L) 

整個stacktrace是

gridSearchTS.fit(Xtrain, ytrain) 
    File "X:Anaconda2\lib\site-packages\sklearn\grid_search.py", line 804, in fit 
    return self._fit(X, y, ParameterGrid(self.param_grid)) 
    File "X:Anaconda2\lib\site-packages\sklearn\grid_search.py", line 553, in _fit 
    for parameters in parameter_iterable 
    File "X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 812, in __call__ 
    self.retrieve() 
    File "X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 762, in retrieve 
    raise exception 
sklearn.externals.joblib.my_exceptions.JoblibValueError: JoblibValueError 
___________________________________________________________________________ 
Multiprocessing exception: 
........................................................................... 
X:\myScript.py in getUniTags(self=<predict.RbcSolver.Predictor object>, multiNomial=True) 
    75   Xtrain, Xvalidate , ytrain, yvalidate = train_test_split(X, y, train_size=self.splitFactor) 
    76    
    77   parameters = {} 
    78  
    79   gridSearchTS = GridSearchCV(self.pipClassifier,parameters,n_jobs=3, verbose=1, scoring='accuracy') 
---> 80   gridSearchTS.fit(Xtrain, ytrain) 
     gridSearchTS.fit = <bound method GridSearchCV.fit of GridSearchCV(c...obs', refit=True, scoring='accuracy', verbose=1)> 
     Xtrain = 123  <some text here> 
Name: Content, dtype: object 
     ytrain = array([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0,..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]) 
    81   
    82   predictions = gridSearchTS.predict(Xvalidate) 
    83  
    84   print ('Accuracy:', accuracy_score(yvalidate, predictions)) 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\grid_search.py in fit(self=GridSearchCV(cv=None, error_score='raise', 
    ...jobs', refit=True, scoring='accuracy', verbose=1), X = 123 <some text here> 
Name: Content, dtype: object, y=array([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0,..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]])) 
    799   y : array-like, shape = [n_samples] or [n_samples, n_output], optional 
    800    Target relative to X for classification or regression; 
    801    None for unsupervised learning. 
    802 
    803   """ 
--> 804   return self._fit(X, y, ParameterGrid(self.param_grid)) 
     self._fit = <bound method GridSearchCV._fit of GridSearchCV(...obs', refit=True, scoring='accuracy', verbose=1)> 
     X = 161 <some text here> 
Name: Content, dtype: object 
     y = array([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0,..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]) 
     self.param_grid = {} 
    805 
    806 
    807 class RandomizedSearchCV(BaseSearchCV): 
    808  """Randomized search on hyper parameters. 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\grid_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise', 
    ...jobs', refit=True, scoring='accuracy', verbose=1), X = 123 <some text here> 
Name: Content, dtype: object, y=array([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0,..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>) 
    548  )(
    549    delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, 
    550          train, test, self.verbose, parameters, 
    551          self.fit_params, return_parameters=True, 
    552          error_score=self.error_score) 
--> 553     for parameters in parameter_iterable 
     parameters = undefined 
     parameter_iterable = <sklearn.grid_search.ParameterGrid object> 
    554     for train, test in cv) 
    555 
    556   # Out is a list of triplet: score, estimator, n_test_samples 
    557   n_fits = len(out) 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=3), iterable=<generator object <genexpr>>) 
    807    if pre_dispatch == "all" or n_jobs == 1: 
    808     # The iterable was consumed all at once by the above for loop. 
    809     # No need to wait for async callbacks to trigger to 
    810     # consumption. 
    811     self._iterating = False 
--> 812    self.retrieve() 
     self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=3)> 
    813    # Make sure that we get a last message telling us we are done 
    814    elapsed_time = time.time() - self._start_time 
    815    self._print('Done %3i out of %3i | elapsed: %s finished', 
    816       (len(self._output), len(self._output), 

--------------------------------------------------------------------------- 
Sub-process traceback: 
--------------------------------------------------------------------------- 
ValueError           
PID: 5360Python 2.7.11: X:Anaconda2\python.exe 
........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>) 
    67  def __init__(self, iterator_slice): 
    68   self.items = list(iterator_slice) 
    69   self._size = len(self.items) 
    70 
    71  def __call__(self): 
---> 72   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    73 
    74  def __len__(self): 
    75   return self._size 
    76 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyze...  tol=0.0001, verbose=0, warm_start=False))]), X = 123 <some text here> 
Name: Content, dtype: object, y=memmap([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]), scorer=make_scorer(accuracy_score), train=array([163, 164, 165, 166, 167, 168, 169, 170, 1...79, 480, 481, 482, 483, 484, 485, 486, 487, 488]), test=array([ 0, 1, 2, 3, 4, 5, 6, 7, ..., 155, 
     156, 157, 158, 159, 160, 161, 162]), verbose=1, parameters={}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise') 
    1526 
    1527  try: 
    1528   if y_train is None: 
    1529    estimator.fit(X_train, **fit_params) 
    1530   else: 
-> 1531    estimator.fit(X_train, y_train, **fit_params) 
    1532 
    1533  except Exception as e: 
    1534   if error_score == 'raise': 
    1535    raise 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\pipeline.pyc in fit(self=Pipeline(steps=[('vect', TfidfVectorizer(analyze...  tol=0.0001, verbose=0, warm_start=False))]), X=29  research weeks feb rel sep hvlo diff clos...rd loihi diff aoo... 
Name: Content, dtype: object, y=memmap([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]), **fit_params={}) 
    160   y : iterable, default=None 
    161    Training targets. Must fulfill label requirements for all steps of 
    162    the pipeline. 
    163   """ 
    164   Xt, fit_params = self._pre_transform(X, y, **fit_params) 
--> 165   self.steps[-1][-1].fit(Xt, y, **fit_params) 
    166   return self 
    167 
    168  def fit_transform(self, X, y=None, **fit_params): 
    169   """Fit all the transforms one after the other and transform the 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\linear_model\logistic.pyc in fit(self=LogisticRegression(C=10, class_weight=None, dual...   tol=0.0001, verbose=0, warm_start=False), X=<326x17576 sparse matrix of type '<type 'numpy.f... stored elements in Compressed Sparse Row format>, y=memmap([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]), sample_weight=None) 
    1137   if not isinstance(self.tol, numbers.Number) or self.tol < 0: 
    1138    raise ValueError("Tolerance for stopping criteria must be " 
    1139        "positive; got (tol=%r)" % self.tol) 
    1140 
    1141   X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, 
-> 1142       order="C") 
    1143   check_classification_targets(y) 
    1144   self.classes_ = np.unique(y) 
    1145   n_samples, n_features = X.shape 
    1146 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in check_X_y(X=<326x17576 sparse matrix of type '<type 'numpy.f... stored elements in Compressed Sparse Row format>, y=memmap([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]), accept_sparse='csr', dtype=<type 'numpy.float64'>, order='C', copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, warn_on_dtype=False, estimator=None) 
    510      ensure_min_features, warn_on_dtype, estimator) 
    511  if multi_output: 
    512   y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, 
    513       dtype=None) 
    514  else: 
--> 515   y = column_or_1d(y, warn=True) 
    516   _assert_all_finite(y) 
    517  if y_numeric and y.dtype.kind == 'O': 
    518   y = y.astype(np.float64) 
    519 

........................................................................... 
X:Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in column_or_1d(y=memmap([[0, 0, 0, ..., 0, 0, 0], 
     [0, 0, 0..., ..., 0, 0, 0], 
     [0, 0, 0, ..., 0, 0, 0]]), warn=True) 
    546       " expected. Please change the shape of y to " 
    547       "(n_samples,), for example using ravel().", 
    548       DataConversionWarning, stacklevel=2) 
    549   return np.ravel(y) 
    550 
--> 551  raise ValueError("bad input shape {0}".format(shape)) 
    552 
    553 
    554 def check_random_state(seed): 
    555  """Turn seed into a np.random.RandomState instance 

ValueError: bad input shape (326L, 559L) 
___________________________________________________________________________ 

我應該如何轉換/格式化我的X維度?

+0

你可以顯示'train_test_split'中發生了什麼嗎? – pneumatics

+0

你想知道每個變量被分配給哪個變量嗎? – AbtPst

+0

我懷疑問題在'train_test_split'中。輸出結果表明'Xtrain'和'ytrain'的長度不一樣,並且正在使用該功能。 – pneumatics

回答

1

docsLogisticRegression.fit

y : array-like, shape (n_samples,)

所以y具有成1-d陣列,但你MulitlabelBinarizer的輸出將是一個0-1矩陣2米的尺寸。它看起來像它的(326,559),它將是326行和559個不同的類。 y的格式在Multilabel docs中解釋。您必須將LogisitcRegression模型放入一個Mulitlabel分類器中,就像在該頁面下面解釋的one-vs-rest一樣。還有一個multilabel example