2017-07-28 58 views
0

我一直在試圖自我主機與apachesklearn分類,我放在一起,我結束了使用joblib序列化保存的模型,然後在瓶中的應用程序加載它。現在,當運行內置開發服務器的燒瓶時,這個應用程序完美運行,但是當我使用debian 9 apache服務器進行設置時,出現500錯誤。在深入研究Apache的error.log,我得到:Sklearn分類和瓶問題

AttributeError: module '__main__' has no attribute 'tokenize' 

現在,這是有趣的我,因爲當我做了我自己寫的標記生成器,Web應用程序給我沒有問題,當我在本地運行它。此外,我使用的保存模型是在網絡服務器上進行培訓的,所以略有不同的庫版本應該不成問題。

我的web應用程序的代碼是:

import re 
import sys 

from flask import Flask, request, render_template 
from nltk import word_tokenize 
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.externals import joblib 

app = Flask(__name__) 



def tokenize(text): 
    # text = text.translate(str.maketrans('','',string.punctuation)) 
    text = re.sub(r'\W+', ' ', text) 
    tokens = word_tokenize(text) 
    lemas = [] 
    for item in tokens: 
     lemas.append(WordNetLemmatizer().lemmatize(item)) 
    return lemas 

@app.route('/') 
def home(): 
    return render_template('home.html') 

@app.route('/analyze',methods=['POST','GET']) 
def analyze(): 
    if request.method=='POST': 
     result=request.form 
     input_text = result['input_text'] 

     clf = joblib.load("model.pkl.z") 
     parameters = clf.named_steps['clf'].get_params() 
     predicted = clf.predict([input_text]) 
     # print(predicted) 
     certainty = clf.decision_function([input_text]) 

     # Is it bonkers? 
     if predicted[0]: 
      verdict = "Not too nuts!" 
     else: 
      verdict = "Bonkers!" 

     return render_template('result.html',prediction=[input_text, verdict, float(certainty), parameters]) 

if __name__ == '__main__': 
    #app.debug = True 
    app.run() 

隨着.wsgi文件之中:

import sys 
sys.path.append('/var/www/mysite') 

from conspiracydetector import app as application 

此外,我訓練了與此代碼模式:

import logging 
import pprint # Pretty stuff 
import re 
import sys # For command line arguments 
from time import time # to show progress 

import numpy as np 
from nltk import word_tokenize 
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn import metrics 
from sklearn.datasets import load_files 
from sklearn.externals import joblib # In order to save 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline 
from sklearn.svm import LinearSVC 

# Tokenizer that does stemming and strips punctuation 
def tokenize(text): 
    # text = text.translate(str.maketrans('','',string.punctuation)) 
    text = re.sub(r'\W+', ' ', text) 
    tokens = word_tokenize(text) 
    lemas = [] 
    for item in tokens: 
     lemas.append(WordNetLemmatizer().lemmatize(item)) 
    return lemas 

if __name__ == "__main__": 
    # NOTE: we put the following in a 'if __name__ == "__main__"' protected 
    # block to be able to use a multi-core grid search that also works under 
    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows 
    # The multiprocessing module is used as the backend of joblib.Parallel 
    # that is used when n_jobs != 1 in GridSearchCV 

    # Display progress logs on stdout 
    print("Initializing...") 
    # Command line arguments 
    save = sys.argv[1] 
    training_directory = sys.argv[2] 

    logging.basicConfig(level=logging.INFO, 
         format='%(asctime)s %(levelname)s %(message)s') 

    dataset = load_files(training_directory, shuffle=False) 
    print("n_samples: %d" % len(dataset.data)) 

    # split the dataset in training and test set: 
    print("Splitting the dataset in training and test set...") 
    docs_train, docs_test, y_train, y_test = train_test_split(
     dataset.data, dataset.target, test_size=0.25, random_state=None) 

    # Build a vectorizer/classifier pipeline that filters out tokens 
    # that are too rare or too frequent 
    # Also remove stop words 
    print("Loading list of stop words...") 
    with open('stopwords.txt', 'r') as f: 
     words = [line.strip() for line in f] 

    print("Stop words list loaded...") 
    print("Setting up pipeline...") 
    pipeline = Pipeline(
     [ 
      # ('vect', TfidfVectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))), 
      ('vect', 
      TfidfVectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))), 
      ('clf', LinearSVC(C=5000)), 
     ]) 

    print("Pipeline:", [name for name, _ in pipeline.steps]) 

    # Build a grid search to find out whether unigrams or bigrams are 
    # more useful. 
    # Fit the pipeline on the training set using grid search for the parameters 
    print("Initializing grid search...") 

    # uncommenting more parameters will give better exploring power but will 
    # increase processing time in a combinatorial way 
    parameters = { 
     # 'vect__ngram_range': [(1, 1), (1, 2)], 
     # 'vect__min_df': (0.0005, 0.001), 
     # 'vect__max_df': (0.25, 0.5), 
     # 'clf__C': (10, 15, 20), 
    } 
    print("Parameters:") 
    pprint.pprint(parameters) 
    grid_search = GridSearchCV(
     pipeline, 
     parameters, 
     n_jobs=-1, 
     verbose=True) 

    print("Training and performing grid search...\n") 
    t0 = time() 
    grid_search.fit(docs_train, y_train) 
    print("\nDone in %0.3fs!\n" % (time() - t0)) 

    # Print the mean and std for each candidate along with the parameter 
    # settings for all the candidates explored by grid search. 
    n_candidates = len(grid_search.cv_results_['params']) 
    for i in range(n_candidates): 
     print(i, 'params - %s; mean - %0.2f; std - %0.2f' 
       % (grid_search.cv_results_['params'][i], 
       grid_search.cv_results_['mean_test_score'][i], 
       grid_search.cv_results_['std_test_score'][i])) 

    # Predict the outcome on the testing set and store it in a variable 
    # named y_predicted 
    print("\nRunning against testing set...\n") 
    y_predicted = grid_search.predict(docs_test) 

    # Save model 
    print("\nSaving model to", save, "...") 
    joblib.dump(grid_search.best_estimator_, save) 
    print("Model Saved! \nPrepare for some awesome stats!") 

我必須承認,我非常難過,經過修改,搜索,並確保我的服務器配置corr實際上,我覺得也許這裏有人可能會提供幫助。 任何幫助表示讚賞,如果有任何我需要提供的信息,請讓我知道,我會很樂意。

此外,我跑:

  • 蟒蛇3.5.3與NLTK和sklearn。
+0

是否還有更多我需要添加的信息? –

回答

0

我解決了這個問題,雖然不完美,但通過移除我的自定義標記器並退回到sklearn之一,

但是,我仍然在黑暗中如何整合我自己的tokenizer。