2017-04-03 53 views
1

我對TensorFlow /機器學習相當新,因此有一些困難。我有一個csv格式的數據集here,並且想用像here這樣的熊貓來閱讀它。它在另一個數據集上工作,但我修改和擴展,但我認爲我錯過了重要的東西在這裏。基本上我所要做的就是預測給定數據集的「總體」評級。這裏是我的代碼和追蹤我得到:預期的二進制或unicode字符串,得到nan - tensorflow /熊貓

import pandas as pd 
import tensorflow as tf 
import tempfile 


COLUMNS = ["reviewerID", "asin", "reviewerName", "helpful_0", "helpful_1", "reviewText", 
      "overall", "summary", "unixReviewTime"] 

CATEGORICAL_COLUMNS = ["reviewerID", "reviewerName", "reviewText", "summary"] 
CONTINUOUS_COLUMNS = ["helpful_0", "helpful_1", "unixReviewTime"] 

df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS, skipinitialspace=True, 
         low_memory=False, skiprows=1) 
df_test = pd.read_csv('Digital_Music_5_test.csv', names=COLUMNS, 
         skipinitialspace=True, skiprows=1) 

LABEL_COLUMN = "label" 


df_train[LABEL_COLUMN] = df_train["overall"] 
df_test[LABEL_COLUMN] = df_train["overall"] 

print(df_train) 


def input_fn(df): 
    # Creates a dictionary mapping from each continuous feature column name (k) 
    # to the values of that column stored in a constant Tensor. 
    continuous_cols = {k: tf.constant(df[k].values) 
         for k in CONTINUOUS_COLUMNS} 
    # Creates a dictionary mapping from each categorical feature column name 
    # (k) to the values of that column stored in a tf.SparseTensor. 
    categorical_cols = {k: tf.SparseTensor(
     indices=[[i, 0] for i in range(df[k].size)], 
     values=df[k].values, 
     dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS} 
    # Merges the two dictionaries into one. 
    feature_cols = dict(continuous_cols) 
    feature_cols.update(categorical_cols) 
    # Converts the label column into a constant Tensor. 
    label = tf.constant(df[LABEL_COLUMN].values) 
    # Returns the feature columns and the label. 
    return feature_cols, label 


def train_input_fn(): 
    return input_fn(df_train) 


def eval_input_fn(): 
    return input_fn(df_test) 


reviewText = tf.contrib.layers.sparse_column_with_hash_bucket("reviewText", hash_bucket_size=100000) 
reviewerID = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerID", hash_bucket_size=100000) 
reviewerName = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerName", hash_bucket_size=100000) 
summary = tf.contrib.layers.sparse_column_with_hash_bucket("summary", hash_bucket_size=100000) 


asin = tf.contrib.layers.real_valued_column("asin") 
helpful_0 = tf.contrib.layers.real_valued_column("helpful_0") 
helpful_1 = tf.contrib.layers.real_valued_column("helpful_1") 
unixReviewTime = tf.contrib.layers.real_valued_column("unixReviewTime") 

# reviewText_x_summary = tf.contrib.layers.crossed_column([reviewText, summary], hash_bucket_size=100000) 
# reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewerID, reviewerName], hash_bucket_size=100000) 
# reviewText_x_reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewText, reviewerID, reviewerName], hash_bucket_size=100000) 


model_dir = tempfile.mkdtemp() 
m = tf.contrib.learn.LinearClassifier(feature_columns=[reviewText, reviewerName, summary, 
                 asin, helpful_0, helpful_1, unixReviewTime], optimizer=tf.train.FtrlOptimizer(
                    learning_rate=0.1, 
                    l1_regularization_strength=1.0, 
                    l2_regularization_strength=1.0), 
                 model_dir=model_dir) 

m.fit(input_fn=train_input_fn, steps=200) 
# results = m.evaluate(input_fn=eval_input_fn, steps=1) 
# for key in sorted(results): 
#  print("{}: {}".format(key, results[key])) 

回溯:

Traceback (most recent call last): 
    File "amazon_reviews.py", line 78, in <module> 
    m.fit(input_fn=train_input_fn, steps=200) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 280, in new_func 
    return func(*args, **kwargs) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 426, in fit 
    loss = self._train_model(input_fn=input_fn, hooks=hooks) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 932, in _train_model 
    features, labels = input_fn() 
    File "amazon_reviews.py", line 47, in train_input_fn 
    return input_fn(df_train) 
    File "amazon_reviews.py", line 36, in input_fn 
    dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS} 
    File "amazon_reviews.py", line 36, in <dictcomp> 
    dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS} 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/sparse_tensor.py", line 125, in __init__ 
    values, name="values", as_ref=True) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 702, in internal_convert_to_tensor 
    ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 110, in _constant_tensor_conversion_function 
    return constant(v, dtype=dtype, name=name) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 99, in constant 
    tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape)) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 451, in make_tensor_proto 
    append_fn(tensor_proto, proto_values) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 109, in SlowAppendObjectArrayToTensorProto 
    tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values]) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 109, in <listcomp> 
    tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values]) 
    File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 65, in as_bytes 
    (bytes_or_text,)) 
TypeError: Expected binary or unicode string, got nan 

回答

2

你輸入數據幀中包含空審閱名稱和由pd.read_csv()映射爲NaN的評論文章,但是TensorFlow需要一個字符串,而不是NaN的。

檢查使用此命令的空單元格:

df_train[df_train.isnull().any(axis=1)] 

您可以將這些NaN的簡單轉換成一個空字符串使用

df_train.fillna('', inplace=True) 

或有pd.read_csv()創建直接使用na_values=[]空字符串,而不是NaN的:

df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS, 
         skipinitialspace=True, low_memory=False, 
         skiprows=1, na_values=[]) 
+0

謝謝!這節省了我的一天! :d –

相關問題