如何使用没有 header（csv 文件）的数据创建 tf.feature_columns？

Question

我在下面处理multi-class_classification_of_handwritten_digitslinkgoogle colab

然后我尝试将代码放在我的方式中以重新编写、馈送和训练 DNN。由于 csv 文件没有 header 我无法创建我的特征列，所以我无法训练我的模型。

能否请您帮我弄清楚 link 中是如何完成的，或者我的代码需要如何完成？提前致谢。

import pandas as pd
import seaborn as sns
import tensorflow as tf

mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)

mnist_df.columns

hand_df = mnist_df[0]
hand_df.head()

matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()

mnist_df = mnist_df.head(10000)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
                         columns=matrix_df.columns,
                         index=matrix_df.index)

input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
                                                 batch_size=10,
                                                 num_epochs=1000,
                                                 shuffle=True)

my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)

my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

model = tf.estimator.LinearClassifier(feature_columns=feat_cols,
                                      n_classes=10,
                                      optimizer=my_optimizer,
                      config=tf.estimator.RunConfig(keep_checkpoint_max=1))

model.train(input_fn=input_func,steps=1000)

Answer 1

示例代码已将数据集拆分为训练集和验证集。

而且我认为这与 CSV 中的 header 没有任何关系。

training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])

validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])

所以训练代码单独放在这里

import pandas as pd
import tensorflow as tf
from tensorflow.python.data import Dataset
import numpy as np


mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",sep=",",header=None)

mnist_df = mnist_df.head(10000)


dataset = mnist_df[:7500]
labels = dataset[0]
print ( labels.shape )

# DataFrame.loc index ranges are inclusive at both ends.
features = dataset.loc[:, 1:784]
print ( features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
features = features / 255


def create_training_input_fn(feature, label, batch_size, num_epochs=None, shuffle=True):
    """A custom input_fn for sending MNIST data to the estimator for training.

    Args:
      features: The training features.
      labels: The training labels.
      batch_size: Batch size to use during training.

    Returns:
      A function that returns batches of training features and labels during
      training.
    """

    def _input_fn(num_epochs=None, shuffle=True):
        # Input pipelines are reset with each call to .train(). To ensure model
        # gets a good sampling of data, even when number of steps is small, we
        # shuffle all the data before creating the Dataset object
        idx = np.random.permutation(feature.index)
        raw_features = {"pixels": feature.reindex(idx)}
        raw_targets = np.array(label[idx])

        ds = Dataset.from_tensor_slices((raw_features, raw_targets))  # warning: 2GB limit
        ds = ds.batch(batch_size).repeat(num_epochs)

        if shuffle:
            ds = ds.shuffle(10000)

        # Return the next batch of data.
        feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
        return feature_batch, label_batch

    return _input_fn



my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)

my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

model = tf.estimator.LinearClassifier(feature_columns=set([tf.feature_column.numeric_column('pixels', shape=784)]),
                                      n_classes=10,
                                      optimizer=my_optimizer,
                                      config=tf.estimator.RunConfig(keep_checkpoint_max=1))

model.train(input_fn=create_training_input_fn(features, labels, batch_size=10),steps=1000)

同样，您有一个函数可以为预测准备验证集。您可以按原样使用此模式。

但是如果您使用 train_test_split 分割数据框，您可以试试这个。

X_train, X_test = train_test_split(mnist_df, test_size=0.2)

您还必须对 X_test 重复以下过程才能获得验证特征和标签。

X_train_labels = X_train[0]
print ( X_train_labels.shape )

# DataFrame.loc index ranges are inclusive at both ends.
X_train_features = X_train.loc[:, 1:784]
print ( X_train_features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
X_train_features = X_train_features / 255

Answer 2

我没有尝试找到一种使用没有任何列名的数据的方法，而是想到 :) 我已经命名了我所有的列并将它们附加到 cols=[] 然后很容易分配和feature_columns = cols.

使用

这是我针对自己的问题的完整工作代码。

谢谢。

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf 

from sklearn import metrics
from tensorflow.python.data import Dataset

mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)

mnist_df.describe()

mnist_df.columns

hand_df = mnist_df[0]

matrix_df = mnist_df.drop([0],axis=1)

matrix_df.head()

hand_df.head()

#creating cols array and append a1 to a784 in order to name columns
cols=[]
for i in range(785):
    if i!=0:
        a = '{}{}'.format('a',i)
        cols.append(a)

matrix_df.columns = cols

mnist_df = mnist_df.head(10000)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
                         columns=matrix_df.columns,
                         index=matrix_df.index)

#naming columns so I will not get error while assigning feature_columns
for i in range(len(cols)):
    a=i+1
    b='{}{}'.format('a',a)
    cols[i] = tf.feature_column.numeric_column(str(b))

matrix_df.head()

input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
                                                 batch_size=10,num_epochs=1000,
                                                 shuffle=True)

my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)

my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

model = tf.estimator.DNNClassifier(feature_columns=cols,
                                   hidden_units=[32,64],
                                      n_classes=10,
                                      optimizer=my_optimizer,
                                      config=tf.estimator.RunConfig(keep_checkpoint_max=1))

model.train(input_fn=input_func,steps=1000)

predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                         batch_size=50,
                                                         num_epochs=1,
                                                         shuffle=False)

pred_gen = model.predict(predict_input_func)

predictions = list(pred_gen)

predictions[0]

如何使用没有 header（csv 文件）的数据创建 tf.feature_columns？

how to create tf.feature_columns with data have no header(csv file)?

pandas

tensorflow

sklearn-pandas