为什么 Skorch 在每个时代都显示 NAN?

Why Skorch show NAN in the every epoch?

我想基于 Skorch 的数据集 class 创建自己的数据集 class,因为我想区分分类列和连续列。这些分类列将通过模型中的嵌入层传递。结果很奇怪,因为它显示 NAN 像这样:

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1           nan           nan  0.2187
      2           nan           nan  0.1719
      3           nan           nan  0.1719
      4           nan           nan  0.1562
      5           nan           nan  0.1406

你能帮我解决一下吗?我正在使用来自这个 kaggle 的数据: Here

from skorch import NeuralNetRegressor
from skorch.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y = np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns
                          if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[self.cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))

    def __len__(self):
        # Denotes the total number of sampoes
        return self.n

    def __getitem__(self, idx):
        # generates one sample of data
        return [self.cont_X[idx], self.cat_X[idx]], self.y[idx]


class FeedForwardNN(nn.Module):

    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
                 output_size, emb_dropout, lin_layer_dropouts):

        """
        Parameters
        ----------
        emb_dims: List of two element tuples
          This list will contain a two element tuple for each
          categorical feature. The first element of a tuple will
          denote the number of unique values of the categorical
          feature. The second element will denote the embedding
          dimension to be used for that feature.
        no_of_cont: Integer
          The number of continuous features in the data.
        lin_layer_sizes: List of integers.
          The size of each linear layer. The length will be equal
          to the total number
          of linear layers in the network.
        output_size: Integer
          The size of the final output.
        emb_dropout: Float
          The dropout to be used after the embedding layers.
        lin_layer_dropouts: List of floats
          The dropouts to be used after each linear layer.
        """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                         for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                    lin_layer_sizes[0])

        self.lin_layers = \
            nn.ModuleList([first_lin_layer] + \
                          [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
                           for i in range(len(lin_layer_sizes) - 1)])

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                      output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                        for size in lin_layer_sizes])

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                            for size in lin_layer_dropouts])

    def forward(self, X):
        cont_data = X[0]
        cat_data = X[1]
        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i])
                 for i, emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in \
                zip(self.lin_layers, self.droput_layers, self.bn_layers):
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)

        return x


# Read data
data = pd.read_csv("data/train.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                              "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

categorical_features = ["MSSubClass", "MSZoning", "Street", "LotShape", "YearBuilt"]
output_feature = "SalePrice"

# Label Encode Categorial Features
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

# feed Forward NN
cat_dims = [int(data[col].nunique()) for col in categorical_features]

emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]


net = FeedForwardNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100],
                    output_size=1, emb_dropout=0.04,
                    lin_layer_dropouts=[0.001, 0.01])

# Fit
ds = TabularDataset(data=data, cat_cols=categorical_features,
                    output_col=output_feature)
X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice'].values.reshape(-1, 1)
net = NeuralNetRegressor(
    net,
    max_epochs=5,
    lr=0.1,
    dataset=ds
)
net.fit(X, y)

问题不在于烤焦,而在于您的数据。你必须缩放你的输入,在这种情况下,尤其是目标,以避免巨大的损失和梯度爆炸。作为开始,我建议使用 sklearn.preprocessing.StandardScaler:

from sklearn.preprocessing import StandardScaler

class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        self.n = data.shape[0]
        # [...]
        if output_col:
            scaler_y = StandardScaler()
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)

            scaler_y.fit(self.y)
            self.y = scaler_y.transform(self.y)
        # [...]
        if self.cont_cols:
            scaler_X_cont = StandardScaler()
            self.cont_X = data[self.cont_cols].astype(np.float32).values
            scaler_X_cont.fit(self.cont_X)
            self.cont_X = scaler_X_cont.transform(self.cont_X)
        # [...]

附带说明一下,如果您拥有提供实际数据的数据集,则不需要 Xy,只需将其传递给 net.fit(使用使用分层 CV 分割的例外):

net = NeuralNetRegressor(
    net,
    max_epochs=5,
    lr=0.00001,
)
net.fit(ds, y=None)

对于遇到分类问题的类似问题的任何人,Skorch 默认使用的损失函数(准则)是 NLLLoss,它会为您计算对数 (doc and related issue)。因此,预计 Softmax(在多个 类 的情况下)层作为架构中的最后一步出现,以便能够产生概率。

您可以:

  1. 添加一个 Softmax 层来生成概率并保留默认的 NLLLoss;
  2. 将默认损失更改为 CrossEntropyLoss:
    net = NeuralNetClassifier(
        ...
        criterion=torch.nn.CrossEntropyLoss
    )