基于 Python 中较小的数据集生成较大的合成数据集
Generate larger synthetic dataset based on a smaller dataset in Python
我有一个包含 21000 行(数据样本)和 102 列(特征)的数据集。我想根据当前数据集生成一个更大的合成数据集,比如 100000 行,这样我就可以将它用于机器学习目的。
我一直在参考@Prashant 对此 post https://stats.stackexchange.com/questions/215938/generate-synthetic-data-to-match-sample-data 的回答,但我无法让它为我的数据生成更大的合成数据集。
import numpy as np
from random import randrange, choice
from sklearn.neighbors import NearestNeighbors
import pandas as pd
#referring to https://stats.stackexchange.com/questions/215938/generate-synthetic-data-to-match-sample-data
df = pd.read_pickle('df_saved.pkl')
df = df.iloc[:,:-1] # this gives me df, the final Dataframe which I would like to generate a larger dataset based on. This is the smaller Dataframe with 21000x102 dimensions.
def SMOTE(T, N, k):
# """
# Returns (N/100) * n_minority_samples synthetic minority samples.
#
# Parameters
# ----------
# T : array-like, shape = [n_minority_samples, n_features]
# Holds the minority samples
# N : percetange of new synthetic samples:
# n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
# k : int. Number of nearest neighbours.
#
# Returns
# -------
# S : array, shape = [(N/100) * n_minority_samples, n_features]
# """
n_minority_samples, n_features = T.shape
if N < 100:
#create synthetic samples only for a subset of T.
#TODO: select random minortiy samples
N = 100
pass
if (N % 100) != 0:
raise ValueError("N must be < 100 or multiple of 100")
N = N/100
n_synthetic_samples = N * n_minority_samples
n_synthetic_samples = int(n_synthetic_samples)
n_features = int(n_features)
S = np.zeros(shape=(n_synthetic_samples, n_features))
#Learn nearest neighbours
neigh = NearestNeighbors(n_neighbors = k)
neigh.fit(T)
#Calculate synthetic samples
for i in range(n_minority_samples):
nn = neigh.kneighbors(T[i], return_distance=False)
for n in range(N):
nn_index = choice(nn[0])
#NOTE: nn includes T[i], we don't want to select it
while nn_index == i:
nn_index = choice(nn[0])
dif = T[nn_index] - T[i]
gap = np.random.random()
S[n + i * N, :] = T[i,:] + gap * dif[:]
return S
df = df.to_numpy()
new_data = SMOTE(df,50,10) # this is where I call the function and expect new_data to be generated with larger number of samples than original df.
我得到的错误回溯如下:-
Traceback (most recent call last):
File "MyScript.py", line 66, in <module>
new_data = SMOTE(df,50,10)
File "MyScript.py", line 52, in SMOTE
nn = neigh.kneighbors(T[i], return_distance=False)
File "/trinity/clustervision/CentOS/7/apps/anaconda/4.3.31/3.6-VE/lib/python3.5/site-packages/sklearn/neighbors/base.py", line 393, in kneighbors
X = check_array(X, accept_sparse='csr')
File "/trinity/clustervision/CentOS/7/apps/anaconda/4.3.31/3.6-VE/lib/python3.5/site-packages/sklearn/utils/validation.py", line 547, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
我知道这个错误(预期的二维数组,得到的是一维数组)发生在行 nn = neigh.kneighbors(T[i], return_distance=False)
上。准确地说,当我调用该函数时,T 是形状为 (21000x102) 的 numpy
数组,我的数据从 Pandas 数据帧转换为 numpy
数组。我知道这个问题可能有一些类似的重复,但是 none 回答了我的问题。在这方面的任何帮助将不胜感激。
那么 T[i] 给它的是一个形状为 (102, ) 的数组。
函数需要的是一个形状为 (1, 102) 的数组。
你可以通过调用 reshape 得到它:
nn = neigh.kneighbors(T[i].reshape(1, -1), return_distance=False)
如果你不熟悉 np.reshape,1 表示第一个维度应该是大小 1,-1 表示第二个维度应该是 numpy 可以广播到的任何大小;在这种情况下,原来的 102.
可能对你有用
SMOTE and other advanced over_sampling techniques
这个包 imblearn 有类似 API 的 sklearn 和很多过采样技术。
我遇到了同样的问题。我研究了一段时间但找不到合适的解决方案,然后我尝试将自己的解决方案应用于这个问题。它对我有帮助,我希望它对所有遇到同样问题的人都有用。
columns = df.columns.to_numpy()
iteration_count = 30
new_df = pd.DataFrame(columns=columns)
for i in range(iteration_count):
for k in df.iterrows():
data_obj = {}
for j in range(columns.size):
random_index = np.random.randint(0,13, dtype='int')
data_obj[columns[j]] = df.loc[random_index][columns[j]]
new_df = new_df.append(data_obj, ignore_index=True)
df = df.append(new_df, ignore_index=True)
我有一个包含 21000 行(数据样本)和 102 列(特征)的数据集。我想根据当前数据集生成一个更大的合成数据集,比如 100000 行,这样我就可以将它用于机器学习目的。
我一直在参考@Prashant 对此 post https://stats.stackexchange.com/questions/215938/generate-synthetic-data-to-match-sample-data 的回答,但我无法让它为我的数据生成更大的合成数据集。
import numpy as np
from random import randrange, choice
from sklearn.neighbors import NearestNeighbors
import pandas as pd
#referring to https://stats.stackexchange.com/questions/215938/generate-synthetic-data-to-match-sample-data
df = pd.read_pickle('df_saved.pkl')
df = df.iloc[:,:-1] # this gives me df, the final Dataframe which I would like to generate a larger dataset based on. This is the smaller Dataframe with 21000x102 dimensions.
def SMOTE(T, N, k):
# """
# Returns (N/100) * n_minority_samples synthetic minority samples.
#
# Parameters
# ----------
# T : array-like, shape = [n_minority_samples, n_features]
# Holds the minority samples
# N : percetange of new synthetic samples:
# n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
# k : int. Number of nearest neighbours.
#
# Returns
# -------
# S : array, shape = [(N/100) * n_minority_samples, n_features]
# """
n_minority_samples, n_features = T.shape
if N < 100:
#create synthetic samples only for a subset of T.
#TODO: select random minortiy samples
N = 100
pass
if (N % 100) != 0:
raise ValueError("N must be < 100 or multiple of 100")
N = N/100
n_synthetic_samples = N * n_minority_samples
n_synthetic_samples = int(n_synthetic_samples)
n_features = int(n_features)
S = np.zeros(shape=(n_synthetic_samples, n_features))
#Learn nearest neighbours
neigh = NearestNeighbors(n_neighbors = k)
neigh.fit(T)
#Calculate synthetic samples
for i in range(n_minority_samples):
nn = neigh.kneighbors(T[i], return_distance=False)
for n in range(N):
nn_index = choice(nn[0])
#NOTE: nn includes T[i], we don't want to select it
while nn_index == i:
nn_index = choice(nn[0])
dif = T[nn_index] - T[i]
gap = np.random.random()
S[n + i * N, :] = T[i,:] + gap * dif[:]
return S
df = df.to_numpy()
new_data = SMOTE(df,50,10) # this is where I call the function and expect new_data to be generated with larger number of samples than original df.
我得到的错误回溯如下:-
Traceback (most recent call last):
File "MyScript.py", line 66, in <module>
new_data = SMOTE(df,50,10)
File "MyScript.py", line 52, in SMOTE
nn = neigh.kneighbors(T[i], return_distance=False)
File "/trinity/clustervision/CentOS/7/apps/anaconda/4.3.31/3.6-VE/lib/python3.5/site-packages/sklearn/neighbors/base.py", line 393, in kneighbors
X = check_array(X, accept_sparse='csr')
File "/trinity/clustervision/CentOS/7/apps/anaconda/4.3.31/3.6-VE/lib/python3.5/site-packages/sklearn/utils/validation.py", line 547, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
我知道这个错误(预期的二维数组,得到的是一维数组)发生在行 nn = neigh.kneighbors(T[i], return_distance=False)
上。准确地说,当我调用该函数时,T 是形状为 (21000x102) 的 numpy
数组,我的数据从 Pandas 数据帧转换为 numpy
数组。我知道这个问题可能有一些类似的重复,但是 none 回答了我的问题。在这方面的任何帮助将不胜感激。
那么 T[i] 给它的是一个形状为 (102, ) 的数组。
函数需要的是一个形状为 (1, 102) 的数组。
你可以通过调用 reshape 得到它:
nn = neigh.kneighbors(T[i].reshape(1, -1), return_distance=False)
如果你不熟悉 np.reshape,1 表示第一个维度应该是大小 1,-1 表示第二个维度应该是 numpy 可以广播到的任何大小;在这种情况下,原来的 102.
可能对你有用
SMOTE and other advanced over_sampling techniques
这个包 imblearn 有类似 API 的 sklearn 和很多过采样技术。
我遇到了同样的问题。我研究了一段时间但找不到合适的解决方案,然后我尝试将自己的解决方案应用于这个问题。它对我有帮助,我希望它对所有遇到同样问题的人都有用。
columns = df.columns.to_numpy()
iteration_count = 30
new_df = pd.DataFrame(columns=columns)
for i in range(iteration_count):
for k in df.iterrows():
data_obj = {}
for j in range(columns.size):
random_index = np.random.randint(0,13, dtype='int')
data_obj[columns[j]] = df.loc[random_index][columns[j]]
new_df = new_df.append(data_obj, ignore_index=True)
df = df.append(new_df, ignore_index=True)