大数据帧上的二维高斯过采样
2D Gaussian oversampling over large dataframe
我目前有以下格式的数据框:
step tag_id x_pos y_pos
1 1 5 3
1 2 3 4
2 1 2 2
2 3 1 6
.........................
.........................
N 1 5 7
对于 df 中的每一行,我的目标是添加额外的 m 行,从 x 和 y 值(独立)的高斯分布过采样。因此,N = 100 和 m = 10 的 df 将导致 df 长度为 1010,包括原始值和过采样值。
我为此编写的代码有效,但在大型数据集 (N > 100k) 上速度极慢。我确信有许多操作(创建新数组/dfs、使用 itertuples 等)会影响性能;我将不胜感激任何关于如何提高性能的帮助,以便我可以在整个数据集上生成更高的 m 值。例如:输入数据来自 pandas 数据框,但多元正态函数对 numpy 数组进行操作。有没有一种更自然的方法可以通过 pandas 实现这一点,而无需在 numpy 数组和数据帧之间进行复制?谢谢!
可重现的例子:
import pandas as pd
import numpy as np
import random
def gaussianOversample2(row, n):
sigma = 2
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
return x,y, steps, tags
def oversampleDf(df, n):
oversampled_arr = np.empty((0,4), float)
# with input df with step, tag_id, x_pos, y_pos
data = pd.DataFrame(columns = df.columns)
count = 0
for row in df.itertuples(index=False):
count = count + 1
temp = np.zeros((len(row), n+1))
oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
temp[0] = steps
temp[1] = tags
temp[2] = oversample_x
temp[3] = oversample_y
temp = pd.DataFrame(temp.T, columns = df.columns)
data = data.append(temp)
if count % 1000 == 0:
print("Row: ", count)
return data
df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])
res = oversampleDf(df, 20)
"""
# Result should be:
step tag_id x_pos y_pos
0 1.0 1.0 5.000000 3.000000
1 1.0 1.0 3.423492 3.886602
2 1.0 1.0 5.404581 2.177559
3 1.0 1.0 4.023274 2.883737
4 1.0 1.0 3.390710 3.038782
.. ... ... ... ...
16 2.0 3.0 1.894151 5.510321
17 2.0 3.0 1.110932 5.281578
18 2.0 3.0 1.623538 4.529825
19 2.0 3.0 -0.576756 7.476872
20 2.0 3.0 -0.866123 5.898048
"""
这是我自己找到的解决方案;它更像是一种解决方法,而不是使用更快方法的技术。我改为写入一个 csv 文件,然后在完成后读入,如下所示:
def gaussianOversample3(row, n):
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
def oversampleDf2(df, n):
filename = "oversample.csv"
d = pd.DataFrame(list())
d.to_csv(filename)
#count = 0
for row in df.itertuples(index=False):
#count = count + 1
gaussianOversample3(row, n)
#if count % 10000 == 0:
# print("Row: ", count)
由于读取文件的方式,我必须执行以下操作:
oversampleDf2(defensive_df2, num_oversamples)
oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
oversampled_df.columns = ['col']
oversampled_df = oversampled_df.col.str.split(",",expand=True)
oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
oversampled_df = oversampled_df.drop(['temp'], axis = 1)
oversampled_df = oversampled_df.astype(float)
我目前有以下格式的数据框:
step tag_id x_pos y_pos
1 1 5 3
1 2 3 4
2 1 2 2
2 3 1 6
.........................
.........................
N 1 5 7
对于 df 中的每一行,我的目标是添加额外的 m 行,从 x 和 y 值(独立)的高斯分布过采样。因此,N = 100 和 m = 10 的 df 将导致 df 长度为 1010,包括原始值和过采样值。
我为此编写的代码有效,但在大型数据集 (N > 100k) 上速度极慢。我确信有许多操作(创建新数组/dfs、使用 itertuples 等)会影响性能;我将不胜感激任何关于如何提高性能的帮助,以便我可以在整个数据集上生成更高的 m 值。例如:输入数据来自 pandas 数据框,但多元正态函数对 numpy 数组进行操作。有没有一种更自然的方法可以通过 pandas 实现这一点,而无需在 numpy 数组和数据帧之间进行复制?谢谢!
可重现的例子:
import pandas as pd
import numpy as np
import random
def gaussianOversample2(row, n):
sigma = 2
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
return x,y, steps, tags
def oversampleDf(df, n):
oversampled_arr = np.empty((0,4), float)
# with input df with step, tag_id, x_pos, y_pos
data = pd.DataFrame(columns = df.columns)
count = 0
for row in df.itertuples(index=False):
count = count + 1
temp = np.zeros((len(row), n+1))
oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
temp[0] = steps
temp[1] = tags
temp[2] = oversample_x
temp[3] = oversample_y
temp = pd.DataFrame(temp.T, columns = df.columns)
data = data.append(temp)
if count % 1000 == 0:
print("Row: ", count)
return data
df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])
res = oversampleDf(df, 20)
"""
# Result should be:
step tag_id x_pos y_pos
0 1.0 1.0 5.000000 3.000000
1 1.0 1.0 3.423492 3.886602
2 1.0 1.0 5.404581 2.177559
3 1.0 1.0 4.023274 2.883737
4 1.0 1.0 3.390710 3.038782
.. ... ... ... ...
16 2.0 3.0 1.894151 5.510321
17 2.0 3.0 1.110932 5.281578
18 2.0 3.0 1.623538 4.529825
19 2.0 3.0 -0.576756 7.476872
20 2.0 3.0 -0.866123 5.898048
"""
这是我自己找到的解决方案;它更像是一种解决方法,而不是使用更快方法的技术。我改为写入一个 csv 文件,然后在完成后读入,如下所示:
def gaussianOversample3(row, n):
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
def oversampleDf2(df, n):
filename = "oversample.csv"
d = pd.DataFrame(list())
d.to_csv(filename)
#count = 0
for row in df.itertuples(index=False):
#count = count + 1
gaussianOversample3(row, n)
#if count % 10000 == 0:
# print("Row: ", count)
由于读取文件的方式,我必须执行以下操作:
oversampleDf2(defensive_df2, num_oversamples)
oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
oversampled_df.columns = ['col']
oversampled_df = oversampled_df.col.str.split(",",expand=True)
oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
oversampled_df = oversampled_df.drop(['temp'], axis = 1)
oversampled_df = oversampled_df.astype(float)