大数据帧上的二维高斯过采样

Question

我目前有以下格式的数据框：

step  tag_id  x_pos   y_pos
1     1         5      3
1     2         3      4
2     1         2      2
2     3         1      6
.........................
.........................
N     1         5      7

对于 df 中的每一行，我的目标是添加额外的 m 行，从 x 和 y 值（独立）的高斯分布过采样。因此，N = 100 和 m = 10 的 df 将导致 df 长度为 1010，包括原始值和过采样值。

我为此编写的代码有效，但在大型数据集 (N > 100k) 上速度极慢。我确信有许多操作（创建新数组/dfs、使用 itertuples 等）会影响性能；我将不胜感激任何关于如何提高性能的帮助，以便我可以在整个数据集上生成更高的 m 值。例如：输入数据来自 pandas 数据框，但多元正态函数对 numpy 数组进行操作。有没有一种更自然的方法可以通过 pandas 实现这一点，而无需在 numpy 数组和数据帧之间进行复制？谢谢！

可重现的例子：

import pandas as pd
import numpy as np
import random


def gaussianOversample2(row, n):
    sigma = 2
    mean_x = float(getattr(row,'x_pos'))
    mean_y = float(getattr(row,'y_pos'))
    step = getattr(row, 'step')
    tag_id = getattr(row, 'tag_id')
    sigma = np.array([1,1])
    cov = np.diag(sigma ** 2)
    x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
    x = np.concatenate(([mean_x], x))
    y = np.concatenate(([mean_y], y))
    
    steps = np.empty(n+1)
    tags = np.empty(n+1)
    
    steps.fill(step)
    tags.fill(tag_id)
    
    return x,y, steps, tags
    
    
def oversampleDf(df, n):
    
    oversampled_arr = np.empty((0,4), float)
    # with input df with step, tag_id, x_pos, y_pos
    data = pd.DataFrame(columns = df.columns)
    count = 0
    for row in df.itertuples(index=False):
        count = count + 1
        temp = np.zeros((len(row), n+1))
        oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
        temp[0] = steps
        temp[1] = tags
        temp[2] = oversample_x
        temp[3] = oversample_y
        temp = pd.DataFrame(temp.T, columns = df.columns)

        data = data.append(temp)
        if count % 1000 == 0:
            print("Row: ", count)
    return data

df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])

res = oversampleDf(df, 20)

"""
# Result should be:
    step  tag_id     x_pos     y_pos
0    1.0     1.0  5.000000  3.000000
1    1.0     1.0  3.423492  3.886602
2    1.0     1.0  5.404581  2.177559
3    1.0     1.0  4.023274  2.883737
4    1.0     1.0  3.390710  3.038782
..   ...     ...       ...       ...
16   2.0     3.0  1.894151  5.510321
17   2.0     3.0  1.110932  5.281578
18   2.0     3.0  1.623538  4.529825
19   2.0     3.0 -0.576756  7.476872
20   2.0     3.0 -0.866123  5.898048
"""

Answer 1

这是我自己找到的解决方案；它更像是一种解决方法，而不是使用更快方法的技术。我改为写入一个 csv 文件，然后在完成后读入，如下所示：

def gaussianOversample3(row, n):
    mean_x = float(getattr(row,'x_pos'))
    mean_y = float(getattr(row,'y_pos'))
    step = getattr(row, 'step')
    tag_id = getattr(row, 'tag_id')
    sigma = np.array([1,1])
    cov = np.diag(sigma ** 2)
    x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
    x = np.concatenate(([mean_x], x))
    y = np.concatenate(([mean_y], y))
    
    steps = np.empty(n+1)
    tags = np.empty(n+1)
    
    steps.fill(step)
    tags.fill(tag_id)
    
    pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
 
def oversampleDf2(df, n):
    filename = "oversample.csv"
    d = pd.DataFrame(list())
    d.to_csv(filename)

    #count = 0
    for row in df.itertuples(index=False):
        #count = count + 1
        gaussianOversample3(row, n)
        #if count % 10000 == 0:
        #   print("Row: ", count)

由于读取文件的方式，我必须执行以下操作：

oversampleDf2(defensive_df2, num_oversamples)
oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
oversampled_df.columns = ['col']
oversampled_df = oversampled_df.col.str.split(",",expand=True)
oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
oversampled_df = oversampled_df.drop(['temp'], axis = 1)
oversampled_df = oversampled_df.astype(float)

大数据帧上的二维高斯过采样

2D Gaussian oversampling over large dataframe

python

dataframe

pandas

oversampling