使用 Pandas 将列复制到文件时遇到问题
Trouble copying a column to a file using Pandas
代码的目标只是简单地在两列中执行 OHE,并按照原始文件中的原样写入其余列。但是,如图所示,Dur 列在写入第二个文件时不知何故 "bugging" 并且传递的内容多于应有的内容。我不想限制该字段,因为原始文件太大,并且可能包含更长和更短字段的行,这可能会使以后的分析复杂化。
Image of the problem
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
def opendataset():
file = pd.read_csv('originalfiletest.binetflow')
return file
def writefile():
df.to_csv('newfiletest.binetflow', columns=['Dur','Proto','State','TotBytes','average_packet_size','average_bits_psecond'], index=False)
def writebackupproto():
df.to_csv('fieldprotobackup.binetflow', columns=['Proto2','Proto'], index=False)
def writebackupstate():
df.to_csv('fieldstatebackup.binetflow', columns=['State2','State'], index=False)
df = opendataset()
df['State2'] = df['State']
df['Proto2'] = df['Proto']
le = LabelEncoder()
dfle = df
dfle.State = le.fit_transform(dfle.State)
X = dfle[['State']].values
Y = dfle[['Proto']].values
ohe = OneHotEncoder()
OnehotX = ohe.fit_transform(X).toarray()
OnehotY = ohe.fit_transform(Y).toarray()
dx = pd.DataFrame(data=OnehotX)
dy = pd.DataFrame(data=OnehotY)
dfle['State'] = (dx[dx.columns[0:]].apply(lambda x:''.join(x.dropna().astype(int).astype(str)), axis=1))
dfle['Proto'] = (dy[dy.columns[0:]].apply(lambda y:''.join(y.dropna().astype(int).astype(str)), axis=1))
writefile()
writebackupproto()
writebackupstate()
看起来唯一的错误是您的值没有被截断。您可以只使用带有 "truncating lambda" 的 pandas.Series.apply
方法以获得预期的结果。
df.Dur = df.Dur.apply(lambda n: '%.6f' % n)
一个工作示例可能是截断 pi
from math import pi
non_truncated = pd.Series(10*[pi])
non_truncated.apply(lambda n: '%.2f' % n)
你得到一个被截断的系列
0 3.14
1 3.14
2 3.14
3 3.14
4 3.14
5 3.14
6 3.14
7 3.14
8 3.14
9 3.14
dtype: object
代码的目标只是简单地在两列中执行 OHE,并按照原始文件中的原样写入其余列。但是,如图所示,Dur 列在写入第二个文件时不知何故 "bugging" 并且传递的内容多于应有的内容。我不想限制该字段,因为原始文件太大,并且可能包含更长和更短字段的行,这可能会使以后的分析复杂化。
Image of the problem
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
def opendataset():
file = pd.read_csv('originalfiletest.binetflow')
return file
def writefile():
df.to_csv('newfiletest.binetflow', columns=['Dur','Proto','State','TotBytes','average_packet_size','average_bits_psecond'], index=False)
def writebackupproto():
df.to_csv('fieldprotobackup.binetflow', columns=['Proto2','Proto'], index=False)
def writebackupstate():
df.to_csv('fieldstatebackup.binetflow', columns=['State2','State'], index=False)
df = opendataset()
df['State2'] = df['State']
df['Proto2'] = df['Proto']
le = LabelEncoder()
dfle = df
dfle.State = le.fit_transform(dfle.State)
X = dfle[['State']].values
Y = dfle[['Proto']].values
ohe = OneHotEncoder()
OnehotX = ohe.fit_transform(X).toarray()
OnehotY = ohe.fit_transform(Y).toarray()
dx = pd.DataFrame(data=OnehotX)
dy = pd.DataFrame(data=OnehotY)
dfle['State'] = (dx[dx.columns[0:]].apply(lambda x:''.join(x.dropna().astype(int).astype(str)), axis=1))
dfle['Proto'] = (dy[dy.columns[0:]].apply(lambda y:''.join(y.dropna().astype(int).astype(str)), axis=1))
writefile()
writebackupproto()
writebackupstate()
看起来唯一的错误是您的值没有被截断。您可以只使用带有 "truncating lambda" 的 pandas.Series.apply
方法以获得预期的结果。
df.Dur = df.Dur.apply(lambda n: '%.6f' % n)
一个工作示例可能是截断 pi
from math import pi
non_truncated = pd.Series(10*[pi])
non_truncated.apply(lambda n: '%.2f' % n)
你得到一个被截断的系列
0 3.14
1 3.14
2 3.14
3 3.14
4 3.14
5 3.14
6 3.14
7 3.14
8 3.14
9 3.14
dtype: object