获取 pandas 中每一列的方差
Getting the variance of each column in pandas
我想计算保存在训练和测试文件中的特征的方差:
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
为此,我写了以下内容:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
这显然行不通。 variance_result.csv
中的结果只是归一化的训练矩阵。
所以我的问题是如何获得方差低于 20% 的列(特征)的索引。
提前致谢 !
更新
我已经通过这种方式解决了方差问题:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
不,我仍然不知道如何从 variance
中获取方差大于 0.2
的特征索引,其他感谢 运行 一个循环!
只需将阈值设置为 0.0,然后使用 VarianceThreshold 对象的 variances_
属性来获取所有特征的方差,然后您就可以确定哪些特征的方差较小。
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])
我想计算保存在训练和测试文件中的特征的方差:
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
为此,我写了以下内容:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
这显然行不通。 variance_result.csv
中的结果只是归一化的训练矩阵。
所以我的问题是如何获得方差低于 20% 的列(特征)的索引。
提前致谢 !
更新
我已经通过这种方式解决了方差问题:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
不,我仍然不知道如何从 variance
中获取方差大于 0.2
的特征索引,其他感谢 运行 一个循环!
只需将阈值设置为 0.0,然后使用 VarianceThreshold 对象的 variances_
属性来获取所有特征的方差,然后您就可以确定哪些特征的方差较小。
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])