如何为 运行 PCA 和 pandas 中不同段的 K 均值构建循环
How to buld a loop for running PCA and K-means for different segment in pandas
我想在 运行 不同国家的 PCA 和 K 均值中创建一个循环。但是我可以为每个国家分别做,然后合并或连接结果。我想在一次迭代中以一种简单的方式为所有国家/地区完成此操作,以便我可以节省其中涉及的代码行数和计算量。
下面是美国客户的代码。
import pandas as pd
import numpy as np
# intialise data of lists.
data = {'country':["Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US"],
'Inv_count':[91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
'debit_count':[5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
'credit_count':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
'Earlycount':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]}
# Create DataFrame
df = pd.DataFrame(data)
US=df[df['country']=='US']
US_numeric=US.select_dtypes(exclude=[object])
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(US_numeric)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['p1','p2'])
from sklearn.cluster import KMeans
np.random.seed(131)
km = KMeans(n_clusters=3,random_state=3425)
y_predicted = km.fit_predict(principalDf)
- 构建您的代码,以便您的计算return一个国家/地区的数据框
- 那么就是简单的使用groupby apply
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# intialise data of lists.
data = {'country':["Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US"],
'Inv_count':[91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
'debit_count':[5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
'credit_count':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
'Earlycount':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]}
# Create DataFrame
df = pd.DataFrame(data)
def pcakmeanscalc(country_df):
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(country_df.select_dtypes(exclude=[object]))
principalDf = pd.DataFrame(data = principalComponents
, columns = ['p1','p2'])
np.random.seed(131)
km = KMeans(n_clusters=3,random_state=3425)
y_predicted = km.fit_predict(principalDf)
return pd.DataFrame([{"p1":principalComponents.T[0],"p2":principalComponents.T[1],"kmeans":y_predicted}])
df.groupby("country").apply(pcakmeanscalc).droplevel(1)
输出
p1 p2 kmeans
country
Canada [29737.58741546159, -12135.271593139332, 59257... [565.2987589576035, -405.69927102369314, 258.6... [1, 2, 1, 0, 1, 2, 0, 2, 0, 0, 0, 2, 1, 1, 0, 1]
Japan [28208.93539830322, -50650.59428312244, 25260.... [633.5574877384774, 49.929437144969306, -525.7... [1, 0, 1, 2, 0, 2, 1, 1, 1, 0, 2, 0, 2, 2, 0]
US [69714.07771102455, -62245.354747757076, 9855.... [311.8510307588487, -450.68732626950384, 27.36... [1, 2, 0, 1, 1, 2, 1, 0, 1, 2, 2, 0, 2, 0, 1, ...
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
pd.set_option('chained_assignment', None)
np.random.seed(100)
# define the input data frame
df = pd.DataFrame({
'country': ['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US'],
'inv_count': [91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
'debit_count': [5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
'credit_count': [48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
'early_count': [48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]
})
# add 2 columns for storing the PCA values
df['pca_1'] = np.nan
df['pca_2'] = np.nan
# add 1 column for storing the cluster labels
df['cluster'] = np.nan
# loop across the different countries
for country in df['country'].unique():
# extract the data for the considered country
data = df.loc[df['country'] == country]
data.drop(labels=['country', 'pca_1', 'pca_2', 'cluster'], axis=1, inplace=True)
data.reset_index(inplace=True, drop=True)
# extract the principal components
pca = PCA(n_components=2).fit_transform(data)
# extract the cluster labels
clusters = KMeans(n_clusters=3, random_state=123).fit_predict(pca)
# save the results
df.loc[df['country'] == country, 'pca_1'] = pca[:, 0]
df.loc[df['country'] == country, 'pca_2'] = pca[:, 1]
df.loc[df['country'] == country, 'cluster'] = clusters
df.head()
# country inv_count debit_count ... pca_1 pca_2 cluster
# 0 Canada 91064 5 ... 29737.587415 565.298759 0.0
# 1 Canada 49190 8 ... -12135.271593 -405.699271 2.0
# 2 Canada 120584 16 ... 59257.979929 258.659878 0.0
# 3 Canada 27348 2 ... -33977.476225 -249.638903 1.0
# 4 Canada 107865 17 ... 46539.018792 219.183397 0.0
df.tail()
# country inv_count debit_count ... pca_1 pca_2 cluster
# 47 US 21174 733 ... -43847.163578 7.510766 0.0
# 48 US 137186 51 ... 72165.447415 -41.127238 1.0
# 49 US 64757 33 ... -263.822921 -162.163828 2.0
# 50 US 32629 12 ... -32391.736467 -100.762044 0.0
# 51 US 47240 12 ... -17781.137860 -346.435270 2.0
我想在 运行 不同国家的 PCA 和 K 均值中创建一个循环。但是我可以为每个国家分别做,然后合并或连接结果。我想在一次迭代中以一种简单的方式为所有国家/地区完成此操作,以便我可以节省其中涉及的代码行数和计算量。
下面是美国客户的代码。
import pandas as pd
import numpy as np
# intialise data of lists.
data = {'country':["Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US"],
'Inv_count':[91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
'debit_count':[5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
'credit_count':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
'Earlycount':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]}
# Create DataFrame
df = pd.DataFrame(data)
US=df[df['country']=='US']
US_numeric=US.select_dtypes(exclude=[object])
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(US_numeric)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['p1','p2'])
from sklearn.cluster import KMeans
np.random.seed(131)
km = KMeans(n_clusters=3,random_state=3425)
y_predicted = km.fit_predict(principalDf)
- 构建您的代码,以便您的计算return一个国家/地区的数据框
- 那么就是简单的使用groupby apply
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# intialise data of lists.
data = {'country':["Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "Japan", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US"],
'Inv_count':[91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
'debit_count':[5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
'credit_count':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
'Earlycount':[48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]}
# Create DataFrame
df = pd.DataFrame(data)
def pcakmeanscalc(country_df):
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(country_df.select_dtypes(exclude=[object]))
principalDf = pd.DataFrame(data = principalComponents
, columns = ['p1','p2'])
np.random.seed(131)
km = KMeans(n_clusters=3,random_state=3425)
y_predicted = km.fit_predict(principalDf)
return pd.DataFrame([{"p1":principalComponents.T[0],"p2":principalComponents.T[1],"kmeans":y_predicted}])
df.groupby("country").apply(pcakmeanscalc).droplevel(1)
输出
p1 p2 kmeans
country
Canada [29737.58741546159, -12135.271593139332, 59257... [565.2987589576035, -405.69927102369314, 258.6... [1, 2, 1, 0, 1, 2, 0, 2, 0, 0, 0, 2, 1, 1, 0, 1]
Japan [28208.93539830322, -50650.59428312244, 25260.... [633.5574877384774, 49.929437144969306, -525.7... [1, 0, 1, 2, 0, 2, 1, 1, 1, 0, 2, 0, 2, 2, 0]
US [69714.07771102455, -62245.354747757076, 9855.... [311.8510307588487, -450.68732626950384, 27.36... [1, 2, 0, 1, 1, 2, 1, 0, 1, 2, 2, 0, 2, 0, 1, ...
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
pd.set_option('chained_assignment', None)
np.random.seed(100)
# define the input data frame
df = pd.DataFrame({
'country': ['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US'],
'inv_count': [91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
'debit_count': [5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
'credit_count': [48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
'early_count': [48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]
})
# add 2 columns for storing the PCA values
df['pca_1'] = np.nan
df['pca_2'] = np.nan
# add 1 column for storing the cluster labels
df['cluster'] = np.nan
# loop across the different countries
for country in df['country'].unique():
# extract the data for the considered country
data = df.loc[df['country'] == country]
data.drop(labels=['country', 'pca_1', 'pca_2', 'cluster'], axis=1, inplace=True)
data.reset_index(inplace=True, drop=True)
# extract the principal components
pca = PCA(n_components=2).fit_transform(data)
# extract the cluster labels
clusters = KMeans(n_clusters=3, random_state=123).fit_predict(pca)
# save the results
df.loc[df['country'] == country, 'pca_1'] = pca[:, 0]
df.loc[df['country'] == country, 'pca_2'] = pca[:, 1]
df.loc[df['country'] == country, 'cluster'] = clusters
df.head()
# country inv_count debit_count ... pca_1 pca_2 cluster
# 0 Canada 91064 5 ... 29737.587415 565.298759 0.0
# 1 Canada 49190 8 ... -12135.271593 -405.699271 2.0
# 2 Canada 120584 16 ... 59257.979929 258.659878 0.0
# 3 Canada 27348 2 ... -33977.476225 -249.638903 1.0
# 4 Canada 107865 17 ... 46539.018792 219.183397 0.0
df.tail()
# country inv_count debit_count ... pca_1 pca_2 cluster
# 47 US 21174 733 ... -43847.163578 7.510766 0.0
# 48 US 137186 51 ... 72165.447415 -41.127238 1.0
# 49 US 64757 33 ... -263.822921 -162.163828 2.0
# 50 US 32629 12 ... -32391.736467 -100.762044 0.0
# 51 US 47240 12 ... -17781.137860 -346.435270 2.0