如何为 运行 PCA 和 pandas 中不同段的 K 均值构建循环

How to buld a loop for running PCA and K-means for different segment in pandas

我想在 运行 不同国家的 PCA 和 K 均值中创建一个循环。但是我可以为每个国家分别做,然后合并或连接结果。我想在一次迭代中以一种简单的方式为所有国家/地区完成此操作,以便我可以节省其中涉及的代码行数和计算量。

下面是美国客户的代码。

import pandas as pd
import numpy as np
  
# intialise data of lists.
data = {'country':["Canada",    "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US"],
        'Inv_count':[91064, 49190,  120584, 27348,  107865, 58944,  34278,  47936,  19229,  18716,  34958,  68670,  86187,  111691, 4583,   99969,  58868,  137728, 61817,  89282,  109959, 94515,  64524,  39598,  34791,  113512, 89053,  113504, 97454,  74338,  127219, 134734, 2776,   74876,  93921,  96265,  5354,   114885, 58670,  103276, 27973,  13052,  60989,  27008,  52628,  139029, 57010,  21174,  137186, 64757,  32629,  47240],
        'debit_count':[5,   8,  16, 2,  17, 2,  16, 28, 1,  7,  1,  2,  9,  5,  9,  129,    4,  5,  19, 12, 7,  8,  2,  2,  5,  1,  1,  34, 8,  27, 11, 11, 1,  12, 8,  4,  1,  8,  55, 1,  67, 1,  251,    44, 1,  1,  2,  733,    51, 33, 12, 12],
       'credit_count':[48,  699,    290,    570,    307,    632,    161,    740,    203,    268,    391,    647,    542,    372,    129,    756,    89, 454,    907,    110,    962,    539,    371,    997,    463,    910,    50, 422,    662,    545,    318,    909,    209,    635,    614,    298,    978,    884,    829,    342,    987,    98, 201,    843,    799,    781,    738,    568,    663,    489,    493,    337],
       'Earlycount':[48,    699,    290,    570,    307,    632,    161,    740,    203,    268,    391,    647,    542,    372,    129,    756,    89, 454,    907,    110,    962,    539,    371,    997,    463,    910,    50, 422,    662,    545,    318,    909,    209,    635,    614,    298,    978,    884,    829,    342,    987,    98, 201,    843,    799,    781,    738,    568,    663,    489,    493,    337]}

  # Create DataFrame
df = pd.DataFrame(data)

US=df[df['country']=='US']
US_numeric=US.select_dtypes(exclude=[object])

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(US_numeric)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['p1','p2'])
from sklearn.cluster import KMeans
np.random.seed(131)
km = KMeans(n_clusters=3,random_state=3425)
y_predicted = km.fit_predict(principalDf)
  • 构建您的代码,以便您的计算return一个国家/地区的数据框
  • 那么就是简单的使用groupby apply
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# intialise data of lists.
data = {'country':["Canada",    "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Canada",   "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "Japan",    "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US",   "US"],
        'Inv_count':[91064, 49190,  120584, 27348,  107865, 58944,  34278,  47936,  19229,  18716,  34958,  68670,  86187,  111691, 4583,   99969,  58868,  137728, 61817,  89282,  109959, 94515,  64524,  39598,  34791,  113512, 89053,  113504, 97454,  74338,  127219, 134734, 2776,   74876,  93921,  96265,  5354,   114885, 58670,  103276, 27973,  13052,  60989,  27008,  52628,  139029, 57010,  21174,  137186, 64757,  32629,  47240],
        'debit_count':[5,   8,  16, 2,  17, 2,  16, 28, 1,  7,  1,  2,  9,  5,  9,  129,    4,  5,  19, 12, 7,  8,  2,  2,  5,  1,  1,  34, 8,  27, 11, 11, 1,  12, 8,  4,  1,  8,  55, 1,  67, 1,  251,    44, 1,  1,  2,  733,    51, 33, 12, 12],
       'credit_count':[48,  699,    290,    570,    307,    632,    161,    740,    203,    268,    391,    647,    542,    372,    129,    756,    89, 454,    907,    110,    962,    539,    371,    997,    463,    910,    50, 422,    662,    545,    318,    909,    209,    635,    614,    298,    978,    884,    829,    342,    987,    98, 201,    843,    799,    781,    738,    568,    663,    489,    493,    337],
       'Earlycount':[48,    699,    290,    570,    307,    632,    161,    740,    203,    268,    391,    647,    542,    372,    129,    756,    89, 454,    907,    110,    962,    539,    371,    997,    463,    910,    50, 422,    662,    545,    318,    909,    209,    635,    614,    298,    978,    884,    829,    342,    987,    98, 201,    843,    799,    781,    738,    568,    663,    489,    493,    337]}

  # Create DataFrame
df = pd.DataFrame(data)

def pcakmeanscalc(country_df):
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(country_df.select_dtypes(exclude=[object]))
    principalDf = pd.DataFrame(data = principalComponents
                 , columns = ['p1','p2'])

    np.random.seed(131)
    km = KMeans(n_clusters=3,random_state=3425)
    y_predicted = km.fit_predict(principalDf)
    return pd.DataFrame([{"p1":principalComponents.T[0],"p2":principalComponents.T[1],"kmeans":y_predicted}])


df.groupby("country").apply(pcakmeanscalc).droplevel(1)

输出

    p1  p2  kmeans
country         
Canada  [29737.58741546159, -12135.271593139332, 59257...   [565.2987589576035, -405.69927102369314, 258.6...   [1, 2, 1, 0, 1, 2, 0, 2, 0, 0, 0, 2, 1, 1, 0, 1]
Japan   [28208.93539830322, -50650.59428312244, 25260....   [633.5574877384774, 49.929437144969306, -525.7...   [1, 0, 1, 2, 0, 2, 1, 1, 1, 0, 2, 0, 2, 2, 0]
US  [69714.07771102455, -62245.354747757076, 9855....   [311.8510307588487, -450.68732626950384, 27.36...   [1, 2, 0, 1, 1, 2, 1, 0, 1, 2, 2, 0, 2, 0, 1, ...
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
pd.set_option('chained_assignment', None)
np.random.seed(100)

# define the input data frame
df = pd.DataFrame({
    'country': ['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US'],
    'inv_count': [91064, 49190, 120584, 27348, 107865, 58944, 34278, 47936, 19229, 18716, 34958, 68670, 86187, 111691, 4583, 99969, 58868, 137728, 61817, 89282, 109959, 94515, 64524, 39598, 34791, 113512, 89053, 113504, 97454, 74338, 127219, 134734, 2776, 74876, 93921, 96265, 5354, 114885, 58670, 103276, 27973, 13052, 60989, 27008, 52628, 139029, 57010, 21174, 137186, 64757, 32629, 47240],
    'debit_count': [5, 8, 16, 2, 17, 2, 16, 28, 1, 7, 1, 2, 9, 5, 9, 129, 4, 5, 19, 12, 7, 8, 2, 2, 5, 1, 1, 34, 8, 27, 11, 11, 1, 12, 8, 4, 1, 8, 55, 1, 67, 1, 251, 44, 1, 1, 2, 733, 51, 33, 12, 12],
    'credit_count': [48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337],
    'early_count': [48, 699, 290, 570, 307, 632, 161, 740, 203, 268, 391, 647, 542, 372, 129, 756, 89, 454, 907, 110, 962, 539, 371, 997, 463, 910, 50, 422, 662, 545, 318, 909, 209, 635, 614, 298, 978, 884, 829, 342, 987, 98, 201, 843, 799, 781, 738, 568, 663, 489, 493, 337]
})

# add 2 columns for storing the PCA values
df['pca_1'] = np.nan
df['pca_2'] = np.nan

# add 1 column for storing the cluster labels
df['cluster'] = np.nan

# loop across the different countries
for country in df['country'].unique():

    # extract the data for the considered country
    data = df.loc[df['country'] == country]
    data.drop(labels=['country', 'pca_1', 'pca_2', 'cluster'], axis=1, inplace=True)
    data.reset_index(inplace=True, drop=True)

    # extract the principal components
    pca = PCA(n_components=2).fit_transform(data)

    # extract the cluster labels
    clusters = KMeans(n_clusters=3, random_state=123).fit_predict(pca)

    # save the results
    df.loc[df['country'] == country, 'pca_1'] = pca[:, 0]
    df.loc[df['country'] == country, 'pca_2'] = pca[:, 1]
    df.loc[df['country'] == country, 'cluster'] = clusters

df.head()
#   country  inv_count  debit_count  ...         pca_1       pca_2  cluster
# 0  Canada      91064            5  ...  29737.587415  565.298759      0.0
# 1  Canada      49190            8  ... -12135.271593 -405.699271      2.0
# 2  Canada     120584           16  ...  59257.979929  258.659878      0.0
# 3  Canada      27348            2  ... -33977.476225 -249.638903      1.0
# 4  Canada     107865           17  ...  46539.018792  219.183397      0.0

df.tail()
#    country  inv_count  debit_count  ...         pca_1       pca_2  cluster
# 47      US      21174          733  ... -43847.163578    7.510766      0.0
# 48      US     137186           51  ...  72165.447415  -41.127238      1.0
# 49      US      64757           33  ...   -263.822921 -162.163828      2.0
# 50      US      32629           12  ... -32391.736467 -100.762044      0.0
# 51      US      47240           12  ... -17781.137860 -346.435270      2.0