处理衣服中的纳氏(NaNs)
Dealing with Na's (NaNs) in kmodes
我正在查看食谱 (180K) * 成分 (~8000) 的高维数据集。我根据配方中是否包含一种成分,将值设为二进制。显然,在使用 Kmodes 时,如果我用 0s '''data = data.replace(np.nan, 0)''' 替换 NaNs。我最终得到一个密集类别(从零开始)和每个其他集群中的一个值(相似性基于 1 和 0)。所以问题是我怎样才能使这些 NaN 不被 Kmodes 考虑?
from kmodes.kmodes import KModes
km_cao = KModes(n_clusters=20, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(data)
fitClusters_cao
示例:
import pandas as pd
import numpy as np
{'recipe_id': {0: 424415, 1: 424415, 2: 424415, 3: 424415, 4: 424415, 5: 146223, 6: 146223, 7: 146223, 8: 146223, 9: 146223, 10: 146223, 11: 146223, 12: 146223, 13: 146223, 14: 146223, 15: 146223, 16: 146223, 17: 312329, 18: 312329, 19: 312329}, 'ingredient_ids': {0: 389, 1: 7655, 2: 6270, 3: 1527, 4: 3406, 5: 2683, 6: 4969, 7: 800, 8: 5298, 9: 840, 10: 2499, 11: 6632, 12: 7022, 13: 1511, 14: 3248, 15: 4964, 16: 6270, 17: 1257, 18: 7655, 19: 6270}}
df = pandas.DataFrame.from_dict(data_as_dict)
df[['counts']] = df\
.groupby(by = ['ingredient_ids'], as_index = False)['ingredient_ids'].count()
df[['counts']] = df\
.groupby(by = ['ingredient_ids'], as_index = False)['ingredient_ids'].count()
data_exploded = df[['recipe_id', 'ingredient_ids', 'counts']]
data_exploded['count'] = 1
data_exploded = data_exploded.drop('counts', axis = 1)
data_exploded = data_exploded.pivot_table(values = 'count', index = 'recipe_id', columns='ingredient_ids')
data_exploded = data_exploded.replace(np.nan, 0)
from kmodes.kmodes import KModes
km_cao = KModes(n_clusters=20, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(data_exploded)
fitClusters_cao
解决这个问题的方法是将所有内容都转换为字符串值(kmodes 显然可以处理)。因此,从 pivot.table() 开始,使 fill_value = '',如果使用二进制数据,还要将 1(和 0)转换为字符串值。
'''
data_exploded[['count']] = '1'
data_exploded = data_exploded.drop('counts', 轴 = 1)
#data_exploded[['count']] = data_exploded[['count']].astype(int)
data_exploded = data_exploded.pivot_table(index = 'recipe_id', columns='ingredient_ids', values = 'count', fill_value = '', aggfunc='sum')
data_exploded
#data_exploded = data_exploded.replace(0, Na)
#data_exploded = data_exploded.replace(np.nan, 0)
从 kmodes.kmodes 导入 KModes
km_cao = KModes(n_clusters=25, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(data_exploded)
fitClusters_cao
'''
我正在查看食谱 (180K) * 成分 (~8000) 的高维数据集。我根据配方中是否包含一种成分,将值设为二进制。显然,在使用 Kmodes 时,如果我用 0s '''data = data.replace(np.nan, 0)''' 替换 NaNs。我最终得到一个密集类别(从零开始)和每个其他集群中的一个值(相似性基于 1 和 0)。所以问题是我怎样才能使这些 NaN 不被 Kmodes 考虑?
from kmodes.kmodes import KModes
km_cao = KModes(n_clusters=20, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(data)
fitClusters_cao
示例:
import pandas as pd
import numpy as np
{'recipe_id': {0: 424415, 1: 424415, 2: 424415, 3: 424415, 4: 424415, 5: 146223, 6: 146223, 7: 146223, 8: 146223, 9: 146223, 10: 146223, 11: 146223, 12: 146223, 13: 146223, 14: 146223, 15: 146223, 16: 146223, 17: 312329, 18: 312329, 19: 312329}, 'ingredient_ids': {0: 389, 1: 7655, 2: 6270, 3: 1527, 4: 3406, 5: 2683, 6: 4969, 7: 800, 8: 5298, 9: 840, 10: 2499, 11: 6632, 12: 7022, 13: 1511, 14: 3248, 15: 4964, 16: 6270, 17: 1257, 18: 7655, 19: 6270}}
df = pandas.DataFrame.from_dict(data_as_dict)
df[['counts']] = df\
.groupby(by = ['ingredient_ids'], as_index = False)['ingredient_ids'].count()
df[['counts']] = df\
.groupby(by = ['ingredient_ids'], as_index = False)['ingredient_ids'].count()
data_exploded = df[['recipe_id', 'ingredient_ids', 'counts']]
data_exploded['count'] = 1
data_exploded = data_exploded.drop('counts', axis = 1)
data_exploded = data_exploded.pivot_table(values = 'count', index = 'recipe_id', columns='ingredient_ids')
data_exploded = data_exploded.replace(np.nan, 0)
from kmodes.kmodes import KModes
km_cao = KModes(n_clusters=20, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(data_exploded)
fitClusters_cao
解决这个问题的方法是将所有内容都转换为字符串值(kmodes 显然可以处理)。因此,从 pivot.table() 开始,使 fill_value = '',如果使用二进制数据,还要将 1(和 0)转换为字符串值。
''' data_exploded[['count']] = '1'
data_exploded = data_exploded.drop('counts', 轴 = 1)
#data_exploded[['count']] = data_exploded[['count']].astype(int) data_exploded = data_exploded.pivot_table(index = 'recipe_id', columns='ingredient_ids', values = 'count', fill_value = '', aggfunc='sum') data_exploded #data_exploded = data_exploded.replace(0, Na)
#data_exploded = data_exploded.replace(np.nan, 0)
从 kmodes.kmodes 导入 KModes km_cao = KModes(n_clusters=25, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(data_exploded) fitClusters_cao '''