如何对命名元组列表中的数据进行分组
How to group data from a list of namedtuples
在python中,我在内存中的namedtuple列表中有以下数据:
from collections import namedtuple
perfvalues=[]
perfitem = namedtuple('perfitem', 'cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write')
item1=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk1', 650000, 500000, 1.2, 0.9)
item2=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk2', 630000, 480000, 1.1, 1)
item3=perfitem('cluster1', 'host1', 'database1', 'dg_data2', 'disk1', 730000, 250000, 0.4, 0.7)
item4=perfitem('cluster1', 'host1', 'database2', 'dg_data1', 'disk1', 320000, 400000, 1, 0.4)
item5=perfitem('cluster1', 'host1', 'database2', 'dg_data2', 'disk1', 550000, 300000, 0.8, 0.8)
item6=perfitem('cluster1', 'host2', 'database3', 'dg_data1', 'disk1', 420000, 310000, 1.2, 0.7)
item7=perfitem('cluster1', 'host2', 'database3', 'dg_data2', 'disk1', 880000, 280000, 0.4, 0.6)
item8=perfitem('cluster1', 'host2', 'database4', 'dg_data5', 'disk1', 440000, 600000, 1, 0.5)
item9=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk1', 490000, 450000, 1, 0.4)
item10=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk2', 410000, 450000, 1, 0.4)
perfvalues.append(item1)
perfvalues.append(item2)
perfvalues.append(item3)
perfvalues.append(item4)
perfvalues.append(item5)
perfvalues.append(item6)
perfvalues.append(item7)
perfvalues.append(item8)
perfvalues.append(item9)
print(perfvalues)
我想按以下方式对数据进行分组:
- 集群
- 集群和主机
- 集群、主机和数据库
- 集群、主机、数据库和磁盘组
我不需要磁盘详细信息。
我想在每个组中:
- 将 read_bytes_per_sec 和 write_bytes_per_sec
的值相加
- 计算 avg_ms_per_read 和 avg_ms_per_write
的平均值
因此,我想要一个包含以下内容的列表:
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data5', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database3', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database4', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_cluster', cluster='cluster1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
我猜结果可能是字典列表或命名元组列表。
最终列表中的顺序无关紧要。
我正在使用 python 3.8.
有什么想法吗?
谢谢
这些是 pandas 数据框,如果您需要 python 字典版本,您可以通过将 .to_dict()
添加到 per_diskgroup
、per_database
来获得它, per_host
、per_cluster
。例如:per_database.to_dict()
In [2]: l = ["cluster1,host1,database1,dg_data1,disk1,650000,500000,1.2,0.9",
...: "cluster1,host1,database1,dg_data1,disk2,630000,480000,1.1,1",
...: "cluster1,host1,database1,dg_data2,disk1,730000,250000,0.4,0.7",
...: "cluster1,host1,database2,dg_data1,disk1,320000,400000,1,0.4",
...: "cluster1,host1,database2,dg_data2,disk1,550000,300000,0.8,0.8",
...: "cluster1,host2,database3,dg_data1,disk1,420000,310000,1.2,0.7",
...: "cluster1,host2,database3,dg_data2,disk1,880000,280000,0.4,0.6",
...: "cluster1,host2,database4,dg_data5,disk1,440000,600000,1,0.5",
...: "cluster1,host2,database4,dg_data2,disk1,490000,450000,1,0.4",
...: "cluster1,host2,database4,dg_data2,disk2,410000,450000,1,0.4"]
...: l = [i.split(",") for i in l]
...: df = pd.DataFrame(l, columns="cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
...: ".split())
...: cols = ['read_bytes_per_sec', 'write_bytes_per_sec', 'avg_ms_per_read','avg_ms_per_write']
...: df[cols] = df[cols].astype(float)
...: df
Out[2]:
cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
0 cluster1 host1 database1 dg_data1 disk1 650000.0 500000.0 1.2 0.9
1 cluster1 host1 database1 dg_data1 disk2 630000.0 480000.0 1.1 1.0
2 cluster1 host1 database1 dg_data2 disk1 730000.0 250000.0 0.4 0.7
3 cluster1 host1 database2 dg_data1 disk1 320000.0 400000.0 1.0 0.4
4 cluster1 host1 database2 dg_data2 disk1 550000.0 300000.0 0.8 0.8
5 cluster1 host2 database3 dg_data1 disk1 420000.0 310000.0 1.2 0.7
6 cluster1 host2 database3 dg_data2 disk1 880000.0 280000.0 0.4 0.6
7 cluster1 host2 database4 dg_data5 disk1 440000.0 600000.0 1.0 0.5
8 cluster1 host2 database4 dg_data2 disk1 490000.0 450000.0 1.0 0.4
9 cluster1 host2 database4 dg_data2 disk2 410000.0 450000.0 1.0 0.4
In [3]: aggregation = {"read_bytes_per_sec" : sum, "avg_ms_per_read" : np.mean, "avg_ms_per_write" : np.mean}
...: per_diskgroup = df.groupby(['disk', 'cluster', 'host', 'database', 'diskgroup']).agg(aggregation)
...: per_database = df.groupby (['cluster', 'host', 'database']).agg(aggregation)
...: per_host = df.groupby (['host', 'cluster']).agg(aggregation)
...: per_cluster = df.groupby (['cluster']).agg(aggregation)
In [4]: per_diskgroup
Out[4]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
disk cluster host database diskgroup
disk1 cluster1 host1 database1 dg_data1 650000.0 1.2 0.9
dg_data2 730000.0 0.4 0.7
database2 dg_data1 320000.0 1.0 0.4
dg_data2 550000.0 0.8 0.8
host2 database3 dg_data1 420000.0 1.2 0.7
dg_data2 880000.0 0.4 0.6
database4 dg_data2 490000.0 1.0 0.4
dg_data5 440000.0 1.0 0.5
disk2 cluster1 host1 database1 dg_data1 630000.0 1.1 1.0
host2 database4 dg_data2 410000.0 1.0 0.4
In [5]: per_database
Out[5]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
cluster host database
cluster1 host1 database1 2010000.0 0.9 0.866667
database2 870000.0 0.9 0.600000
host2 database3 1300000.0 0.8 0.650000
database4 1340000.0 1.0 0.433333
In [6]: per_host
Out[6]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
host cluster
host1 cluster1 2880000.0 0.90 0.76
host2 cluster1 2640000.0 0.92 0.52
In [7]: per_cluster
Out[7]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
cluster
cluster1 5520000.0 0.91 0.64
在python中,我在内存中的namedtuple列表中有以下数据:
from collections import namedtuple
perfvalues=[]
perfitem = namedtuple('perfitem', 'cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write')
item1=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk1', 650000, 500000, 1.2, 0.9)
item2=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk2', 630000, 480000, 1.1, 1)
item3=perfitem('cluster1', 'host1', 'database1', 'dg_data2', 'disk1', 730000, 250000, 0.4, 0.7)
item4=perfitem('cluster1', 'host1', 'database2', 'dg_data1', 'disk1', 320000, 400000, 1, 0.4)
item5=perfitem('cluster1', 'host1', 'database2', 'dg_data2', 'disk1', 550000, 300000, 0.8, 0.8)
item6=perfitem('cluster1', 'host2', 'database3', 'dg_data1', 'disk1', 420000, 310000, 1.2, 0.7)
item7=perfitem('cluster1', 'host2', 'database3', 'dg_data2', 'disk1', 880000, 280000, 0.4, 0.6)
item8=perfitem('cluster1', 'host2', 'database4', 'dg_data5', 'disk1', 440000, 600000, 1, 0.5)
item9=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk1', 490000, 450000, 1, 0.4)
item10=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk2', 410000, 450000, 1, 0.4)
perfvalues.append(item1)
perfvalues.append(item2)
perfvalues.append(item3)
perfvalues.append(item4)
perfvalues.append(item5)
perfvalues.append(item6)
perfvalues.append(item7)
perfvalues.append(item8)
perfvalues.append(item9)
print(perfvalues)
我想按以下方式对数据进行分组:
- 集群
- 集群和主机
- 集群、主机和数据库
- 集群、主机、数据库和磁盘组
我不需要磁盘详细信息。
我想在每个组中:
- 将 read_bytes_per_sec 和 write_bytes_per_sec 的值相加
- 计算 avg_ms_per_read 和 avg_ms_per_write 的平均值
因此,我想要一个包含以下内容的列表:
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data5', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database3', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database4', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_cluster', cluster='cluster1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
我猜结果可能是字典列表或命名元组列表。 最终列表中的顺序无关紧要。 我正在使用 python 3.8.
有什么想法吗? 谢谢
这些是 pandas 数据框,如果您需要 python 字典版本,您可以通过将 .to_dict()
添加到 per_diskgroup
、per_database
来获得它, per_host
、per_cluster
。例如:per_database.to_dict()
In [2]: l = ["cluster1,host1,database1,dg_data1,disk1,650000,500000,1.2,0.9",
...: "cluster1,host1,database1,dg_data1,disk2,630000,480000,1.1,1",
...: "cluster1,host1,database1,dg_data2,disk1,730000,250000,0.4,0.7",
...: "cluster1,host1,database2,dg_data1,disk1,320000,400000,1,0.4",
...: "cluster1,host1,database2,dg_data2,disk1,550000,300000,0.8,0.8",
...: "cluster1,host2,database3,dg_data1,disk1,420000,310000,1.2,0.7",
...: "cluster1,host2,database3,dg_data2,disk1,880000,280000,0.4,0.6",
...: "cluster1,host2,database4,dg_data5,disk1,440000,600000,1,0.5",
...: "cluster1,host2,database4,dg_data2,disk1,490000,450000,1,0.4",
...: "cluster1,host2,database4,dg_data2,disk2,410000,450000,1,0.4"]
...: l = [i.split(",") for i in l]
...: df = pd.DataFrame(l, columns="cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
...: ".split())
...: cols = ['read_bytes_per_sec', 'write_bytes_per_sec', 'avg_ms_per_read','avg_ms_per_write']
...: df[cols] = df[cols].astype(float)
...: df
Out[2]:
cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
0 cluster1 host1 database1 dg_data1 disk1 650000.0 500000.0 1.2 0.9
1 cluster1 host1 database1 dg_data1 disk2 630000.0 480000.0 1.1 1.0
2 cluster1 host1 database1 dg_data2 disk1 730000.0 250000.0 0.4 0.7
3 cluster1 host1 database2 dg_data1 disk1 320000.0 400000.0 1.0 0.4
4 cluster1 host1 database2 dg_data2 disk1 550000.0 300000.0 0.8 0.8
5 cluster1 host2 database3 dg_data1 disk1 420000.0 310000.0 1.2 0.7
6 cluster1 host2 database3 dg_data2 disk1 880000.0 280000.0 0.4 0.6
7 cluster1 host2 database4 dg_data5 disk1 440000.0 600000.0 1.0 0.5
8 cluster1 host2 database4 dg_data2 disk1 490000.0 450000.0 1.0 0.4
9 cluster1 host2 database4 dg_data2 disk2 410000.0 450000.0 1.0 0.4
In [3]: aggregation = {"read_bytes_per_sec" : sum, "avg_ms_per_read" : np.mean, "avg_ms_per_write" : np.mean}
...: per_diskgroup = df.groupby(['disk', 'cluster', 'host', 'database', 'diskgroup']).agg(aggregation)
...: per_database = df.groupby (['cluster', 'host', 'database']).agg(aggregation)
...: per_host = df.groupby (['host', 'cluster']).agg(aggregation)
...: per_cluster = df.groupby (['cluster']).agg(aggregation)
In [4]: per_diskgroup
Out[4]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
disk cluster host database diskgroup
disk1 cluster1 host1 database1 dg_data1 650000.0 1.2 0.9
dg_data2 730000.0 0.4 0.7
database2 dg_data1 320000.0 1.0 0.4
dg_data2 550000.0 0.8 0.8
host2 database3 dg_data1 420000.0 1.2 0.7
dg_data2 880000.0 0.4 0.6
database4 dg_data2 490000.0 1.0 0.4
dg_data5 440000.0 1.0 0.5
disk2 cluster1 host1 database1 dg_data1 630000.0 1.1 1.0
host2 database4 dg_data2 410000.0 1.0 0.4
In [5]: per_database
Out[5]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
cluster host database
cluster1 host1 database1 2010000.0 0.9 0.866667
database2 870000.0 0.9 0.600000
host2 database3 1300000.0 0.8 0.650000
database4 1340000.0 1.0 0.433333
In [6]: per_host
Out[6]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
host cluster
host1 cluster1 2880000.0 0.90 0.76
host2 cluster1 2640000.0 0.92 0.52
In [7]: per_cluster
Out[7]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
cluster
cluster1 5520000.0 0.91 0.64