尝试一个函数来获取不同范围内的数据,对其进行计数,然后 return 一个计数列表
Trying a function to get data in separate ranges, tally it, and return a list of the tallies
我一直在尝试编写一个函数,从字典中获取一个值,检查它的范围,然后在它的范围内计算它。返回统计值列表。
所以给定以下字典:
data={'P1': {'age': 'eighty two', 'salary': '96.0', 'suburb': 'Toorak', 'language': 'English'},
'P2': {'age': '49', 'salary': '14.0', 'suburb': 'St. Kilda', 'language': 'Chinese'},
'P3': {'age': '54', 'salary': '75.0', 'suburb': 'Neverland', 'language': 'Italian'}}
以及函数代码:
def wealth_distribution(data, n_bins, max_salary):
count = 0
sal_list = []
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
for val in data.values():
if val['salary'] == None:
continue
for n in bin_list:
if math.floor(n*bin_width)<=float(val['salary'])<math.floor((n+1)*bin_width):
count+= 1
sal_list.append(count)
return sal_list
给定 n_bins
= 5 且 max_salary
= 100,所需输出为 [1,0,0,1,1]
.
但是函数returns[0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6]
.
基本上我修复的代码有几个问题。一个大问题,我相信你想要的是你的数学 if 语句,我也修复了它。此解决方案不是最有效但有效。
def wealth_distribution(data, n_bins, max_salary):
count = 0
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
sal_list = [0]*len(bin_list)
for val in data.values():
if val['salary']:
for index, n in enumerate(bin_list):
if math.floor(n) <= float(val['salary']) < math.floor(n+bin_width):
sal_list[index] += 1
return sal_list
print(wealth_distribution(data, 5, 100))
首先,您似乎有一个缩进错误 - for val in data.values():
不应嵌套在 for bins in range(0, max_salary+1, bin_width):
中 - 这就是为什么你会得到一个更长的值列表。
其次,您的逻辑在各个方面都有点偏差 - 您保留了一个仅在函数开始时设置为零一次的计数变量。 for n in bin_list:
循环遍历 bin_list
中的值,但是您随后将 n
乘以 bin_width
,这没有意义。您可以使用 range(n_bins)
更改此设置以遍历 bin_lists
的索引,如下所示:
def wealth_distribution(data, n_bins, max_salary):
sal_list = [0] * n_bins
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
for val in data.values():
if val['salary'] == None:
continue
for i in range(n_bins):
if math.floor(i*bin_width)<=float(val['salary'])<math.floor((i+1)*bin_width):
sal_list[i] += 1
return sal_list
但仔细观察,bin_list
实际上在这里没有任何作用。函数可以简化为:
def wealth_distribution(data, n_bins, max_salary):
sal_list = [0] * n_bins
bin_width = max_salary/n_bins
for val in data.values():
if val['salary'] == None:
continue
bin_index = int(float(val["salary"]) / bin_width)
if bin_index < n_bins:
sal_list[bin_index] += 1
else: # salary = max_salary
sal_list[n_bins-1] += 1
return sal_list
上面的函数计算 bin 索引而不是遍历 bin 或索引。我还删除了 math.floor
s,因为这些似乎没有必要,并且可能导致一些小的舍入误差会使一些薪水未分类。
您可以使用 collections.Counter
进一步简化:
from collections import Counter
def wealth_distribution(data, n_bins, max_salary):
bin_width = max_salary / n_bins
bins = Counter(min(int(float(val["salary"]) // bin_width), n_bins-1)
for val in data.values())
return [bins[i] for i in range(n_bins)]
numpy
中有一个 histogram
函数也可以执行您想要的操作,并且作为奖励提供了 bin 边界数组。
import numpy as np
salaries = [float(val["salary"]) for val in data.values()]
sal_list, bin_list = np.histogram(salaries, bins=5, range=(0, 100))
如果您想使用 pandas
...(可能对同一数据的其他操作有用)
import pandas as pd
def wealth_distribution(data, n_bins, max_salary):
df = pd.DataFrame(data).transpose()
bin_width = max_salary / n_bins
df["salary_bin"] = (pd.to_numeric(df["salary"]) // bin_width).clip(upper=n_bins-1)
counts = df["salary_bin"].value_counts()
return counts.reindex(range(n_bins), fill_value=0).values
我不确定您的代码究竟有什么问题,只是它看起来不必要地复杂。
以下是我的做法:
from math import floor
def wealth_distribution(data, n_bins, max_salary):
sal_list = [0 for _ in range(n_bins)] # Pre-allocate salary counts.
bin_width = max_salary // n_bins
for item in data.values():
salary = float(item['salary'])
for i in range(n_bins):
low = floor(float(i * bin_width))
high = floor(float(low + bin_width))
if (salary is not None) and (low <= salary < high):
sal_list[i] += 1
break
return sal_list
data={
'P1': {'age': 'eighty two', 'salary': '96.0', 'suburb': 'Toorak', 'language': 'English'},
'P2': {'age': '49', 'salary': '14.0', 'suburb': 'St. Kilda', 'language': 'Chinese'},
'P3': {'age': '54', 'salary': '75.0', 'suburb': 'Neverland', 'language': 'Italian'}
}
sal_list = wealth_distribution(data, 5, 100)
print(sal_list) # -> [1, 0, 0, 1, 1]
import pandas as pd
from pandas import DataFrame
def wealth_distribution(data, n_bins, max_salary):
sal_list = []
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
sal_list = [0] * (len(bin_list) - 1)
df = pd.DataFrame(data)
for sal in range(0,len(df) - 1):
salary = float(df.loc['salary'][sal])
for i in range(len(bin_list) - 1,-1,-1):
if salary > bin_list[i]:
sal_list[i] += 1
break
return sal_list
我一直在尝试编写一个函数,从字典中获取一个值,检查它的范围,然后在它的范围内计算它。返回统计值列表。
所以给定以下字典:
data={'P1': {'age': 'eighty two', 'salary': '96.0', 'suburb': 'Toorak', 'language': 'English'},
'P2': {'age': '49', 'salary': '14.0', 'suburb': 'St. Kilda', 'language': 'Chinese'},
'P3': {'age': '54', 'salary': '75.0', 'suburb': 'Neverland', 'language': 'Italian'}}
以及函数代码:
def wealth_distribution(data, n_bins, max_salary):
count = 0
sal_list = []
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
for val in data.values():
if val['salary'] == None:
continue
for n in bin_list:
if math.floor(n*bin_width)<=float(val['salary'])<math.floor((n+1)*bin_width):
count+= 1
sal_list.append(count)
return sal_list
给定 n_bins
= 5 且 max_salary
= 100,所需输出为 [1,0,0,1,1]
.
但是函数returns[0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6]
.
基本上我修复的代码有几个问题。一个大问题,我相信你想要的是你的数学 if 语句,我也修复了它。此解决方案不是最有效但有效。
def wealth_distribution(data, n_bins, max_salary):
count = 0
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
sal_list = [0]*len(bin_list)
for val in data.values():
if val['salary']:
for index, n in enumerate(bin_list):
if math.floor(n) <= float(val['salary']) < math.floor(n+bin_width):
sal_list[index] += 1
return sal_list
print(wealth_distribution(data, 5, 100))
首先,您似乎有一个缩进错误 - for val in data.values():
不应嵌套在 for bins in range(0, max_salary+1, bin_width):
中 - 这就是为什么你会得到一个更长的值列表。
其次,您的逻辑在各个方面都有点偏差 - 您保留了一个仅在函数开始时设置为零一次的计数变量。 for n in bin_list:
循环遍历 bin_list
中的值,但是您随后将 n
乘以 bin_width
,这没有意义。您可以使用 range(n_bins)
更改此设置以遍历 bin_lists
的索引,如下所示:
def wealth_distribution(data, n_bins, max_salary):
sal_list = [0] * n_bins
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
for val in data.values():
if val['salary'] == None:
continue
for i in range(n_bins):
if math.floor(i*bin_width)<=float(val['salary'])<math.floor((i+1)*bin_width):
sal_list[i] += 1
return sal_list
但仔细观察,bin_list
实际上在这里没有任何作用。函数可以简化为:
def wealth_distribution(data, n_bins, max_salary):
sal_list = [0] * n_bins
bin_width = max_salary/n_bins
for val in data.values():
if val['salary'] == None:
continue
bin_index = int(float(val["salary"]) / bin_width)
if bin_index < n_bins:
sal_list[bin_index] += 1
else: # salary = max_salary
sal_list[n_bins-1] += 1
return sal_list
上面的函数计算 bin 索引而不是遍历 bin 或索引。我还删除了 math.floor
s,因为这些似乎没有必要,并且可能导致一些小的舍入误差会使一些薪水未分类。
您可以使用 collections.Counter
进一步简化:
from collections import Counter
def wealth_distribution(data, n_bins, max_salary):
bin_width = max_salary / n_bins
bins = Counter(min(int(float(val["salary"]) // bin_width), n_bins-1)
for val in data.values())
return [bins[i] for i in range(n_bins)]
numpy
中有一个 histogram
函数也可以执行您想要的操作,并且作为奖励提供了 bin 边界数组。
import numpy as np
salaries = [float(val["salary"]) for val in data.values()]
sal_list, bin_list = np.histogram(salaries, bins=5, range=(0, 100))
如果您想使用 pandas
...(可能对同一数据的其他操作有用)
import pandas as pd
def wealth_distribution(data, n_bins, max_salary):
df = pd.DataFrame(data).transpose()
bin_width = max_salary / n_bins
df["salary_bin"] = (pd.to_numeric(df["salary"]) // bin_width).clip(upper=n_bins-1)
counts = df["salary_bin"].value_counts()
return counts.reindex(range(n_bins), fill_value=0).values
我不确定您的代码究竟有什么问题,只是它看起来不必要地复杂。
以下是我的做法:
from math import floor
def wealth_distribution(data, n_bins, max_salary):
sal_list = [0 for _ in range(n_bins)] # Pre-allocate salary counts.
bin_width = max_salary // n_bins
for item in data.values():
salary = float(item['salary'])
for i in range(n_bins):
low = floor(float(i * bin_width))
high = floor(float(low + bin_width))
if (salary is not None) and (low <= salary < high):
sal_list[i] += 1
break
return sal_list
data={
'P1': {'age': 'eighty two', 'salary': '96.0', 'suburb': 'Toorak', 'language': 'English'},
'P2': {'age': '49', 'salary': '14.0', 'suburb': 'St. Kilda', 'language': 'Chinese'},
'P3': {'age': '54', 'salary': '75.0', 'suburb': 'Neverland', 'language': 'Italian'}
}
sal_list = wealth_distribution(data, 5, 100)
print(sal_list) # -> [1, 0, 0, 1, 1]
import pandas as pd
from pandas import DataFrame
def wealth_distribution(data, n_bins, max_salary):
sal_list = []
bin_list = []
bin_width = int(max_salary/n_bins)
for bins in range(0, max_salary+1, bin_width):
bin_list.append(bins)
sal_list = [0] * (len(bin_list) - 1)
df = pd.DataFrame(data)
for sal in range(0,len(df) - 1):
salary = float(df.loc['salary'][sal])
for i in range(len(bin_list) - 1,-1,-1):
if salary > bin_list[i]:
sal_list[i] += 1
break
return sal_list