按唯一属性值过滤数据类实例
Filtering dataclass instances by unique attribute value
我有以下形式的数据类实例列表:
dataclass_list = [DataEntry(company="Microsoft", users=["Jane Doe", "John Doe"]), DataEntry(company="Google", users=["Bob Whoever"]), DataEntry(company="Microsoft", users=[])]
现在我想过滤该列表并通过特定键(在本例中为公司)仅获取唯一实例。
想要的列表:
new_list = [DataEntry(company="Microsoft", users=["Jane Doe", "John Doe"]), DataEntry(company="Google", users=["Bob Whoever"])]
最初的想法是按照 python 的 set()
或 filter()
函数的方式使用一个函数,但这两者在这里是不可能的。
目前我的工作解决方案:
tup_list = [(dataclass, dataclass.company)) for dataclass in dataclass_list]
new_list = []
check_list = []
for tup in tup_list:
if tup[1].lower() not in check_list:
new_list.append(tup[0])
check_list.append(tup[1].lower())
这给了我想要的输出,但我想知道是否有更 pythonic 或优雅的解决方案?
这是另一个解决方案,您是否觉得更优雅取决于您:
unique = {}
for dc in dataclass_list:
if dc.company not in unique:
unique[dc.company] = dc
new_list = list(unique.values())
在你的DataEntry
数据中class你需要重写__eq__(...)
and __hash__(...)
函数,你在其中指定计算一个对象的散列值时使用哪个属性,什么时候是两个对象被认为是平等的。
一个简短的示例,其中 class Company
的 name
属性默认用于确定两个对象的相等性。我还通过一个选项扩展了您的案例,您可以在其中确定在构造对象时将被视为唯一性的属性。请注意,将要比较的所有对象 需要具有相同的 comparison_attr
。
import pprint
class Company:
def __init__(self, name, location, comparison_attr="name") -> None:
# By default we use the attribute `name` for comparison
self.name = name
self.location = location
self.__comparison_attr = comparison_attr
def __hash__(self) -> int:
return hash(self.__getattribute__(self.__comparison_attr))
def __eq__(self, other: object) -> bool:
return self.__getattribute__(self.__comparison_attr) == other.__getattribute__(self.__comparison_attr)
def __repr__(self) -> str:
return f"name={self.name}, location={self.location}"
for attribute_name in ["name", "location"]:
companies = [
Company("Google", "Palo Alto", comparison_attr=attribute_name),
Company("Google", "Berlin", comparison_attr=attribute_name),
Company("Microsoft", "Berlin", comparison_attr=attribute_name),
Company("Microsoft", "San Francisco", comparison_attr=attribute_name),
Company("IBM", "Palo Alto", comparison_attr=attribute_name),
]
print(f"Attribute considered for uniqueness: {attribute_name}")
pprint.pprint(set(companies))
输出:
Attribute considered for uniqueness: name
{name=Microsoft, location=Berlin,
name=Google, location=Palo Alto,
name=IBM, location=Palo Alto}
Attribute considered for uniqueness: location
{name=Microsoft, location=San Francisco,
name=Google, location=Berlin,
name=Google, location=Palo Alto}
我有以下形式的数据类实例列表:
dataclass_list = [DataEntry(company="Microsoft", users=["Jane Doe", "John Doe"]), DataEntry(company="Google", users=["Bob Whoever"]), DataEntry(company="Microsoft", users=[])]
现在我想过滤该列表并通过特定键(在本例中为公司)仅获取唯一实例。
想要的列表:
new_list = [DataEntry(company="Microsoft", users=["Jane Doe", "John Doe"]), DataEntry(company="Google", users=["Bob Whoever"])]
最初的想法是按照 python 的 set()
或 filter()
函数的方式使用一个函数,但这两者在这里是不可能的。
目前我的工作解决方案:
tup_list = [(dataclass, dataclass.company)) for dataclass in dataclass_list]
new_list = []
check_list = []
for tup in tup_list:
if tup[1].lower() not in check_list:
new_list.append(tup[0])
check_list.append(tup[1].lower())
这给了我想要的输出,但我想知道是否有更 pythonic 或优雅的解决方案?
这是另一个解决方案,您是否觉得更优雅取决于您:
unique = {}
for dc in dataclass_list:
if dc.company not in unique:
unique[dc.company] = dc
new_list = list(unique.values())
在你的DataEntry
数据中class你需要重写__eq__(...)
and __hash__(...)
函数,你在其中指定计算一个对象的散列值时使用哪个属性,什么时候是两个对象被认为是平等的。
一个简短的示例,其中 class Company
的 name
属性默认用于确定两个对象的相等性。我还通过一个选项扩展了您的案例,您可以在其中确定在构造对象时将被视为唯一性的属性。请注意,将要比较的所有对象 需要具有相同的 comparison_attr
。
import pprint
class Company:
def __init__(self, name, location, comparison_attr="name") -> None:
# By default we use the attribute `name` for comparison
self.name = name
self.location = location
self.__comparison_attr = comparison_attr
def __hash__(self) -> int:
return hash(self.__getattribute__(self.__comparison_attr))
def __eq__(self, other: object) -> bool:
return self.__getattribute__(self.__comparison_attr) == other.__getattribute__(self.__comparison_attr)
def __repr__(self) -> str:
return f"name={self.name}, location={self.location}"
for attribute_name in ["name", "location"]:
companies = [
Company("Google", "Palo Alto", comparison_attr=attribute_name),
Company("Google", "Berlin", comparison_attr=attribute_name),
Company("Microsoft", "Berlin", comparison_attr=attribute_name),
Company("Microsoft", "San Francisco", comparison_attr=attribute_name),
Company("IBM", "Palo Alto", comparison_attr=attribute_name),
]
print(f"Attribute considered for uniqueness: {attribute_name}")
pprint.pprint(set(companies))
输出:
Attribute considered for uniqueness: name
{name=Microsoft, location=Berlin,
name=Google, location=Palo Alto,
name=IBM, location=Palo Alto}
Attribute considered for uniqueness: location
{name=Microsoft, location=San Francisco,
name=Google, location=Berlin,
name=Google, location=Palo Alto}