Python: 重命名列表中的重复项而不排序列表
Python: Rename duplicates in list with progressive numbers without sorting list
给出这样的列表:
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
我想通过附加数字重命名重复项以获得以下结果:
mylist = ["name1", "state", "name2", "city", "name3", "zip1", "zip2"]
我不想改变原来列表的顺序。为此 related Stack Overflow question 建议的解决方案对列表进行排序,我不想这样做。
你可以使用哈希表来解决这个问题。定义字典 D.键是字符串,值是 (first_time_index_in_the_list, times_of_appearance)。每次看到一个词,就查字典,如果值为2,用first_time_index_in_the_list把'1'追加到第一个元素,用times_of_appearance追加到当前元素。如果大于 2,则将 times_of_appearance 附加到当前元素。
我对 map
和 lambda
的解决方案:
print map(lambda x: x[1] + str(mylist[:x[0]].count(x[1]) + 1) if mylist.count(x[1]) > 1 else x[1], enumerate(mylist))
更传统的形式
newlist = []
for i, v in enumerate(mylist):
totalcount = mylist.count(v)
count = mylist[:i].count(v)
newlist.append(v + str(count + 1) if totalcount > 1 else v)
最后一个
[v + str(mylist[:i].count(v) + 1) if mylist.count(v) > 1 else v for i, v in enumerate(mylist)]
任何在每个元素上调用 count
的方法都将导致 O(n^2)
,因为 count
是 O(n)
。你可以这样做:
# not modifying original list
from collections import Counter
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
counts = {k:v for k,v in Counter(mylist).items() if v > 1}
newlist = mylist[:]
for i in reversed(range(len(mylist))):
item = mylist[i]
if item in counts and counts[item]:
newlist[i] += str(counts[item])
counts[item]-=1
print(newlist)
# ['name1', 'state', 'name2', 'city', 'name3', 'zip1', 'zip2']
# modifying original list
from collections import Counter
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
counts = {k:v for k,v in Counter(mylist).items() if v > 1}
for i in reversed(range(len(mylist))):
item = mylist[i]
if item in counts and counts[item]:
mylist[i] += str(counts[item])
counts[item]-=1
print(mylist)
# ['name1', 'state', 'name2', 'city', 'name3', 'zip1', 'zip2']
这应该是O(n)
。
其他提供的答案:
mylist.index(s)
每个元素导致 O(n^2)
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
from collections import Counter
counts = Counter(mylist)
for s,num in counts.items():
if num > 1:
for suffix in range(1, num + 1):
mylist[mylist.index(s)] = s + str(suffix)
count(x[1])
每个元素导致 O(n^2)
它也与列表切片一起用于每个元素多次。
print map(lambda x: x[1] + str(mylist[:x[0]].count(x[1]) + 1) if mylist.count(x[1]) > 1 else x[1], enumerate(mylist))
基准:
不那么花哨的东西。
from collections import defaultdict
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
finalList = []
dictCount = defaultdict(int)
anotherDict = defaultdict(int)
for t in mylist:
anotherDict[t] += 1
for m in mylist:
dictCount[m] += 1
if anotherDict[m] > 1:
finalList.append(str(m)+str(dictCount[m]))
else:
finalList.append(m)
print finalList
这是一个非常简单的 O(n)
解决方案。只需遍历存储列表中元素索引的列表。如果我们以前见过这个元素,使用之前存储的数据追加出现值。
这种方法解决了这个问题,只需要再创建一个字典用于回顾。避免进行前瞻,这样我们就不会创建临时列表切片。
mylist = ["name", "state", "name", "city", "city", "name", "zip", "zip", "name"]
dups = {}
for i, val in enumerate(mylist):
if val not in dups:
# Store index of first occurrence and occurrence value
dups[val] = [i, 1]
else:
# Special case for first occurrence
if dups[val][1] == 1:
mylist[dups[val][0]] += str(dups[val][1])
# Increment occurrence value, index value doesn't matter anymore
dups[val][1] += 1
# Use stored occurrence value
mylist[i] += str(dups[val][1])
print mylist
# ['name1', 'state', 'name2', 'city1', 'city2', 'name3', 'zip1', 'zip2', 'name4']
这就是我要做的。编辑:我把它写成一个更通用的效用函数,因为人们似乎喜欢这个答案。
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
check = ["name1", "state", "name2", "city", "name3", "zip1", "zip2"]
copy = mylist[:] # so we will only mutate the copy in case of failure
from collections import Counter # Counter counts the number of occurrences of each item
from itertools import tee, count
def uniquify(seq, suffs = count(1)):
"""Make all the items unique by adding a suffix (1, 2, etc).
`seq` is mutable sequence of strings.
`suffs` is an optional alternative suffix iterable.
"""
not_unique = [k for k,v in Counter(seq).items() if v>1] # so we have: ['name', 'zip']
# suffix generator dict - e.g., {'name': <my_gen>, 'zip': <my_gen>}
suff_gens = dict(zip(not_unique, tee(suffs, len(not_unique))))
for idx,s in enumerate(seq):
try:
suffix = str(next(suff_gens[s]))
except KeyError:
# s was unique
continue
else:
seq[idx] += suffix
uniquify(copy)
assert copy==check # raise an error if we failed
mylist = copy # success
如果你想在每个计数前添加一个下划线,你可以这样做:
>>> mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
>>> uniquify(mylist, (f'_{x!s}' for x in range(1, 100)))
>>> mylist
['name_1', 'state', 'name_2', 'city', 'name_3', 'zip_1', 'zip_2']
...或者如果您想改用字母:
>>> mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
>>> import string
>>> uniquify(mylist, (f'_{x!s}' for x in string.ascii_lowercase))
>>> mylist
['name_a', 'state', 'name_b', 'city', 'name_c', 'zip_a', 'zip_b']
注意:这不是最快的算法;为此,请参阅 。上述函数的优点是易于理解和阅读,除非您有一个非常大的列表,否则您不会看到太大的性能差异。
编辑:这是我在一行中的原始答案,但是顺序没有保留,它使用了 .index
方法,这是非常次优的(如 ). See 中所解释的很好 'two-liner' 保留了顺序。
[s + str(suffix) if num>1 else s for s,num in Counter(mylist).items() for suffix in range(1, num+1)]
# Produces: ['zip1', 'zip2', 'city', 'state', 'name1', 'name2', 'name3']
Rick Teachey 、"two-liner":
的列表理解版本
from collections import Counter
m = ["name", "state", "name", "city", "name", "zip", "zip"]
d = {a:list(range(1, b+1)) if b>1 else '' for a,b in Counter(m).items()}
[i+str(d[i].pop(0)) if len(d[i]) else i for i in m]
#['name1', 'state', 'name2', 'city', 'name3', 'zip1', 'zip2']
注意原始列表中已存在的更新值
如果起始列表已经包含一个项目"name2"
...
mylist = ["name", "state", "name", "city", "name", "zip", "zip", "name2"]
...那么在函数运行时mylist[2]
不应该更新为"name2"
,否则会创建一个新的副本;相反,函数应该跳转到下一个可用的项目名称 "name3"
.
mylist_updated = ['name1', 'state', 'name3', 'city', 'name4', 'zip1', 'zip2', 'name2']
这是一个替代解决方案(可能会被缩短和优化),其中包括一个递归函数,用于检查原始列表中的这些现有项目。
mylist = ["name", "state", "name", "city", "name", "zip", "zip", "name2"]
def fix_dups(mylist, sep='', start=1, update_first=True):
mylist_dups = {}
#build dictionary containing val: [occurrences, suffix]
for val in mylist:
if val not in mylist_dups:
mylist_dups[val] = [1, start - 1]
else:
mylist_dups[val][0] += 1
#define function to update duplicate values with suffix, check if updated value already exists
def update_val(val, num):
temp_val = sep.join([str(x) for x in [val, num]])
if temp_val not in mylist_dups:
return temp_val, num
else:
num += 1
return update_val(val, num)
#update list
for i, val in enumerate(mylist):
if mylist_dups[val][0] > 1:
mylist_dups[val][1] += 1
if update_first or mylist_dups[val][1] > start:
new_val, mylist_dups[val][1] = update_val(val, mylist_dups[val][1])
mylist[i] = new_val
return mylist
mylist_updated = fix_dups(mylist, sep='', start=1, update_first=True)
print(mylist_updated)
#['name1', 'state', 'name3', 'city', 'name4', 'zip1', 'zip2', 'name2']
如果您不想更改第一个出现的地方。
mylist = ["name", "state", "name", "city", "name", "zip", "zip", "name_2"]
mylist_updated = fix_dups(mylist, sep='_', start=0, update_first=False)
print(mylist_updated)
#['name', 'state', 'name_1', 'city', 'name_3', 'zip', 'zip_1', 'name_2']
给出这样的列表:
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
我想通过附加数字重命名重复项以获得以下结果:
mylist = ["name1", "state", "name2", "city", "name3", "zip1", "zip2"]
我不想改变原来列表的顺序。为此 related Stack Overflow question 建议的解决方案对列表进行排序,我不想这样做。
你可以使用哈希表来解决这个问题。定义字典 D.键是字符串,值是 (first_time_index_in_the_list, times_of_appearance)。每次看到一个词,就查字典,如果值为2,用first_time_index_in_the_list把'1'追加到第一个元素,用times_of_appearance追加到当前元素。如果大于 2,则将 times_of_appearance 附加到当前元素。
我对 map
和 lambda
的解决方案:
print map(lambda x: x[1] + str(mylist[:x[0]].count(x[1]) + 1) if mylist.count(x[1]) > 1 else x[1], enumerate(mylist))
更传统的形式
newlist = []
for i, v in enumerate(mylist):
totalcount = mylist.count(v)
count = mylist[:i].count(v)
newlist.append(v + str(count + 1) if totalcount > 1 else v)
最后一个
[v + str(mylist[:i].count(v) + 1) if mylist.count(v) > 1 else v for i, v in enumerate(mylist)]
任何在每个元素上调用 count
的方法都将导致 O(n^2)
,因为 count
是 O(n)
。你可以这样做:
# not modifying original list
from collections import Counter
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
counts = {k:v for k,v in Counter(mylist).items() if v > 1}
newlist = mylist[:]
for i in reversed(range(len(mylist))):
item = mylist[i]
if item in counts and counts[item]:
newlist[i] += str(counts[item])
counts[item]-=1
print(newlist)
# ['name1', 'state', 'name2', 'city', 'name3', 'zip1', 'zip2']
# modifying original list
from collections import Counter
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
counts = {k:v for k,v in Counter(mylist).items() if v > 1}
for i in reversed(range(len(mylist))):
item = mylist[i]
if item in counts and counts[item]:
mylist[i] += str(counts[item])
counts[item]-=1
print(mylist)
# ['name1', 'state', 'name2', 'city', 'name3', 'zip1', 'zip2']
这应该是O(n)
。
其他提供的答案:
mylist.index(s)
每个元素导致 O(n^2)
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
from collections import Counter
counts = Counter(mylist)
for s,num in counts.items():
if num > 1:
for suffix in range(1, num + 1):
mylist[mylist.index(s)] = s + str(suffix)
count(x[1])
每个元素导致 O(n^2)
它也与列表切片一起用于每个元素多次。
print map(lambda x: x[1] + str(mylist[:x[0]].count(x[1]) + 1) if mylist.count(x[1]) > 1 else x[1], enumerate(mylist))
基准:
不那么花哨的东西。
from collections import defaultdict
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
finalList = []
dictCount = defaultdict(int)
anotherDict = defaultdict(int)
for t in mylist:
anotherDict[t] += 1
for m in mylist:
dictCount[m] += 1
if anotherDict[m] > 1:
finalList.append(str(m)+str(dictCount[m]))
else:
finalList.append(m)
print finalList
这是一个非常简单的 O(n)
解决方案。只需遍历存储列表中元素索引的列表。如果我们以前见过这个元素,使用之前存储的数据追加出现值。
这种方法解决了这个问题,只需要再创建一个字典用于回顾。避免进行前瞻,这样我们就不会创建临时列表切片。
mylist = ["name", "state", "name", "city", "city", "name", "zip", "zip", "name"]
dups = {}
for i, val in enumerate(mylist):
if val not in dups:
# Store index of first occurrence and occurrence value
dups[val] = [i, 1]
else:
# Special case for first occurrence
if dups[val][1] == 1:
mylist[dups[val][0]] += str(dups[val][1])
# Increment occurrence value, index value doesn't matter anymore
dups[val][1] += 1
# Use stored occurrence value
mylist[i] += str(dups[val][1])
print mylist
# ['name1', 'state', 'name2', 'city1', 'city2', 'name3', 'zip1', 'zip2', 'name4']
这就是我要做的。编辑:我把它写成一个更通用的效用函数,因为人们似乎喜欢这个答案。
mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
check = ["name1", "state", "name2", "city", "name3", "zip1", "zip2"]
copy = mylist[:] # so we will only mutate the copy in case of failure
from collections import Counter # Counter counts the number of occurrences of each item
from itertools import tee, count
def uniquify(seq, suffs = count(1)):
"""Make all the items unique by adding a suffix (1, 2, etc).
`seq` is mutable sequence of strings.
`suffs` is an optional alternative suffix iterable.
"""
not_unique = [k for k,v in Counter(seq).items() if v>1] # so we have: ['name', 'zip']
# suffix generator dict - e.g., {'name': <my_gen>, 'zip': <my_gen>}
suff_gens = dict(zip(not_unique, tee(suffs, len(not_unique))))
for idx,s in enumerate(seq):
try:
suffix = str(next(suff_gens[s]))
except KeyError:
# s was unique
continue
else:
seq[idx] += suffix
uniquify(copy)
assert copy==check # raise an error if we failed
mylist = copy # success
如果你想在每个计数前添加一个下划线,你可以这样做:
>>> mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
>>> uniquify(mylist, (f'_{x!s}' for x in range(1, 100)))
>>> mylist
['name_1', 'state', 'name_2', 'city', 'name_3', 'zip_1', 'zip_2']
...或者如果您想改用字母:
>>> mylist = ["name", "state", "name", "city", "name", "zip", "zip"]
>>> import string
>>> uniquify(mylist, (f'_{x!s}' for x in string.ascii_lowercase))
>>> mylist
['name_a', 'state', 'name_b', 'city', 'name_c', 'zip_a', 'zip_b']
注意:这不是最快的算法;为此,请参阅
编辑:这是我在一行中的原始答案,但是顺序没有保留,它使用了 .index
方法,这是非常次优的(如
[s + str(suffix) if num>1 else s for s,num in Counter(mylist).items() for suffix in range(1, num+1)]
# Produces: ['zip1', 'zip2', 'city', 'state', 'name1', 'name2', 'name3']
Rick Teachey
from collections import Counter
m = ["name", "state", "name", "city", "name", "zip", "zip"]
d = {a:list(range(1, b+1)) if b>1 else '' for a,b in Counter(m).items()}
[i+str(d[i].pop(0)) if len(d[i]) else i for i in m]
#['name1', 'state', 'name2', 'city', 'name3', 'zip1', 'zip2']
注意原始列表中已存在的更新值
如果起始列表已经包含一个项目"name2"
...
mylist = ["name", "state", "name", "city", "name", "zip", "zip", "name2"]
...那么在函数运行时mylist[2]
不应该更新为"name2"
,否则会创建一个新的副本;相反,函数应该跳转到下一个可用的项目名称 "name3"
.
mylist_updated = ['name1', 'state', 'name3', 'city', 'name4', 'zip1', 'zip2', 'name2']
这是一个替代解决方案(可能会被缩短和优化),其中包括一个递归函数,用于检查原始列表中的这些现有项目。
mylist = ["name", "state", "name", "city", "name", "zip", "zip", "name2"]
def fix_dups(mylist, sep='', start=1, update_first=True):
mylist_dups = {}
#build dictionary containing val: [occurrences, suffix]
for val in mylist:
if val not in mylist_dups:
mylist_dups[val] = [1, start - 1]
else:
mylist_dups[val][0] += 1
#define function to update duplicate values with suffix, check if updated value already exists
def update_val(val, num):
temp_val = sep.join([str(x) for x in [val, num]])
if temp_val not in mylist_dups:
return temp_val, num
else:
num += 1
return update_val(val, num)
#update list
for i, val in enumerate(mylist):
if mylist_dups[val][0] > 1:
mylist_dups[val][1] += 1
if update_first or mylist_dups[val][1] > start:
new_val, mylist_dups[val][1] = update_val(val, mylist_dups[val][1])
mylist[i] = new_val
return mylist
mylist_updated = fix_dups(mylist, sep='', start=1, update_first=True)
print(mylist_updated)
#['name1', 'state', 'name3', 'city', 'name4', 'zip1', 'zip2', 'name2']
如果您不想更改第一个出现的地方。
mylist = ["name", "state", "name", "city", "name", "zip", "zip", "name_2"]
mylist_updated = fix_dups(mylist, sep='_', start=0, update_first=False)
print(mylist_updated)
#['name', 'state', 'name_1', 'city', 'name_3', 'zip', 'zip_1', 'name_2']