使用 python 删除 excel 中具有特定列组合的重复行
Remove duplicate rows with certain column combination in excel using python
我有一个 python 程序可以读取 excel 个文档。我只需要允许某些列组合的首次出现。
例如:
A | B
-------------
1. 200 | 201
2. 200 | 202
3. 200 | 201
4. 200 | 203
5. 201 | 201
6. 201 | 202
.............
我想要 remove/skip 找到重复项的第三行并将其写入 CSV 文件。
这是我到目前为止一直在尝试的功能。但它不起作用。
def validateExcel(filename):
xls=xlrd.open_workbook(filename)
setcount = 0
column = 0
count = 0
# sheetcount = 0
for sheet in xls.sheets():
header=""
# sheetcount = sheetcount + 1
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
mylist = []
for row in range (1, number_of_rows):
mylist = []
for col in range(0, 2):
mylist.append(sheet.cell_value(row, col))
print mylist
myset = set(mylist)
print myset
mylist = []
被使用了两次,分配单个值会很困难。应该是这样的:
mylist = []
for row in range(1, number_of_rows):
mylist.append((sheet.cell_value(row, 0), sheet.cell_value(row, 1)))
myset = set(mylist)
注意 set
未排序。如果您想要按顺序排列结果,请也检查 this。
这应该将行(在本例中称为子列表)追加到您的 mylist
列表中(如果尚未将其放入其中)。这应该按照它们在 xlsx 文件中找到的顺序为您提供一个去除重复的行列表。如果可以,可能值得查看 pandas 库。如果没有,这应该有所帮助:
def validateExcel(filename):
xls=xlrd.open_workbook(filename)
for sheet in xls.sheets():
header=""
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
mylist = []
for row in range (1, number_of_rows):
sublist = [sheet.cell_value(row, col) for col in range(0, number_of_cols)]
if sublist not in mylist:
mylist.append(sublist)
print mylist
return mylist
编辑:
如果您有一个 xlsx
文件包含多个 sheet,您可以使用字典存储以 sheet 名称作为键的去重行数据,然后将该字典传递给 csv 写入函数:
def validateExcel(filename):
outputDict = {}
xls=xlrd.open_workbook(filename)
sheetCount = 0
for sheet in xls.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
if not sheetname:
sheetname = str(sheetCount)
outputDict[str(sheetCount)] = []
for row in range (1, number_of_rows):
sublist = [sheet.cell_value(row, col) for col in in range(0,number_of_cols)]
if sublist not in outputDict[sheetname]:
outputDict[sheetname].append(sublist)
print outputDict[sheetname]
sheetCount += 1
return outputDict
# will go through the generated dictionary and write the data to csv files
def writeToFiles(generatedDictionary):
for key generatedDictionary:
with open(key + ".csv") as csvFile:
writer = csv.writer(csvFile)
writer.writerows(generatedDictionary[key])
如果你可以使用 pandas,像这样的东西可以工作:
import pandas as pd
df = pd.read_excel(filename)
for name in df.sheetnames:
sheetDataFrame = df.parse(name)
filtered = sheetDataFrame.drop_duplicates()
filtered.to_csv(name + ".csv")
它对我有用:在 python 2.7
def validateExcel(filename):
xls=xlrd.open_workbook(filename)
setcount = 0
column = 0
count = 0
# sheetcount = 0
for sheet in xls.sheets():
header=""
# sheetcount = sheetcount + 1
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
mylist = []
for row in range(1, number_of_rows):
mylist.append((sheet.cell_value(row, 0), sheet.cell_value(row, 1)))
myset = sorted(set(mylist), key=mylist.index)
return myset
这是我的解决方案。删除重复项并创建一个没有重复项的新文件。
import xlsxwriter
import xlrd
def remove_duplicates():
read_file = xlrd.open_workbook('Original.xlsx')
write_file = xlsxwriter.Workbook ('Removed_Duplicates.xlsx')
for sheet in read_file.sheets():
no_rows = sheet.nrows
no_cols = sheet.ncols
name = sheet.name
gen_sheets = write_file.add_worksheet(name)
line_list = []
r = 0
for row in range(0, no_rows):
line_sublist = [sheet.cell(row, col).value for col in range(0, no_cols)]
if line_sublist not in line_list:
line_list.append(line_sublist)
for col in range(0, no_cols):
gen_sheets.write(r,col,line_sublist[col])
r = r + 1
write_file.close()
我们可以使用 python pandas 包
要安装这个包:pip install pandas
参考:https://pandas.pydata.org/docs/getting_started/install.html
使用不带任何参数的 drop_duplicates()
import pandas as pd
data = pd.read_excel('your_excel_path_goes_here.xlsx')
#print(data)
data.drop_duplicates()
通过使用列名
使用drop_duplicates()
import pandas as pd
data = pd.read_excel('your_excel_path_goes_here.xlsx')
#print(data)
data.drop_duplicates(subset=["YOUR_COLUMN_NAME_GOES_HERE"], keep="last")
keep=first 指示 Python 保留第一个值并删除其他列的重复值。
keep=last 指示 Python 保留最后一个值并删除其他列的重复值。
假设我们要删除 excel sheet 中的所有重复值。我们可以指定 keep=False
我有一个 python 程序可以读取 excel 个文档。我只需要允许某些列组合的首次出现。 例如:
A | B
-------------
1. 200 | 201
2. 200 | 202
3. 200 | 201
4. 200 | 203
5. 201 | 201
6. 201 | 202
.............
我想要 remove/skip 找到重复项的第三行并将其写入 CSV 文件。 这是我到目前为止一直在尝试的功能。但它不起作用。
def validateExcel(filename):
xls=xlrd.open_workbook(filename)
setcount = 0
column = 0
count = 0
# sheetcount = 0
for sheet in xls.sheets():
header=""
# sheetcount = sheetcount + 1
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
mylist = []
for row in range (1, number_of_rows):
mylist = []
for col in range(0, 2):
mylist.append(sheet.cell_value(row, col))
print mylist
myset = set(mylist)
print myset
mylist = []
被使用了两次,分配单个值会很困难。应该是这样的:
mylist = []
for row in range(1, number_of_rows):
mylist.append((sheet.cell_value(row, 0), sheet.cell_value(row, 1)))
myset = set(mylist)
注意 set
未排序。如果您想要按顺序排列结果,请也检查 this。
这应该将行(在本例中称为子列表)追加到您的 mylist
列表中(如果尚未将其放入其中)。这应该按照它们在 xlsx 文件中找到的顺序为您提供一个去除重复的行列表。如果可以,可能值得查看 pandas 库。如果没有,这应该有所帮助:
def validateExcel(filename):
xls=xlrd.open_workbook(filename)
for sheet in xls.sheets():
header=""
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
mylist = []
for row in range (1, number_of_rows):
sublist = [sheet.cell_value(row, col) for col in range(0, number_of_cols)]
if sublist not in mylist:
mylist.append(sublist)
print mylist
return mylist
编辑:
如果您有一个 xlsx
文件包含多个 sheet,您可以使用字典存储以 sheet 名称作为键的去重行数据,然后将该字典传递给 csv 写入函数:
def validateExcel(filename):
outputDict = {}
xls=xlrd.open_workbook(filename)
sheetCount = 0
for sheet in xls.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
if not sheetname:
sheetname = str(sheetCount)
outputDict[str(sheetCount)] = []
for row in range (1, number_of_rows):
sublist = [sheet.cell_value(row, col) for col in in range(0,number_of_cols)]
if sublist not in outputDict[sheetname]:
outputDict[sheetname].append(sublist)
print outputDict[sheetname]
sheetCount += 1
return outputDict
# will go through the generated dictionary and write the data to csv files
def writeToFiles(generatedDictionary):
for key generatedDictionary:
with open(key + ".csv") as csvFile:
writer = csv.writer(csvFile)
writer.writerows(generatedDictionary[key])
如果你可以使用 pandas,像这样的东西可以工作:
import pandas as pd
df = pd.read_excel(filename)
for name in df.sheetnames:
sheetDataFrame = df.parse(name)
filtered = sheetDataFrame.drop_duplicates()
filtered.to_csv(name + ".csv")
它对我有用:在 python 2.7
def validateExcel(filename):
xls=xlrd.open_workbook(filename)
setcount = 0
column = 0
count = 0
# sheetcount = 0
for sheet in xls.sheets():
header=""
# sheetcount = sheetcount + 1
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
sheetname = sheet.name
mylist = []
for row in range(1, number_of_rows):
mylist.append((sheet.cell_value(row, 0), sheet.cell_value(row, 1)))
myset = sorted(set(mylist), key=mylist.index)
return myset
这是我的解决方案。删除重复项并创建一个没有重复项的新文件。
import xlsxwriter
import xlrd
def remove_duplicates():
read_file = xlrd.open_workbook('Original.xlsx')
write_file = xlsxwriter.Workbook ('Removed_Duplicates.xlsx')
for sheet in read_file.sheets():
no_rows = sheet.nrows
no_cols = sheet.ncols
name = sheet.name
gen_sheets = write_file.add_worksheet(name)
line_list = []
r = 0
for row in range(0, no_rows):
line_sublist = [sheet.cell(row, col).value for col in range(0, no_cols)]
if line_sublist not in line_list:
line_list.append(line_sublist)
for col in range(0, no_cols):
gen_sheets.write(r,col,line_sublist[col])
r = r + 1
write_file.close()
我们可以使用 python pandas 包
要安装这个包:pip install pandas
参考:https://pandas.pydata.org/docs/getting_started/install.html
使用不带任何参数的 drop_duplicates()
import pandas as pd
data = pd.read_excel('your_excel_path_goes_here.xlsx')
#print(data)
data.drop_duplicates()
通过使用列名
使用drop_duplicates()import pandas as pd
data = pd.read_excel('your_excel_path_goes_here.xlsx')
#print(data)
data.drop_duplicates(subset=["YOUR_COLUMN_NAME_GOES_HERE"], keep="last")
keep=first 指示 Python 保留第一个值并删除其他列的重复值。
keep=last 指示 Python 保留最后一个值并删除其他列的重复值。
假设我们要删除 excel sheet 中的所有重复值。我们可以指定 keep=False