将 Python 输出写为 xlsx
Writing Python output as xlsx
我想对给定路径中可用的所有文件(库)执行相同的功能(给出输出 A、B、C、D)。我正在尝试将输出 (A、B、C、D) 写入 xlsx 中 sheet 的四个不同列中。此外,xlsx 的 sheet 名称应与路径中可用的相关文件相同。
我写了下面的代码:
def create_xlsx_file(xlsx_name, file_path):
workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
workbook.close()
libraries=os.listdir(file_path)
file_path=os.chdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name): ###library = all files in the given path
directory=os.chdir(os.getcwd())
workbook = openpyxl.load_workbook(xlsx_name)
worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
#print('library is: - ',library)
sheet=workbook[library] ###to create column headers
sheet.cell(column=1, row=1, value='value_A')
sheet.cell(column=2, row=1, value='value_B')
sheet.cell(column=3, row=1, value='value_C')
sheet.cell(column=4, row=1, value='value_D')
workbook.save(xlsx_name)
with open(library, 'r') as library:
for line in library:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
sheet=workbook[library]
sheet.cell(column=1, row=sheet.max_row+1, value=str(A))
sheet.cell(column=2, row=sheet.max_row, value=str(B))
sheet.cell(column=3, row=sheet.max_row, value=str(C))
sheet.cell(column=4, row=sheet.max_row, value=str(D))
print(f'library {library} has been written at {os.getcwd()}')
#time.sleep(1)
workbook.save(xlsx_name)
这段代码对我来说绝对没问题,但写 xlsx 文件太慢了,因为我的路径有数百个 .txt 库,每个库都有超过数百万行。
我可以将输出(A,B,C,D)保存为.txt格式,然后可以手动编写xlsx文件,但是很费力。
有什么办法可以加快这个过程吗?或任何其他快速 xlsx 编写器可用?
任何帮助将不胜感激。
谢谢
好吧,如果我没理解错的话,你有一个 TXT 文件,其中每行只有 4 个单词。
That is why you are doing:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
如果这是真的,您可以将每个 TXT 文件作为 data-frame 并使用 column/row 拆分技术并将值分配给列。
然后您可以根据需要将结果保存在 CSV 文件中。
这种方式比正常循环快得多。即使一行中的单词数量不均匀,您仍然可以使用此方法并仅对前 4 列进行子集化,这也应该可以解决您的问题。
根据我的经验 pandas 库处理信息的速度非常快,并且具有导出 xlsx 格式数据的功能。
您可以创建一个空的 DataFrame
data = pd.DataFrame()
将您的行保存在 pd.Series
中。例如:
row = pd.Series(data=[A,B,C,D], index = ['value_A', 'value_B', 'value_C', 'value_D'])
index 参数是列的名称,data 参数是每行的值。
将每一行添加到 DataFrame
data = data.append(row, ignore_index = True)
并且只需将 DataFrame 导出到 xlsx
data.to_excel("output.xlsx")
参考资料
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html
不要忘记安装 pandas 库并将其导入到您的程序中
import pandas as pd
希望对你有用。
从您的代码看来,您似乎不需要 xlsx 的任何格式或功能,如果您只需要集中数据,一个简单的 csv 只需对您的代码进行微小的更改就可以了
import csv
def create_xlsx_file(xlsx_name, file_path):
with open(xlsx_name, 'w', encoding='UTF8') as output_file:
writer = csv.writer(output_file)
writer.writerow(['value_A','value_B','value_C','value_D'])
libraries=os.listdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name)
with open(xlsx_name, 'a', encoding='UTF8') as output_file:
writer = csv.writer(output_file)
with open(library, 'r', encoding='UTF8') as input_file:
lines = input_file.read().splitlines()
for line in lines:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
writer.writerow([A,B,C,D])
print(f'library {library} has been written at {os.getcwd()}')
我找到了一种将我的数据保存到 excel 中的更快方法:
由于我的输出是 for 循环的结果,因此首先将输出(A、B、C、D)保存到字典中,然后使用 pandas.
保存到 excel 中
def create_xlsx_file(xlsx_name, file_path):
workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
workbook.close()
libraries=os.listdir(file_path)
file_path=os.chdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name): ###library = all files in the given path
dic={'label_A':[], 'label_B':[],'label_C':[],'label_D':[]}# to store A,B,C,D values.
directory=os.chdir(os.getcwd())
workbook = openpyxl.load_workbook(xlsx_name)
worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
#print('library is: - ',library)
sheet=workbook[library] ###to create column headers
sheet.cell(column=1, row=1, value='value_A')
sheet.cell(column=2, row=1, value='value_B')
sheet.cell(column=3, row=1, value='value_C')
sheet.cell(column=4, row=1, value='value_D')
workbook.save(xlsx_name)
with open(library, 'r') as library:
for line in library:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
dic['label_A'].append(A)
dic['label_B'].append(B)
dic['label_C'].append(C)
dic['label_D'].append(D)
df=pd.DataFrame(data=dic, columns=['label_A', 'label_B', 'label_C', 'label_D'])
df.to_excel(xlsx_name, sheet_name=library)
print(f'library {library} has been written at {os.getcwd()}')
#time.sleep(1)
workbook.save(xlsx_name)
我想对给定路径中可用的所有文件(库)执行相同的功能(给出输出 A、B、C、D)。我正在尝试将输出 (A、B、C、D) 写入 xlsx 中 sheet 的四个不同列中。此外,xlsx 的 sheet 名称应与路径中可用的相关文件相同。
我写了下面的代码:
def create_xlsx_file(xlsx_name, file_path):
workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
workbook.close()
libraries=os.listdir(file_path)
file_path=os.chdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name): ###library = all files in the given path
directory=os.chdir(os.getcwd())
workbook = openpyxl.load_workbook(xlsx_name)
worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
#print('library is: - ',library)
sheet=workbook[library] ###to create column headers
sheet.cell(column=1, row=1, value='value_A')
sheet.cell(column=2, row=1, value='value_B')
sheet.cell(column=3, row=1, value='value_C')
sheet.cell(column=4, row=1, value='value_D')
workbook.save(xlsx_name)
with open(library, 'r') as library:
for line in library:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
sheet=workbook[library]
sheet.cell(column=1, row=sheet.max_row+1, value=str(A))
sheet.cell(column=2, row=sheet.max_row, value=str(B))
sheet.cell(column=3, row=sheet.max_row, value=str(C))
sheet.cell(column=4, row=sheet.max_row, value=str(D))
print(f'library {library} has been written at {os.getcwd()}')
#time.sleep(1)
workbook.save(xlsx_name)
这段代码对我来说绝对没问题,但写 xlsx 文件太慢了,因为我的路径有数百个 .txt 库,每个库都有超过数百万行。
我可以将输出(A,B,C,D)保存为.txt格式,然后可以手动编写xlsx文件,但是很费力。
有什么办法可以加快这个过程吗?或任何其他快速 xlsx 编写器可用? 任何帮助将不胜感激。 谢谢
好吧,如果我没理解错的话,你有一个 TXT 文件,其中每行只有 4 个单词。
That is why you are doing:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
如果这是真的,您可以将每个 TXT 文件作为 data-frame 并使用 column/row 拆分技术并将值分配给列。
然后您可以根据需要将结果保存在 CSV 文件中。 这种方式比正常循环快得多。即使一行中的单词数量不均匀,您仍然可以使用此方法并仅对前 4 列进行子集化,这也应该可以解决您的问题。
根据我的经验 pandas 库处理信息的速度非常快,并且具有导出 xlsx 格式数据的功能。
您可以创建一个空的 DataFrame
data = pd.DataFrame()
将您的行保存在 pd.Series
中。例如:
row = pd.Series(data=[A,B,C,D], index = ['value_A', 'value_B', 'value_C', 'value_D'])
index 参数是列的名称,data 参数是每行的值。
将每一行添加到 DataFrame
data = data.append(row, ignore_index = True)
并且只需将 DataFrame 导出到 xlsx
data.to_excel("output.xlsx")
参考资料 https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html
不要忘记安装 pandas 库并将其导入到您的程序中
import pandas as pd
希望对你有用。
从您的代码看来,您似乎不需要 xlsx 的任何格式或功能,如果您只需要集中数据,一个简单的 csv 只需对您的代码进行微小的更改就可以了
import csv
def create_xlsx_file(xlsx_name, file_path):
with open(xlsx_name, 'w', encoding='UTF8') as output_file:
writer = csv.writer(output_file)
writer.writerow(['value_A','value_B','value_C','value_D'])
libraries=os.listdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name)
with open(xlsx_name, 'a', encoding='UTF8') as output_file:
writer = csv.writer(output_file)
with open(library, 'r', encoding='UTF8') as input_file:
lines = input_file.read().splitlines()
for line in lines:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
writer.writerow([A,B,C,D])
print(f'library {library} has been written at {os.getcwd()}')
我找到了一种将我的数据保存到 excel 中的更快方法: 由于我的输出是 for 循环的结果,因此首先将输出(A、B、C、D)保存到字典中,然后使用 pandas.
保存到 excel 中def create_xlsx_file(xlsx_name, file_path):
workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
workbook.close()
libraries=os.listdir(file_path)
file_path=os.chdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name): ###library = all files in the given path
dic={'label_A':[], 'label_B':[],'label_C':[],'label_D':[]}# to store A,B,C,D values.
directory=os.chdir(os.getcwd())
workbook = openpyxl.load_workbook(xlsx_name)
worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
#print('library is: - ',library)
sheet=workbook[library] ###to create column headers
sheet.cell(column=1, row=1, value='value_A')
sheet.cell(column=2, row=1, value='value_B')
sheet.cell(column=3, row=1, value='value_C')
sheet.cell(column=4, row=1, value='value_D')
workbook.save(xlsx_name)
with open(library, 'r') as library:
for line in library:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
dic['label_A'].append(A)
dic['label_B'].append(B)
dic['label_C'].append(C)
dic['label_D'].append(D)
df=pd.DataFrame(data=dic, columns=['label_A', 'label_B', 'label_C', 'label_D'])
df.to_excel(xlsx_name, sheet_name=library)
print(f'library {library} has been written at {os.getcwd()}')
#time.sleep(1)
workbook.save(xlsx_name)