在 apache NIFI 中将 xls 文件转换为 csv 文件

Convert xls files to csv files in apache NIFI

大家好,我正在尝试在 apache NIFI 数据流中将 .xls 文件转换为 .csv。我尝试了很多解决方案都没有结果我什至尝试创建这样的脚本 sccript

提前致谢

将您的 ExecuteStreamCommand 配置为

并尝试在您的 python 脚本中使用以下代码,

import csv
import os
import sys
from io import StringIO, BytesIO
import pandas as pd
import xlrd
from pandas import ExcelFile

wb = xlrd.open_workbook(file_contents=sys.stdin.read(),logfile=open(os.devnull, 'w'))
excel_file_df = pd.read_excel(wb, sheet_name='Sheet1', index=False, index_col=0, encoding='utf-8',engine='xlrd')

#flowfile_content = ExcelFile(BytesIO(sys.stdin.read()))
#excel_file_df = pd.read_excel(flowfile_content, sheet_name='Sheet1', index=False, index_col=0, encoding='utf-8')

csv_data_rows = []
header_list = list(excel_file_df.columns.values)
temp_header_list = []

for field in header_list:
    temp = '"' + field +  '"'
    temp_header_list.append(temp)

header_row  = ','.join([str(elem) for elem in temp_header_list])
csv_data_rows.append(header_row)
is_header_row = True
for index, row in excel_file_df.iterrows():

    if is_header_row :
        is_header_row = False
        continue

    temp_data_list = []
    for item in row :
        #item = item.encode('utf-8', 'ignore').decode('utf-8')
        if hasattr(item, 'encode'):
            item = item.encode('ascii', 'ignore').decode('ascii')

        item = str(item)
        item = item.replace('\n', '')
        item = item.replace('",', '" ')
        if item == 'nan':
            item=''
        temp = '"' + str(item) + '"'
        temp_data_list.append(temp)

    data_row = ','.join([str(elem) for elem in temp_data_list])
    data_row = data_row
    csv_data_rows.append(data_row)

for item in csv_data_rows:
    sys.stdout.write("%s\r\n" % item)

你试过ConvertExcelToCSVProcessor吗?如果是这样,但它不起作用,您能分享任何错误、日志等吗?

    import csv
import os
import io
import sys
from io import StringIO, BytesIO
import pandas as pd
import xlrd
from pandas import ExcelFile
import petl as etl

#xls = etl.fromxls(sys.stdin)
#wb = xlrd.open_workbook(file_contents=xls,logfile=open(os.devnull, 'w', encoding='utf-8'))
excel_file_df = pd.read_excel(sys.stdin.buffer, sheet_name='Sheet1', index=False, index_col=0, encoding='utf-8',engine='xlrd')

#flowfile_content = ExcelFile(BytesIO(sys.stdin.read()))
#excel_file_df = pd.read_excel(flowfile_content, sheet_name='Sheet1', index=False, index_col=0, encoding='utf-8')

csv_data_rows = []
header_list = list(excel_file_df.columns.values)
temp_header_list = []

for field in header_list:
    temp = '"' + field +  '"'
    temp_header_list.append(temp)

header_row  = ','.join([str(elem) for elem in temp_header_list])
csv_data_rows.append(header_row)
is_header_row = True
for index, row in excel_file_df.iterrows():

    if is_header_row :
        is_header_row = False
        continue

    temp_data_list = []
    for item in row :
        #item = item.encode('utf-8', 'ignore').decode('utf-8')
        if hasattr(item, 'encode'):
            item = item.encode('ascii', 'ignore').decode('ascii')

        item = str(item)
        item = item.replace('\n', '')
        item = item.replace('",', '" ')
        if item == 'nan':
            item=''
        temp = '"' + str(item) + '"'
        temp_data_list.append(temp)

    data_row = ','.join([str(elem) for elem in temp_data_list])
    data_row = data_row
    csv_data_rows.append(data_row)

for item in csv_data_rows:
    sys.stdout.write("%s\r\n" % item)