Python JSON to CSV - bad encoding, UnicodeDecodeError: 'charmap' codec can't decode byte
Python JSON to CSV - bad encoding, UnicodeDecodeError: 'charmap' codec can't decode byte
我在将嵌套 JSON 转换为 CSV 时遇到问题。为此,我使用 https://github.com/vinay20045/json-to-csv(分叉一点以支持 python 3.4),这里是完整的 json-to-csv.py 文件。
如果我设置
,转换工作正常
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('utf8','ignore')
和
fp = open(json_file_path, 'r', encoding='utf-8')
但是当我将 csv 导入 MS Excel 时,我看到了错误的西里尔字符,例如 \xe0\xf1,英文文本是可以的。
尝试设置 encode('cp1251','ignore') 但后来出现错误
UnicodeDecodeError:'charmap' 编解码器无法解码位置 Y 中的字节 X:字符映射到(如此处 UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to <undefined>)
import sys
import json
import csv
##
# This function converts an item like
# {
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
# To
# {
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
##
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+str(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+str(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('cp1251','ignore')
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n")
else:
#Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
fp = open(json_file_path, 'r', encoding='cp1251')
json_value = fp.read()
raw_data = json.loads(json_value)
processed_data = []
header = []
for item in raw_data[node]:
reduced_item = {}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'wt+') as f:#wb+ for python 2.7
writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL, delimiter=',')
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
如何正确转换西里尔字母以及我想跳过坏字符?
您需要知道您要打开哪个文件的 cyrylic 编码。
例如,在 python3:
中就足够了
with open(args.input_file, 'r', encoding="cp866") as input_file:
data = input_file.read()
structure = json.loads(data)
在python3中数据变量自动为utf-8。在 python2 中,向 json 提供输入可能有问题。
也尝试在python解释行中打印出来,看看符号是否正确。没有输入文件很难判断是否一切正确。另外您确定这是 python 而不是 excel 相关的问题吗?您是否尝试过在 notepad++ 或类似的编辑器编码中打开?
使用编码最重要的事情是检查输入和输出是否正确。我建议看这里。
也许您可以使用 chardet 来检测文件的编码。
import chardet
File='arq.GeoJson'
enc=chardet.detect(open(File,'rb').read())['encoding']
with open(File,'r', encoding = enc) as f:
data=json.load(f)
f.close()
这避免了 'to kick' 编码。
我在将嵌套 JSON 转换为 CSV 时遇到问题。为此,我使用 https://github.com/vinay20045/json-to-csv(分叉一点以支持 python 3.4),这里是完整的 json-to-csv.py 文件。 如果我设置
,转换工作正常 #Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('utf8','ignore')
和
fp = open(json_file_path, 'r', encoding='utf-8')
但是当我将 csv 导入 MS Excel 时,我看到了错误的西里尔字符,例如 \xe0\xf1,英文文本是可以的。 尝试设置 encode('cp1251','ignore') 但后来出现错误 UnicodeDecodeError:'charmap' 编解码器无法解码位置 Y 中的字节 X:字符映射到(如此处 UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to <undefined>)
import sys
import json
import csv
##
# This function converts an item like
# {
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
# To
# {
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
##
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+str(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+str(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('cp1251','ignore')
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n")
else:
#Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
fp = open(json_file_path, 'r', encoding='cp1251')
json_value = fp.read()
raw_data = json.loads(json_value)
processed_data = []
header = []
for item in raw_data[node]:
reduced_item = {}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'wt+') as f:#wb+ for python 2.7
writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL, delimiter=',')
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
如何正确转换西里尔字母以及我想跳过坏字符?
您需要知道您要打开哪个文件的 cyrylic 编码。 例如,在 python3:
中就足够了with open(args.input_file, 'r', encoding="cp866") as input_file:
data = input_file.read()
structure = json.loads(data)
在python3中数据变量自动为utf-8。在 python2 中,向 json 提供输入可能有问题。
也尝试在python解释行中打印出来,看看符号是否正确。没有输入文件很难判断是否一切正确。另外您确定这是 python 而不是 excel 相关的问题吗?您是否尝试过在 notepad++ 或类似的编辑器编码中打开?
使用编码最重要的事情是检查输入和输出是否正确。我建议看这里。
也许您可以使用 chardet 来检测文件的编码。
import chardet
File='arq.GeoJson'
enc=chardet.detect(open(File,'rb').read())['encoding']
with open(File,'r', encoding = enc) as f:
data=json.load(f)
f.close()
这避免了 'to kick' 编码。