如何解析 Python 中的文本文件并转换为 JSON
How to parse text file in Python and convert to JSON
我有一个大文件,格式如下:
"string in quotes"
string
string
string
number
|-
...这会重复一段时间。我正在尝试将其转换为 JSON,因此每个块都是这样的:
"name": "string in quotes"
"description": "string"
"info": "string"
"author": "string"
"year": number
这是我目前拥有的:
import shutil
import os
import urllib
myFile = open('unformatted.txt','r')
newFile = open("formatted.json", "w")
newFile.write('{'+'\n'+'list: {'+'\n')
for line in myFile:
newFile.write() // this is where I'm not sure what to write
newFile.write('}'+'\n'+'}')
myFile.close()
newFile.close()
我认为我可以用行号做一些模数,但我不确定这是否是正确的方法。
我认为这可以解决问题。
import itertools
import json
with open('unformatted.txt', 'r') as f_in, open('formatted.json', 'w') as f_out:
for name, desc, info, author, yr, ignore in itertools.izip_longest(*[f_in]*6):
record = {
"name": '"' + name.strip() + '"',
"description": desc.strip(),
"info": info.strip(),
"author": author.strip(),
"year": int(yr.strip()),
}
f_out.write(json.dumps(record))
您可以使用 itertools.groupby 将所有部分分组,然后 json.dump
将口述到您的 json 文件:
from itertools import groupby
import json
names = ["name", "description","info","author", "year"]
with open("test.csv") as f, open("out.json","w") as out:
grouped = groupby(map(str.rstrip,f), key=lambda x: x.startswith("|-"))
for k,v in grouped:
if not k:
json.dump(dict(zip(names,v)),out)
out.write("\n")
输入:
"string in quotes"
string
string
string
number
|-
"other string in quotes"
string2
string2
string2
number2
输出:
{"author": "string", "name": "\"string in quotes\"", "description": "string", "info": "string", "year": "number"}
{"author": "string2", "name": "\"other string in quotes\"", "description": "string2", "info": "string2", "year": "number2"}
要访问只需遍历文件并加载:
In [6]: with open("out.json") as out:
for line in out:
print(json.loads(line))
...:
{'name': '"string in quotes"', 'info': 'string', 'author': 'string', 'year': 'number', 'description': 'string'}
{'name': '"other string in quotes"', 'info': 'string2', 'author': 'string2', 'year': 'number2', 'description': 'string2'}
这是一个完成基本工作的粗略示例。
它使用生成器将输入分成批次(6 个),然后另一个将键添加到值中。
import json
def read():
with open('input.txt', 'r') as f:
return [l.strip() for l in f.readlines()]
def batch(content, n=1):
length = len(content)
for num_idx in range(0, length, n):
yield content[num_idx:min(num_idx+n, length)]
def emit(batched):
for n, name in enumerate([
'name', 'description', 'info', 'author', 'year'
]):
yield name, batched[n]
content = read()
batched = batch(content, 6)
res = [dict(emit(b)) for b in batched]
print(res)
with open('output.json', 'w') as f:
f.write(json.dumps(res, indent=4))
更新
使用这种方法,您可以轻松地挂钩格式化函数,因此 year 和 name 值将是正确的。
像这样扩展 emit 函数:
def emit(batched):
def _quotes(q):
return q.replace('"', '')
def _pass(p):
return p
def _num(n):
try:
return int(n)
except ValueError:
return n
for n, (name, func) in enumerate([
('name', _quotes),
('description', _pass),
('info', _pass),
('author', _pass),
('year', _num)
]):
yield name, func(batched[n])
我有一个大文件,格式如下:
"string in quotes"
string
string
string
number
|-
...这会重复一段时间。我正在尝试将其转换为 JSON,因此每个块都是这样的:
"name": "string in quotes"
"description": "string"
"info": "string"
"author": "string"
"year": number
这是我目前拥有的:
import shutil
import os
import urllib
myFile = open('unformatted.txt','r')
newFile = open("formatted.json", "w")
newFile.write('{'+'\n'+'list: {'+'\n')
for line in myFile:
newFile.write() // this is where I'm not sure what to write
newFile.write('}'+'\n'+'}')
myFile.close()
newFile.close()
我认为我可以用行号做一些模数,但我不确定这是否是正确的方法。
我认为这可以解决问题。
import itertools
import json
with open('unformatted.txt', 'r') as f_in, open('formatted.json', 'w') as f_out:
for name, desc, info, author, yr, ignore in itertools.izip_longest(*[f_in]*6):
record = {
"name": '"' + name.strip() + '"',
"description": desc.strip(),
"info": info.strip(),
"author": author.strip(),
"year": int(yr.strip()),
}
f_out.write(json.dumps(record))
您可以使用 itertools.groupby 将所有部分分组,然后 json.dump
将口述到您的 json 文件:
from itertools import groupby
import json
names = ["name", "description","info","author", "year"]
with open("test.csv") as f, open("out.json","w") as out:
grouped = groupby(map(str.rstrip,f), key=lambda x: x.startswith("|-"))
for k,v in grouped:
if not k:
json.dump(dict(zip(names,v)),out)
out.write("\n")
输入:
"string in quotes"
string
string
string
number
|-
"other string in quotes"
string2
string2
string2
number2
输出:
{"author": "string", "name": "\"string in quotes\"", "description": "string", "info": "string", "year": "number"}
{"author": "string2", "name": "\"other string in quotes\"", "description": "string2", "info": "string2", "year": "number2"}
要访问只需遍历文件并加载:
In [6]: with open("out.json") as out:
for line in out:
print(json.loads(line))
...:
{'name': '"string in quotes"', 'info': 'string', 'author': 'string', 'year': 'number', 'description': 'string'}
{'name': '"other string in quotes"', 'info': 'string2', 'author': 'string2', 'year': 'number2', 'description': 'string2'}
这是一个完成基本工作的粗略示例。
它使用生成器将输入分成批次(6 个),然后另一个将键添加到值中。
import json
def read():
with open('input.txt', 'r') as f:
return [l.strip() for l in f.readlines()]
def batch(content, n=1):
length = len(content)
for num_idx in range(0, length, n):
yield content[num_idx:min(num_idx+n, length)]
def emit(batched):
for n, name in enumerate([
'name', 'description', 'info', 'author', 'year'
]):
yield name, batched[n]
content = read()
batched = batch(content, 6)
res = [dict(emit(b)) for b in batched]
print(res)
with open('output.json', 'w') as f:
f.write(json.dumps(res, indent=4))
更新
使用这种方法,您可以轻松地挂钩格式化函数,因此 year 和 name 值将是正确的。
像这样扩展 emit 函数:
def emit(batched):
def _quotes(q):
return q.replace('"', '')
def _pass(p):
return p
def _num(n):
try:
return int(n)
except ValueError:
return n
for n, (name, func) in enumerate([
('name', _quotes),
('description', _pass),
('info', _pass),
('author', _pass),
('year', _num)
]):
yield name, func(batched[n])