解析文本文件 python 并转换为 pandas 数据帧
Parse text file python and covert to pandas dataframe
我正在尝试解析文本文件,将其转换为 pandas 数据帧。
文件(包括空行):
HEADING1
value 1
HEADING2
value 2
HEADING1,
value 11
HEADING2
value 12
应该转换成dataframe
:
HEADING1, HEADING2
value 1, value 2
value 11, value 12
我试过下面的代码。但是,我不确定使用 converters
是否可行?
df = pd.read_table(textfile, header=None, skip_blank_lines=True, delimiter='\n',
# converters= 'what should I use?',
names= 'HEADING1, HEADING2'.split() )
您自己解析文本并在 '\n\n'
上拆分
# split file by `'\n\n'` to get rows
# split again by `'\n'` to get columns
# `zip` to get convenient lists of headers and values
cols, vals = zip(
*[line.split('\n') for line in open(textfile).read().split('\n\n')]
)
# construct a `pd.Series`
# note: your index contained in the `cols` list will not be unique
s = pd.Series(vals, cols)
# we'll need to enumerate the duplicated index values so that we can unstack
# we do this by creating a `pd.MultiIndex` with `cumcount` then the header values
s.index = [s.groupby(level=0).cumcount(), s.index]
# finally, `unstack`
s.unstack()
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
细分
list
理解
[line.split('\n') for line in StringIO(txt).read().split('\n\n')]
[['HEADING1', 'value 1'],
['HEADING2', 'value 2'],
['HEADING1', 'value 11'],
['HEADING2', 'value 12']]
和zip
list(zip(*[line.split('\n') for line in StringIO(txt).read().split('\n\n')]))
[('HEADING1', 'HEADING2', 'HEADING1', 'HEADING2'),
('value 1', 'value 2', 'value 11', 'value 12')]
设置cols
和vals
cols, vals = zip(*[line.split('\n') for line in StringIO(txt).read().split('\n\n')])
print(cols)
print()
print(vals)
('HEADING1', 'HEADING2', 'HEADING1', 'HEADING2')
('value 1', 'value 2', 'value 11', 'value 12')
制作系列
s = pd.Series(vals, cols)
s
HEADING1 value 1
HEADING2 value 2
HEADING1 value 11
HEADING2 value 12
dtype: object
枚举索引值
s.index = [s.groupby(level=0).cumcount(), s.index]
s
0 HEADING1 value 1
HEADING2 value 2
1 HEADING1 value 11
HEADING2 value 12
dtype: object
unstack
s.unstack()
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
完整演示
import pandas as pd
from io import StringIO
txt = """HEADING1
value 1
HEADING2
value 2
HEADING1
value 11
HEADING2
value 12"""
cols, vals = zip(*[line.split('\n') for line in StringIO(txt).read().split('\n\n')])
s = pd.Series(vals, cols)
s.index = [s.groupby(level=0).cumcount(), s.index]
s.unstack()
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
使用defaultdict
from collections import defaultdict
from io import StringIO
import pandas as pd
txt = """HEADING1
value 1
HEADING2
value 2
HEADING1
value 11
HEADING2
value 12"""
d = defaultdict(list)
[
d[k].append(v)
for k, v in [line.split('\n')
for line in StringIO(txt).read().split('\n\n')]
];
pd.DataFrame(d)
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
我正在尝试解析文本文件,将其转换为 pandas 数据帧。 文件(包括空行):
HEADING1
value 1
HEADING2
value 2
HEADING1,
value 11
HEADING2
value 12
应该转换成dataframe
:
HEADING1, HEADING2
value 1, value 2
value 11, value 12
我试过下面的代码。但是,我不确定使用 converters
是否可行?
df = pd.read_table(textfile, header=None, skip_blank_lines=True, delimiter='\n',
# converters= 'what should I use?',
names= 'HEADING1, HEADING2'.split() )
您自己解析文本并在 '\n\n'
# split file by `'\n\n'` to get rows
# split again by `'\n'` to get columns
# `zip` to get convenient lists of headers and values
cols, vals = zip(
*[line.split('\n') for line in open(textfile).read().split('\n\n')]
)
# construct a `pd.Series`
# note: your index contained in the `cols` list will not be unique
s = pd.Series(vals, cols)
# we'll need to enumerate the duplicated index values so that we can unstack
# we do this by creating a `pd.MultiIndex` with `cumcount` then the header values
s.index = [s.groupby(level=0).cumcount(), s.index]
# finally, `unstack`
s.unstack()
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
细分
list
理解
[line.split('\n') for line in StringIO(txt).read().split('\n\n')]
[['HEADING1', 'value 1'],
['HEADING2', 'value 2'],
['HEADING1', 'value 11'],
['HEADING2', 'value 12']]
和zip
list(zip(*[line.split('\n') for line in StringIO(txt).read().split('\n\n')]))
[('HEADING1', 'HEADING2', 'HEADING1', 'HEADING2'),
('value 1', 'value 2', 'value 11', 'value 12')]
设置cols
和vals
cols, vals = zip(*[line.split('\n') for line in StringIO(txt).read().split('\n\n')])
print(cols)
print()
print(vals)
('HEADING1', 'HEADING2', 'HEADING1', 'HEADING2')
('value 1', 'value 2', 'value 11', 'value 12')
制作系列
s = pd.Series(vals, cols)
s
HEADING1 value 1
HEADING2 value 2
HEADING1 value 11
HEADING2 value 12
dtype: object
枚举索引值
s.index = [s.groupby(level=0).cumcount(), s.index]
s
0 HEADING1 value 1
HEADING2 value 2
1 HEADING1 value 11
HEADING2 value 12
dtype: object
unstack
s.unstack()
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
完整演示
import pandas as pd
from io import StringIO
txt = """HEADING1
value 1
HEADING2
value 2
HEADING1
value 11
HEADING2
value 12"""
cols, vals = zip(*[line.split('\n') for line in StringIO(txt).read().split('\n\n')])
s = pd.Series(vals, cols)
s.index = [s.groupby(level=0).cumcount(), s.index]
s.unstack()
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12
使用defaultdict
from collections import defaultdict
from io import StringIO
import pandas as pd
txt = """HEADING1
value 1
HEADING2
value 2
HEADING1
value 11
HEADING2
value 12"""
d = defaultdict(list)
[
d[k].append(v)
for k, v in [line.split('\n')
for line in StringIO(txt).read().split('\n\n')]
];
pd.DataFrame(d)
HEADING1 HEADING2
0 value 1 value 2
1 value 11 value 12