使用 pd.read_clipboard 复制 MultiIndex 数据帧?
Copying MultiIndex dataframes with pd.read_clipboard?
给定一个 dataframe like this:
C
A B
1.1 111 20
222 31
3.3 222 24
333 65
5.5 333 22
6.6 777 74
如何使用 pd.read_clipboard
阅读它?我试过这个:
df = pd.read_clipboard(index_col=[0, 1])
但是它抛出一个错误:
ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 3
我该如何解决这个问题?
更新: 现在它解析剪贴板 - 即不需要事先保存它
def read_clipboard_mi(index_names_row=None, **kwargs):
encoding = kwargs.pop('encoding', 'utf-8')
# only utf-8 is valid for passed value because that's what clipboard
# supports
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
raise NotImplementedError(
'reading from clipboard only supports utf-8 encoding')
from pandas import compat, read_fwf
from pandas.io.clipboard import clipboard_get
from pandas.io.common import StringIO
data = clipboard_get()
# try to decode (if needed on PY3)
# Strange. linux py33 doesn't complain, win py33 does
if compat.PY3:
try:
text = compat.bytes_to_str(
text, encoding=(kwargs.get('encoding') or
get_option('display.encoding'))
)
except:
pass
index_names = None
if index_names_row:
if isinstance(index_names_row, int):
index_names = data.splitlines()[index_names_row].split()
skiprows = [index_names_row]
kwargs.update({'skiprows': skiprows})
else:
raise Exception('[index_names_row] must be of [int] data type')
df = read_fwf(StringIO(data), **kwargs)
unnamed_cols = df.columns[df.columns.str.contains(r'Unnamed:')].tolist()
if index_names:
idx_cols = df.columns[range(len(index_names))].tolist()
elif unnamed_cols:
idx_cols = df.columns[range(len(unnamed_cols))].tolist()
index_names = [None] * len(idx_cols)
df[idx_cols] = df[idx_cols].ffill()
df = df.set_index(idx_cols).rename_axis(index_names)
return df
测试没有索引名称的多索引 DF:
In [231]: read_clipboard_mi()
Out[231]:
C
1.1 111 20
222 31
3.3 222 24
333 65
5.5 333 22
6.6 777 74
使用索引名称测试多索引 DF:
In [232]: read_clipboard_mi(index_names_row=1)
Out[232]:
C
A B
1.1 111 20
222 31
3.3 222 24
333 65
5.5 333 22
6.6 777 74
注意:
- 它没有经过很好的测试
- 不支持多级列
- 见第 1 点 ;-)
注意 2: 请随意使用此代码或创建 a pull request on Pandas github
给定一个 dataframe like this:
C
A B
1.1 111 20
222 31
3.3 222 24
333 65
5.5 333 22
6.6 777 74
如何使用 pd.read_clipboard
阅读它?我试过这个:
df = pd.read_clipboard(index_col=[0, 1])
但是它抛出一个错误:
ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 3
我该如何解决这个问题?
更新: 现在它解析剪贴板 - 即不需要事先保存它
def read_clipboard_mi(index_names_row=None, **kwargs):
encoding = kwargs.pop('encoding', 'utf-8')
# only utf-8 is valid for passed value because that's what clipboard
# supports
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
raise NotImplementedError(
'reading from clipboard only supports utf-8 encoding')
from pandas import compat, read_fwf
from pandas.io.clipboard import clipboard_get
from pandas.io.common import StringIO
data = clipboard_get()
# try to decode (if needed on PY3)
# Strange. linux py33 doesn't complain, win py33 does
if compat.PY3:
try:
text = compat.bytes_to_str(
text, encoding=(kwargs.get('encoding') or
get_option('display.encoding'))
)
except:
pass
index_names = None
if index_names_row:
if isinstance(index_names_row, int):
index_names = data.splitlines()[index_names_row].split()
skiprows = [index_names_row]
kwargs.update({'skiprows': skiprows})
else:
raise Exception('[index_names_row] must be of [int] data type')
df = read_fwf(StringIO(data), **kwargs)
unnamed_cols = df.columns[df.columns.str.contains(r'Unnamed:')].tolist()
if index_names:
idx_cols = df.columns[range(len(index_names))].tolist()
elif unnamed_cols:
idx_cols = df.columns[range(len(unnamed_cols))].tolist()
index_names = [None] * len(idx_cols)
df[idx_cols] = df[idx_cols].ffill()
df = df.set_index(idx_cols).rename_axis(index_names)
return df
测试没有索引名称的多索引 DF:
In [231]: read_clipboard_mi()
Out[231]:
C
1.1 111 20
222 31
3.3 222 24
333 65
5.5 333 22
6.6 777 74
使用索引名称测试多索引 DF:
In [232]: read_clipboard_mi(index_names_row=1)
Out[232]:
C
A B
1.1 111 20
222 31
3.3 222 24
333 65
5.5 333 22
6.6 777 74
注意:
- 它没有经过很好的测试
- 不支持多级列
- 见第 1 点 ;-)
注意 2: 请随意使用此代码或创建 a pull request on Pandas github