子值并添加新列 pandas
Sub Value and Add new column pandas
我正在尝试从 path
中读取一些 files
作为我之前的 The answer given by Jianxun 的扩展,这绝对是有道理的,但我遇到了一个关键错误。 pandas 非常新,无法修复错误。
注意:我使用 Python 2.7 和 Pandas 0.16
File_1.csv
Ids,12:00:00
2341,9865
7352,8969
File_2.csv
Ids,12:45:00
1234,9865
8435,8969
Master.csv
Ids,00:00:00,00:30:00,00:45:00
1234,1000,500,100
8435,5243,300,200
2341,563,400,400
7352,345,500,600
计划:
import pandas as pd
import numpy as np
from StringIO import StringIO
# your csv file contents
csv_file1 = 'Path/Transition_Data/Test_1.csv '
csv_file2 = 'Path/Transition_Data/Test_2.csv '
csv_file_all = [csv_file1, csv_file2]
# read csv into df using list comprehension
# I use buffer here, replace stringIO with your file path
df_all = [pd.read_csv(StringIO(csv_file)) for csv_file in csv_file_all]
# processing
# =====================================================
# concat along axis=0, outer join on axis=1
merged = pd.concat(df_all, axis=0, ignore_index=True, join='outer').set_index('Ids')
# custom function to handle/merge duplicates on Ids (axis=0)
def apply_func(group):
return group.fillna(method='ffill').iloc[-1]
# remove Ids duplicates
merged_unique = merged.groupby(level='Ids').apply(apply_func)
# do the subtraction
master_csv_file = 'Path/Data_repository/Master1_Test.csv'
df_master = pd.read_csv(io.StringIO(master_csv_file), index_col=['Ids']).sort_index()
# select matching records and horizontal concat
df_matched = pd.concat([df_master,merged_unique.reindex(df_master.index)], axis=1)
# use broadcasting
df_matched.iloc[:, 1:] = df_matched.iloc[:, 1:].sub(df_matched.iloc[:, 0], axis=0)
错误:
Traceback (most recent call last):
File "distribute_count.py", line 18, in <module>
merged = pd.concat(df_all, axis=0, ignore_index=True, join='outer').set_index('Ids')
File "/usr/lib/pymodules/python2.7/pandas/core/frame.py", line 2583, in set_index
level = frame[col].values
File "/usr/lib/pymodules/python2.7/pandas/core/frame.py", line 1787, in __getitem__
return self._getitem_column(key)
File "/usr/lib/pymodules/python2.7/pandas/core/frame.py", line 1794, in _getitem_column
return self._get_item_cache(key)
File "/usr/lib/pymodules/python2.7/pandas/core/generic.py", line 1079, in _get_item_cache
values = self._data.get(item)
File "/usr/lib/pymodules/python2.7/pandas/core/internals.py", line 2843, in get
loc = self.items.get_loc(item)
File "/usr/lib/pymodules/python2.7/pandas/core/index.py", line 1437, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas/index.c:3786)
File "index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:3664)
File "hashtable.pyx", line 697, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:11943)
File "hashtable.pyx", line 705, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:11896)
KeyError: 'Ids'
import pandas as pd
import numpy as np
# your csv file contents
csv_file1 = '/home/Jian/Downloads/stack_flow_bundle/Transition_Data/Test_1.csv'
csv_file2 = '/home/Jian/Downloads/stack_flow_bundle/Transition_Data/Test_2.csv'
master_csv_file = '/home/Jian/Downloads/stack_flow_bundle/Data_repository/master_lac_Test.csv'
csv_file_all = [csv_file1, csv_file2]
# read csv into df using list comprehension
# I use buffer here, replace stringIO with your file path
df_all = [pd.read_csv(csv_file) for csv_file in csv_file_all]
# processing
# =====================================================
# concat along axis=0, outer join on axis=1
merged = pd.concat(df_all, axis=0, ignore_index=True, join='outer').set_index('Ids')
# custom function to handle/merge duplicates on Ids (axis=0)
def apply_func(group):
return group.fillna(method='ffill').iloc[-1]
# remove Ids duplicates
merged_unique = merged.groupby(level='Ids').apply(apply_func)
# do the subtraction
df_master = pd.read_csv(master_csv_file, index_col=['Ids']).sort_index()
# select matching records and horizontal concat
df_matched = pd.concat([df_master,merged_unique.reindex(df_master.index)], axis=1)
# use broadcasting
df_matched.iloc[:, 1:] = df_matched.iloc[:, 1:].sub(df_matched.iloc[:, 0], axis=0)
print(df_matched)
00:00:00 00:30:00 00:45:00 12:00:00 12:45:00
Ids
1234 1000 -500 -900 NaN 8865
2341 563 -163 -163 9302 NaN
7352 345 155 255 8624 NaN
8435 5243 -4943 -5043 NaN 3726
我正在尝试从 path
中读取一些 files
作为我之前的
注意:我使用 Python 2.7 和 Pandas 0.16
File_1.csv
Ids,12:00:00
2341,9865
7352,8969
File_2.csv
Ids,12:45:00
1234,9865
8435,8969
Master.csv
Ids,00:00:00,00:30:00,00:45:00
1234,1000,500,100
8435,5243,300,200
2341,563,400,400
7352,345,500,600
计划:
import pandas as pd
import numpy as np
from StringIO import StringIO
# your csv file contents
csv_file1 = 'Path/Transition_Data/Test_1.csv '
csv_file2 = 'Path/Transition_Data/Test_2.csv '
csv_file_all = [csv_file1, csv_file2]
# read csv into df using list comprehension
# I use buffer here, replace stringIO with your file path
df_all = [pd.read_csv(StringIO(csv_file)) for csv_file in csv_file_all]
# processing
# =====================================================
# concat along axis=0, outer join on axis=1
merged = pd.concat(df_all, axis=0, ignore_index=True, join='outer').set_index('Ids')
# custom function to handle/merge duplicates on Ids (axis=0)
def apply_func(group):
return group.fillna(method='ffill').iloc[-1]
# remove Ids duplicates
merged_unique = merged.groupby(level='Ids').apply(apply_func)
# do the subtraction
master_csv_file = 'Path/Data_repository/Master1_Test.csv'
df_master = pd.read_csv(io.StringIO(master_csv_file), index_col=['Ids']).sort_index()
# select matching records and horizontal concat
df_matched = pd.concat([df_master,merged_unique.reindex(df_master.index)], axis=1)
# use broadcasting
df_matched.iloc[:, 1:] = df_matched.iloc[:, 1:].sub(df_matched.iloc[:, 0], axis=0)
错误:
Traceback (most recent call last):
File "distribute_count.py", line 18, in <module>
merged = pd.concat(df_all, axis=0, ignore_index=True, join='outer').set_index('Ids')
File "/usr/lib/pymodules/python2.7/pandas/core/frame.py", line 2583, in set_index
level = frame[col].values
File "/usr/lib/pymodules/python2.7/pandas/core/frame.py", line 1787, in __getitem__
return self._getitem_column(key)
File "/usr/lib/pymodules/python2.7/pandas/core/frame.py", line 1794, in _getitem_column
return self._get_item_cache(key)
File "/usr/lib/pymodules/python2.7/pandas/core/generic.py", line 1079, in _get_item_cache
values = self._data.get(item)
File "/usr/lib/pymodules/python2.7/pandas/core/internals.py", line 2843, in get
loc = self.items.get_loc(item)
File "/usr/lib/pymodules/python2.7/pandas/core/index.py", line 1437, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas/index.c:3786)
File "index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:3664)
File "hashtable.pyx", line 697, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:11943)
File "hashtable.pyx", line 705, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:11896)
KeyError: 'Ids'
import pandas as pd
import numpy as np
# your csv file contents
csv_file1 = '/home/Jian/Downloads/stack_flow_bundle/Transition_Data/Test_1.csv'
csv_file2 = '/home/Jian/Downloads/stack_flow_bundle/Transition_Data/Test_2.csv'
master_csv_file = '/home/Jian/Downloads/stack_flow_bundle/Data_repository/master_lac_Test.csv'
csv_file_all = [csv_file1, csv_file2]
# read csv into df using list comprehension
# I use buffer here, replace stringIO with your file path
df_all = [pd.read_csv(csv_file) for csv_file in csv_file_all]
# processing
# =====================================================
# concat along axis=0, outer join on axis=1
merged = pd.concat(df_all, axis=0, ignore_index=True, join='outer').set_index('Ids')
# custom function to handle/merge duplicates on Ids (axis=0)
def apply_func(group):
return group.fillna(method='ffill').iloc[-1]
# remove Ids duplicates
merged_unique = merged.groupby(level='Ids').apply(apply_func)
# do the subtraction
df_master = pd.read_csv(master_csv_file, index_col=['Ids']).sort_index()
# select matching records and horizontal concat
df_matched = pd.concat([df_master,merged_unique.reindex(df_master.index)], axis=1)
# use broadcasting
df_matched.iloc[:, 1:] = df_matched.iloc[:, 1:].sub(df_matched.iloc[:, 0], axis=0)
print(df_matched)
00:00:00 00:30:00 00:45:00 12:00:00 12:45:00
Ids
1234 1000 -500 -900 NaN 8865
2341 563 -163 -163 9302 NaN
7352 345 155 255 8624 NaN
8435 5243 -4943 -5043 NaN 3726