如何根据另一列的条件创建 pandas 列?
How to create pandas column based on condition of another column?
我想根据 clin["days_to_death"]
值在 clin
数据框中创建一个新列 SURV
,其中:
- 'lts' 如果 NA 或大于或等于 2*365
- 'non-lts' 如果不满足条件(即小于 2*365)
我下面的代码将所有值标记为 'lts',即使小于 2*365。
clin
数据框:
clin = pd.DataFrame([[1, '45', '44', 1, nan],
[1, '121', '68', 0, nan],
[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60'],
[0, nan, '61', 0, '80'],
[0, nan, '60', 0, '100'],
[0, nan, '23', 1, '80'],
[0, nan, '45', 1, '80'],
[1, '83', '75', 1, '60'],
[1, '114', '58', 0, nan],
[0, nan, '45', 1, '100'],
[0, nan, '63', 0, '40'],
[1, '159', '64', 1, '80'],
[0, nan, '64', 0, '40'],
[0, nan, '65', 0, '80'],
[0, nan, '53', 1, nan],
[0, nan, '58', 1, nan],
[0, nan, '76', 0, '100'],
[0, nan, '60', 0, nan],
[0, nan, '21', 1, '90'],
[1, '57', '78', 1, nan],
[1, '95', '79', 0, nan],
[1, '78', '53', 1, '60'],
[1, '444', '64', 0, '70'],
[0, nan, '30', 1, '100'],
[1, '454', '60', 1, nan],
[1, '98', '56', 1, '80'],
[1, '62', '59', 0, '80'],
[1, '460', '49', 0, '100'],
[1, '364', '70', 0, '60'],
[1, '29', '49', 0, nan],
[1, '88', '60', 0, '40'],
[1, '485', '60', 1, '60'],
[1, '42', '52', 0, '80'],
[1, '975', '58', 1, '80'],
[0, nan, '57', 1, '80'],
[0, nan, '36', 0, nan],
[1, '202', '47', 1, '40'],
[1, '523', '52', 1, '100'],
[1, '244', '76', 1, '80'],
[1, '575', '62', 0, '60'],
[1, '144', '58', 0, nan],
[1, '368', '72', 1, '60'],
[1, '54', '83', 0, nan],
[1, '684', '53', 1, '80'],
[1, '428', '56', 0, '80'],
[1, '511', '77', 1, '80'],
[1, '455', '56', 0, '80'],
[0, nan, '39', 0, nan],
[0, nan, '40', 0, '100'],
[0, nan, '74', 0, '60'],
[1, '270', '72', 1, nan],
[1, '577', '78', 1, nan],
[0, nan, '47', 0, nan],
[1, '593', '69', 1, nan],
[1, '36', '72', 0, nan],
[1, '585', '66', 0, nan],
[1, '460', '59', 1, nan],
[1, '379', '73', 1, nan],
[0, nan, '61', 0, nan],
[0, nan, '69', 0, nan],
[0, nan, '54', 1, '70'],
[1, '105', '67', 0, '100'],
[0, nan, '73', 1, '70'],
[0, nan, '51', 0, '90'],
[0, nan, '58', 0, '100'],
[0, nan, '77', 0, '100'],
[0, nan, '55', 1, '80'],
[1, '146', '76', 0, '100'],
[1, '138', '68', 0, '80'],
[1, '535', '58', 0, '80'],
[1, '94', '85', 1, '80'],
[1, '111', '76', 1, '80'],
[1, '279', '70', 1, '80'],
[1, '1458', '50', 1, '80'],
[1, '77', '66', 1, '80'],
[1, '1121', '52', 1, '80'],
[1, '508', '57', 0, '80'],
[1, '100', '74', 0, '80'],
[1, '82', '78', 0, '60'],
[1, '519', '63', 0, '80'],
[1, '254', '64', 1, '80'],
[1, '638', '60', 0, '80'],
[1, '147', '66', 0, '60'],
[1, '153', '74', 0, '80'],
[1, '727', '54', 0, '60'],
[1, '1048', '58', 0, '80'],
[1, '567', '44', 1, '80'],
[0, nan, '49', 1, '80'],
[1, '180', '68', 0, nan],
[1, '191', '72', 1, nan],
[0, nan, '51', 0, nan],
[1, '625', '55', 0, nan],
[1, '1448', '63', 1, '60'],
[1, '375', '68', 0, nan],
[1, '399', '65', 0, '100'],
[1, '317', '62', 0, '80'],
[1, '225', '72', 1, nan],
[1, '360', '47', 0, '100'],
[1, '603', '31', 0, '100'],
[1, '717', '39', 1, nan],
[1, '414', '81', 1, nan],
[0, nan, '49', 1, '100'],
[1, '164', '58', 0, '80'],
[1, '3667', '64', 1, nan],
[1, '224', '76', 1, nan],
[1, '24', '61', 0, '40'],
[1, '1537', '21', 0, nan],
[1, '666', '48', 1, nan],
[1, '141', '51', 0, '80'],
[0, nan, '43', 0, '80'],
[0, nan, '59', 0, '80'],
[0, nan, '74', 0, '60'],
[0, nan, '65', 1, '60'],
[0, nan, '57', 0, '80'],
[0, nan, '65', 1, '80'],
[1, '454', '72', 0, '80'],
[1, '343', '71', 0, '70'],
[1, '544', '52', 0, '70'],
[0, nan, '66', 1, nan],
[0, nan, '42', 0, '100'],
[0, nan, '72', 0, '80'],
[1, '713', '53', 0, '70'],
[1, '335', '62', 1, '90'],
[0, nan, '52', 0, '80'],
[1, '157', '63', 0, nan],
[0, nan, '47', 1, '70'],
[1, '388', '67', 0, '90'],
[1, '165', '60', 0, nan],
[1, '346', '57', 0, '80'],
[1, '165', '71', 1, '0'],
[1, '114', '73', 0, '0'],
[1, '49', '64', 0, '40'],
[0, nan, '33', 0, nan],
[0, nan, '50', 1, nan]], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index="bcr_patient_barcode")
我的尝试:
import numpy as np
import pandas as pd
# Survival info
def survival(clin):
if np.where(clin["days_to_death"],np.nan,1):
val = "lts"
elif clin["days_to_death"].astype(int) >= 2*365:
val = "lts"
else:
val = "non-lts"
return val
clin['SURV'] = clin.apply(survival, axis=1)
然后我想删除除 days_to_death
列之外的所有列中的 NA。
要根据需要创建 clin['SURV']
列,您可以使用 numpy.select
,如下所示:
vals = clin['days_to_death'].astype(np.float32)
condlist = [vals>=2*365, vals<2*365]
choicelist = ['Its', 'nonIts']
clin['SURV'] = np.select(condlist, choicelist, 'Its')
要删除包含 np.nan
的行,您可以使用 dropna(subset=[...])
,如下所示:
clin = clin.dropna(subset=['vital_status', 'age_at_initial_pathologic_diagnosis','gender', 'karnofsky_performance_score'])
使用.loc
clin.loc[clin["days_to_death"].isna() | (clin["days_to_death"].map(float) >= 2*365), 'SURV'] = 'lts'
clin.loc[clin["SURV"].isna(), 'SURV'] = 'non-lts'
然后删除 nans
clin = clin.dropna(subset=['vital_status', 'age_at_initial_pathologic_diagnosis', 'gender', 'karnofsky_performance_score'])
我想根据 clin["days_to_death"]
值在 clin
数据框中创建一个新列 SURV
,其中:
- 'lts' 如果 NA 或大于或等于 2*365
- 'non-lts' 如果不满足条件(即小于 2*365)
我下面的代码将所有值标记为 'lts',即使小于 2*365。
clin
数据框:
clin = pd.DataFrame([[1, '45', '44', 1, nan],
[1, '121', '68', 0, nan],
[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60'],
[0, nan, '61', 0, '80'],
[0, nan, '60', 0, '100'],
[0, nan, '23', 1, '80'],
[0, nan, '45', 1, '80'],
[1, '83', '75', 1, '60'],
[1, '114', '58', 0, nan],
[0, nan, '45', 1, '100'],
[0, nan, '63', 0, '40'],
[1, '159', '64', 1, '80'],
[0, nan, '64', 0, '40'],
[0, nan, '65', 0, '80'],
[0, nan, '53', 1, nan],
[0, nan, '58', 1, nan],
[0, nan, '76', 0, '100'],
[0, nan, '60', 0, nan],
[0, nan, '21', 1, '90'],
[1, '57', '78', 1, nan],
[1, '95', '79', 0, nan],
[1, '78', '53', 1, '60'],
[1, '444', '64', 0, '70'],
[0, nan, '30', 1, '100'],
[1, '454', '60', 1, nan],
[1, '98', '56', 1, '80'],
[1, '62', '59', 0, '80'],
[1, '460', '49', 0, '100'],
[1, '364', '70', 0, '60'],
[1, '29', '49', 0, nan],
[1, '88', '60', 0, '40'],
[1, '485', '60', 1, '60'],
[1, '42', '52', 0, '80'],
[1, '975', '58', 1, '80'],
[0, nan, '57', 1, '80'],
[0, nan, '36', 0, nan],
[1, '202', '47', 1, '40'],
[1, '523', '52', 1, '100'],
[1, '244', '76', 1, '80'],
[1, '575', '62', 0, '60'],
[1, '144', '58', 0, nan],
[1, '368', '72', 1, '60'],
[1, '54', '83', 0, nan],
[1, '684', '53', 1, '80'],
[1, '428', '56', 0, '80'],
[1, '511', '77', 1, '80'],
[1, '455', '56', 0, '80'],
[0, nan, '39', 0, nan],
[0, nan, '40', 0, '100'],
[0, nan, '74', 0, '60'],
[1, '270', '72', 1, nan],
[1, '577', '78', 1, nan],
[0, nan, '47', 0, nan],
[1, '593', '69', 1, nan],
[1, '36', '72', 0, nan],
[1, '585', '66', 0, nan],
[1, '460', '59', 1, nan],
[1, '379', '73', 1, nan],
[0, nan, '61', 0, nan],
[0, nan, '69', 0, nan],
[0, nan, '54', 1, '70'],
[1, '105', '67', 0, '100'],
[0, nan, '73', 1, '70'],
[0, nan, '51', 0, '90'],
[0, nan, '58', 0, '100'],
[0, nan, '77', 0, '100'],
[0, nan, '55', 1, '80'],
[1, '146', '76', 0, '100'],
[1, '138', '68', 0, '80'],
[1, '535', '58', 0, '80'],
[1, '94', '85', 1, '80'],
[1, '111', '76', 1, '80'],
[1, '279', '70', 1, '80'],
[1, '1458', '50', 1, '80'],
[1, '77', '66', 1, '80'],
[1, '1121', '52', 1, '80'],
[1, '508', '57', 0, '80'],
[1, '100', '74', 0, '80'],
[1, '82', '78', 0, '60'],
[1, '519', '63', 0, '80'],
[1, '254', '64', 1, '80'],
[1, '638', '60', 0, '80'],
[1, '147', '66', 0, '60'],
[1, '153', '74', 0, '80'],
[1, '727', '54', 0, '60'],
[1, '1048', '58', 0, '80'],
[1, '567', '44', 1, '80'],
[0, nan, '49', 1, '80'],
[1, '180', '68', 0, nan],
[1, '191', '72', 1, nan],
[0, nan, '51', 0, nan],
[1, '625', '55', 0, nan],
[1, '1448', '63', 1, '60'],
[1, '375', '68', 0, nan],
[1, '399', '65', 0, '100'],
[1, '317', '62', 0, '80'],
[1, '225', '72', 1, nan],
[1, '360', '47', 0, '100'],
[1, '603', '31', 0, '100'],
[1, '717', '39', 1, nan],
[1, '414', '81', 1, nan],
[0, nan, '49', 1, '100'],
[1, '164', '58', 0, '80'],
[1, '3667', '64', 1, nan],
[1, '224', '76', 1, nan],
[1, '24', '61', 0, '40'],
[1, '1537', '21', 0, nan],
[1, '666', '48', 1, nan],
[1, '141', '51', 0, '80'],
[0, nan, '43', 0, '80'],
[0, nan, '59', 0, '80'],
[0, nan, '74', 0, '60'],
[0, nan, '65', 1, '60'],
[0, nan, '57', 0, '80'],
[0, nan, '65', 1, '80'],
[1, '454', '72', 0, '80'],
[1, '343', '71', 0, '70'],
[1, '544', '52', 0, '70'],
[0, nan, '66', 1, nan],
[0, nan, '42', 0, '100'],
[0, nan, '72', 0, '80'],
[1, '713', '53', 0, '70'],
[1, '335', '62', 1, '90'],
[0, nan, '52', 0, '80'],
[1, '157', '63', 0, nan],
[0, nan, '47', 1, '70'],
[1, '388', '67', 0, '90'],
[1, '165', '60', 0, nan],
[1, '346', '57', 0, '80'],
[1, '165', '71', 1, '0'],
[1, '114', '73', 0, '0'],
[1, '49', '64', 0, '40'],
[0, nan, '33', 0, nan],
[0, nan, '50', 1, nan]], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index="bcr_patient_barcode")
我的尝试:
import numpy as np
import pandas as pd
# Survival info
def survival(clin):
if np.where(clin["days_to_death"],np.nan,1):
val = "lts"
elif clin["days_to_death"].astype(int) >= 2*365:
val = "lts"
else:
val = "non-lts"
return val
clin['SURV'] = clin.apply(survival, axis=1)
然后我想删除除 days_to_death
列之外的所有列中的 NA。
要根据需要创建 clin['SURV']
列,您可以使用 numpy.select
,如下所示:
vals = clin['days_to_death'].astype(np.float32)
condlist = [vals>=2*365, vals<2*365]
choicelist = ['Its', 'nonIts']
clin['SURV'] = np.select(condlist, choicelist, 'Its')
要删除包含 np.nan
的行,您可以使用 dropna(subset=[...])
,如下所示:
clin = clin.dropna(subset=['vital_status', 'age_at_initial_pathologic_diagnosis','gender', 'karnofsky_performance_score'])
使用.loc
clin.loc[clin["days_to_death"].isna() | (clin["days_to_death"].map(float) >= 2*365), 'SURV'] = 'lts'
clin.loc[clin["SURV"].isna(), 'SURV'] = 'non-lts'
然后删除 nans
clin = clin.dropna(subset=['vital_status', 'age_at_initial_pathologic_diagnosis', 'gender', 'karnofsky_performance_score'])