相当于 cuDF 中的 pd.Series.str.slice() 和 pd.Series.apply()
Equivalent of pd.Series.str.slice() and pd.Series.apply() in cuDF
我想将以下代码(pandas 中的 运行 转换为 cuDF 中的 运行 代码。
正在处理的系列 .head()
中的示例数据被插入到第三个代码单元格中的 OG 代码中——应该能够 copy/paste 运行.
原始代码在pandas
# both are float columns now
# rawcensustractandblock
s_rawcensustractandblock = df_train['rawcensustractandblock'].apply(lambda x: str(x))
# adjust/set new tract number
df_train['census_tractnumber'] = s_rawcensustractandblock.str.slice(4,11)
# adjust block number
df_train['block_number'] = s_rawcensustractandblock.str.slice(start=11)
df_train['block_number'] = df_train['block_number'].apply(lambda x: x[:4]+'.'+x[4:]+'0' )
df_train['block_number'] = df_train['block_number'].apply(lambda x: int(round(float(x),0)) )
df_train['block_number'] = df_train['block_number'].apply(lambda x: str(x).ljust(4,'0') )
数据被操纵
# series of values from df_train.['rawcensustractandblock'].head()
data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
代码已调整为从此示例数据开始
下面是使用上面提供的数据而不是整个数据帧时代码的样子。
根据尝试转换时遇到的错误,此问题在系列级别,因此将下面的单元格转换为在 cuDF 中执行应该可以解决问题。
import pandas as pd
# series of values from df_train.['rawcensustractandblock'].head()
data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# how the first line looks using the series
s_rawcensustractandblock = data.apply(lambda x: str(x))
# adjust/set new tract number
census_tractnumber = s_rawcensustractandblock.str.slice(4,11)
# adjust block number
block_number = s_rawcensustractandblock.str.slice(start=11)
block_number = block_number.apply(lambda x: x[:4]+'.'+x[4:]+'0' )
block_number = block_number.apply(lambda x: int(round(float(x),0)) )
block_number = block_number.apply(lambda x: str(x).ljust(4,'0') )
预期变化(输出)
df_train['census_tractnumber'].head()
# out
0 1066.46
1 0524.22
2 4638.00
3 2963.00
4 0423.38
Name: census_tractnumber, dtype: object
df_train['block_number'].head()
0 1001
1 2024
2 3004
3 2002
4 1006
Name: block_number, dtype: object
for循环解决方案
pandas(原码)
import pandas as pd
# data from df_train.rawcensustractandblock.head()
pd_data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# using series instead of dataframe
pd_raw_block = pd_data.apply(lambda x: str(x))
# adjust/set new tract number
pd_tractnumber = pd_raw_block.str.slice(4,11)
# set/adjust block number
pd_block_number = pd_raw_block.str.slice(11)
pd_block_number = pd_block_number.apply(lambda x: x[:4]+'.'+x[4:]+'0')
pd_block_number = pd_block_number.apply(lambda x: int(round(float(x),0)))
pd_block_number = pd_block_number.apply(lambda x: str(x).ljust(4,'0'))
# print(list(pd_tractnumber))
# print(list(pd_block_number))
cuDF(解法代码)
import cudf
# data from df_train.rawcensustractandblock.head()
cudf_data = cudf.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# using series instead of dataframe
cudf_tractnumber = cudf_data.values_to_string()
# adjust/set new tract number
for i in range(len(cudf_tractnumber)):
funct = slice(4,11)
cudf_tractnumber[i] = cudf_tractnumber[i][funct]
# using series instead of dataframe
cudf_block_number = cudf_data.values_to_string()
# set/adjust block number
for i in range(len(cudf_block_number)):
funct = slice(11, None)
cudf_block_number[i] = cudf_block_number[i][funct]
cudf_block_number[i] = cudf_block_number[i][:4]+'.'+cudf_block_number[i][4:]+'0'
cudf_block_number[i] = int(round(float(cudf_block_number[i]), 0))
cudf_block_number[i] = str(cudf_block_number[i]).ljust(4,'0')
# print(cudf_tractnumber)
# print(cudf_block_number)
您几乎可以将 cuDF 字符串方法(通过 nvStrings)用于您尝试做的所有事情。将这些浮点数转换为 cuDF 中的字符串会失去一些精度(尽管在上面的示例中可能无关紧要),因此对于这个示例,我只是事先进行了转换。如果可能,我建议最初将 rawcensustractandblock
创建为字符串列而不是浮点列。
import cudf
import pandas as pd
gdata = cudf.from_pandas(pd_data.astype('str'))
tractnumber = gdata.str.slice(4,11)
blocknumber = gdata.str.slice(11)
blocknumber = blocknumber.str.slice(0,4).str.cat(blocknumber.str.slice(4), '.')
blocknumber = blocknumber.astype('float').round(0).astype('int')
blocknumber = blocknumber.astype('str').str.ljust(4, '0')
tractnumber
0 1066.46
1 0524.22
2 4638.00
3 2963.00
4 0423.38
dtype: object
blocknumber
0 1001
1 2024
2 3004
3 2002
4 1006
dtype: object
我想将以下代码(pandas 中的 运行 转换为 cuDF 中的 运行 代码。
正在处理的系列 .head()
中的示例数据被插入到第三个代码单元格中的 OG 代码中——应该能够 copy/paste 运行.
原始代码在pandas
# both are float columns now
# rawcensustractandblock
s_rawcensustractandblock = df_train['rawcensustractandblock'].apply(lambda x: str(x))
# adjust/set new tract number
df_train['census_tractnumber'] = s_rawcensustractandblock.str.slice(4,11)
# adjust block number
df_train['block_number'] = s_rawcensustractandblock.str.slice(start=11)
df_train['block_number'] = df_train['block_number'].apply(lambda x: x[:4]+'.'+x[4:]+'0' )
df_train['block_number'] = df_train['block_number'].apply(lambda x: int(round(float(x),0)) )
df_train['block_number'] = df_train['block_number'].apply(lambda x: str(x).ljust(4,'0') )
数据被操纵
# series of values from df_train.['rawcensustractandblock'].head()
data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
代码已调整为从此示例数据开始
下面是使用上面提供的数据而不是整个数据帧时代码的样子。
根据尝试转换时遇到的错误,此问题在系列级别,因此将下面的单元格转换为在 cuDF 中执行应该可以解决问题。
import pandas as pd
# series of values from df_train.['rawcensustractandblock'].head()
data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# how the first line looks using the series
s_rawcensustractandblock = data.apply(lambda x: str(x))
# adjust/set new tract number
census_tractnumber = s_rawcensustractandblock.str.slice(4,11)
# adjust block number
block_number = s_rawcensustractandblock.str.slice(start=11)
block_number = block_number.apply(lambda x: x[:4]+'.'+x[4:]+'0' )
block_number = block_number.apply(lambda x: int(round(float(x),0)) )
block_number = block_number.apply(lambda x: str(x).ljust(4,'0') )
预期变化(输出)
df_train['census_tractnumber'].head()
# out
0 1066.46
1 0524.22
2 4638.00
3 2963.00
4 0423.38
Name: census_tractnumber, dtype: object
df_train['block_number'].head()
0 1001
1 2024
2 3004
3 2002
4 1006
Name: block_number, dtype: object
for循环解决方案
pandas(原码)
import pandas as pd
# data from df_train.rawcensustractandblock.head()
pd_data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# using series instead of dataframe
pd_raw_block = pd_data.apply(lambda x: str(x))
# adjust/set new tract number
pd_tractnumber = pd_raw_block.str.slice(4,11)
# set/adjust block number
pd_block_number = pd_raw_block.str.slice(11)
pd_block_number = pd_block_number.apply(lambda x: x[:4]+'.'+x[4:]+'0')
pd_block_number = pd_block_number.apply(lambda x: int(round(float(x),0)))
pd_block_number = pd_block_number.apply(lambda x: str(x).ljust(4,'0'))
# print(list(pd_tractnumber))
# print(list(pd_block_number))
cuDF(解法代码)
import cudf
# data from df_train.rawcensustractandblock.head()
cudf_data = cudf.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# using series instead of dataframe
cudf_tractnumber = cudf_data.values_to_string()
# adjust/set new tract number
for i in range(len(cudf_tractnumber)):
funct = slice(4,11)
cudf_tractnumber[i] = cudf_tractnumber[i][funct]
# using series instead of dataframe
cudf_block_number = cudf_data.values_to_string()
# set/adjust block number
for i in range(len(cudf_block_number)):
funct = slice(11, None)
cudf_block_number[i] = cudf_block_number[i][funct]
cudf_block_number[i] = cudf_block_number[i][:4]+'.'+cudf_block_number[i][4:]+'0'
cudf_block_number[i] = int(round(float(cudf_block_number[i]), 0))
cudf_block_number[i] = str(cudf_block_number[i]).ljust(4,'0')
# print(cudf_tractnumber)
# print(cudf_block_number)
您几乎可以将 cuDF 字符串方法(通过 nvStrings)用于您尝试做的所有事情。将这些浮点数转换为 cuDF 中的字符串会失去一些精度(尽管在上面的示例中可能无关紧要),因此对于这个示例,我只是事先进行了转换。如果可能,我建议最初将 rawcensustractandblock
创建为字符串列而不是浮点列。
import cudf
import pandas as pd
gdata = cudf.from_pandas(pd_data.astype('str'))
tractnumber = gdata.str.slice(4,11)
blocknumber = gdata.str.slice(11)
blocknumber = blocknumber.str.slice(0,4).str.cat(blocknumber.str.slice(4), '.')
blocknumber = blocknumber.astype('float').round(0).astype('int')
blocknumber = blocknumber.astype('str').str.ljust(4, '0')
tractnumber
0 1066.46
1 0524.22
2 4638.00
3 2963.00
4 0423.38
dtype: object
blocknumber
0 1001
1 2024
2 3004
3 2002
4 1006
dtype: object