通过匹配 python 中的其他列值来拆分列值
split a column value by matching other column values in python
我有一个如下所示的数据框
dummy = [["new york is a cool city in usa but i like london","cool","new","like"]]
df_dummy = pd.DataFrame(dummy,columns=["text","a","b","c"])
现在我想拆分其他列值匹配的文本列的字符串。
我试过下面的代码,但不能超出这个范围。
idx_ = [0]
cols_dummy = df_dummy.columns.values
cols_dummy = np.delete(cols_dummy,idx_,axis=0)
t_text = df_dummy.text.values[0]
for i in cols_dummy:
match_ = "("+df_dummy[i].values[0]+")"
tmp = re.split(match_,t_text)
for e in range (0,len(tmp)):
print(match_, tmp)
预期输出:
["new", "york is a", "cool", "city in usa but i", "like", "london"]
不是最漂亮的,但可以胜任。希望它能激励你实现一些东西similar/better。
t = "new york is a cool city in usa but i like london"
words = ["cool", "new", "like"]
def get_indicies(s, words):
indicies = []
for word in words:
start = s.index(word)
end = start + len(word)
idx_tupl = (start, end)
indicies.append(idx_tupl)
return sorted(indicies)
def compose(s, indicies):
result = []
ptr = 0 # start at beginning of string
for idx in indicies:
if idx[0] != ptr:
result.append(s[ptr : idx[0]])
result.append(s[idx[0] : idx[1]])
ptr = idx[1]
result.append(s[ptr:])
return result
def split_by_word(s, words):
indices = get_indicies(s, words)
return compose(s, indices)
print(split_by_word(t, words))
# output (Notice the whitespace around some string here)
['new', ' york is a ', 'cool', ' city in usa but i ', 'like', ' london']
我有一个如下所示的数据框
dummy = [["new york is a cool city in usa but i like london","cool","new","like"]]
df_dummy = pd.DataFrame(dummy,columns=["text","a","b","c"])
现在我想拆分其他列值匹配的文本列的字符串。
我试过下面的代码,但不能超出这个范围。
idx_ = [0]
cols_dummy = df_dummy.columns.values
cols_dummy = np.delete(cols_dummy,idx_,axis=0)
t_text = df_dummy.text.values[0]
for i in cols_dummy:
match_ = "("+df_dummy[i].values[0]+")"
tmp = re.split(match_,t_text)
for e in range (0,len(tmp)):
print(match_, tmp)
预期输出:
["new", "york is a", "cool", "city in usa but i", "like", "london"]
不是最漂亮的,但可以胜任。希望它能激励你实现一些东西similar/better。
t = "new york is a cool city in usa but i like london"
words = ["cool", "new", "like"]
def get_indicies(s, words):
indicies = []
for word in words:
start = s.index(word)
end = start + len(word)
idx_tupl = (start, end)
indicies.append(idx_tupl)
return sorted(indicies)
def compose(s, indicies):
result = []
ptr = 0 # start at beginning of string
for idx in indicies:
if idx[0] != ptr:
result.append(s[ptr : idx[0]])
result.append(s[idx[0] : idx[1]])
ptr = idx[1]
result.append(s[ptr:])
return result
def split_by_word(s, words):
indices = get_indicies(s, words)
return compose(s, indices)
print(split_by_word(t, words))
# output (Notice the whitespace around some string here)
['new', ' york is a ', 'cool', ' city in usa but i ', 'like', ' london']