如何在 pandas 数据帧上迭代带有字符串的函数
How to iterate a function with strings over a pandas dataframe
我想获得我的数据框和基础之间的 Jaccard 相似度。问题是我需要它 500 多行,我要么收到错误消息:“要解压的值太多”,'Series' 对象没有属性'iterrows' 或函数将基础与数据帧作为一个整体进行比较。
选项 A:
sentences = pd.Series(df.sentence)
sentences = sentences.str.replace('[^A-z ]','').str.replace(' +',' ').str.strip()
splitwords = [ nltk.word_tokenize( str(sentence) ) for sentence in sentences ]
print(splitwords)
sentence = df.sentence
def Jaccard_Similarity(base, sentence):
for i, row in sentence.iterrows():
a = set(word for word in base)
b = set(word for word in df.sentence())
c = a.intersection(b)
return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, sentence)
备选方案 B:
df = df.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
print(df)
def Jaccard_Similarity(bas, df):
for row in df.iterrows(df):
a = set(word for word in base)
b = set(word for word in df)
c = a.intersection(b)
return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, df)
数据:
base = ['Tom', 'eats', 'apple']
df = (["Tom eats an apple"],
["Tom eats a pineapple"],
["Eva eats an apple"],
["Eva eats a pineapple"],
columns = 'sentence')
编辑:
base = set(base.lower().split())
df = set(df.lower().split())
def Jaccard_Similarity(base, df):
intersection = base.intersection(df)
union = base.union(df)
return float(len(intersection)) / len(union)
试试这个 - 我稍后会添加解释需要做一些工作。
import nltk
from nltk.corpus import stopwords # to remove stopwords
base = ['Tom', 'eats', 'apple']
base = [item.lower() for item in base]
stop_words = set(stopwords.words('english'))
list1 = [["Tom eats an apple"],
["Tom eats a pineapple"],
["Eva eats an apple"],
["Eva eats a pineapple"]]
df = pd.DataFrame(list1, columns= ['sentence'])
df = df.sentence.apply(nltk.word_tokenize)
df = df.apply(
lambda x: [item.lower() for item in x if item.lower() not in stop_words]
)
b = df.apply(set)
a = set(base)
c = b.apply(lambda x : a.intersection(x))
len_a_b = b.apply(lambda x : len(x) + len(a))
len_c = c.apply(lambda x : len(x))
dict1 = {'length' : len_c / (len_a_b - len_c) , 'b' : b , 'c' : c}
import numpy as np
df = pd.DataFrame(dict1)
df['a'] = np.NAN
df['a'] = df.a.apply(lambda x: a)
print(df)
输出-
length b c a
0 1.0 {apple, eats, tom} {apple, eats, tom} {apple, eats, tom}
1 0.5 {eats, tom, pineapple} {eats, tom} {apple, eats, tom}
2 0.5 {apple, eats, eva} {apple, eats} {apple, eats, tom}
3 0.2 {eats, pineapple, eva} {eats} {apple, eats, tom}
我想获得我的数据框和基础之间的 Jaccard 相似度。问题是我需要它 500 多行,我要么收到错误消息:“要解压的值太多”,'Series' 对象没有属性'iterrows' 或函数将基础与数据帧作为一个整体进行比较。
选项 A:
sentences = pd.Series(df.sentence)
sentences = sentences.str.replace('[^A-z ]','').str.replace(' +',' ').str.strip()
splitwords = [ nltk.word_tokenize( str(sentence) ) for sentence in sentences ]
print(splitwords)
sentence = df.sentence
def Jaccard_Similarity(base, sentence):
for i, row in sentence.iterrows():
a = set(word for word in base)
b = set(word for word in df.sentence())
c = a.intersection(b)
return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, sentence)
备选方案 B:
df = df.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
print(df)
def Jaccard_Similarity(bas, df):
for row in df.iterrows(df):
a = set(word for word in base)
b = set(word for word in df)
c = a.intersection(b)
return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, df)
数据:
base = ['Tom', 'eats', 'apple']
df = (["Tom eats an apple"],
["Tom eats a pineapple"],
["Eva eats an apple"],
["Eva eats a pineapple"],
columns = 'sentence')
编辑:
base = set(base.lower().split())
df = set(df.lower().split())
def Jaccard_Similarity(base, df):
intersection = base.intersection(df)
union = base.union(df)
return float(len(intersection)) / len(union)
试试这个 - 我稍后会添加解释需要做一些工作。
import nltk
from nltk.corpus import stopwords # to remove stopwords
base = ['Tom', 'eats', 'apple']
base = [item.lower() for item in base]
stop_words = set(stopwords.words('english'))
list1 = [["Tom eats an apple"],
["Tom eats a pineapple"],
["Eva eats an apple"],
["Eva eats a pineapple"]]
df = pd.DataFrame(list1, columns= ['sentence'])
df = df.sentence.apply(nltk.word_tokenize)
df = df.apply(
lambda x: [item.lower() for item in x if item.lower() not in stop_words]
)
b = df.apply(set)
a = set(base)
c = b.apply(lambda x : a.intersection(x))
len_a_b = b.apply(lambda x : len(x) + len(a))
len_c = c.apply(lambda x : len(x))
dict1 = {'length' : len_c / (len_a_b - len_c) , 'b' : b , 'c' : c}
import numpy as np
df = pd.DataFrame(dict1)
df['a'] = np.NAN
df['a'] = df.a.apply(lambda x: a)
print(df)
输出-
length b c a
0 1.0 {apple, eats, tom} {apple, eats, tom} {apple, eats, tom}
1 0.5 {eats, tom, pineapple} {eats, tom} {apple, eats, tom}
2 0.5 {apple, eats, eva} {apple, eats} {apple, eats, tom}
3 0.2 {eats, pineapple, eva} {eats} {apple, eats, tom}