如何在使用词干清理数据后获取单词列表
how to get a list of words after cleaning the data with stemming
目前,我只有一行。我怎样才能得到所有的话?目前,我有一个单词栏。词干分析器中的问题。它只给出一行而不是所有单词。
我的目的是清理数据并打印所有以逗号分隔的单词。
输入:word1,word2,word3,word4,word5 每行列df[tag]
并且输出将是一个长列表,其中包含所有值 word1、word2、word3、word4、word5、word6、word7....
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
import pandas as pd
import spacy
import pytextrank
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def Clean_stop_words(data):
#print(stopwords.words('english'))
stop_words=stopwords.words('english')
new_data=""
for word in data:
np.char.lower(word)
if word not in stop_words:
new_data = data + " , " + word
print(new_data)
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
for i in symbols:
new_data = np.char.replace(new_text, i, ' ')
#print(data)
stemmer=PorterStemmer()
new_data=stemmer.stem(word)
#print(new_data)
Clean_stop_words(df["Tag"])
#print(data)
提前致谢
通知-
我决定用正则表达式清理特殊字符,如果你愿意,你可以改变方法。
此外,请查看 pandas 的应用函数,它获取每一行并执行 Clean_stop_words 函数。
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import re
l = ["'word1,wording,w#ord,he##llo,sleeping,don't"]
df = pd.DataFrame(l, columns=['Tag'])
def Clean_stop_words(data):
stemmer = PorterStemmer()
stop_words=stopwords.words('english')
new_data=""
data_split = data.split(',')
for word in data_split:
np.char.lower(word)
word = re.sub('[^A-Za-z0-9]+', '', word)
if word not in stop_words:
stemmer.stem(word)
new_data = new_data + " , " + word
return new_data
df['Tag'] = df['Tag'].apply(Clean_stop_words)
print(df['Tag'])
目前,我只有一行。我怎样才能得到所有的话?目前,我有一个单词栏。词干分析器中的问题。它只给出一行而不是所有单词。
我的目的是清理数据并打印所有以逗号分隔的单词。
输入:word1,word2,word3,word4,word5 每行列df[tag]
并且输出将是一个长列表,其中包含所有值 word1、word2、word3、word4、word5、word6、word7....
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
import pandas as pd
import spacy
import pytextrank
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def Clean_stop_words(data):
#print(stopwords.words('english'))
stop_words=stopwords.words('english')
new_data=""
for word in data:
np.char.lower(word)
if word not in stop_words:
new_data = data + " , " + word
print(new_data)
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
for i in symbols:
new_data = np.char.replace(new_text, i, ' ')
#print(data)
stemmer=PorterStemmer()
new_data=stemmer.stem(word)
#print(new_data)
Clean_stop_words(df["Tag"])
#print(data)
提前致谢
通知-
我决定用正则表达式清理特殊字符,如果你愿意,你可以改变方法。
此外,请查看 pandas 的应用函数,它获取每一行并执行 Clean_stop_words 函数。
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import re
l = ["'word1,wording,w#ord,he##llo,sleeping,don't"]
df = pd.DataFrame(l, columns=['Tag'])
def Clean_stop_words(data):
stemmer = PorterStemmer()
stop_words=stopwords.words('english')
new_data=""
data_split = data.split(',')
for word in data_split:
np.char.lower(word)
word = re.sub('[^A-Za-z0-9]+', '', word)
if word not in stop_words:
stemmer.stem(word)
new_data = new_data + " , " + word
return new_data
df['Tag'] = df['Tag'].apply(Clean_stop_words)
print(df['Tag'])