Python WordCloud 未删除停用词
Python WordCloud not removing Stopwords
我正在尝试构建一个 Wordcloud,它可以自动从职位描述中提取单词并构建一个 wordcloud。如果你有停用词=None,它应该删除 wordcloud 的已知停用词列表,但我的程序没有。我相信这可能与我如何用漂亮的汤来描述工作描述有关。我需要帮助,要么用 beautifulsoup 以不同方式提取单词,要么我没有正确使用停用词。
import requests
# pip install bs4
from bs4 import BeautifulSoup
# pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Goes to a job description
url = "https://career.benteler.jobs/job/Paderborn-Head-of-Finance-&-Controlling-North-America-NW/604307901/?locale=en_US"
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser')
# Goes through all the words in the beautiful soup text
combinedWords = ''
for words in soup.find_all('span'):
separatedWords = words.text.split(' ')
combinedWords += " ".join(separatedWords) + ' '
# creates wordcloud
resumeCloud = WordCloud(stopwords=None, background_color='white', max_words=75, max_font_size=75, random_state=1).generate(combinedWords)
plt.figure(figsize=(8, 4))
plt.imshow(resumeCloud)
plt.axis('off')
plt.show()
主要问题是所有代码都在一个块中。尝试将逻辑拆分为方法并单独测试每一位。 请求不检查错误(例如服务器可能不可用但现在这应该不是问题。)
BeautifulSoup 正在提取页面上的所有 span 元素。这意味着它将包括菜单/页脚。如果您想要职位描述,那么您可能想要 select 具有 class 名称职位描述的跨度。之后你可以调用 text 来删除 html。我不确定您是否需要删除其他内容,例如逗号和句号。
我没有任何使用词云的经验。然而,在下面的代码中,它返回了一些看起来像结果的东西。
import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def get_job_html(url):
response = requests.get(url)
response.raise_for_status() # check for 4xx & 5xx errors
return response.text
def extract_combined_words(html):
soup = BeautifulSoup(html, 'html.parser')
job_description = soup.find("span", {"class": "jobdescription"}).text.replace('\n', ' ') # Target span with class jobdescription. text will strip out html.
print(job_description) # TODO - Check this is the results you expect?
return job_description
def create_resume_cloud(combinedWords):
return WordCloud(stopwords=None, background_color='white', max_words=75, max_font_size=75, random_state=1).generate(combinedWords)
def plot_resume_cloud(resumeCloud):
plt.figure(figsize=(8, 4))
plt.imshow(resumeCloud)
plt.axis('off')
plt.show()
def run(url):
html = get_job_html(url)
combinedWords = extract_combined_words(html)
resumeCloud = create_resume_cloud(combinedWords)
plt = plot_resume_cloud(resumeCloud)
return plt # TODO - not sure how the results gets consumed
if __name__ == '__main__':
run("https://career.benteler.jobs/job/Paderborn-Head-of-Finance-&-Controlling-North-America-NW/604307901/?locale=en_US")
我正在尝试构建一个 Wordcloud,它可以自动从职位描述中提取单词并构建一个 wordcloud。如果你有停用词=None,它应该删除 wordcloud 的已知停用词列表,但我的程序没有。我相信这可能与我如何用漂亮的汤来描述工作描述有关。我需要帮助,要么用 beautifulsoup 以不同方式提取单词,要么我没有正确使用停用词。
import requests
# pip install bs4
from bs4 import BeautifulSoup
# pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Goes to a job description
url = "https://career.benteler.jobs/job/Paderborn-Head-of-Finance-&-Controlling-North-America-NW/604307901/?locale=en_US"
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser')
# Goes through all the words in the beautiful soup text
combinedWords = ''
for words in soup.find_all('span'):
separatedWords = words.text.split(' ')
combinedWords += " ".join(separatedWords) + ' '
# creates wordcloud
resumeCloud = WordCloud(stopwords=None, background_color='white', max_words=75, max_font_size=75, random_state=1).generate(combinedWords)
plt.figure(figsize=(8, 4))
plt.imshow(resumeCloud)
plt.axis('off')
plt.show()
主要问题是所有代码都在一个块中。尝试将逻辑拆分为方法并单独测试每一位。 请求不检查错误(例如服务器可能不可用但现在这应该不是问题。)
BeautifulSoup 正在提取页面上的所有 span 元素。这意味着它将包括菜单/页脚。如果您想要职位描述,那么您可能想要 select 具有 class 名称职位描述的跨度。之后你可以调用 text 来删除 html。我不确定您是否需要删除其他内容,例如逗号和句号。
我没有任何使用词云的经验。然而,在下面的代码中,它返回了一些看起来像结果的东西。
import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def get_job_html(url):
response = requests.get(url)
response.raise_for_status() # check for 4xx & 5xx errors
return response.text
def extract_combined_words(html):
soup = BeautifulSoup(html, 'html.parser')
job_description = soup.find("span", {"class": "jobdescription"}).text.replace('\n', ' ') # Target span with class jobdescription. text will strip out html.
print(job_description) # TODO - Check this is the results you expect?
return job_description
def create_resume_cloud(combinedWords):
return WordCloud(stopwords=None, background_color='white', max_words=75, max_font_size=75, random_state=1).generate(combinedWords)
def plot_resume_cloud(resumeCloud):
plt.figure(figsize=(8, 4))
plt.imshow(resumeCloud)
plt.axis('off')
plt.show()
def run(url):
html = get_job_html(url)
combinedWords = extract_combined_words(html)
resumeCloud = create_resume_cloud(combinedWords)
plt = plot_resume_cloud(resumeCloud)
return plt # TODO - not sure how the results gets consumed
if __name__ == '__main__':
run("https://career.benteler.jobs/job/Paderborn-Head-of-Finance-&-Controlling-North-America-NW/604307901/?locale=en_US")