在 Discord Bot 脚本中创建一个定时循环以重新加载网页(web scraper bot)
Creating a timed loop inside Discord Bot script to reload web page (web scraper bot)
我目前正在设计一个 discord 机器人,它可以抓取不断更新与 PBE 服务器相关的补丁的网页。我现在通过 Heroku 成功拥有了机器人 运行。我 运行 遇到的问题是我想创建一个自动(定时循环)刷新来重新加载我请求的网站。目前,它只加载网站的一个实例,如果该网站 changes/updates,我的内容 none 将更新,因为我正在使用网站的“旧”请求。
有没有一种方法可以让我在函数中嵌入代码,以便我可以创建一个定时循环,或者我只需要围绕我的网站请求创建一个?那看起来怎么样?谢谢!
from bs4 import BeautifulSoup
from urllib.request import urlopen
from discord.ext import commands
import discord
# what I want the commands to start with
bot = commands.Bot(command_prefix='!')
# instantiating discord client
token = "************************************"
client = discord.Client()
# begin the scraping of passed in web page
URL = "*********************************"
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
pbe_titles = soup.find_all('h1', attrs={'class': 'news-title'}) # using soup to find all header tags with the news-title
# class and storing them in pbe_titles
linksAndTitles = []
counter = 0
# finding tags that start with 'a' as in a href and appending those titles/links
for tag in pbe_titles:
for anchor in tag.find_all('a'):
linksAndTitles.append(tag.text.strip())
linksAndTitles.append(anchor['href'])
# counts number of lines stored inside linksAndTitles list
for i in linksAndTitles:
counter = counter + 1
print(counter)
# separates list by line so that it looks nice when printing
allPatches = '\n'.join(str(line) for line in linksAndTitles[:counter])
# stores the first two lines in list which is the current pbe patch title and link
currPatch = '\n'.join(str(line) for line in linksAndTitles[:2])
# command that allows user to type in exactly what patch they want to see information for based off date
@bot.command(name='patch')
async def pbe_patch(ctx, *, arg):
if any(item.startswith(arg) for item in linksAndTitles):
await ctx.send(arg + " exists!")
else:
await ctx.send('The date you entered: ' + '"' + arg + '"' + ' does not have a patch associated with it or that patch expired.')
# command that displays the current, most up to date, patch
@bot.command(name='current')
async def current_patch(ctx):
response = currPatch
await ctx.send(response)
bot.run(token)
我玩过
while True:
循环但是每当我在其中嵌套任何东西时,我无法访问其他地方的代码。
discord
有特殊装饰器 tasks
到 运行 一些代码定期
from discord.ext import tasks
@tasks.loop(seconds=5.0)
async def scrape():
# ... your scraping code ...
# ... your commands ...
scrape.start()
bot.run(token)
它会每 5 秒重复一次功能 scrape
。
文档:tasks
在 Linux 上,最终我会使用标准服务 cron
定期地 运行 一些脚本。这个脚本可以抓取数据并保存在文件或数据库中,discord
可以从这个文件或数据库中读取。但是 cron
每 1 分钟检查一次任务,因此它不能 运行 更频繁地执行任务。
编辑:
最少的工作代码。
我使用为抓取学习创建的页面 http://books.toscrape.com。
我改变了几个元素。有 bot
时不需要创建 client
因为 bot
是一种特殊的 client
我将 title
和 link
作为字典
{
'title': tag.text.strip(),
'link': url + anchor['href'],
}
所以以后更容易创建像
这样的文本
title: A Light in the ...
link: http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
import os
import discord
from discord.ext import commands, tasks
from bs4 import BeautifulSoup
from urllib.request import urlopen
# default value at start (before `scrape` will assign new value)
# because some function may try to use these variables before `scrape` will create them
links_and_titles = [] # PEP8: `lower_case_namese`
counter = 0
items = []
bot = commands.Bot(command_prefix='!')
@tasks.loop(seconds=5)
async def scrape():
global links_and_titles
global counter
global items
url = "http://books.toscrape.com/"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
#pbe_titles = soup.find_all('h1', attrs={'class': 'news-title'})
pbe_titles = soup.find_all('h3')
# remove previous content
links_and_titles = []
for tag in pbe_titles:
for anchor in tag.find_all('a'):
links_and_titles.append({
'title': tag.text.strip(),
'link': url + anchor['href'],
})
counter = len(links_and_titles)
print('counter:', counter)
items = [f"title: {x['title']}\nlink: {x['link']}" for x in links_and_titles]
@bot.command(name='patch')
async def pbe_patch(ctx, *, arg=None):
if arg is None:
await ctx.send('Use: !patch date')
elif any(item['title'].startswith(arg) for item in links_and_titles):
await ctx.send(arg + " exists!")
else:
await ctx.send(f'The date you entered: "{arg}" does not have a patch associated with it or that patch expired.')
@bot.command(name='current')
async def current_patch(ctx, *, number=1):
if items:
responses = items[:number]
text = '\n----\n'.join(responses)
await ctx.send(text)
else:
await ctx.send('no patches')
scrape.start()
token = os.getenv('DISCORD_TOKEN')
bot.run(token)
我目前正在设计一个 discord 机器人,它可以抓取不断更新与 PBE 服务器相关的补丁的网页。我现在通过 Heroku 成功拥有了机器人 运行。我 运行 遇到的问题是我想创建一个自动(定时循环)刷新来重新加载我请求的网站。目前,它只加载网站的一个实例,如果该网站 changes/updates,我的内容 none 将更新,因为我正在使用网站的“旧”请求。
有没有一种方法可以让我在函数中嵌入代码,以便我可以创建一个定时循环,或者我只需要围绕我的网站请求创建一个?那看起来怎么样?谢谢!
from bs4 import BeautifulSoup
from urllib.request import urlopen
from discord.ext import commands
import discord
# what I want the commands to start with
bot = commands.Bot(command_prefix='!')
# instantiating discord client
token = "************************************"
client = discord.Client()
# begin the scraping of passed in web page
URL = "*********************************"
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
pbe_titles = soup.find_all('h1', attrs={'class': 'news-title'}) # using soup to find all header tags with the news-title
# class and storing them in pbe_titles
linksAndTitles = []
counter = 0
# finding tags that start with 'a' as in a href and appending those titles/links
for tag in pbe_titles:
for anchor in tag.find_all('a'):
linksAndTitles.append(tag.text.strip())
linksAndTitles.append(anchor['href'])
# counts number of lines stored inside linksAndTitles list
for i in linksAndTitles:
counter = counter + 1
print(counter)
# separates list by line so that it looks nice when printing
allPatches = '\n'.join(str(line) for line in linksAndTitles[:counter])
# stores the first two lines in list which is the current pbe patch title and link
currPatch = '\n'.join(str(line) for line in linksAndTitles[:2])
# command that allows user to type in exactly what patch they want to see information for based off date
@bot.command(name='patch')
async def pbe_patch(ctx, *, arg):
if any(item.startswith(arg) for item in linksAndTitles):
await ctx.send(arg + " exists!")
else:
await ctx.send('The date you entered: ' + '"' + arg + '"' + ' does not have a patch associated with it or that patch expired.')
# command that displays the current, most up to date, patch
@bot.command(name='current')
async def current_patch(ctx):
response = currPatch
await ctx.send(response)
bot.run(token)
我玩过
while True:
循环但是每当我在其中嵌套任何东西时,我无法访问其他地方的代码。
discord
有特殊装饰器 tasks
到 运行 一些代码定期
from discord.ext import tasks
@tasks.loop(seconds=5.0)
async def scrape():
# ... your scraping code ...
# ... your commands ...
scrape.start()
bot.run(token)
它会每 5 秒重复一次功能 scrape
。
文档:tasks
在 Linux 上,最终我会使用标准服务 cron
定期地 运行 一些脚本。这个脚本可以抓取数据并保存在文件或数据库中,discord
可以从这个文件或数据库中读取。但是 cron
每 1 分钟检查一次任务,因此它不能 运行 更频繁地执行任务。
编辑:
最少的工作代码。
我使用为抓取学习创建的页面 http://books.toscrape.com。
我改变了几个元素。有 bot
时不需要创建 client
因为 bot
是一种特殊的 client
我将 title
和 link
作为字典
{
'title': tag.text.strip(),
'link': url + anchor['href'],
}
所以以后更容易创建像
这样的文本title: A Light in the ...
link: http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
import os
import discord
from discord.ext import commands, tasks
from bs4 import BeautifulSoup
from urllib.request import urlopen
# default value at start (before `scrape` will assign new value)
# because some function may try to use these variables before `scrape` will create them
links_and_titles = [] # PEP8: `lower_case_namese`
counter = 0
items = []
bot = commands.Bot(command_prefix='!')
@tasks.loop(seconds=5)
async def scrape():
global links_and_titles
global counter
global items
url = "http://books.toscrape.com/"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
#pbe_titles = soup.find_all('h1', attrs={'class': 'news-title'})
pbe_titles = soup.find_all('h3')
# remove previous content
links_and_titles = []
for tag in pbe_titles:
for anchor in tag.find_all('a'):
links_and_titles.append({
'title': tag.text.strip(),
'link': url + anchor['href'],
})
counter = len(links_and_titles)
print('counter:', counter)
items = [f"title: {x['title']}\nlink: {x['link']}" for x in links_and_titles]
@bot.command(name='patch')
async def pbe_patch(ctx, *, arg=None):
if arg is None:
await ctx.send('Use: !patch date')
elif any(item['title'].startswith(arg) for item in links_and_titles):
await ctx.send(arg + " exists!")
else:
await ctx.send(f'The date you entered: "{arg}" does not have a patch associated with it or that patch expired.')
@bot.command(name='current')
async def current_patch(ctx, *, number=1):
if items:
responses = items[:number]
text = '\n----\n'.join(responses)
await ctx.send(text)
else:
await ctx.send('no patches')
scrape.start()
token = os.getenv('DISCORD_TOKEN')
bot.run(token)