只获取列表中最后一个循环的数据
Only get the last loop's data in list
我正在尝试通过制作一个小项目来学习 python/beautifulsoup 和 Django。对于这个项目,我试图从一个网站上抓取食谱,然后展示一个随机选择的页面。为此,我编写了一段代码,当我刚看到第一页时,它就可以完美运行,35 个食谱。但是:我也想从第 2 页和第 3 页获取食谱。我想我应该为此写一个循环,但我似乎做对了。
该循环非常适合抓取网站,但仅将最后一个循环存储在为食谱项目制作的列表中。我如何获取此代码以将信息添加到列表而不是覆盖?
该代码适用于列表中的前 35 个项目(页面上有 35 个食谱)但不适用于更高的项目。
from django.shortcuts import render
import requests
import re
from bs4 import BeautifulSoup
import random
# Create your views here.
def recipe(request):
#Create soup
for page in range(0,2):
webpage_response = requests.get(f"https://www.ah.nl/allerhande/recepten-zoeken?page={page}" )
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
recipe_links = soup.find_all('a', attrs={'class' : re.compile('^display-card_root__.*')})
recipe_pictures = soup.find_all('img', attrs={'class' : re.compile('^card-image-set_imageSet__.*')})
recipe_prep_time = [ul.find('li').text
for ul in soup.find_all('ul',
attrs={'class': re.compile('^recipe-card-properties_root')})]
#Set up lists
links = []
titles = []
pictures = []
#create prefix for link
prefix = "https://ah.nl"
#scrape page for recipe
for link in recipe_links:
links.append(prefix + link.get('href'))
for title in recipe_links:
titles.append(title.get('aria-label'))
for img in recipe_pictures:
pictures.append(img.get('data-srcset'))
#create random int to select a recipe
nummer = random.randint(0,105)
#select correct link for image
pic_url = pictures[nummer].split(' ')
#create context
context = {
"titles" : titles[nummer],
"pictures" : pic_url[16],
"preptime" : recipe_prep_time[nummer],
"link" : links[nummer]
}
#render page
return render(request, "randomRecipe/recipe.html", context)
好主意 - 我自己总是无法决定何时提供如此好和压倒性的问题。
正如@Barmar 已经提到的那样,使用更结构化的方法存储抓取的信息会更精简 - 例如一个列表 data
,其中包含具有类似 context
.
结构的字典
您还可以 select 您的元素更具体:
data = []
for e in soup.select('a[data-testhook="recipe-card"]'):
data.append({
'title' : e.span.text,
'picture' : e.img.get('data-srcset').split()[1],
'preptime' : e.li.text,
'link' : prefix+e['href']
})
例子
from django.shortcuts import render
import requests
import re
from bs4 import BeautifulSoup
import random
# Create your views here.
def recipe(request):
#create prefix for link
prefix = "https://ah.nl"
#Create soup
data = []
for page in range(0,2):
webpage_response = requests.get(f"https://www.ah.nl/allerhande/recepten-zoeken?page={page}" )
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
for e in soup.select('a[data-testhook="recipe-card"]'):
data.append({
'title' : e.span.text,
'picture' : e.img.get('data-srcset').split()[1],
'preptime' : e.li.text,
'link' : prefix+e['href']
})
#create random int to select a recipe
nummer = random.randint(0,len(data))
context = data[nummer]
#render page
return render(request, "randomRecipe/recipe.html", context)
上下文
{'title': 'Noedels met sticky sriracha-aubergine, cashewnoten en garnalen',
'pictures': 'https://static.ah.nl/static/recepten/img_RAM_PRD159203_220x162_JPG.jpg',
'preptime': '45 min',
'link': 'https://ah.nl/allerhande/recept/R-R1196327/noedels-met-sticky-sriracha-aubergine-cashewnoten-en-garnalen'}
我正在尝试通过制作一个小项目来学习 python/beautifulsoup 和 Django。对于这个项目,我试图从一个网站上抓取食谱,然后展示一个随机选择的页面。为此,我编写了一段代码,当我刚看到第一页时,它就可以完美运行,35 个食谱。但是:我也想从第 2 页和第 3 页获取食谱。我想我应该为此写一个循环,但我似乎做对了。 该循环非常适合抓取网站,但仅将最后一个循环存储在为食谱项目制作的列表中。我如何获取此代码以将信息添加到列表而不是覆盖? 该代码适用于列表中的前 35 个项目(页面上有 35 个食谱)但不适用于更高的项目。
from django.shortcuts import render
import requests
import re
from bs4 import BeautifulSoup
import random
# Create your views here.
def recipe(request):
#Create soup
for page in range(0,2):
webpage_response = requests.get(f"https://www.ah.nl/allerhande/recepten-zoeken?page={page}" )
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
recipe_links = soup.find_all('a', attrs={'class' : re.compile('^display-card_root__.*')})
recipe_pictures = soup.find_all('img', attrs={'class' : re.compile('^card-image-set_imageSet__.*')})
recipe_prep_time = [ul.find('li').text
for ul in soup.find_all('ul',
attrs={'class': re.compile('^recipe-card-properties_root')})]
#Set up lists
links = []
titles = []
pictures = []
#create prefix for link
prefix = "https://ah.nl"
#scrape page for recipe
for link in recipe_links:
links.append(prefix + link.get('href'))
for title in recipe_links:
titles.append(title.get('aria-label'))
for img in recipe_pictures:
pictures.append(img.get('data-srcset'))
#create random int to select a recipe
nummer = random.randint(0,105)
#select correct link for image
pic_url = pictures[nummer].split(' ')
#create context
context = {
"titles" : titles[nummer],
"pictures" : pic_url[16],
"preptime" : recipe_prep_time[nummer],
"link" : links[nummer]
}
#render page
return render(request, "randomRecipe/recipe.html", context)
好主意 - 我自己总是无法决定何时提供如此好和压倒性的问题。
正如@Barmar 已经提到的那样,使用更结构化的方法存储抓取的信息会更精简 - 例如一个列表 data
,其中包含具有类似 context
.
您还可以 select 您的元素更具体:
data = []
for e in soup.select('a[data-testhook="recipe-card"]'):
data.append({
'title' : e.span.text,
'picture' : e.img.get('data-srcset').split()[1],
'preptime' : e.li.text,
'link' : prefix+e['href']
})
例子
from django.shortcuts import render
import requests
import re
from bs4 import BeautifulSoup
import random
# Create your views here.
def recipe(request):
#create prefix for link
prefix = "https://ah.nl"
#Create soup
data = []
for page in range(0,2):
webpage_response = requests.get(f"https://www.ah.nl/allerhande/recepten-zoeken?page={page}" )
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
for e in soup.select('a[data-testhook="recipe-card"]'):
data.append({
'title' : e.span.text,
'picture' : e.img.get('data-srcset').split()[1],
'preptime' : e.li.text,
'link' : prefix+e['href']
})
#create random int to select a recipe
nummer = random.randint(0,len(data))
context = data[nummer]
#render page
return render(request, "randomRecipe/recipe.html", context)
上下文
{'title': 'Noedels met sticky sriracha-aubergine, cashewnoten en garnalen',
'pictures': 'https://static.ah.nl/static/recepten/img_RAM_PRD159203_220x162_JPG.jpg',
'preptime': '45 min',
'link': 'https://ah.nl/allerhande/recept/R-R1196327/noedels-met-sticky-sriracha-aubergine-cashewnoten-en-garnalen'}