requests_htlml 无限滚动 div 而不是整个页面
requests_htlml infinite scrolling on div instead of entire page
您好,我正在尝试从以下网页获取所有链接。当我们向下滚动时,此页面会加载新产品,我试图通过滚动到页面底部来获取所有产品的链接。我在 following this post 之后使用 requests_html 的 scrolldown
方法,但是它只获取无需滚动即可看到的产品链接。问题是它向下滚动整个页面而不是产品框架。如果您看到下图,则只有当您滚动到产品框架底部时才会加载产品。
我也试过 seleniumwire(检查下面的代码)但它做同样的事情,滚动到没有加载任何产品的页面底部。我如何只滚动产品 div?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from seleniumwire import webdriver
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36 '
}
driver = webdriver.Chrome(executable_path="/src/resources/chromedriver")
driver.implicitly_wait(30)
product_links = []
try:
SCROLL_PAUSE_TIME = 2
def interceptor(request):
del request.headers['Referer'] # Delete the header first
request.headers['Referer'] = header
# Set the interceptor on the driver
driver.request_interceptor = interceptor
# All requests will now use 'some_referer' for the referer
driver.get(baseurl)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# r = requests.get(driver.page_source, headers=header)
print(driver.page_source)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# product_list = soup.find_all('div', class_='col-item productInfoDiv ')
#
# for itemprop in product_list:
# for link in itemprop.find_all('a', href=True):
# product_links.append("{}{}".format(baseurl, link['href']))
#
# product_links_uniq = set(product_links)
#
# print(product_links_uniq)
finally:
driver.quit()
from requests_html import HTML, HTMLSession
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
print(ln)
print(len(all_links))
filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))
您可以模仿页面所做的 POST 请求,并继续请求每批 20 个结果,提取链接,直到收集到指定数量的结果。
import requests
import math
from bs4 import BeautifulSoup as bs
def add_product_links(soup):
product_links.extend(['https://www.medplusmart.com' + i['href']
for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
return
product_links = []
n = 0
results_per_page = 20
page = 1
data = {
'sortField': '',
'startIndex': n,
'productCategoryId': 'MART_20002',
'startPrice': '',
'endPrice': '',
'minPrice': '0',
'maxPrice': '2650',
'excludeNoStock': 'N',
'pCatName': 'personal-care_10102',
'catName': 'skin-care_20002',
'productIdString': '',
'Brand Search': ''
}
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get(
'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
soup = bs(r.content, 'lxml')
data['productIdString'] = soup.select_one('#productIdString')['value']
num_results = int(soup.select_one('#totalProductFound')['value'])
num_pages = math.ceil(num_results / results_per_page)
add_product_links(soup)
s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})
while True:
if page > num_pages:
break
data['startIndex'] = n
r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
soup = bs(r.content, 'lxml')
add_product_links(soup)
n += results_per_page
page += 1
print(len(product_links))
您好,我正在尝试从以下网页获取所有链接。当我们向下滚动时,此页面会加载新产品,我试图通过滚动到页面底部来获取所有产品的链接。我在 following this post 之后使用 requests_html 的 scrolldown
方法,但是它只获取无需滚动即可看到的产品链接。问题是它向下滚动整个页面而不是产品框架。如果您看到下图,则只有当您滚动到产品框架底部时才会加载产品。
我也试过 seleniumwire(检查下面的代码)但它做同样的事情,滚动到没有加载任何产品的页面底部。我如何只滚动产品 div?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from seleniumwire import webdriver
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36 '
}
driver = webdriver.Chrome(executable_path="/src/resources/chromedriver")
driver.implicitly_wait(30)
product_links = []
try:
SCROLL_PAUSE_TIME = 2
def interceptor(request):
del request.headers['Referer'] # Delete the header first
request.headers['Referer'] = header
# Set the interceptor on the driver
driver.request_interceptor = interceptor
# All requests will now use 'some_referer' for the referer
driver.get(baseurl)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# r = requests.get(driver.page_source, headers=header)
print(driver.page_source)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# product_list = soup.find_all('div', class_='col-item productInfoDiv ')
#
# for itemprop in product_list:
# for link in itemprop.find_all('a', href=True):
# product_links.append("{}{}".format(baseurl, link['href']))
#
# product_links_uniq = set(product_links)
#
# print(product_links_uniq)
finally:
driver.quit()
from requests_html import HTML, HTMLSession
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
print(ln)
print(len(all_links))
filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))
您可以模仿页面所做的 POST 请求,并继续请求每批 20 个结果,提取链接,直到收集到指定数量的结果。
import requests
import math
from bs4 import BeautifulSoup as bs
def add_product_links(soup):
product_links.extend(['https://www.medplusmart.com' + i['href']
for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
return
product_links = []
n = 0
results_per_page = 20
page = 1
data = {
'sortField': '',
'startIndex': n,
'productCategoryId': 'MART_20002',
'startPrice': '',
'endPrice': '',
'minPrice': '0',
'maxPrice': '2650',
'excludeNoStock': 'N',
'pCatName': 'personal-care_10102',
'catName': 'skin-care_20002',
'productIdString': '',
'Brand Search': ''
}
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get(
'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
soup = bs(r.content, 'lxml')
data['productIdString'] = soup.select_one('#productIdString')['value']
num_results = int(soup.select_one('#totalProductFound')['value'])
num_pages = math.ceil(num_results / results_per_page)
add_product_links(soup)
s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})
while True:
if page > num_pages:
break
data['startIndex'] = n
r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
soup = bs(r.content, 'lxml')
add_product_links(soup)
n += results_per_page
page += 1
print(len(product_links))