Python 从网站抓取
Python Scraping from website
我已经尝试为 https://www.waug.com/area/?idx=15 编写网络抓取工具:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.abcd.com/area/?abc=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
names = soup.select('#good_{} > div > div.class_name > div > div'.format(count))
prices = soup.select('#good_{} > div > div.class_name > div.class_name'.format(count))
for name in names:
while count < 45:
print(name.text)
count = count + 1
for price in prices:
while count < 45:
print(price.text)
count = count + 1
输出只有第一个项目名称的 45 倍,没有价格。我怎样才能得到所有的商品名称和价格?我想在同一行获取项目名称和价格。 (为了以防万一,我更改了 url 和一些 class 名称)
为了确保获得正确标题的正确名称,我会得到整个 "item-good" class.
然后使用 for 循环可以确保我获得的标题与其价格相符。
下面是一个如何使用 BeautifulSoup 解析网站的示例:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.waug.com/area/?idx=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors delete the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
根据 OP 的要求,这还不够,因为有一个 "more" 按钮可以推入网页以检索所有结果。
这可以使用 Selenium Webdriver 来完成。
=== 重要说明 ===
为了完成这项工作,您还需要在脚本文件夹中复制 "chromedriver" 文件。
您可以从 this Google website 下载它。
这是脚本:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
for number in range(10):
try:
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
more_button = browser.find_element_by_id('more_good')
more_button.click()
time.sleep(10)
except:
print "Scrolling is now complete!"
source = browser.page_source
# This source variable should be used as input for BeautifulSoup
print source
现在需要合并两个已解释的灵魂以获得最终请求的结果。
请记住,这只是一个快速的'n'dirty hack,需要适当的错误处理和完善,但它应该足以让你开始:
#!/usr/bin/env python3
#_*_coding:utf8_*_
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
def is_page_load_complete():
close_button = browser.find_element_by_id('close_good');
return close_button.is_displayed();
while(True):
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
time.sleep(10)
more_button = browser.find_element_by_id('more_good')
if (more_button.is_displayed()):
more_button.click()
else:
if (is_page_load_complete()):
break
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors comment the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
print "Total items found: " + str(len(items))
我已经尝试为 https://www.waug.com/area/?idx=15 编写网络抓取工具:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.abcd.com/area/?abc=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
names = soup.select('#good_{} > div > div.class_name > div > div'.format(count))
prices = soup.select('#good_{} > div > div.class_name > div.class_name'.format(count))
for name in names:
while count < 45:
print(name.text)
count = count + 1
for price in prices:
while count < 45:
print(price.text)
count = count + 1
输出只有第一个项目名称的 45 倍,没有价格。我怎样才能得到所有的商品名称和价格?我想在同一行获取项目名称和价格。 (为了以防万一,我更改了 url 和一些 class 名称)
为了确保获得正确标题的正确名称,我会得到整个 "item-good" class.
然后使用 for 循环可以确保我获得的标题与其价格相符。
下面是一个如何使用 BeautifulSoup 解析网站的示例:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.waug.com/area/?idx=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors delete the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
根据 OP 的要求,这还不够,因为有一个 "more" 按钮可以推入网页以检索所有结果。
这可以使用 Selenium Webdriver 来完成。
=== 重要说明 ===
为了完成这项工作,您还需要在脚本文件夹中复制 "chromedriver" 文件。
您可以从 this Google website 下载它。
这是脚本:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
for number in range(10):
try:
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
more_button = browser.find_element_by_id('more_good')
more_button.click()
time.sleep(10)
except:
print "Scrolling is now complete!"
source = browser.page_source
# This source variable should be used as input for BeautifulSoup
print source
现在需要合并两个已解释的灵魂以获得最终请求的结果。
请记住,这只是一个快速的'n'dirty hack,需要适当的错误处理和完善,但它应该足以让你开始:
#!/usr/bin/env python3
#_*_coding:utf8_*_
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
def is_page_load_complete():
close_button = browser.find_element_by_id('close_good');
return close_button.is_displayed();
while(True):
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
time.sleep(10)
more_button = browser.find_element_by_id('more_good')
if (more_button.is_displayed()):
more_button.click()
else:
if (is_page_load_complete()):
break
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors comment the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
print "Total items found: " + str(len(items))