我怎样才能让 phantomJS webdriver 等到特定的 HTML 元素被加载然后 return page.source?
How can I make the phantomJS webdriver to wait until a specific HTML element being loaded and then return the page.source?
我已经为网络抓取对象开发了以下代码。
需要两个日期,因为 inputs.Then 创建这两个日期之间的日期列表,并将每个日期附加到包含某个位置的天气信息的网页 url。然后它将 HTML tables 的数据转换为 Dataframe,然后将数据作为 csv 文件存储在存储中(基础 link 是:https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3 正如你在这个中看到的例如日期是 2019-1-3):
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
html = self.driver.page_source
return html
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
这是我想要使用 WebCrawler
对象的方式:
date1 = date(2018, 12, 29)
date2 = date(2019, 1, 1)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in dates:
print('**************************')
print('PROCESSING : ', day)
link = crawler.create_link(day)
print('WAITING... ')
time.sleep(3)
print('VISIT WEBPAGE ... ')
html = crawler.open_link(link)
print('DATA RETRIEVED ... ')
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
发生的问题是循环的第一次迭代运行完美,但第二次循环停止并出现错误 No tables where found
(发生在 table = soup.find("table",{"class":"tablesaw-sortable"})
行),这是因为页面源是 return由 WebCrawler.open_link
编辑,然后网页完全加载网页内容,包括 table(包含天气信息)。网站也有可能拒绝请求,因为这会使服务器太忙。
我们是否可以构建一个循环,不断尝试打开 link 直到它可以找到 table,或者至少等到 table 被加载并且然后 return table?
您可以让 selenium 等待特定元素。在您的情况下,它将是 table 的 class 名称 "tablesaw-sortable"。我 高度 建议您使用 CSS 选择器来查找此元素,因为它比获取所有 table 元素更快且更不容易出错。
这是为您预制的 CSS 选择器 table.tablesaw-sortable
。将 selenium 设置为等待该元素加载。
来源:
我使用@mildmelon 建议的 解决方案重写了代码,我还在每次向服务器发送请求和请求页面源之间使用了一些延迟:
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.delay_for_page = 7
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
myElem = WebDriverWait(self.driver, self.delay_for_page)\
.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
date1 = date(2019, 2, 1)
date2 = date(2019, 3, 5)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in few_dates:
print('**************************')
print('DATE : ', day)
link = crawler.create_link(day)
print('WAITING ....')
print('')
time.sleep(12)
print('OPENING LINK ... ')
try:
crawler.open_link(link)
html = crawler.driver.page_source
print( "DATA IS FETCHED")
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
except TimeoutException:
print( "NOT FETCHED ...!!!")
获取天气信息没有问题。我想每个请求之间的延迟会导致更好的性能。 myElem = WebDriverWait(self.driver, self.delay_for_page)\.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
行也提高了速度。
我已经为网络抓取对象开发了以下代码。
需要两个日期,因为 inputs.Then 创建这两个日期之间的日期列表,并将每个日期附加到包含某个位置的天气信息的网页 url。然后它将 HTML tables 的数据转换为 Dataframe,然后将数据作为 csv 文件存储在存储中(基础 link 是:https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3 正如你在这个中看到的例如日期是 2019-1-3):
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
html = self.driver.page_source
return html
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
这是我想要使用 WebCrawler
对象的方式:
date1 = date(2018, 12, 29)
date2 = date(2019, 1, 1)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in dates:
print('**************************')
print('PROCESSING : ', day)
link = crawler.create_link(day)
print('WAITING... ')
time.sleep(3)
print('VISIT WEBPAGE ... ')
html = crawler.open_link(link)
print('DATA RETRIEVED ... ')
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
发生的问题是循环的第一次迭代运行完美,但第二次循环停止并出现错误 No tables where found
(发生在 table = soup.find("table",{"class":"tablesaw-sortable"})
行),这是因为页面源是 return由 WebCrawler.open_link
编辑,然后网页完全加载网页内容,包括 table(包含天气信息)。网站也有可能拒绝请求,因为这会使服务器太忙。
我们是否可以构建一个循环,不断尝试打开 link 直到它可以找到 table,或者至少等到 table 被加载并且然后 return table?
您可以让 selenium 等待特定元素。在您的情况下,它将是 table 的 class 名称 "tablesaw-sortable"。我 高度 建议您使用 CSS 选择器来查找此元素,因为它比获取所有 table 元素更快且更不容易出错。
这是为您预制的 CSS 选择器 table.tablesaw-sortable
。将 selenium 设置为等待该元素加载。
来源:
我使用@mildmelon 建议的 解决方案重写了代码,我还在每次向服务器发送请求和请求页面源之间使用了一些延迟:
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.delay_for_page = 7
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
myElem = WebDriverWait(self.driver, self.delay_for_page)\
.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
date1 = date(2019, 2, 1)
date2 = date(2019, 3, 5)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in few_dates:
print('**************************')
print('DATE : ', day)
link = crawler.create_link(day)
print('WAITING ....')
print('')
time.sleep(12)
print('OPENING LINK ... ')
try:
crawler.open_link(link)
html = crawler.driver.page_source
print( "DATA IS FETCHED")
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
except TimeoutException:
print( "NOT FETCHED ...!!!")
获取天气信息没有问题。我想每个请求之间的延迟会导致更好的性能。 myElem = WebDriverWait(self.driver, self.delay_for_page)\.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
行也提高了速度。