Python phantomjs 加载网页不正确
Python phantomjs loading webpage not correct
我在从 link
中提取时遇到问题
从这个 link 给我带来数据,而不是主页本身。
http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all
知道为什么会这样吗?
我正在使用 PhantomJS selenium 和 beautiful soup 来帮助我。
# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup
# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def getURLS(url):
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(url) # load the web page
src = driver.page_source
#Get text and split it
soup = BeautifulSoup(src, 'html5lib')
print soup
link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)
来自 Alex Lucaci 的解决方案
def getURLS(url):
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(url) # load the web page
src = driver.page_source
category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
category_select.select_by_visible_text("Financial Results")
category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
category_select2.select_by_visible_text("Financial Results")
category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()
src = driver.page_source
soup = BeautifulSoup(src, 'html5lib')
link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)
当您保存源代码时,页面未完全加载您提交的内容 post 因此请尝试等待几秒钟,然后再获取页面源代码:
def getURLS(url):
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(url) # load the web page
time.sleep(5)# waiting for 5 seconds before fetching the source
src = driver.page_source
#Get text and split it
soup = BeautifulSoup(src, 'html5lib')
print soup
要执行下拉 select 您必须按以下方式导入 Select
class :from selenium.webdriver.support.ui import Select
然后您必须 select 这样的下拉元素:
category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
category_select.select_by_visible_text('Financial Results')
在我的示例中,我已经为 -Category- 下拉列表完成了此操作,请按照每个类别的确切步骤操作。
请注意,通过 xpath selecting 下拉列表是最好的方法,您可以使用 Google Chrome -> 右键单击元素 -> 检查 -> 右键单击 <select>
在右边出现的菜单->Copy->Copy Xpath
当您 select 编辑完所有元素后,您必须单击提交并等待几秒钟加载,然后您将获取源代码。
如果我的回答对您有帮助,请告诉我。
我在从 link
中提取时遇到问题从这个 link 给我带来数据,而不是主页本身。 http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all
知道为什么会这样吗? 我正在使用 PhantomJS selenium 和 beautiful soup 来帮助我。
# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup
# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def getURLS(url):
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(url) # load the web page
src = driver.page_source
#Get text and split it
soup = BeautifulSoup(src, 'html5lib')
print soup
link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)
来自 Alex Lucaci 的解决方案
def getURLS(url):
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(url) # load the web page
src = driver.page_source
category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
category_select.select_by_visible_text("Financial Results")
category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
category_select2.select_by_visible_text("Financial Results")
category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()
src = driver.page_source
soup = BeautifulSoup(src, 'html5lib')
link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)
当您保存源代码时,页面未完全加载您提交的内容 post 因此请尝试等待几秒钟,然后再获取页面源代码:
def getURLS(url):
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(url) # load the web page
time.sleep(5)# waiting for 5 seconds before fetching the source
src = driver.page_source
#Get text and split it
soup = BeautifulSoup(src, 'html5lib')
print soup
要执行下拉 select 您必须按以下方式导入 Select
class :from selenium.webdriver.support.ui import Select
然后您必须 select 这样的下拉元素:
category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
category_select.select_by_visible_text('Financial Results')
在我的示例中,我已经为 -Category- 下拉列表完成了此操作,请按照每个类别的确切步骤操作。
请注意,通过 xpath selecting 下拉列表是最好的方法,您可以使用 Google Chrome -> 右键单击元素 -> 检查 -> 右键单击 <select>
在右边出现的菜单->Copy->Copy Xpath
当您 select 编辑完所有元素后,您必须单击提交并等待几秒钟加载,然后您将获取源代码。
如果我的回答对您有帮助,请告诉我。