Python Selenium session deleted because of page crash from unknown error: cannot determine loading status from tab crashed
Python Selenium session deleted because of page crash from unknown error: cannot determine loading status from tab crashed
问题可能是内存使用问题。页面开始变得非常慢,有时会出现以下错误消息
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ActionChains
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wd.fullscreen_window()
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
roaster=wd.find_element_by_xpath('//*[@id="resultTypeRaster"]')
ActionChains(wd).click(roaster).perform()
#use keys to get where the button is
html = wd.find_element_by_tag_name('html')
c=2
for i in range(100):
html.send_keys(Keys.END)
time.sleep(1)
html.send_keys(Keys.END)
time.sleep(1)
html.send_keys(Keys.ARROW_UP)
try:
wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='resultPane']/div["+str(c)+"]/span")))
loadButton=wd.find_element_by_xpath("//*[@id='resultPane']/div["+str(c)+"]/span")
loadButton.click()
except TimeoutException or ElementClickInterceptedException:
break
time.sleep(1)
c+=1
wd.close
这里是一些我浏览过的有类似问题的链接
我尝试添加选项,但它不起作用。其他一些技巧真的让我很困惑,所以我希望有人能在这里帮助我(我对编码还很陌生)
这是我浏览过的链接
selenium.WebDriverException: unknown error: session deleted because of page crash from tab crashed
只是为了阐明该程序的目标是获取所有配置文件的列表并从中抓取内容,这就是为什么程序的这一部分首先加载整个页面以获取所有这些链接(afaik 我不能只是获取因为 javascript) 他们用 bsoup 所以我没有任何解决方法
非常感谢!
就像我在评论中提到的那样。这对初学者来说不是一件容易的事。不过这段代码应该可以让您入门。
这里最大的问题是,结果是通过 iframe 加载的,因此您需要先获取它。
看看这段代码,它将获取配置文件的基本信息,并将 return 它们作为 json。如果您需要对此进行更多解释,请随时在评论中提问。
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
def get_profile_info(profile_url):
# gets info of a profile page // Adjust here to get more info
wd.get(profile_url)
label_element = WebDriverWait(wd, 5).until(
EC.presence_of_element_located((By.ID, "labelAddress"))
)
label = label_element.find_element_by_tag_name("h1").text
street = label_element.find_element_by_css_selector(
"span[itemprop='streetAddress']"
).text
postal_code = label_element.find_element_by_css_selector(
"span[itemprop='postalCode']"
).text
city = label_element.find_element_by_css_selector(
"span[itemprop='addressLocality']"
).text
address_region = label_element.find_element_by_css_selector(
"span[itemprop='addressRegion']"
).text
country = label_element.find_element_by_css_selector(
"span[itemprop='addressCountry']"
).text
return {
"label": label,
"street": street,
"postal_code": postal_code,
"city": city,
"address_region": address_region,
"country": country,
}
def get_profile_url(label_element):
# get the url from a result element
onlick = label_element.get_attribute("onclick")
# some regex magic
return re.search(r"(?<=open\(\')(.*?)(?=\')", onlick).group()
def load_more_results():
# load more results if needed // use only on the search page!
button_wrapper = wd.find_element_by_class_name("loadNextBtn")
button_wrapper.find_element_by_tag_name("span").click()
#### Script starts here ####
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Webdriver
wd = webdriver.Chrome(options=options)
# Load URL
wd.get("https://www.techpilot.de/zulieferer-suchen?laserschneiden")
# lets first wait for the timeframe
iframe = WebDriverWait(wd, 5).until(
EC.frame_to_be_available_and_switch_to_it("efficientSearchIframe")
)
# the result parent
result_pane = WebDriverWait(wd, 5).until(
EC.presence_of_element_located((By.ID, "resultPane"))
)
result_elements = wd.find_elements_by_class_name("fancyCompLabel")
# lets first collect all the links visible
href_list = []
for element in result_elements:
url = get_profile_url(element)
href_list.append(url)
# lets collect all the data now
result = []
for href in href_list:
result.append(get_profile_info(href))
wd.close
# lets see what we've got
print(result)
解决方案是从 dom 树中删除元素
就像@pcalkins 在上面所说的 dom 树似乎“超载”否则
问题可能是内存使用问题。页面开始变得非常慢,有时会出现以下错误消息
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ActionChains
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wd.fullscreen_window()
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
roaster=wd.find_element_by_xpath('//*[@id="resultTypeRaster"]')
ActionChains(wd).click(roaster).perform()
#use keys to get where the button is
html = wd.find_element_by_tag_name('html')
c=2
for i in range(100):
html.send_keys(Keys.END)
time.sleep(1)
html.send_keys(Keys.END)
time.sleep(1)
html.send_keys(Keys.ARROW_UP)
try:
wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='resultPane']/div["+str(c)+"]/span")))
loadButton=wd.find_element_by_xpath("//*[@id='resultPane']/div["+str(c)+"]/span")
loadButton.click()
except TimeoutException or ElementClickInterceptedException:
break
time.sleep(1)
c+=1
wd.close
这里是一些我浏览过的有类似问题的链接 我尝试添加选项,但它不起作用。其他一些技巧真的让我很困惑,所以我希望有人能在这里帮助我(我对编码还很陌生)
这是我浏览过的链接
selenium.WebDriverException: unknown error: session deleted because of page crash from tab crashed
只是为了阐明该程序的目标是获取所有配置文件的列表并从中抓取内容,这就是为什么程序的这一部分首先加载整个页面以获取所有这些链接(afaik 我不能只是获取因为 javascript) 他们用 bsoup 所以我没有任何解决方法 非常感谢!
就像我在评论中提到的那样。这对初学者来说不是一件容易的事。不过这段代码应该可以让您入门。
这里最大的问题是,结果是通过 iframe 加载的,因此您需要先获取它。
看看这段代码,它将获取配置文件的基本信息,并将 return 它们作为 json。如果您需要对此进行更多解释,请随时在评论中提问。
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
def get_profile_info(profile_url):
# gets info of a profile page // Adjust here to get more info
wd.get(profile_url)
label_element = WebDriverWait(wd, 5).until(
EC.presence_of_element_located((By.ID, "labelAddress"))
)
label = label_element.find_element_by_tag_name("h1").text
street = label_element.find_element_by_css_selector(
"span[itemprop='streetAddress']"
).text
postal_code = label_element.find_element_by_css_selector(
"span[itemprop='postalCode']"
).text
city = label_element.find_element_by_css_selector(
"span[itemprop='addressLocality']"
).text
address_region = label_element.find_element_by_css_selector(
"span[itemprop='addressRegion']"
).text
country = label_element.find_element_by_css_selector(
"span[itemprop='addressCountry']"
).text
return {
"label": label,
"street": street,
"postal_code": postal_code,
"city": city,
"address_region": address_region,
"country": country,
}
def get_profile_url(label_element):
# get the url from a result element
onlick = label_element.get_attribute("onclick")
# some regex magic
return re.search(r"(?<=open\(\')(.*?)(?=\')", onlick).group()
def load_more_results():
# load more results if needed // use only on the search page!
button_wrapper = wd.find_element_by_class_name("loadNextBtn")
button_wrapper.find_element_by_tag_name("span").click()
#### Script starts here ####
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Webdriver
wd = webdriver.Chrome(options=options)
# Load URL
wd.get("https://www.techpilot.de/zulieferer-suchen?laserschneiden")
# lets first wait for the timeframe
iframe = WebDriverWait(wd, 5).until(
EC.frame_to_be_available_and_switch_to_it("efficientSearchIframe")
)
# the result parent
result_pane = WebDriverWait(wd, 5).until(
EC.presence_of_element_located((By.ID, "resultPane"))
)
result_elements = wd.find_elements_by_class_name("fancyCompLabel")
# lets first collect all the links visible
href_list = []
for element in result_elements:
url = get_profile_url(element)
href_list.append(url)
# lets collect all the data now
result = []
for href in href_list:
result.append(get_profile_info(href))
wd.close
# lets see what we've got
print(result)
解决方案是从 dom 树中删除元素 就像@pcalkins 在上面所说的 dom 树似乎“超载”否则