我怎样才能从网站的 Javascript 内容中抓取数据?
How can I, scrape data from a Javascript Content of a website?
实际上,我正在尝试从 Nykaa 网站获取产品描述中的内容。
URL:- https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502
这是 URL,在产品描述部分,点击 'Read More' 按钮,最后有一些文字.
我想提取的文本是:
Explore the entire range of Foundation available on Nykaa. Shop more
Nykaa Cosmetics products here.You can browse through the complete
world of Nykaa Cosmetics Foundation . Alternatively, you can also find
many more products from the Nykaa SkinShield Anti-Pollution Matte
Foundation range.
Expiry Date: 15 February 2024
Country of Origin: India
Name of Mfg / Importer / Brand: FSN E-commerce Ventures Pvt Ltd
Address of Mfg / Importer / Brand: 104 Vasan Udyog Bhavan Sun Mill
Compound Senapati Bapat Marg, Lower Parel, Mumbai City Maharashtra -
400013
检查页面后,当我 'disable the javascript' 来自 'product description' 的所有内容都消失了。这意味着内容正在 Javascript.
的帮助下动态加载
我为此使用了 'selenium'。这就是我尝试过的。
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath(xpath="/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_class_name('content-details')
for desc in desc_data:
para_details = browser.find_element_by_xpath(
'.//*[@id="content-details"]/p[1]').text
extra_details = browser.find_elements_by_xpath(
'.//*[@id="content-details"]/p[2]', './/*[@id="content-details"]/p[3]', './/*[@id="content-details"]/p[4]', './/*[@id="content-details"]/p[5]').text
print(para_details, extra_details)
这是正在显示的输出。
PS E:\Web Scraping - Nykaa> python -u "e:\Web Scraping - Nykaa\scrape_nykaa_final.py"
e:\Web Scraping - Nykaa\scrape_nykaa_final.py:16: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
browser = webdriver.Chrome(
DevTools listening on ws://127.0.0.1:1033/devtools/browser/097c0e11-6f2c-4742-a2b5-cd05bee72661
e:\Web Scraping - Nykaa\scrape_nykaa_final.py:28: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead
loadMore = browser.find_element_by_xpath(
[9312:4972:0206/110327.883:ERROR:ssl_client_socket_impl.cc(996)] handshake failed; returned -1, SSL error code 1, net_error -101
[9312:4972:0206/110328.019:ERROR:ssl_client_socket_impl.cc(996)] handshake failed; returned -1, SSL error code 1, net_error -101
Traceback (most recent call last):
File "e:\Web Scraping - Nykaa\scrape_nykaa_final.py", line 28, in <module>
loadMore = browser.find_element_by_xpath(
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 520, in find_element_by_xpath
return self.find_element(by=By.XPATH, value=xpath)
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 1244, in find_element
return self.execute(Command.FIND_ELEMENT, {
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 424, in execute
self.error_handler.check_response(response)
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 247, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]"}
(Session info: chrome=97.0.4692.99)
Stacktrace:
Backtrace:
Ordinal0 [0x00FDFDC3+2555331]
Ordinal0 [0x00F777F1+2127857]
Ordinal0 [0x00E72E08+1060360]
Ordinal0 [0x00E9E49E+1238174]
Ordinal0 [0x00E9E69B+1238683]
Ordinal0 [0x00EC9252+1413714]
Ordinal0 [0x00EB7B54+1342292]
Ordinal0 [0x00EC75FA+1406458]
Ordinal0 [0x00EB7976+1341814]
Ordinal0 [0x00E936B6+1193654]
Ordinal0 [0x00E94546+1197382]
GetHandleVerifier [0x01179622+1619522]
GetHandleVerifier [0x0122882C+2336844]
GetHandleVerifier [0x010723E1+541697]
GetHandleVerifier [0x01071443+537699]
Ordinal0 [0x00F7D18E+2150798]
Ordinal0 [0x00F81518+2168088]
Ordinal0 [0x00F81660+2168416]
Ordinal0 [0x00F8B330+2208560]
BaseThreadInitThunk [0x76C9FA29+25]
RtlGetAppContainerNamedObjectPath [0x77337A9E+286]
RtlGetAppContainerNamedObjectPath [0x77337A6E+238]
任何人都可以帮助我解决这个问题,或者任何其他要编写的特定代码,我缺少从产品描述中获取文本内容的代码。这将是一个很大的帮助。
谢谢。
你可以这样做
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath("/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_id('content-details')
for desc in desc_data:
para_details = browser.find_element_by_xpath('//*[@id="content-details"]/p[1]').text
expiry = browser.find_element_by_xpath('//*[@id="content-details"]/p[2]').text
country = browser.find_element_by_xpath('//*[@id="content-details"]/p[3]').text
importer = browser.find_element_by_xpath('//*[@id="content-details"]/p[4]').text
address = browser.find_element_by_xpath('//*[@id="content-details"]/p[5]').text
print(para_details, country, importer, address)
对于 desc_data,您正在寻找具有该字符串的 class 名称,当页面上没有时,它实际上是具有该字符串的 id 标签。
在 for 循环中,您在 find_elements_by_xpath() 中插入了一堆 xpath,它只需要一个元素的 xpath。
尝试
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
browser.execute_script("document.body.style.zoom='50%'")
time.sleep(1)
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath(xpath='//div [@class="css-mqbsar"]')
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_xpath('//div[@id="content-details"]/p')
# desc_data = browser.find_elements_by_class_name('content-details')
# here in your previous code this class('content-details') which is a single element so it is not iterable
# I used xpath to locate every every element <p> under the (id="content-details) attrid=bute
for desc in desc_data:
para_detail = desc.text
print(para_detail)
# if you you want to specify try this
# para_detail = desc_data[0].text
# expiry_ date = desc_data[1].text
并且不要只是从 chrome 开发工具复制 XPath,它对动态内容不可靠。
您收到此错误是因为在您执行点击功能时该元素未正确加载。
我使用这两个函数来定位元素:
def find_until_located(eltype,name):
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(driver, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
第一个参数将是其中之一:By.ID, By.XPATH, By.LINK_TEXT, By.PARTIAL_LINK_TEXT, By.NAME, By.TAG_NAME, By.CLASS_NAME, By.CSS_SELECTOR
,第二个参数将是您的 class 名称或 xpath 或 id 等。
所以现在,您的代码将是:
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
def find_until_located(eltype,name):
# eltype will be one of them By.ID, By.XPATH, By.LINK_TEXT, By.PARTIAL_LINK_TEXT, By.NAME, By.TAG_NAME, By.CLASS_NAME, By.CSS_SELECTOR
element = WebDriverWait(browser, 60).until(
EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(browser, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = find_until_clicklable(By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_until_located(By.CLASS_NAME,'content-details')
for desc in desc_data:
para_details = browser.find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[1]').text
extra_details = browser.find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[2]', './/*[@id="content-details"]/p[3]', './/*[@id="content-details"]/p[4]', './/*[@id="content-details"]/p[5]').text
print(para_details, extra_details)
编辑:
我意识到了问题,然后更新了代码
这是最终代码:
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
def find_until_located(eltype,name):
# eltype will be one of them By.ID, By.XPATH, By.LINK_TEXT, By.PARTIAL_LINK_TEXT, By.NAME, By.TAG_NAME, By.CLASS_NAME, By.CSS_SELECTOR
element = WebDriverWait(browser, 60).until(EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(browser, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
def scroll_to_element(element):
browser.execute_script("arguments[0].scrollIntoView();", element)
return
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
bag_btn=find_until_located(By.CLASS_NAME, 'css-17hv1os')
scroll_to_element(bag_btn)
desc_label=find_until_located(By.CLASS_NAME, 'css-1g43l8l')
scroll_to_element(desc_label)
# Waiting until loads
browser.implicitly_wait(20)
loadMore = find_until_clicklable(By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = find_until_located(By.ID,'content-details')
for desc in desc_data:
para_details = find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[1]').text
extra_details = find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[2]', './/*[@id="content-details"]/p[3]', './/*[@id="content-details"]/p[4]', './/*[@id="content-details"]/p[5]').text
print(para_details, extra_details)
这个问题的最终答案。
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
# browser.get(
# "https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
browser.get(
"https://www.nykaa.com/kay-beauty-hydrating-foundation/p/1229442?productId=1229442&pps=3&skuId=772975")
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(By.XPATH,
"/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[1]').text
expiry = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[2]').text
country = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[3]').text
importer = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[4]').text
address = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[5]').text
# print(para_details, country, importer, address)
print(f"{para_details} \n")
print(f"{expiry} \n")
print(f"{country} \n")
print(f"{importer} \n")
print(f"{address} \n")
实际上,我正在尝试从 Nykaa 网站获取产品描述中的内容。
URL:- https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502
这是 URL,在产品描述部分,点击 'Read More' 按钮,最后有一些文字.
我想提取的文本是:
Explore the entire range of Foundation available on Nykaa. Shop more Nykaa Cosmetics products here.You can browse through the complete world of Nykaa Cosmetics Foundation . Alternatively, you can also find many more products from the Nykaa SkinShield Anti-Pollution Matte Foundation range.
Expiry Date: 15 February 2024
Country of Origin: India
Name of Mfg / Importer / Brand: FSN E-commerce Ventures Pvt Ltd
Address of Mfg / Importer / Brand: 104 Vasan Udyog Bhavan Sun Mill Compound Senapati Bapat Marg, Lower Parel, Mumbai City Maharashtra - 400013
检查页面后,当我 'disable the javascript' 来自 'product description' 的所有内容都消失了。这意味着内容正在 Javascript.
的帮助下动态加载我为此使用了 'selenium'。这就是我尝试过的。
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath(xpath="/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_class_name('content-details')
for desc in desc_data:
para_details = browser.find_element_by_xpath(
'.//*[@id="content-details"]/p[1]').text
extra_details = browser.find_elements_by_xpath(
'.//*[@id="content-details"]/p[2]', './/*[@id="content-details"]/p[3]', './/*[@id="content-details"]/p[4]', './/*[@id="content-details"]/p[5]').text
print(para_details, extra_details)
这是正在显示的输出。
PS E:\Web Scraping - Nykaa> python -u "e:\Web Scraping - Nykaa\scrape_nykaa_final.py"
e:\Web Scraping - Nykaa\scrape_nykaa_final.py:16: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
browser = webdriver.Chrome(
DevTools listening on ws://127.0.0.1:1033/devtools/browser/097c0e11-6f2c-4742-a2b5-cd05bee72661
e:\Web Scraping - Nykaa\scrape_nykaa_final.py:28: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead
loadMore = browser.find_element_by_xpath(
[9312:4972:0206/110327.883:ERROR:ssl_client_socket_impl.cc(996)] handshake failed; returned -1, SSL error code 1, net_error -101
[9312:4972:0206/110328.019:ERROR:ssl_client_socket_impl.cc(996)] handshake failed; returned -1, SSL error code 1, net_error -101
Traceback (most recent call last):
File "e:\Web Scraping - Nykaa\scrape_nykaa_final.py", line 28, in <module>
loadMore = browser.find_element_by_xpath(
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 520, in find_element_by_xpath
return self.find_element(by=By.XPATH, value=xpath)
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 1244, in find_element
return self.execute(Command.FIND_ELEMENT, {
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 424, in execute
self.error_handler.check_response(response)
File "C:\Python310\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 247, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]"}
(Session info: chrome=97.0.4692.99)
Stacktrace:
Backtrace:
Ordinal0 [0x00FDFDC3+2555331]
Ordinal0 [0x00F777F1+2127857]
Ordinal0 [0x00E72E08+1060360]
Ordinal0 [0x00E9E49E+1238174]
Ordinal0 [0x00E9E69B+1238683]
Ordinal0 [0x00EC9252+1413714]
Ordinal0 [0x00EB7B54+1342292]
Ordinal0 [0x00EC75FA+1406458]
Ordinal0 [0x00EB7976+1341814]
Ordinal0 [0x00E936B6+1193654]
Ordinal0 [0x00E94546+1197382]
GetHandleVerifier [0x01179622+1619522]
GetHandleVerifier [0x0122882C+2336844]
GetHandleVerifier [0x010723E1+541697]
GetHandleVerifier [0x01071443+537699]
Ordinal0 [0x00F7D18E+2150798]
Ordinal0 [0x00F81518+2168088]
Ordinal0 [0x00F81660+2168416]
Ordinal0 [0x00F8B330+2208560]
BaseThreadInitThunk [0x76C9FA29+25]
RtlGetAppContainerNamedObjectPath [0x77337A9E+286]
RtlGetAppContainerNamedObjectPath [0x77337A6E+238]
任何人都可以帮助我解决这个问题,或者任何其他要编写的特定代码,我缺少从产品描述中获取文本内容的代码。这将是一个很大的帮助。
谢谢。
你可以这样做
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath("/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_id('content-details')
for desc in desc_data:
para_details = browser.find_element_by_xpath('//*[@id="content-details"]/p[1]').text
expiry = browser.find_element_by_xpath('//*[@id="content-details"]/p[2]').text
country = browser.find_element_by_xpath('//*[@id="content-details"]/p[3]').text
importer = browser.find_element_by_xpath('//*[@id="content-details"]/p[4]').text
address = browser.find_element_by_xpath('//*[@id="content-details"]/p[5]').text
print(para_details, country, importer, address)
对于 desc_data,您正在寻找具有该字符串的 class 名称,当页面上没有时,它实际上是具有该字符串的 id 标签。
在 for 循环中,您在 find_elements_by_xpath() 中插入了一堆 xpath,它只需要一个元素的 xpath。
尝试
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
browser.execute_script("document.body.style.zoom='50%'")
time.sleep(1)
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath(xpath='//div [@class="css-mqbsar"]')
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_xpath('//div[@id="content-details"]/p')
# desc_data = browser.find_elements_by_class_name('content-details')
# here in your previous code this class('content-details') which is a single element so it is not iterable
# I used xpath to locate every every element <p> under the (id="content-details) attrid=bute
for desc in desc_data:
para_detail = desc.text
print(para_detail)
# if you you want to specify try this
# para_detail = desc_data[0].text
# expiry_ date = desc_data[1].text
并且不要只是从 chrome 开发工具复制 XPath,它对动态内容不可靠。
您收到此错误是因为在您执行点击功能时该元素未正确加载。 我使用这两个函数来定位元素:
def find_until_located(eltype,name):
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(driver, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
第一个参数将是其中之一:By.ID, By.XPATH, By.LINK_TEXT, By.PARTIAL_LINK_TEXT, By.NAME, By.TAG_NAME, By.CLASS_NAME, By.CSS_SELECTOR
,第二个参数将是您的 class 名称或 xpath 或 id 等。
所以现在,您的代码将是:
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
def find_until_located(eltype,name):
# eltype will be one of them By.ID, By.XPATH, By.LINK_TEXT, By.PARTIAL_LINK_TEXT, By.NAME, By.TAG_NAME, By.CLASS_NAME, By.CSS_SELECTOR
element = WebDriverWait(browser, 60).until(
EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(browser, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = find_until_clicklable(By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_until_located(By.CLASS_NAME,'content-details')
for desc in desc_data:
para_details = browser.find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[1]').text
extra_details = browser.find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[2]', './/*[@id="content-details"]/p[3]', './/*[@id="content-details"]/p[4]', './/*[@id="content-details"]/p[5]').text
print(para_details, extra_details)
编辑:
我意识到了问题,然后更新了代码这是最终代码:
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
def find_until_located(eltype,name):
# eltype will be one of them By.ID, By.XPATH, By.LINK_TEXT, By.PARTIAL_LINK_TEXT, By.NAME, By.TAG_NAME, By.CLASS_NAME, By.CSS_SELECTOR
element = WebDriverWait(browser, 60).until(EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(browser, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
def scroll_to_element(element):
browser.execute_script("arguments[0].scrollIntoView();", element)
return
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
bag_btn=find_until_located(By.CLASS_NAME, 'css-17hv1os')
scroll_to_element(bag_btn)
desc_label=find_until_located(By.CLASS_NAME, 'css-1g43l8l')
scroll_to_element(desc_label)
# Waiting until loads
browser.implicitly_wait(20)
loadMore = find_until_clicklable(By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = find_until_located(By.ID,'content-details')
for desc in desc_data:
para_details = find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[1]').text
extra_details = find_until_located(By.XPATH,
'.//*[@id="content-details"]/p[2]', './/*[@id="content-details"]/p[3]', './/*[@id="content-details"]/p[4]', './/*[@id="content-details"]/p[5]').text
print(para_details, extra_details)
这个问题的最终答案。
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
# browser.get(
# "https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
browser.get(
"https://www.nykaa.com/kay-beauty-hydrating-foundation/p/1229442?productId=1229442&pps=3&skuId=772975")
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(By.XPATH,
"/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[1]').text
expiry = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[2]').text
country = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[3]').text
importer = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[4]').text
address = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[5]').text
# print(para_details, country, importer, address)
print(f"{para_details} \n")
print(f"{expiry} \n")
print(f"{country} \n")
print(f"{importer} \n")
print(f"{address} \n")