谁能建议如何快速抓取这个 https://www3.wipo.int/branddb/en/# 网站?

Can anyone suggest how to scrape this https://www3.wipo.int/branddb/en/# website in a fast way?

我想以更快的方式抓取这个 https://www3.wipo.int/branddb/en/# 网站。我需要使用过滤器。我用 selenium 写了一个代码来抓取这个网站,它抓取了数据并在 exel 文件中保存了一个 selenium 代码来抓取这个网站,但是抓取数据需要 55 个小时。 我使用的代码是:

import time, xlsxwriter, requests
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import bs4

browser = webdriver.Chrome("chromedriver.exe")
browser.get('https://www3.wipo.int/branddb/en/#')
time.sleep(5)

browser.find_element_by_xpath("//*[@id=\"ui-id-10\"]").click()
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[5]/div[1]/div/div[6]/div/a[1]").click()
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[5]/div[1]/div/div[6]/div/a[2]").click()

element_to_hover_over = browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[5]/ul/li/a")
hover = ActionChains(browser).move_to_element(element_to_hover_over)
hover.perform()
time.sleep(0.1)
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[5]/ul/li/ul/li[1]/a").click()
time.sleep(11)
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[7]/div[1]/div/div[6]/div/a[12]").click()
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[7]/a[1]").click()
time.sleep(10)

element_to_hover_over = browser.find_element_by_css_selector("#results > div.results_navigation.top_results_navigation.displayButtons > div.results_pager.ui-widget-content > div.rowCountContainer.lightBackground > span > div.rowCountSelectContainer > ul > li > a")
hover = ActionChains(browser).move_to_element(element_to_hover_over)
hover.perform()
time.sleep(1)
browser.find_element_by_css_selector("#results > div.results_navigation.top_results_navigation.displayButtons > div.results_pager.ui-widget-content > div.rowCountContainer.lightBackground > span > div.rowCountSelectContainer > ul > li > ul > li:nth-child(4) > a").click()

row = 0
for i in range(1,1001): 
    workbook = xlsxwriter.Workbook(str(i) + ".xlsx")
    worksheet = workbook.add_worksheet()

    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)


    soup = bs4.BeautifulSoup(browser.page_source, 'lxml')
    tr = soup.findAll('tr', {'role': 'row'})
    num = 1
    img_num = 0

    for every in tr:
        try:

            every['id']
            worksheet.write(row,0, str(num))
            #brand, source, status, relevance, origin, holder, holder_count, number, date, image_class, nice_ci, img = '', '', '', '', '','','','','','','',''
            for each in every:
                try:
                    each['aria-hidden']
                except KeyError:
                    try:
                        if each['aria-describedby'] == "gridForsearch_pane_BRAND":
                            worksheet.write(row,1, each['title'])
                        elif each['aria-describedby'] == "gridForsearch_pane_SOURCE":
                            worksheet.write(row,2, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_STATUS":
                            worksheet.write(row,3, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_score":
                            worksheet.write(row,4, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_OO":
                            worksheet.write(row,5, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_HOL":
                            worksheet.write(row,6, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_HOLC":
                            worksheet.write(row,7, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_ID":
                            worksheet.write(row,8, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_AD":
                            worksheet.write(row,9, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_LOGO":
                            worksheet.write(row,10, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_NC":
                            worksheet.write(row,11, each.getText())
                        elif each['aria-describedby'] == "gridForsearch_pane_IMG":
                            try:
                                img = "https://www3.wipo.int/branddb" + each.img['src'][2:]
                                res = requests.get(img)
                                img_file = open(str(img_num)+'.jpg', 'wb')
                                for chunk in res.iter_content(100000):
                                    img_file.write(chunk)
                                img_file.close()
                                worksheet.insert_image(row,12, str(img_num)+'.jpg', {
                                    'x_scale': 1,
                                    'y_scale': 0.5,
                                    'positioning': 1
                                    })
                                img_num += 1
                            except TypeError:
                                img = ''
                                worksheet.write(row,12, '')
                        pass
                    except KeyError:
                        pass
            num += 1
            row += 1
        except KeyError:
            pass

    workbook.close()
    browser.find_element_by_css_selector("#results > div.results_navigation.bottom_results_navigation.displayButtons > div.results_pager.ui-widget-content > div.arrow_container > a:nth-child(4)").click()
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    print(i)

browser.quit()

我会推荐:

  1. 尝试只使用 selenium 方法来定位元素,而不是使用 beautifulsoup4 Read More Here

  2. 尽量避免使用 time.sleep(),而是使用 Selenium 显式等待或隐式等待。 Read More Here

  3. 运行异步下载图片的请求代码

  4. 获得更快的互联网连接。

These should reduce the time a little bit.

如果您对所花费的时间还不满意。我建议您看一下 JavaScript 框架 puppeteer 它被称赞比 selenium 快得多。