如何在 homedepot 中抓取期权组合价格

How to scrape options combination prices in homedepot

我正在尝试从 homedepot 中抓取产品信息,例如价格、产品详细信息、规格、图像等。我能够抓取所有这些信息,但现在我不知道如果在产品中选择了不同的选项组合如何抓取价格,因为价格会根据这些选项发生变化。有什么方法可以为产品选项中的每个可能组合抓取价格和图像吗?

要更清楚地解释我的问题,请参阅该产品 url

https://custom.homedepot.com/custom-doors/p/Steves-Sons-Regency-Modern-Customizable-Fiberglass-Door/314599913/45272-Pre-Hung/57533-Single-w-Two-Sidelites-Transom/57526-36-x-93/40069-12/57523-64-1-2-x-95-1-4/55572-Autumn-Wheat/45147-Left-Hand-Inswing/55578-Glass-Panels/45143-Black-Bronze/35734-6-9-16

您可以看到图片右侧有一些选项,每个参数都有多个选项,这些选项的每个组合在选中时都会更改图片和价格。如果可能,我该如何抓取这些信息?

注意:我正在使用硒和 BeautifulSoup

更新:

到目前为止,这是我用于抓取产品页面中的选项部分的代码

def scrape_price(self):
    if self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"}):
        price_div = self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"})
        price_curr =  price_div.findAll("span")[0].text
        price_doll =  price_div.findAll("span")[1].text
        price_cent=""
        if len(price_div.findAll("span")) > 2:
            price_cent =  price_div.findAll("span")[2].text
        if price_cent != "":
            self.data['price']=price_curr+price_doll+"."+price_cent
        else:
            self.data['price']=price_curr+price_doll
    else:
        if self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
            price_div = self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
            price_span = price_div.find("span",attrs={"class":"price-detailed__unit-price"}).find("span").text
            #unit_span = price_div.findAll("span")[1].text
            self.data['price']=price_span
        else:
            if self.soup.find("div", attrs={"class":"pricingReg"}):
                price_div = self.soup.find("div", attrs={"class":"pricingReg"})
                curr = price_div.find("span", attrs={"class":"price__currency"}).text
                dollars = price_div.find("span", attrs={"class":"price__dollars"}).text
                cents = price_div.find("span", attrs={"class":"price__cents"}).text
                price = curr+dollars+"."+cents
                self.data['price']=price
                self.data['Availability'] = "Available"
            else:
                self.data['Availability'] = "Not Available"

    if self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
        detailed_price_tag = self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
        detailed_price = cleanhtml(detailed_price_tag.text)
        self.data["Detailed Price"] = detailed_price
        if self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}):
            self.data["Case Unit Cover"] = self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}).text


def scrape_images(self):
    if self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"}):
        img_btns = self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"})
        count=0
        self.data["images"]=[]
        for img_btn in img_btns:
            img_url = img_btn.find("img").get("src")
            self.data["images"].append(img_url)
            count+=1 
    else:
        if self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"}):
            images_div = self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"})
            images_divs = images_div.findAll("div", attrs={"class":"styles__ThumbnailInner-sc-10zajq9-1 icLycq"})
            imgs=[]
            for image_div in images_divs:
                if image_div.find("img"):
                    img_src = image_div.find("img").get("src")
                    imgs.append(img_src)
            self.data["images"]= imgs
def scrape_options(self):
    if self.soup.find("div", attrs={"class":"super-sku"}):
        param_tag = self.soup.find("div", attrs={"class":"super-sku"})
        params = param_tag.findAll("div", attrs={"class":"super-sku__inline-attribute"})
        parameters=[]
        for param in params:
            param_body = param.find("div", attrs={"class":"label"}).text
            cleaned_param = cleanhtml(param_body)
            splitted = cleaned_param.split(':')
            label = splitted[0]
            val = splitted[1]
            options_div=param.findAll("div", attrs={"class":"super-sku__inline-tile--space"})
            if len(options_div) == 0:
                options_div=param.findAll("button", attrs={"class":"super-sku__inline-swatch"})
            options=[]
            for opt_div in options_div:
                if opt_div.find("img"):
                    opt = { 
                        "img" : opt_div.find("img").get("src"),
                        "label":opt_div.find("img").get("title")
                    }
                else:
                    opt = opt_div.find("button").text
                options.append(opt)
            parameters.append({
                "Label":label,
                "Value":val,
                "Options":options
            })
        self.data["Parameters"] = parameters
    else:
        if self.soup.find("div", attrs={"class":"buybox__super-sku"}):
            options=[]
            options_divs = self.soup.find("div", attrs={"class":"buybox__super-sku"}).find_all("div",recursive=False)
            for option_div in options_divs:
                option={}
                optionheader0 = option_div.find("div", attrs={"class":"styles__HeaderRow-fb29x6-1"})
                optionheader1 = option_div.find("div", attrs={"class":"styles__Header-sc-1gql1zk-0"})
                if optionheader0 or optionheader1:
                    if optionheader0:
                        header_div = optionheader0
                    else:
                        header_div = optionheader1
                    if header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}):
                        label = header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}).text
                        option["Label"] = label
                    if header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}):
                        value = header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}).text
                        option["Value"] = value
                optionchoices0 = option_div.find("div", attrs={"class":"DefaultTemplate__FixedSizeChoiceImageWrapper-rpf825-0"})
                optionchoices1 = option_div.find("div", attrs={"class":"styles__TileSelectWrapper-jw86q8-1"})
                optionchoices2 = option_div.find("div", attrs={"class":"product_sku_Overlay_ListBoxes"})
                optionchoices3 = option_div.find("div", attrs={"class":"product_sku_Overlay_ColorSwtHolder"})
                if optionchoices0 or optionchoices1 or optionchoices2 or optionchoices3:
                    if optionchoices0:
                        choices_div = optionchoices0
                        choices=[]
                        choices_images=choices_div.findAll("div",attrs={"class":"styles__ChoiceImage-kykx13-4"})
                        for choice_div in choices_images:
                            if choice_div.find("img"):
                                choice_img = choice_div.find("img").get("src")
                                choice_val = choice_div.find("img").get("alt")
                                choices.append({
                                    "img":choice_img,
                                    "value":choice_val
                                })
                        option["choices"]=choices
                    elif optionchoices2:
                        choices_div = optionchoices2
                        choices=[]
                        choices_images=choices_div.findAll("span",attrs={"class":"drop-down__hover-effect"})
                        for choice_div in choices_images:
                            if choice_div.find("a"):
                                choice_text = choice_div.find("a").text
                                choices.append(choice_text)
                        option["choices"]=choices
                    elif optionchoices3:
                        choices_div = optionchoices3
                        choices=[]
                        choices_images=choices_div.findAll("li",attrs={"class":"styles__SwatchRoot-sc-1kr5yl9-1"})
                        for choice_div in choices_images:
                            if choice_div.find("img"):
                                choice_img = choice_div.find("img").get("src")
                                choice_val = choice_div.find("img").get("title")
                                choices.append({
                                    "img":choice_img,
                                    "value":choice_val
                                })
                        option["choices"]=choices
                    else:
                        choices_div = optionchoices1
                        choices=[]
                        choices_images=choices_div.findAll("div",attrs={"class":"styles__TileDiv-jw86q8-0"})
                        for choice_div in choices_images:
                            choice_text = choice_div.text
                            choices.append(choice_text)
                        option["choices"]=choices
                options.append(option)
            self.data["options"] = options

现在我想知道如何获取这些选项的每个组合的价格

目标页面的一个重要方面是,只要切换(单击或选择)某个项目,就会出现其他定价选项。此解决方案递归遍历功能列表,单击每个列表,并在其余价格列表出现后继续处理:

from selenium import webdriver
import time, re
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://custom.homedepot.com/custom-doors/p/Steves-Sons-Regency-Modern-Customizable-Fiberglass-Door/314599913/45272-Pre-Hung/57533-Single-w-Two-Sidelites-Transom/57526-36-x-93/40069-12/57523-64-1-2-x-95-1-4/55572-Autumn-Wheat/45147-Left-Hand-Inswing/55578-Glass-Panels/45143-Black-Bronze/35733-4-9-16')
def get_combos(_seen):
   flag = False
   for i, a in enumerate(d.execute_script("""return document.querySelector('.buybox__super-sku').children""")):
       if i and i not in dict(_seen):
          flag = True
          for _s in ['.styles__BoxChoice-kykx13-3', '.styles__TileSelectWrapper-jw86q8-1', '.styles__SwatchRoot-sc-1kr5yl9-1', '.drop-down__hover-effect a']:
             p = f'.buybox__super-sku > div:nth-child({i+1}) {_s}'
             if (op1:=d.execute_script(f"""return document.querySelectorAll('{p}')""")):
                for j, _ in enumerate(op1):
                   try:
                       d.execute_script(f"""document.querySelectorAll('{p}')[{j}].click()""")
                       time.sleep(1)
                       yield from get_combos([*_seen, [i, [d.execute_script(f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Label-sc-1gql1zk-1'))"""), d.execute_script(f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Value-sc-1gql1zk-2'))""")]]])
                   except:
                      pass
                break
          break
   if not flag:
      yield {'price':d.execute_script("""return document.querySelector('span:nth-of-type(1).pReg').textContent"""), 
              'img':d.execute_script("""return document.querySelector('.styles__ThumbnailInner-sc-10zajq9-1.icLycq img').getAttribute('src')"""),
              'combo':_seen}
           
          
result = list(get_combos([]))
final_result = [{'price':f'{i["price"][:6]}.{i["price"][-2:]}', 'image':i['img'], **({re.sub(':\s*$', '', a):b for _, [a, b] in i['combo']})} for i in result]

输出:

[{'price': ',423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009218?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': ',506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009266?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': ',264.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009242?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': ',346.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009290?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': ',423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009219?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': ',506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009267?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': ',264.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009243?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': ',346.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009291?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': ',423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009220?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Outswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': ',506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009268?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Outswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}]

关于用于数千个输入链接的上述解决方案的并发版本,有几个直接的细节需要解决:

  1. 首先,托管您的 selenium 个实例。您可以在自己的计算机上启动多个 selenium 驱动程序实例,或者使用 Browserless 之类的服务来为您执行此操作。 selenium 非常耗费资源,因此可以自动处理许多单独的 selenium 实例的托管服务可能是您最好的选择。
  2. 其次,与目标页面交互的方法。如果您使用 Browserless 之类的服务,您可以将 Javascript 函数传递给 driver.execute_script 并设置特定的超时时间,这会将页面交互卸载到服务本身。

下面是维护 selenium 个驱动程序实例池的解决方案,其中 async 版本为 get_combos。这些驱动程序可以指向远程 selenium 实例(如无浏览器)或您自己机器上的本地实例。

首先,async实现get_combos

import asyncio, functools
from selenium import webdriver
async def get_page_combos(d, link):
   d.get(link)
   async def get_combos(_seen):
      flag = False
      loop = asyncio.get_running_loop()
      first_vals = await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('.buybox__super-sku').children"""))
      for i, a in enumerate(first_vals):
         if i and i not in dict(_seen):
            flag = True
            for _s in ['.styles__BoxChoice-kykx13-3', '.styles__TileSelectWrapper-jw86q8-1', '.styles__SwatchRoot-sc-1kr5yl9-1', '.drop-down__hover-effect a']:
                p = f'.buybox__super-sku > div:nth-child({i+1}) {_s}'
                loop = asyncio.get_running_loop()
                if (op1:=(await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return document.querySelectorAll('{p}')""")))):
                   for j, _ in enumerate(op1):
                      try:
                          loop = asyncio.get_running_loop()
                          await loop.run_in_exector(None, functools.partial(d.execute_script, f"""document.querySelectorAll('{p}')[{j}].click()"""))
                          await asyncio.sleep(1)
                          new_vals = [(await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Label-sc-1gql1zk-1'))"""))), (await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Value-sc-1gql1zk-2'))""")))]
                          async for pl in get_combos([*_seen, [i, new_vals]]):
                             yield pl
                      except:
                         pass
                   break
            break
      if not flag:
         loop = asyncio.get_running_loop()
         yield {'price':await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('span:nth-of-type(1).pReg').textContent""")), 
              'img':await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('.styles__ThumbnailInner-sc-10zajq9-1.icLycq img').getAttribute('src')""")),
              'combo':_seen}
   result = []
   async for i in get_combos([]):
      result.append(i)
   return result

其次,把它们放在一起:

async def main(links, instance_num = 10): #you can adjust the number of instances depending on your needs
    drivers = [webdriver.Chrome('<path>') for _ in range(instance_num)] #<path> can be substituted for a path to a local chromedriver executable or a url to a remote instance
    final_results = []
    while links:
       pairing = [(a, b) for a, b in zip(drivers, [links.pop(0) if links else None for _ in range(instance_num)]) if b]
       vals = await asyncio.gather(*[get_page_combos(*i) for i in pairing])
       final_results.extend(vals)
    return final_results
  
links = [...] #all your homedepot links to be crawled
all_page_vals = asyncio.run(main(links))