Python

Question

我有一个脚本可以添加（抓取）字典链接，然后分别抓取每个链接：

def get_products():
    links = []
    for x in range(1,3):
        url1 = f"https://www.kaufland.de/category/39251/p{x}/"
        r = s.get(url1)
        items = r.html.find('div.results')
        for item in items:
            rawlinks = str(item.find('a'))
            list1 = re.findall(r"[0-9]{9}", rawlinks)
            result = ['https://www.kaufland.de/product/' + direction for direction in list1]
            links.append(result)             
    for link in links:

        wcapi = API(
            url='https://s-qmzs9fc4fs3c.eu1.wpsandbox.org/',
            consumer_key='ck_365febfbc43beca56bef990b20cb88db6022a5cd',
            consumer_secret='cs_05c93973ffb6bc467483bf0105cd730a288dd405',
            version="wc/v3"
        )
        r = requests.get(link)
        html = r.text
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find('h1').text.strip()
        price = soup.find('div', {'class':'rd-buybox__price'}).text.strip().replace(' €', '').replace(',', '.')
        product_data = soup.find_all('script')[2] # 3rd script tag
        product_data = str(product_data).partition('return {')[-1]
        product_data = '{' + product_data.split('}(')[0] + '}'
        saleprice = price
        try:
            ean = str(re.findall('[0-9]{13}' , product_data)[0])
        except:
            ean = ""
        try:
            brand = str(re.findall('id:[0-9]{8},name:"(.+?)"', product_data))
        except:
            brand = ""
        try:
            description = str(re.findall('descriptionHtml:"(.+?)"', product_data)).replace('\u003E', '').replace('\u003Cbr', '').replace('\u003Cli', '').replace('\u003C', '').replace('\u002Fli', '').replace('\u002F', '').replace('\\u002Fb', '').replace('\b', '').replace('\p', '')
        except:
            description = ""
        try:
            imgs = str(re.findall('fallbackSrc:"(.+?)"' , product_data))
        except:
            imgs = "https://www.anchorpackaging.com/wp-content/uploads/2016/10/SampleKit.jpg"
        id = ''.join([random.choice(string.digits) for n in range(2)])
        sku = ''.join([random.choice(string.digits) for n in range(7)])
        
        scraped = {
                "name": title,
                "type": "simple",
                "regular_price": price,
                "stock_quantity": id,
                "short_description": "Hersteller: " + brand.replace("']", "").replace("['", "") + ", " + "EAN: " + ean,
                "description": description,
                "categories": [
                    {
                        "id": id
                    }
                ],

                "images": [
                    {
                        "src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[1],
                        "alt": "img"
                    },
                    {
                        "src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[2],
                        "alt": "img"
                    },
                    {
                        "src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[3],
                        "alt": "img"
                    }
                ],
            }
        wcapi.post('products', scraped)
        print(scraped)

print(get_products())

它给了我错误：requests.exceptions.InvalidSchema：没有找到“['link1, link2, link3 ']”的连接适配器，因此打印了 url 而不是 link1，我只是为了便于阅读而没有把它们写在这里.为什么我的脚本无法单独使用每个字典项，而是尝试连接到整个字典？

Answer 1

我认为你的问题在行内 links.append(result)

你想要

links.extend(result)

Python - Webscraping：无法从字典中获取单个项目

Python - Webscraping: Fails to get single item from dictionary

for-loop

web-scraping