将数据转换为 csv 以供 Shopify 商店上传

Transform data into csv for Shopify store Upload

我正在尝试制作一个 csv 文件供 shopify 商店上传,根据 Shopify,您必须执行以下操作才能在导入时添加多张图片:

插入新行(每张图片一行)。

复制并粘贴“句柄”。

复制并粘贴图片网址。

因此,第一张图片排在第一行,所有后续图片排在下面几行。 CSV 示例位于此处:https://help.shopify.com/csv/product_template.csv

我想编写一些程序来循环遍历数组,如下所示(除了明显更长),并将其转换为 CSV,将除第一张照片之外的所有照片放入新行。这是我尝试的代码:

import scrapy
from scrapy.crawler import CrawlerProcess
import csv


class SweetPeaAndWillowSpider(scrapy.Spider):
    name = "sweetpea_and_willow"

    custom_settings = {
        # "FEED_FORMAT": "csv",
        # "FEED_URI": "malabar_furniture.csv",
        "LOG_FILE": "malabar_furniture_shopify.log",
    }

    data = []

    headers = {
        "authority": "www.sweetpeaandwillow.com",
        "cache-control": "max-age=0",
        "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "sec-fetch-site": "same-origin",
        "sec-fetch-mode": "navigate",
        "sec-fetch-user": "?1",
        "sec-fetch-dest": "document",
        "accept-language": "en,ru;q=0.9",
    }

    cookies = {
        "amzn-checkout-session": "%7B%7D",
        "_fbp": "fb.1.1652394481944.1343184112",
        "_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
        "_ga": "GA1.2.752968178.1652394485",
        "SPSI": "4eea709914a47dc1f5575f79dc373b51",
        "SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
        "PHPSESSID": "n6mfpugp82troila6hfib78q3k",
        "UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
        "_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
        "_hjIncludedInSessionSample": "0",
        "_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
        "_hjAbsoluteSessionInProgress": "0",
        "form_key": "LCm4cy48SHYhBX3C",
        "_gid": "GA1.2.1948251329.1652599747",
        "_gat": "1",
        "mage-cache-storage": "%7B%7D",
        "mage-cache-storage-section-invalidation": "%7B%7D",
        "mage-cache-sessid": "true",
        "recently_viewed_product": "%7B%7D",
        "recently_viewed_product_previous": "%7B%7D",
        "recently_compared_product": "%7B%7D",
        "recently_compared_product_previous": "%7B%7D",
        "product_data_storage": "%7B%7D",
        "section_data_ids": "%7B%22cart%22%3A1652599747%7D",
        "newsletter-popup-form": "declined",
        "spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
        "sp_lit": "JHxME1OUKp+83P5XseqYpg==",
        "PRLST": "AH",
        "adOtr": "7ae049U19a4",
    }

    def start_requests(self):
        yield scrapy.Request(
            "https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
            headers=self.headers,
            cookies=self.cookies,
            callback=self.parse_urls,
        )

    def parse_urls(self, response):
        url_list = response.css("div.item.product-item")

        for link in url_list:
            url = link.css("a::attr(href)").get()
            yield scrapy.Request(
                url=url,
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse_details,
            )

    def parse_details(self, response):
        table = response.css("table.data.table.additional-attributes")
        for tr in table.css("tbody"):
            row = tr.css("tr")
            color = row[0].css("td::text").get()
            dimension = row[1].css("td::text").get()
            material = row[2].css("td::text").get()
        self.data.append(
            {
                "Handle": response.css("h1.page-title ::text").get().lower(),
                "Title": response.css("h1.page-title ::text").get(),
                "Descritpion": response.css(
                    "div#description_product_show > p::text"
                ).get(),
                "Price": response.css("div.original-pricing-wrapper")
                .css("span.price ::text")
                .getall()[28],
                "Delivery": response.css("p.availability-message > span::text").get(),
                "Color": color,
                "Dimensions": dimension,
                "Material": material,
                "Image_Src": response.css("div.MagicSlideshow")
                .css("a img::attr(src)")
                .getall(),
            }
        )

        # print(self.data)

        f = csv.writer(open("malabar_furniture_shopify.csv", "w", newline=""))
        f.writerow(
            [
                "Handle",
                "Title",
                "Descritpion",
                "Price",
                "Delivery",
                "Color",
                "Dimensions",
                "Material",
                "Image_Src",
            ]
        )

        for d in self.data:
            images = d["Image_Src"]
            f.writerow(
                [
                    d["Handle"],
                    d["Title"],
                    d["Descritpion"],
                    d["Price"],
                    d["Delivery"],
                    d["Color"],
                    d["Dimensions"],
                    d["Material"],
                    images.pop(0) if images else None,
                ]
            )

            while images:
                f.writerow(
                    [None, None, None, None, None, None, None, None, images.pop(0)]
                )
if __name__ == "__main__":
    process = CrawlerProcess()

    process.crawl(SweetPeaAndWillowSpider)

    process.start()

输出:

更新:我尝试在 satrt 打开文件并定义 headers,但没有区别。我尝试使用 a 附加到文件,它使用重复的 headers.

创建重复条目

我只得到一个产品的 Image_Src 链接,这是最后一个。任何人都知道如何解决它?谢谢

您正在为每个回复创建和编写 "malabar_furniture_shopify.csv"。结果是您只会看到最后一个条目,因为所有其他条目都将被覆盖。

一种可能的解决方法是附加您的结果:

with open("malabar_furniture_shopify.csv", "a", newline="") as csvfile:

然后您需要一个标志来确保 header 只为您的第一个条目编写。 newline="" 用于确保您不会在输出中看到多余的空白行。

更好的方法是在开头打开文件并写入一次header。然后使用相同的文件句柄写入每一行。最后确保文件已关闭。

尝试以下操作:

import scrapy
from scrapy.crawler import CrawlerProcess
import csv


class SweetPeaAndWillowSpider(scrapy.Spider):
    name = "sweetpea_and_willow"

    custom_settings = {
        # "FEED_FORMAT": "csv",
        # "FEED_URI": "malabar_furniture.csv",
        "LOG_FILE": "malabar_furniture_shopify.log",
    }

    headers = {
        "authority": "www.sweetpeaandwillow.com",
        "cache-control": "max-age=0",
        "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "sec-fetch-site": "same-origin",
        "sec-fetch-mode": "navigate",
        "sec-fetch-user": "?1",
        "sec-fetch-dest": "document",
        "accept-language": "en,ru;q=0.9",
    }

    cookies = {
        "amzn-checkout-session": "%7B%7D",
        "_fbp": "fb.1.1652394481944.1343184112",
        "_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
        "_ga": "GA1.2.752968178.1652394485",
        "SPSI": "4eea709914a47dc1f5575f79dc373b51",
        "SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
        "PHPSESSID": "n6mfpugp82troila6hfib78q3k",
        "UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
        "_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
        "_hjIncludedInSessionSample": "0",
        "_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
        "_hjAbsoluteSessionInProgress": "0",
        "form_key": "LCm4cy48SHYhBX3C",
        "_gid": "GA1.2.1948251329.1652599747",
        "_gat": "1",
        "mage-cache-storage": "%7B%7D",
        "mage-cache-storage-section-invalidation": "%7B%7D",
        "mage-cache-sessid": "true",
        "recently_viewed_product": "%7B%7D",
        "recently_viewed_product_previous": "%7B%7D",
        "recently_compared_product": "%7B%7D",
        "recently_compared_product_previous": "%7B%7D",
        "product_data_storage": "%7B%7D",
        "section_data_ids": "%7B%22cart%22%3A1652599747%7D",
        "newsletter-popup-form": "declined",
        "spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
        "sp_lit": "JHxME1OUKp+83P5XseqYpg==",
        "PRLST": "AH",
        "adOtr": "7ae049U19a4",
    }

    def start_requests(self):
        self.f_output = open("malabar_furniture_shopify.csv", "w", newline="")
        self.csv_output = csv.writer(self.f_output)
        
        self.csv_output.writerow(
            [
                "Handle",
                "Title",
                "Descritpion",
                "Price",
                "Delivery",
                "Color",
                "Dimensions",
                "Material",
                "Image_Src",
            ]
        )
        
        yield scrapy.Request(
            "https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
            headers=self.headers,
            cookies=self.cookies,
            callback=self.parse_urls,
        )

    def parse_urls(self, response):
        url_list = response.css("div.item.product-item")

        for link in url_list:
            url = link.css("a::attr(href)").get()
            yield scrapy.Request(
                url=url,
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse_details,
            )

    def parse_details(self, response):
        data = []
        table = response.css("table.data.table.additional-attributes")
        for tr in table.css("tbody"):
            row = tr.css("tr")
            color = row[0].css("td::text").get()
            dimension = row[1].css("td::text").get()
            material = row[2].css("td::text").get()
            data.append(
                {
                    "Handle": response.css("h1.page-title ::text").get().lower(),
                    "Title": response.css("h1.page-title ::text").get(),
                    "Descritpion": response.css(
                        "div#description_product_show > p::text"
                    ).get(),
                    "Price": response.css("div.original-pricing-wrapper")
                    .css("span.price ::text")
                    .getall()[28],
                    "Delivery": response.css("p.availability-message > span::text").get(),
                    "Color": color,
                    "Dimensions": dimension,
                    "Material": material,
                    "Image_Src": response.css("div.MagicSlideshow")
                    .css("a img::attr(src)")
                    .getall(),
                }
            )

        for d in data:
            images = d["Image_Src"]
            self.csv_output.writerow(
                [
                    d["Handle"],
                    d["Title"],
                    d["Descritpion"],
                    d["Price"],
                    d["Delivery"],
                    d["Color"],
                    d["Dimensions"],
                    d["Material"],
                    images.pop(0) if images else None,
                ]
            )

            while images:
                self.csv_output.writerow(
                    [None, None, None, None, None, None, None, None, images.pop(0)]
                )

    def closed(self, spider):
        self.f_output.close()


if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(SweetPeaAndWillowSpider)
    process.start()

重复的原因是您总是附加到全局 data 列表。