将数据转换为 csv 以供 Shopify 商店上传
Transform data into csv for Shopify store Upload
我正在尝试制作一个 csv 文件供 shopify 商店上传,根据 Shopify,您必须执行以下操作才能在导入时添加多张图片:
插入新行(每张图片一行)。
复制并粘贴“句柄”。
复制并粘贴图片网址。
因此,第一张图片排在第一行,所有后续图片排在下面几行。 CSV 示例位于此处:https://help.shopify.com/csv/product_template.csv
我想编写一些程序来循环遍历数组,如下所示(除了明显更长),并将其转换为 CSV,将除第一张照片之外的所有照片放入新行。这是我尝试的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
class SweetPeaAndWillowSpider(scrapy.Spider):
name = "sweetpea_and_willow"
custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "malabar_furniture.csv",
"LOG_FILE": "malabar_furniture_shopify.log",
}
data = []
headers = {
"authority": "www.sweetpeaandwillow.com",
"cache-control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en,ru;q=0.9",
}
cookies = {
"amzn-checkout-session": "%7B%7D",
"_fbp": "fb.1.1652394481944.1343184112",
"_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
"_ga": "GA1.2.752968178.1652394485",
"SPSI": "4eea709914a47dc1f5575f79dc373b51",
"SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
"PHPSESSID": "n6mfpugp82troila6hfib78q3k",
"UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
"_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
"_hjIncludedInSessionSample": "0",
"_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
"_hjAbsoluteSessionInProgress": "0",
"form_key": "LCm4cy48SHYhBX3C",
"_gid": "GA1.2.1948251329.1652599747",
"_gat": "1",
"mage-cache-storage": "%7B%7D",
"mage-cache-storage-section-invalidation": "%7B%7D",
"mage-cache-sessid": "true",
"recently_viewed_product": "%7B%7D",
"recently_viewed_product_previous": "%7B%7D",
"recently_compared_product": "%7B%7D",
"recently_compared_product_previous": "%7B%7D",
"product_data_storage": "%7B%7D",
"section_data_ids": "%7B%22cart%22%3A1652599747%7D",
"newsletter-popup-form": "declined",
"spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
"sp_lit": "JHxME1OUKp+83P5XseqYpg==",
"PRLST": "AH",
"adOtr": "7ae049U19a4",
}
def start_requests(self):
yield scrapy.Request(
"https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
headers=self.headers,
cookies=self.cookies,
callback=self.parse_urls,
)
def parse_urls(self, response):
url_list = response.css("div.item.product-item")
for link in url_list:
url = link.css("a::attr(href)").get()
yield scrapy.Request(
url=url,
headers=self.headers,
cookies=self.cookies,
callback=self.parse_details,
)
def parse_details(self, response):
table = response.css("table.data.table.additional-attributes")
for tr in table.css("tbody"):
row = tr.css("tr")
color = row[0].css("td::text").get()
dimension = row[1].css("td::text").get()
material = row[2].css("td::text").get()
self.data.append(
{
"Handle": response.css("h1.page-title ::text").get().lower(),
"Title": response.css("h1.page-title ::text").get(),
"Descritpion": response.css(
"div#description_product_show > p::text"
).get(),
"Price": response.css("div.original-pricing-wrapper")
.css("span.price ::text")
.getall()[28],
"Delivery": response.css("p.availability-message > span::text").get(),
"Color": color,
"Dimensions": dimension,
"Material": material,
"Image_Src": response.css("div.MagicSlideshow")
.css("a img::attr(src)")
.getall(),
}
)
# print(self.data)
f = csv.writer(open("malabar_furniture_shopify.csv", "w", newline=""))
f.writerow(
[
"Handle",
"Title",
"Descritpion",
"Price",
"Delivery",
"Color",
"Dimensions",
"Material",
"Image_Src",
]
)
for d in self.data:
images = d["Image_Src"]
f.writerow(
[
d["Handle"],
d["Title"],
d["Descritpion"],
d["Price"],
d["Delivery"],
d["Color"],
d["Dimensions"],
d["Material"],
images.pop(0) if images else None,
]
)
while images:
f.writerow(
[None, None, None, None, None, None, None, None, images.pop(0)]
)
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(SweetPeaAndWillowSpider)
process.start()
输出:
更新:我尝试在 satrt 打开文件并定义 headers,但没有区别。我尝试使用 a
附加到文件,它使用重复的 headers.
创建重复条目
我只得到一个产品的 Image_Src 链接,这是最后一个。任何人都知道如何解决它?谢谢
您正在为每个回复创建和编写 "malabar_furniture_shopify.csv"
。结果是您只会看到最后一个条目,因为所有其他条目都将被覆盖。
一种可能的解决方法是附加您的结果:
with open("malabar_furniture_shopify.csv", "a", newline="") as csvfile:
然后您需要一个标志来确保 header 只为您的第一个条目编写。 newline=""
用于确保您不会在输出中看到多余的空白行。
更好的方法是在开头打开文件并写入一次header。然后使用相同的文件句柄写入每一行。最后确保文件已关闭。
尝试以下操作:
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
class SweetPeaAndWillowSpider(scrapy.Spider):
name = "sweetpea_and_willow"
custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "malabar_furniture.csv",
"LOG_FILE": "malabar_furniture_shopify.log",
}
headers = {
"authority": "www.sweetpeaandwillow.com",
"cache-control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en,ru;q=0.9",
}
cookies = {
"amzn-checkout-session": "%7B%7D",
"_fbp": "fb.1.1652394481944.1343184112",
"_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
"_ga": "GA1.2.752968178.1652394485",
"SPSI": "4eea709914a47dc1f5575f79dc373b51",
"SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
"PHPSESSID": "n6mfpugp82troila6hfib78q3k",
"UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
"_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
"_hjIncludedInSessionSample": "0",
"_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
"_hjAbsoluteSessionInProgress": "0",
"form_key": "LCm4cy48SHYhBX3C",
"_gid": "GA1.2.1948251329.1652599747",
"_gat": "1",
"mage-cache-storage": "%7B%7D",
"mage-cache-storage-section-invalidation": "%7B%7D",
"mage-cache-sessid": "true",
"recently_viewed_product": "%7B%7D",
"recently_viewed_product_previous": "%7B%7D",
"recently_compared_product": "%7B%7D",
"recently_compared_product_previous": "%7B%7D",
"product_data_storage": "%7B%7D",
"section_data_ids": "%7B%22cart%22%3A1652599747%7D",
"newsletter-popup-form": "declined",
"spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
"sp_lit": "JHxME1OUKp+83P5XseqYpg==",
"PRLST": "AH",
"adOtr": "7ae049U19a4",
}
def start_requests(self):
self.f_output = open("malabar_furniture_shopify.csv", "w", newline="")
self.csv_output = csv.writer(self.f_output)
self.csv_output.writerow(
[
"Handle",
"Title",
"Descritpion",
"Price",
"Delivery",
"Color",
"Dimensions",
"Material",
"Image_Src",
]
)
yield scrapy.Request(
"https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
headers=self.headers,
cookies=self.cookies,
callback=self.parse_urls,
)
def parse_urls(self, response):
url_list = response.css("div.item.product-item")
for link in url_list:
url = link.css("a::attr(href)").get()
yield scrapy.Request(
url=url,
headers=self.headers,
cookies=self.cookies,
callback=self.parse_details,
)
def parse_details(self, response):
data = []
table = response.css("table.data.table.additional-attributes")
for tr in table.css("tbody"):
row = tr.css("tr")
color = row[0].css("td::text").get()
dimension = row[1].css("td::text").get()
material = row[2].css("td::text").get()
data.append(
{
"Handle": response.css("h1.page-title ::text").get().lower(),
"Title": response.css("h1.page-title ::text").get(),
"Descritpion": response.css(
"div#description_product_show > p::text"
).get(),
"Price": response.css("div.original-pricing-wrapper")
.css("span.price ::text")
.getall()[28],
"Delivery": response.css("p.availability-message > span::text").get(),
"Color": color,
"Dimensions": dimension,
"Material": material,
"Image_Src": response.css("div.MagicSlideshow")
.css("a img::attr(src)")
.getall(),
}
)
for d in data:
images = d["Image_Src"]
self.csv_output.writerow(
[
d["Handle"],
d["Title"],
d["Descritpion"],
d["Price"],
d["Delivery"],
d["Color"],
d["Dimensions"],
d["Material"],
images.pop(0) if images else None,
]
)
while images:
self.csv_output.writerow(
[None, None, None, None, None, None, None, None, images.pop(0)]
)
def closed(self, spider):
self.f_output.close()
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(SweetPeaAndWillowSpider)
process.start()
重复的原因是您总是附加到全局 data
列表。
我正在尝试制作一个 csv 文件供 shopify 商店上传,根据 Shopify,您必须执行以下操作才能在导入时添加多张图片:
插入新行(每张图片一行)。
复制并粘贴“句柄”。
复制并粘贴图片网址。
因此,第一张图片排在第一行,所有后续图片排在下面几行。 CSV 示例位于此处:https://help.shopify.com/csv/product_template.csv
我想编写一些程序来循环遍历数组,如下所示(除了明显更长),并将其转换为 CSV,将除第一张照片之外的所有照片放入新行。这是我尝试的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
class SweetPeaAndWillowSpider(scrapy.Spider):
name = "sweetpea_and_willow"
custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "malabar_furniture.csv",
"LOG_FILE": "malabar_furniture_shopify.log",
}
data = []
headers = {
"authority": "www.sweetpeaandwillow.com",
"cache-control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en,ru;q=0.9",
}
cookies = {
"amzn-checkout-session": "%7B%7D",
"_fbp": "fb.1.1652394481944.1343184112",
"_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
"_ga": "GA1.2.752968178.1652394485",
"SPSI": "4eea709914a47dc1f5575f79dc373b51",
"SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
"PHPSESSID": "n6mfpugp82troila6hfib78q3k",
"UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
"_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
"_hjIncludedInSessionSample": "0",
"_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
"_hjAbsoluteSessionInProgress": "0",
"form_key": "LCm4cy48SHYhBX3C",
"_gid": "GA1.2.1948251329.1652599747",
"_gat": "1",
"mage-cache-storage": "%7B%7D",
"mage-cache-storage-section-invalidation": "%7B%7D",
"mage-cache-sessid": "true",
"recently_viewed_product": "%7B%7D",
"recently_viewed_product_previous": "%7B%7D",
"recently_compared_product": "%7B%7D",
"recently_compared_product_previous": "%7B%7D",
"product_data_storage": "%7B%7D",
"section_data_ids": "%7B%22cart%22%3A1652599747%7D",
"newsletter-popup-form": "declined",
"spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
"sp_lit": "JHxME1OUKp+83P5XseqYpg==",
"PRLST": "AH",
"adOtr": "7ae049U19a4",
}
def start_requests(self):
yield scrapy.Request(
"https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
headers=self.headers,
cookies=self.cookies,
callback=self.parse_urls,
)
def parse_urls(self, response):
url_list = response.css("div.item.product-item")
for link in url_list:
url = link.css("a::attr(href)").get()
yield scrapy.Request(
url=url,
headers=self.headers,
cookies=self.cookies,
callback=self.parse_details,
)
def parse_details(self, response):
table = response.css("table.data.table.additional-attributes")
for tr in table.css("tbody"):
row = tr.css("tr")
color = row[0].css("td::text").get()
dimension = row[1].css("td::text").get()
material = row[2].css("td::text").get()
self.data.append(
{
"Handle": response.css("h1.page-title ::text").get().lower(),
"Title": response.css("h1.page-title ::text").get(),
"Descritpion": response.css(
"div#description_product_show > p::text"
).get(),
"Price": response.css("div.original-pricing-wrapper")
.css("span.price ::text")
.getall()[28],
"Delivery": response.css("p.availability-message > span::text").get(),
"Color": color,
"Dimensions": dimension,
"Material": material,
"Image_Src": response.css("div.MagicSlideshow")
.css("a img::attr(src)")
.getall(),
}
)
# print(self.data)
f = csv.writer(open("malabar_furniture_shopify.csv", "w", newline=""))
f.writerow(
[
"Handle",
"Title",
"Descritpion",
"Price",
"Delivery",
"Color",
"Dimensions",
"Material",
"Image_Src",
]
)
for d in self.data:
images = d["Image_Src"]
f.writerow(
[
d["Handle"],
d["Title"],
d["Descritpion"],
d["Price"],
d["Delivery"],
d["Color"],
d["Dimensions"],
d["Material"],
images.pop(0) if images else None,
]
)
while images:
f.writerow(
[None, None, None, None, None, None, None, None, images.pop(0)]
)
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(SweetPeaAndWillowSpider)
process.start()
输出:
更新:我尝试在 satrt 打开文件并定义 headers,但没有区别。我尝试使用 a
附加到文件,它使用重复的 headers.
我只得到一个产品的 Image_Src 链接,这是最后一个。任何人都知道如何解决它?谢谢
您正在为每个回复创建和编写 "malabar_furniture_shopify.csv"
。结果是您只会看到最后一个条目,因为所有其他条目都将被覆盖。
一种可能的解决方法是附加您的结果:
with open("malabar_furniture_shopify.csv", "a", newline="") as csvfile:
然后您需要一个标志来确保 header 只为您的第一个条目编写。 newline=""
用于确保您不会在输出中看到多余的空白行。
更好的方法是在开头打开文件并写入一次header。然后使用相同的文件句柄写入每一行。最后确保文件已关闭。
尝试以下操作:
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
class SweetPeaAndWillowSpider(scrapy.Spider):
name = "sweetpea_and_willow"
custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "malabar_furniture.csv",
"LOG_FILE": "malabar_furniture_shopify.log",
}
headers = {
"authority": "www.sweetpeaandwillow.com",
"cache-control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en,ru;q=0.9",
}
cookies = {
"amzn-checkout-session": "%7B%7D",
"_fbp": "fb.1.1652394481944.1343184112",
"_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
"_ga": "GA1.2.752968178.1652394485",
"SPSI": "4eea709914a47dc1f5575f79dc373b51",
"SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
"PHPSESSID": "n6mfpugp82troila6hfib78q3k",
"UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
"_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
"_hjIncludedInSessionSample": "0",
"_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
"_hjAbsoluteSessionInProgress": "0",
"form_key": "LCm4cy48SHYhBX3C",
"_gid": "GA1.2.1948251329.1652599747",
"_gat": "1",
"mage-cache-storage": "%7B%7D",
"mage-cache-storage-section-invalidation": "%7B%7D",
"mage-cache-sessid": "true",
"recently_viewed_product": "%7B%7D",
"recently_viewed_product_previous": "%7B%7D",
"recently_compared_product": "%7B%7D",
"recently_compared_product_previous": "%7B%7D",
"product_data_storage": "%7B%7D",
"section_data_ids": "%7B%22cart%22%3A1652599747%7D",
"newsletter-popup-form": "declined",
"spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
"sp_lit": "JHxME1OUKp+83P5XseqYpg==",
"PRLST": "AH",
"adOtr": "7ae049U19a4",
}
def start_requests(self):
self.f_output = open("malabar_furniture_shopify.csv", "w", newline="")
self.csv_output = csv.writer(self.f_output)
self.csv_output.writerow(
[
"Handle",
"Title",
"Descritpion",
"Price",
"Delivery",
"Color",
"Dimensions",
"Material",
"Image_Src",
]
)
yield scrapy.Request(
"https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
headers=self.headers,
cookies=self.cookies,
callback=self.parse_urls,
)
def parse_urls(self, response):
url_list = response.css("div.item.product-item")
for link in url_list:
url = link.css("a::attr(href)").get()
yield scrapy.Request(
url=url,
headers=self.headers,
cookies=self.cookies,
callback=self.parse_details,
)
def parse_details(self, response):
data = []
table = response.css("table.data.table.additional-attributes")
for tr in table.css("tbody"):
row = tr.css("tr")
color = row[0].css("td::text").get()
dimension = row[1].css("td::text").get()
material = row[2].css("td::text").get()
data.append(
{
"Handle": response.css("h1.page-title ::text").get().lower(),
"Title": response.css("h1.page-title ::text").get(),
"Descritpion": response.css(
"div#description_product_show > p::text"
).get(),
"Price": response.css("div.original-pricing-wrapper")
.css("span.price ::text")
.getall()[28],
"Delivery": response.css("p.availability-message > span::text").get(),
"Color": color,
"Dimensions": dimension,
"Material": material,
"Image_Src": response.css("div.MagicSlideshow")
.css("a img::attr(src)")
.getall(),
}
)
for d in data:
images = d["Image_Src"]
self.csv_output.writerow(
[
d["Handle"],
d["Title"],
d["Descritpion"],
d["Price"],
d["Delivery"],
d["Color"],
d["Dimensions"],
d["Material"],
images.pop(0) if images else None,
]
)
while images:
self.csv_output.writerow(
[None, None, None, None, None, None, None, None, images.pop(0)]
)
def closed(self, spider):
self.f_output.close()
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(SweetPeaAndWillowSpider)
process.start()
重复的原因是您总是附加到全局 data
列表。