在字典中写入多个列表 pandas
Write multi list in dict pandas
我尝试解析站点并遇到了这个问题。我确定每件商品的最大图片数量为 7。每张图片 link 写入列表。然后保存在Excel中。所以每个 link 都有文件 1.xlsx 中的列。但是有些商品有 3 或 5 个图像。所以如果图像数量少于 7,我想用空字符串填充另一个字段。但是我得到了文件 2.xlsx.
中的结果
请帮我解决这个问题。
from datetime import datetime, timedelta
from time import sleep
import time, csv
from csv import reader
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import requests, json
def get_html(url):
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get(url, headers=headers).content
return r
goods_link = ['https://www.johnlewis.com/a-a-k-s-hana-raffia-cross-body-bag-navy-multi/p5559710']
Images1 = []
Images2 = []
Images3 = []
Images4 = []
Images5 = []
Images6 = []
Images7 = []
Img = []
for i in goods_link:
soup = BeautifulSoup(get_html(i), 'html.parser')
imgContainer = soup.find('div', {'class':'ProductImages_productImagesContainer__1v2kP'})
imgAll = imgContainer.find_all('div', {'class':'ImageMagnifier_zoomable-image-container__db7jH'})
for j in imgAll:
imgSrc = j.find('img').get('src').split('?$rsp')[0]
Img.append(imgSrc)
[x.append(y) for x,y in zip([Images1, Images2, Images3, Images4, Images5, Images6, Images7], Img)]
info = {}
for ii in Images1:
info.setdefault('Images1',[])
info['Images1'].append(ii)
for ii in Images2:
info.setdefault('Images2',[])
info['Images2'].append(ii)
for ii in Images3:
info.setdefault('Images3',[])
info['Images3'].append(ii)
for ii in Images4:
info.setdefault('Images4',[])
info['Images4'].append(ii)
for ii in Images5:
info.setdefault('Images5',[])
info['Images5'].append(ii)
for ii in Images6:
info.setdefault('Images6',[])
info['Images6'].append(ii)
for ii in Images7:
info.setdefault('Images7',[])
info['Images7'].append(ii)
df = pd.DataFrame.from_dict(info)
df.to_excel('./output.xlsx')
print('Finish')
IIUC 您想要为每一行填充所有 7 列,即使该行的图像少于 7 个。
创建字典的步骤是多余的。您可以在列表中列出所有图像,然后将其附加到列表列表中,并从中创建 DataFrame。
您可以用 columns=
:
指定 headers
def get_html(url):
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get(url, headers=headers).content
return r
goods_link = ['https://www.johnlewis.com/a-a-k-s-hana-raffia-cross-body-bag-navy-multi/p5559710']
headers = ["Images1", "Images2", "Images3", "Images4", "Images5", "Images6", "Images7"]
img_table = []
for link in goods_link:
img_row = [None]*7
soup = BeautifulSoup(get_html(link), 'html.parser')
imgContainer = soup.find('div', {'class':'ProductImages_productImagesContainer__1v2kP'})
imgAll = imgContainer.find_all('div', {'class':'ImageMagnifier_zoomable-image-container__db7jH'})
for j, div_obj in enumerate(imgAll):
imgSrc = div_obj.find('img').get('src').split('?$rsp')[0]
img_row[j]=imgSrc
img_table.append(img_row)
df = pd.DataFrame(img_table, columns=headers)
df.to_excel('./output.xlsx')
print('Finish')
缺少的是创建长度为 7 的 None
列表,然后使用 enumerate
将索引 j
处的元素替换为相应的 link。
请尝试以一种下次更容易理解代码的方式命名变量。
我尝试解析站点并遇到了这个问题。我确定每件商品的最大图片数量为 7。每张图片 link 写入列表。然后保存在Excel中。所以每个 link 都有文件 1.xlsx 中的列。但是有些商品有 3 或 5 个图像。所以如果图像数量少于 7,我想用空字符串填充另一个字段。但是我得到了文件 2.xlsx.
中的结果请帮我解决这个问题。
from datetime import datetime, timedelta
from time import sleep
import time, csv
from csv import reader
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import requests, json
def get_html(url):
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get(url, headers=headers).content
return r
goods_link = ['https://www.johnlewis.com/a-a-k-s-hana-raffia-cross-body-bag-navy-multi/p5559710']
Images1 = []
Images2 = []
Images3 = []
Images4 = []
Images5 = []
Images6 = []
Images7 = []
Img = []
for i in goods_link:
soup = BeautifulSoup(get_html(i), 'html.parser')
imgContainer = soup.find('div', {'class':'ProductImages_productImagesContainer__1v2kP'})
imgAll = imgContainer.find_all('div', {'class':'ImageMagnifier_zoomable-image-container__db7jH'})
for j in imgAll:
imgSrc = j.find('img').get('src').split('?$rsp')[0]
Img.append(imgSrc)
[x.append(y) for x,y in zip([Images1, Images2, Images3, Images4, Images5, Images6, Images7], Img)]
info = {}
for ii in Images1:
info.setdefault('Images1',[])
info['Images1'].append(ii)
for ii in Images2:
info.setdefault('Images2',[])
info['Images2'].append(ii)
for ii in Images3:
info.setdefault('Images3',[])
info['Images3'].append(ii)
for ii in Images4:
info.setdefault('Images4',[])
info['Images4'].append(ii)
for ii in Images5:
info.setdefault('Images5',[])
info['Images5'].append(ii)
for ii in Images6:
info.setdefault('Images6',[])
info['Images6'].append(ii)
for ii in Images7:
info.setdefault('Images7',[])
info['Images7'].append(ii)
df = pd.DataFrame.from_dict(info)
df.to_excel('./output.xlsx')
print('Finish')
IIUC 您想要为每一行填充所有 7 列,即使该行的图像少于 7 个。
创建字典的步骤是多余的。您可以在列表中列出所有图像,然后将其附加到列表列表中,并从中创建 DataFrame。
您可以用 columns=
:
def get_html(url):
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get(url, headers=headers).content
return r
goods_link = ['https://www.johnlewis.com/a-a-k-s-hana-raffia-cross-body-bag-navy-multi/p5559710']
headers = ["Images1", "Images2", "Images3", "Images4", "Images5", "Images6", "Images7"]
img_table = []
for link in goods_link:
img_row = [None]*7
soup = BeautifulSoup(get_html(link), 'html.parser')
imgContainer = soup.find('div', {'class':'ProductImages_productImagesContainer__1v2kP'})
imgAll = imgContainer.find_all('div', {'class':'ImageMagnifier_zoomable-image-container__db7jH'})
for j, div_obj in enumerate(imgAll):
imgSrc = div_obj.find('img').get('src').split('?$rsp')[0]
img_row[j]=imgSrc
img_table.append(img_row)
df = pd.DataFrame(img_table, columns=headers)
df.to_excel('./output.xlsx')
print('Finish')
缺少的是创建长度为 7 的 None
列表,然后使用 enumerate
将索引 j
处的元素替换为相应的 link。
请尝试以一种下次更容易理解代码的方式命名变量。