Python - Webscraping:无法从字典中获取单个项目
Python - Webscraping: Fails to get single item from dictionary
我有一个脚本可以添加(抓取)字典链接,然后分别抓取每个链接:
def get_products():
links = []
for x in range(1,3):
url1 = f"https://www.kaufland.de/category/39251/p{x}/"
r = s.get(url1)
items = r.html.find('div.results')
for item in items:
rawlinks = str(item.find('a'))
list1 = re.findall(r"[0-9]{9}", rawlinks)
result = ['https://www.kaufland.de/product/' + direction for direction in list1]
links.append(result)
for link in links:
wcapi = API(
url='https://s-qmzs9fc4fs3c.eu1.wpsandbox.org/',
consumer_key='ck_365febfbc43beca56bef990b20cb88db6022a5cd',
consumer_secret='cs_05c93973ffb6bc467483bf0105cd730a288dd405',
version="wc/v3"
)
r = requests.get(link)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('h1').text.strip()
price = soup.find('div', {'class':'rd-buybox__price'}).text.strip().replace(' €', '').replace(',', '.')
product_data = soup.find_all('script')[2] # 3rd script tag
product_data = str(product_data).partition('return {')[-1]
product_data = '{' + product_data.split('}(')[0] + '}'
saleprice = price
try:
ean = str(re.findall('[0-9]{13}' , product_data)[0])
except:
ean = ""
try:
brand = str(re.findall('id:[0-9]{8},name:"(.+?)"', product_data))
except:
brand = ""
try:
description = str(re.findall('descriptionHtml:"(.+?)"', product_data)).replace('\u003E', '').replace('\u003Cbr', '').replace('\u003Cli', '').replace('\u003C', '').replace('\u002Fli', '').replace('\u002F', '').replace('\\u002Fb', '').replace('\b', '').replace('\p', '')
except:
description = ""
try:
imgs = str(re.findall('fallbackSrc:"(.+?)"' , product_data))
except:
imgs = "https://www.anchorpackaging.com/wp-content/uploads/2016/10/SampleKit.jpg"
id = ''.join([random.choice(string.digits) for n in range(2)])
sku = ''.join([random.choice(string.digits) for n in range(7)])
scraped = {
"name": title,
"type": "simple",
"regular_price": price,
"stock_quantity": id,
"short_description": "Hersteller: " + brand.replace("']", "").replace("['", "") + ", " + "EAN: " + ean,
"description": description,
"categories": [
{
"id": id
}
],
"images": [
{
"src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[1],
"alt": "img"
},
{
"src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[2],
"alt": "img"
},
{
"src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[3],
"alt": "img"
}
],
}
wcapi.post('products', scraped)
print(scraped)
print(get_products())
它给了我错误:requests.exceptions.InvalidSchema:没有找到“['link1, link2, link3 ']”的连接适配器,因此打印了 url 而不是 link1,我只是为了便于阅读而没有把它们写在这里.为什么我的脚本无法单独使用每个字典项,而是尝试连接到整个字典?
我认为你的问题在行内
links.append(result)
你想要
links.extend(result)
我有一个脚本可以添加(抓取)字典链接,然后分别抓取每个链接:
def get_products():
links = []
for x in range(1,3):
url1 = f"https://www.kaufland.de/category/39251/p{x}/"
r = s.get(url1)
items = r.html.find('div.results')
for item in items:
rawlinks = str(item.find('a'))
list1 = re.findall(r"[0-9]{9}", rawlinks)
result = ['https://www.kaufland.de/product/' + direction for direction in list1]
links.append(result)
for link in links:
wcapi = API(
url='https://s-qmzs9fc4fs3c.eu1.wpsandbox.org/',
consumer_key='ck_365febfbc43beca56bef990b20cb88db6022a5cd',
consumer_secret='cs_05c93973ffb6bc467483bf0105cd730a288dd405',
version="wc/v3"
)
r = requests.get(link)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('h1').text.strip()
price = soup.find('div', {'class':'rd-buybox__price'}).text.strip().replace(' €', '').replace(',', '.')
product_data = soup.find_all('script')[2] # 3rd script tag
product_data = str(product_data).partition('return {')[-1]
product_data = '{' + product_data.split('}(')[0] + '}'
saleprice = price
try:
ean = str(re.findall('[0-9]{13}' , product_data)[0])
except:
ean = ""
try:
brand = str(re.findall('id:[0-9]{8},name:"(.+?)"', product_data))
except:
brand = ""
try:
description = str(re.findall('descriptionHtml:"(.+?)"', product_data)).replace('\u003E', '').replace('\u003Cbr', '').replace('\u003Cli', '').replace('\u003C', '').replace('\u002Fli', '').replace('\u002F', '').replace('\\u002Fb', '').replace('\b', '').replace('\p', '')
except:
description = ""
try:
imgs = str(re.findall('fallbackSrc:"(.+?)"' , product_data))
except:
imgs = "https://www.anchorpackaging.com/wp-content/uploads/2016/10/SampleKit.jpg"
id = ''.join([random.choice(string.digits) for n in range(2)])
sku = ''.join([random.choice(string.digits) for n in range(7)])
scraped = {
"name": title,
"type": "simple",
"regular_price": price,
"stock_quantity": id,
"short_description": "Hersteller: " + brand.replace("']", "").replace("['", "") + ", " + "EAN: " + ean,
"description": description,
"categories": [
{
"id": id
}
],
"images": [
{
"src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[1],
"alt": "img"
},
{
"src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[2],
"alt": "img"
},
{
"src": imgs.replace('\\', '\').replace('\u002F', '/').replace(" '", "").replace("'", "").split(",")[3],
"alt": "img"
}
],
}
wcapi.post('products', scraped)
print(scraped)
print(get_products())
它给了我错误:requests.exceptions.InvalidSchema:没有找到“['link1, link2, link3 ']”的连接适配器,因此打印了 url 而不是 link1,我只是为了便于阅读而没有把它们写在这里.为什么我的脚本无法单独使用每个字典项,而是尝试连接到整个字典?
我认为你的问题在行内
links.append(result)
你想要
links.extend(result)