Return None - 通过 Python 抓取 p
Return None - scraping p by Python
我想报废库存产品的名称和数量,但是 return None.
data_insight = []
for n in range(pagenum):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('[class="prod-section-container"]'):
data_insight.append({
'title' : item.find("a", class_="select-prod").text,
'name' : item.find("p", string="Insight Part"),
'link' : item.find("a", class_="select-prod")['href'],
'price' : item.find("span", class_="c-currency__value"),
'stock' : item.find("p", class_="prod-stock").text
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight
会发生什么?
使用 string
您可以搜索字符串而不是标签,但您搜索的方式是寻找完全匹配。
如何修复?
您可以选择 string=re.compile()
,但这只是第二好的解决方案。
更好的方法是调整您的 css selector
以获得所有 itemprop 属性:
soup.select('#js-search-product-items [itemprop="itemListElement"]')
例子
...
data_insight = []
for n in range(1):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('#js-search-product-items [itemprop="itemListElement"]'):
data_insight.append({
'title' : item.find(attrs={'itemprop':'name'}).text,
'sku' : item.find(attrs={'itemprop':'sku'}).text,
'link' : item.find(attrs={'itemprop':'url'}).text,
'price' : item.find("span", class_="c-currency__value").text,
'stock' : item.find("p", class_="prod-stock").get_text(strip=True).split(' ')[0]
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight
输出
data_insight = []
for n in range(pagenum):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('[class="prod-section-container"]'):
data_insight.append({
'title' : item.find("a", class_="select-prod").text,
'name' : item.find("p", string="Insight Part"),
'link' : item.find("a", class_="select-prod")['href'],
'price' : item.find("span", class_="c-currency__value"),
'stock' : item.find("p", class_="prod-stock").text
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight
会发生什么?
使用 string
您可以搜索字符串而不是标签,但您搜索的方式是寻找完全匹配。
如何修复?
您可以选择 string=re.compile()
,但这只是第二好的解决方案。
更好的方法是调整您的 css selector
以获得所有 itemprop 属性:
soup.select('#js-search-product-items [itemprop="itemListElement"]')
例子
...
data_insight = []
for n in range(1):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('#js-search-product-items [itemprop="itemListElement"]'):
data_insight.append({
'title' : item.find(attrs={'itemprop':'name'}).text,
'sku' : item.find(attrs={'itemprop':'sku'}).text,
'link' : item.find(attrs={'itemprop':'url'}).text,
'price' : item.find("span", class_="c-currency__value").text,
'stock' : item.find("p", class_="prod-stock").get_text(strip=True).split(' ')[0]
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight