访问 uninform <dt></dt> <dd></dd> 标签
Accessing uninform <dt></dt> <dd></dd> tags
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class" : "tile-content"})
data=defaultdict(list)
for tile in g_data:
#the "tile" value in g_data contains what you are looking for...
#find the product titles
try:
title = tile.find("a","js-product-title")
data['Product Title'].append(title.text)
except:
data['Product Title'].append("")
#find the prices
try:
price = tile.find('span', 'price price-display').text.strip()
data['Price'].append(price)
except:
data['Price'].append("")
#find the stars
try:
g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
data['Stars'].append(g_star)
except:
data['Stars'].append("")
try:
dd_starring = tile.find('dd', {"class" : "media-details-multi-line media-details-artist-dd module"}).text.strip()
data['Starring'].append(dd_starring)
except:
data['Starring'].append("")
try:
running_time = tile.find_all('dl',{"class" : "media-details dl-horizontal copy-mini"})
for dd_run in running_time :
running = dd_run.find_all('dd')[1:2]
for run in running :
#print run.text.strip()
data['Running Time'].append(run.text.strip())
except:
data['Running Time'].append("")
try:
dd_format = tile.findAll('dd',{"class" :"media-details-multi-line"})[1:2]
for formatt in dd_format:
data['Format'].append(textOfFormat)
except:
data['Format'].append("")
try:
div_shipping =tile.find_all('div',{"data-offer-shipping-pass-eligible":"false"})
data['Shipping'].append("")
except:
freeshipping = "Free Shipping"
data['Shipping'].append(freeshipping)
df = pd.DataFrame(data)
df
我想访问
如果没有 class 名称。如何访问它?
像第 11 行有 5 个
导演字段,其他很少有 发布日期。
目前我正在使用 [2:1] 等方式访问它。但是它不灵活并且没有正确填充我的 table。
有什么功能可以做到这一点吗?
将凝视和 运行 时间替换为:
try:
dd_starring = tile.find('dd', {"class" : "media-details-artist-dd"}).text.strip()
data['Starring'].append(dd_starring)
except:
data['Starring'].append("")
try:
running = tile.find('dt',{'class':'media-details-running-time'})
running_time = running.find_next("dd")
data['Running Time'].append(running_time.text)
except:
data['Running Time'].append("")
现在应该 运行。似乎当您 select 多个 class 与 BeautifulSoup 时它可能会混淆所以您可以通过 css class media-details-artist-dd
获得演员. 运行宁时间我使用了一个简单的技巧:)
编辑: 更改了代码以查找 运行 时间的 dd
然后获取下一个兄弟。之前的代码有多余的部分
现在应该可以了
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class" : "tile-content"})
data=defaultdict(list)
for tile in g_data:
#the "tile" value in g_data contains what you are looking for...
#find the product titles
try:
title = tile.find("a","js-product-title")
data['Product Title'].append(title.text)
except:
data['Product Title'].append("")
#find the prices
try:
price = tile.find('span', 'price price-display').text.strip()
data['Price'].append(price)
except:
data['Price'].append("")
#find the stars
try:
g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
data['Stars'].append(g_star)
except:
data['Stars'].append("")
try:
dd_starring = tile.find('dd', {"class" : "media-details-multi-line media-details-artist-dd module"}).text.strip()
data['Starring'].append(dd_starring)
except:
data['Starring'].append("")
try:
running_time = tile.find_all('dl',{"class" : "media-details dl-horizontal copy-mini"})
for dd_run in running_time :
running = dd_run.find_all('dd')[1:2]
for run in running :
#print run.text.strip()
data['Running Time'].append(run.text.strip())
except:
data['Running Time'].append("")
try:
dd_format = tile.findAll('dd',{"class" :"media-details-multi-line"})[1:2]
for formatt in dd_format:
data['Format'].append(textOfFormat)
except:
data['Format'].append("")
try:
div_shipping =tile.find_all('div',{"data-offer-shipping-pass-eligible":"false"})
data['Shipping'].append("")
except:
freeshipping = "Free Shipping"
data['Shipping'].append(freeshipping)
df = pd.DataFrame(data)
df
我想访问
如果没有 class 名称。如何访问它?像第 11 行有 5 个
导演字段,其他很少有 发布日期。目前我正在使用 [2:1] 等方式访问它。但是它不灵活并且没有正确填充我的 table。
有什么功能可以做到这一点吗?
将凝视和 运行 时间替换为:
try:
dd_starring = tile.find('dd', {"class" : "media-details-artist-dd"}).text.strip()
data['Starring'].append(dd_starring)
except:
data['Starring'].append("")
try:
running = tile.find('dt',{'class':'media-details-running-time'})
running_time = running.find_next("dd")
data['Running Time'].append(running_time.text)
except:
data['Running Time'].append("")
现在应该 运行。似乎当您 select 多个 class 与 BeautifulSoup 时它可能会混淆所以您可以通过 css class media-details-artist-dd
获得演员. 运行宁时间我使用了一个简单的技巧:)
编辑: 更改了代码以查找 运行 时间的 dd
然后获取下一个兄弟。之前的代码有多余的部分
现在应该可以了