如何使用 beautifulsoup python 提取特定 div 中的所有 href 和 src
How to extract all the hrefs and src inside specific divs with beautifulsoup python
我想提取页面上具有 class = 'news_item'
的所有 div 中的所有 href 和 src
html 看起来像这样:
<div class="col">
<div class="group">
<h4>News</h4>
<div class="news_item">
<a href="www.link.com">
<h2 class="link">
here is a link-heading
</h2>
<div class="Img">
<img border="0" src="/image/link" />
</div>
<p></p>
</a>
</div>
从这里我要提取的是:
www.link.com ,这里是 link-heading 和 /image/link
我的代码是:
def scrape_a(url):
news_links = soup.select("div.news_item [href]")
for links in news_links:
if news_links:
return 'http://www.web.com' + news_links['href']
def scrape_headings(url):
for news_headings in soup.select("h2.link"):
return str(news_headings.string.strip())
def scrape_images(url):
images = soup.select("div.Img[src]")
for image in images:
if images:
return 'http://www.web.com' + news_links['src']
def top_stories():
r = requests.get(url)
soup = BeautifulSoup(r.content)
link = scrape_a(soup)
heading = scrape_headings(soup)
image = scrape_images(soup)
message = {'heading': heading, 'link': link, 'image': image}
print message
问题是它给我错误:
**TypeError: 'NoneType' object is not callable**
这是回溯:
Traceback (most recent call last):
File "web_parser.py", line 40, in <module>
top_stories()
File "web_parser.py", line 32, in top_stories
link = scrape_a('www.link.com')
File "web_parser.py", line 10, in scrape_a
news_links = soup.select_all("div.news_item [href]")
您的大部分错误都是因为 news_link
没有被正确找到。您没有得到预期的 tag
。
变化:
news_links = soup.select("div.news_item [href]")
for links in news_links:
if news_links:
return 'http://www.web.com' + news_links['href']
看看是否有帮助:
news_links = soup.find_all("div", class="news_item")
for links in news_links:
if news_links:
return 'http://www.web.com' + news_links.find("a").get('href')
另请注意,return 语句会给您类似 http://www.web.comwww.link.com 的信息,我认为您不需要。
您应该一次获取所有新闻项,然后遍历它们。这使得将您获得的数据组织成可管理的块(在本例中为指令)变得容易。尝试这样的事情
url = "http://www.web.com"
r = requests.get(url)
soup = BeautifulSoup(r.text)
messages = []
news_links = soup.select("div.news_item") # selects all .news_item's
for l in news_links:
message = {}
message['heading'] = l.find("h2").text.strip()
link = l.find("a")
if not link:
continue
message['link'] = link['href']
image = l.find('img')
if not image:
continue
message['image'] = "http://www.web.com{}".format(image['src'])
messages.append(message)
print messages
你把任务分成不同方法的想法很不错-
易于阅读、更改和重用。
错误几乎已解决并修复,跟踪中有 select_all 但它不在 beautifulsoup 中,也不在您的代码和其他代码中东西...长话短说我会这样做。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from urlparse import urljoin
import requests
def news_links(url, soup):
links = []
for text in soup.select("div.news_item"):
for x in text.find_all(href=True):
links.append(urljoin(url, x['href']))
return links
def news_headings(soup):
headings = []
for news_headings in soup.select("h2.link"):
heading.append(str(news_headings.string.strip()))
return headings
def news_images(url, soup):
sources = []
for image in soup.select("img[src]"):
sources.append(urljoin(url, image['src']))
return sources
def top_stories():
url = 'http://www.web.com/'
r = requests.get(url)
content = r.content
soup = BeautifulSoup(content)
message = {'heading': news_headings(soup),
'link': news_links(url, soup),
'image': news_images(url, soup)}
return message
print top_stories()
汤很健壮,您想找到或 select 不存在的东西 returns 一个空列表。看起来您正在解析项目列表 - 代码非常接近用于此目的。
我想提取页面上具有 class = 'news_item'
的所有 div 中的所有 href 和 srchtml 看起来像这样:
<div class="col">
<div class="group">
<h4>News</h4>
<div class="news_item">
<a href="www.link.com">
<h2 class="link">
here is a link-heading
</h2>
<div class="Img">
<img border="0" src="/image/link" />
</div>
<p></p>
</a>
</div>
从这里我要提取的是:
www.link.com ,这里是 link-heading 和 /image/link
我的代码是:
def scrape_a(url):
news_links = soup.select("div.news_item [href]")
for links in news_links:
if news_links:
return 'http://www.web.com' + news_links['href']
def scrape_headings(url):
for news_headings in soup.select("h2.link"):
return str(news_headings.string.strip())
def scrape_images(url):
images = soup.select("div.Img[src]")
for image in images:
if images:
return 'http://www.web.com' + news_links['src']
def top_stories():
r = requests.get(url)
soup = BeautifulSoup(r.content)
link = scrape_a(soup)
heading = scrape_headings(soup)
image = scrape_images(soup)
message = {'heading': heading, 'link': link, 'image': image}
print message
问题是它给我错误:
**TypeError: 'NoneType' object is not callable**
这是回溯:
Traceback (most recent call last):
File "web_parser.py", line 40, in <module>
top_stories()
File "web_parser.py", line 32, in top_stories
link = scrape_a('www.link.com')
File "web_parser.py", line 10, in scrape_a
news_links = soup.select_all("div.news_item [href]")
您的大部分错误都是因为 news_link
没有被正确找到。您没有得到预期的 tag
。
变化:
news_links = soup.select("div.news_item [href]")
for links in news_links:
if news_links:
return 'http://www.web.com' + news_links['href']
看看是否有帮助:
news_links = soup.find_all("div", class="news_item")
for links in news_links:
if news_links:
return 'http://www.web.com' + news_links.find("a").get('href')
另请注意,return 语句会给您类似 http://www.web.comwww.link.com 的信息,我认为您不需要。
您应该一次获取所有新闻项,然后遍历它们。这使得将您获得的数据组织成可管理的块(在本例中为指令)变得容易。尝试这样的事情
url = "http://www.web.com"
r = requests.get(url)
soup = BeautifulSoup(r.text)
messages = []
news_links = soup.select("div.news_item") # selects all .news_item's
for l in news_links:
message = {}
message['heading'] = l.find("h2").text.strip()
link = l.find("a")
if not link:
continue
message['link'] = link['href']
image = l.find('img')
if not image:
continue
message['image'] = "http://www.web.com{}".format(image['src'])
messages.append(message)
print messages
你把任务分成不同方法的想法很不错-
易于阅读、更改和重用。
错误几乎已解决并修复,跟踪中有 select_all 但它不在 beautifulsoup 中,也不在您的代码和其他代码中东西...长话短说我会这样做。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from urlparse import urljoin
import requests
def news_links(url, soup):
links = []
for text in soup.select("div.news_item"):
for x in text.find_all(href=True):
links.append(urljoin(url, x['href']))
return links
def news_headings(soup):
headings = []
for news_headings in soup.select("h2.link"):
heading.append(str(news_headings.string.strip()))
return headings
def news_images(url, soup):
sources = []
for image in soup.select("img[src]"):
sources.append(urljoin(url, image['src']))
return sources
def top_stories():
url = 'http://www.web.com/'
r = requests.get(url)
content = r.content
soup = BeautifulSoup(content)
message = {'heading': news_headings(soup),
'link': news_links(url, soup),
'image': news_images(url, soup)}
return message
print top_stories()
汤很健壮,您想找到或 select 不存在的东西 returns 一个空列表。看起来您正在解析项目列表 - 代码非常接近用于此目的。