在 div 中抓取链接
Crawling links in a div
我正在 Python 使用 Beautiful Soup 制作网络爬虫。我想从某个div获取链接,我现在的代码没有打印任何东西。
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/'
source = requests.get(url)
plain_text = source.text
obj = BeautifulSoup(plain_text, "html5lib")
for link in obj.find_all('div', {'class': 'videos-top-courses'}):
href = 'https://thenewboston.com/', link.get('href')
print(href)
page += 1
spider(1)
您可以使用类似的东西:
from bs4 import BeautifulSoup
from urllib2 import urlopen
soup = BeautifulSoup(urlopen("https://thenewboston.com/"),'html.parser')
videos = soup.findAll('td', {'class': 'video-icon-column'})
for td in videos:
print td.a['href']
您必须找到 <table>
而不是 <div>
,稍后您可以找到 <a>
以获得 href
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/'
source = requests.get(url)
plain_text = source.text
obj = BeautifulSoup(plain_text, "html5lib")
for table in obj.find_all('table', {'class': 'videos-top-courses'}):
for a in table.find_all('a'):
print(a.get('href'))
page += 1
spider(1)
我正在 Python 使用 Beautiful Soup 制作网络爬虫。我想从某个div获取链接,我现在的代码没有打印任何东西。
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/'
source = requests.get(url)
plain_text = source.text
obj = BeautifulSoup(plain_text, "html5lib")
for link in obj.find_all('div', {'class': 'videos-top-courses'}):
href = 'https://thenewboston.com/', link.get('href')
print(href)
page += 1
spider(1)
您可以使用类似的东西:
from bs4 import BeautifulSoup
from urllib2 import urlopen
soup = BeautifulSoup(urlopen("https://thenewboston.com/"),'html.parser')
videos = soup.findAll('td', {'class': 'video-icon-column'})
for td in videos:
print td.a['href']
您必须找到 <table>
而不是 <div>
,稍后您可以找到 <a>
以获得 href
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/'
source = requests.get(url)
plain_text = source.text
obj = BeautifulSoup(plain_text, "html5lib")
for table in obj.find_all('table', {'class': 'videos-top-courses'}):
for a in table.find_all('a'):
print(a.get('href'))
page += 1
spider(1)