如何使用 python 在 html 页面中获取标题和 url
How to get the title and url in html page with python
我想去 department
并且只想 select/print name
和 url
。我尝试了以下方法,但我无法理解如何进入 department
和 select 这两个特定的东西。如何获取所有链接的“名称”和“url”?
import json
import urllib.request
from bs4 import BeautifulSoup
def getContent():
# target site url
url = "www.xyz.com"
# requesting the url for data
request = urllib.request.Request(url)
# get the html, whole page
htmlpage = urllib.request.urlopen(request).read()
bsoup = BeautifulSoup(htmlpage, "html.parser")
# print(bsoup.prettify())
# main_table = bsoup.find("div",attrs)
# print(main_table)
# print(bsoup.find_all('name'))
# nav = bsoup.nav
# print(bsoup.title.department.url)
# for url in find_all('a'):
# print(url.get('href'))
for link in bsoup.find_all("a"):
print("Title: {}".format(link.get("name")))
print("href: {}".format(link.get("href")))
您可以使用 json
模块获取 name
/ url
,如下所示:
import json
import urllib.request
from bs4 import BeautifulSoup
def get_content():
url = "http://www.ucdenver.edu/pages/ucdwelcomepage.aspx"
request = urllib.request.Request(url)
html_page = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html_page, 'html.parser')
json_data = json.loads(soup.find("script", type="application/ld+json").string)
for data in json_data["department"]:
print("{:<60} {}".format(data["name"], data["url"]))
get_content()
输出:
Center for Undergraduate Exploration and Advising https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising
Commencement https://www.ucdenver.edu/commencement
Counseling Center https://www.ucdenver.edu/counseling-center
First Year Experiences https://www.ucdenver.edu/first-year-experiences
Health Programs https://www.ucdenver.edu/programs/health-programs
Housing and Dining https://www.ucdenver.edu/housing-and-dining
...
我想去 department
并且只想 select/print name
和 url
。我尝试了以下方法,但我无法理解如何进入 department
和 select 这两个特定的东西。如何获取所有链接的“名称”和“url”?
import json
import urllib.request
from bs4 import BeautifulSoup
def getContent():
# target site url
url = "www.xyz.com"
# requesting the url for data
request = urllib.request.Request(url)
# get the html, whole page
htmlpage = urllib.request.urlopen(request).read()
bsoup = BeautifulSoup(htmlpage, "html.parser")
# print(bsoup.prettify())
# main_table = bsoup.find("div",attrs)
# print(main_table)
# print(bsoup.find_all('name'))
# nav = bsoup.nav
# print(bsoup.title.department.url)
# for url in find_all('a'):
# print(url.get('href'))
for link in bsoup.find_all("a"):
print("Title: {}".format(link.get("name")))
print("href: {}".format(link.get("href")))
您可以使用 json
模块获取 name
/ url
,如下所示:
import json
import urllib.request
from bs4 import BeautifulSoup
def get_content():
url = "http://www.ucdenver.edu/pages/ucdwelcomepage.aspx"
request = urllib.request.Request(url)
html_page = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html_page, 'html.parser')
json_data = json.loads(soup.find("script", type="application/ld+json").string)
for data in json_data["department"]:
print("{:<60} {}".format(data["name"], data["url"]))
get_content()
输出:
Center for Undergraduate Exploration and Advising https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising
Commencement https://www.ucdenver.edu/commencement
Counseling Center https://www.ucdenver.edu/counseling-center
First Year Experiences https://www.ucdenver.edu/first-year-experiences
Health Programs https://www.ucdenver.edu/programs/health-programs
Housing and Dining https://www.ucdenver.edu/housing-and-dining
...