从维基百科中提取 table 数据 API
Extracting table data from wikipedia API
我需要从维基百科中提取任何政客右侧的 table。
为此,我尝试使用维基百科 API。但是我无法提取 table 数据。
到目前为止我尝试过的代码如下:
import wikipedia
person = wikipedia.search("Rahul Gandhi")[0]
person # prints "Rahul Gandhi"
wikipedia.summary(person) # able to get summary
page = wikipedia.page(person)
page.url # prints "url"
print(page.content) # prints complete content, but not the tables
我也尝试过抓取 tables,但是很难以结构化形式获取数据。
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "https://en.wikipedia.org/wiki/Rahul_Gandhi"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")
table = soup.find("table",{"class":"infobox vcard"})
info = table.findAll('tr')
for row in info:
content = []
if row.find('th'):
content += [row.find('th').text]
if row.find('a'):
content += [row.find('a').text]
if row.find('td'):
content += [row.find('td').text]
print(content)
# Output :
['Rahul Gandhi']
['', 'Gandhi in May 2019']
['']
['President of the Indian National Congress', 'President of the Indian National Congress']
['In office16 December 2017\xa0– 10 August 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Sonia Gandhi', 'Sonia Gandhi (Interim)']
['Member of Parliament, Lok Sabha', 'Member of Parliament, Lok Sabha']
['Incumbent', 'Incumbent']
['Assumed office 23 May 2019']
['Preceded by', 'M. I. Shanavas', 'M. I. Shanavas']
['Constituency', 'Wayanad', 'Wayanad, Kerala']
['In office17 May 2004\xa0– 23 May 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Smriti Irani', 'Smriti Irani']
['Constituency', 'Amethi', 'Amethi, Uttar Pradesh']
['Vice-President of the Indian National Congress', 'Indian National Congress']
['In office19 January 2013\xa0– 16 December 2017']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Preceded by', 'Position established']
['Succeeded by', 'Position abolished']
['General Secretary of Indian National Congress', 'Indian National Congress']
['In office25 September 2007\xa0– 19 January 2013']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Chair of Indian Youth Congress', 'Indian Youth Congress']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['Chair of National Students’ Union of India', 'National Students’ Union of India']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['\n']
['Personal details']
['Born', ' (1970-06-19) 19 June 1970 (age\xa050)New Delhi, India']
['Political party', 'Indian National Congress', 'Indian National Congress']
['Parents', 'Rajiv Gandhi', 'Rajiv GandhiSonia Gandhi']
['Relatives', 'Nehru–Gandhi family', 'Nehru–Gandhi family']
['Education', "St. Stephen's College, Delhi", "St. Stephen's College, DelhiHarvard UniversityRollins College (BA)Trinity College, Cambridge (MPhil)"]
['Signature', '', '']
['Website', 'Official website', 'Official website']
使用我创建的行中有关标签和项目数量的信息
import urllib3
import requests
from bs4 import BeautifulSoup
import json
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = 'https://en.wikipedia.org/wiki/Rahul_Gandhi'
#url = 'https://en.wikipedia.org/wiki/Sonia_Gandhi'
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', {'class': 'infobox vcard'})
# ---
content = {}
header1 = None
header2 = None
current = content
for row in table.find_all('tr'):
children = list(row.children)
# replace '<br>' with '\n'
for item in children:
for br in item.find_all('br'):
br.replace_with('\n' + br.text)
# headers/subheaders (sections/subsections)
if len(children) == 1:
#html = str(children[0]).strip()
# skip empty rows
inner_html = children[0].decode_contents().strip()
if not inner_html:
continue
#print(inner_html)
text = children[0].get_text().strip() # don't `get_text(strip=True)` to keep `\n`
# clean text - replace non-breaking space
text = text.replace('\u00a0', ' ')
#print(item.name, '|', text)
images = [{
'src': x.get('src'),
'width': x.get('width', ''),
'height': x.get('height', ''),
'alt': x.get('alt'),
} for x in children[0].find_all('img')]
links = [{
'text': x.text,
'href': x.get('href', ''),
'title': x.get('title', ''),
} for x in children[0].find_all('a')]
# create headers / section
if children[0].name == 'th':
header1 = text
section = {
'type': 'header',
#'html': html,
'key' : text,
'text': text, # text in header
'links': links, # links in header
'images': images,
'items': {}, # items in section
}
content[header1] = section # add section to content
current = section['items'] # keep access to add items later
# create subheaders / subsection
if children[0].name == 'td':
header2 = text
section = {
'type': 'header',
#'html': html,
'key' : text,
'text': text, # text in subheader
'links': links, # links in subheader
'images': images,
'items': {}, # items in subsection
}
content[header1]['items'][header2] = section # add section to content
current = section['items'] # keep access to add items later
# items in sections/sections
if len(children) == 2:
#html = str(children[1])
# skip empty rows
#inner_html = children[0].decode_contents().strip()
#if not inner_html:
# continue
#print(inner_html)
key = children[0].get_text().strip()
text = children[1].get_text().strip()
links = [{
'text': x.text,
'href': x.get('href', ''),
'title': x.get('title', ''),
} for x in children[1].find_all('a')]
images = [{
'src': x.get('src'),
'width': x.get('width', ''),
'height': x.get('height', ''),
'alt': x.get('alt'),
} for x in children[1].find_all('img')]
# clean text - replace non-breaking space
text = text.replace('\u00a0', ' ')
current[key] = {
'type': 'item',
#'html': html,
'key': key,
'text': text,
'links': links,
'images': images,
'items': {}
}
#print(content[key])
#first_key = list(content.keys())[0]
#print(first_key)
#print(json.dumps(content[first_key], indent=2))
print(json.dumps(content, indent=2))
这给出了这个:
{
"Rahul Gandhi": {
"type": "header",
"key": "Rahul Gandhi",
"text": "Rahul Gandhi",
"links": [],
"images": [],
"items": {
"Gandhi in May 2019": {
"type": "header",
"key": "Gandhi in May 2019",
"text": "Gandhi in May 2019",
"links": [
{
"text": "",
"href": "/wiki/File:Rahul_Gandhi,_Member_of_Parliament,_Wayanad,_Kerala.jpg",
"title": ""
}
],
"images": [
{
"src": "//upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg/220px-Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg",
"width": "220",
"height": "293",
"alt": "Rahul Gandhi, Member of Parliament, Wayanad, Kerala.jpg"
}
],
"items": {}
}
}
},
"President of the Indian National Congress": {
"type": "header",
"key": "President of the Indian National Congress",
"text": "President of the Indian National Congress",
"links": [
{
"text": "President of the Indian National Congress",
"href": "/wiki/List_of_Presidents_of_the_Indian_National_Congress",
"title": "List of Presidents of the Indian National Congress"
}
],
"images": [],
"items": {
"In office\n16 December 2017 \u2013 10 August 2019": {
"type": "header",
"key": "In office\n16 December 2017 \u2013 10 August 2019",
"text": "In office\n16 December 2017 \u2013 10 August 2019",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Succeeded by": {
"type": "item",
"key": "Succeeded by",
"text": "Sonia Gandhi (Interim)",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
}
}
}
}
},
"Member of Parliament, Lok Sabha": {
"type": "header",
"key": "Member of Parliament, Lok Sabha",
"text": "Member of Parliament, Lok Sabha",
"links": [
{
"text": "Member of Parliament, Lok Sabha",
"href": "/wiki/Member_of_Parliament,_Lok_Sabha",
"title": "Member of Parliament, Lok Sabha"
}
],
"images": [],
"items": {
"Incumbent": {
"type": "header",
"key": "Incumbent",
"text": "Incumbent",
"links": [
{
"text": "Incumbent",
"href": "/wiki/Incumbent",
"title": "Incumbent"
}
],
"images": [],
"items": {}
},
"Assumed office \n23 May 2019": {
"type": "header",
"key": "Assumed office \n23 May 2019",
"text": "Assumed office \n23 May 2019",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "M. I. Shanavas",
"links": [
{
"text": "M. I. Shanavas",
"href": "/wiki/M._I._Shanavas",
"title": "M. I. Shanavas"
}
],
"images": [],
"items": {}
},
"Constituency": {
"type": "item",
"key": "Constituency",
"text": "Wayanad, Kerala",
"links": [
{
"text": "Wayanad",
"href": "/wiki/Wayanad_(Lok_Sabha_constituency)",
"title": "Wayanad (Lok Sabha constituency)"
},
{
"text": "Kerala",
"href": "/wiki/Kerala",
"title": "Kerala"
}
],
"images": [],
"items": {}
}
}
},
"In office\n17 May 2004 \u2013 23 May 2019": {
"type": "header",
"key": "In office\n17 May 2004 \u2013 23 May 2019",
"text": "In office\n17 May 2004 \u2013 23 May 2019",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Succeeded by": {
"type": "item",
"key": "Succeeded by",
"text": "Smriti Irani",
"links": [
{
"text": "Smriti Irani",
"href": "/wiki/Smriti_Irani",
"title": "Smriti Irani"
}
],
"images": [],
"items": {}
},
"Constituency": {
"type": "item",
"key": "Constituency",
"text": "Amethi, Uttar Pradesh",
"links": [
{
"text": "Amethi",
"href": "/wiki/Amethi_(Lok_Sabha_constituency)",
"title": "Amethi (Lok Sabha constituency)"
},
{
"text": "Uttar Pradesh",
"href": "/wiki/Uttar_Pradesh",
"title": "Uttar Pradesh"
}
],
"images": [],
"items": {}
}
}
}
}
},
"Vice-President of the Indian National Congress": {
"type": "header",
"key": "Vice-President of the Indian National Congress",
"text": "Vice-President of the Indian National Congress",
"links": [
{
"text": "Indian National Congress",
"href": "/wiki/Indian_National_Congress",
"title": "Indian National Congress"
}
],
"images": [],
"items": {
"In office\n19 January 2013 \u2013 16 December 2017": {
"type": "header",
"key": "In office\n19 January 2013 \u2013 16 December 2017",
"text": "In office\n19 January 2013 \u2013 16 December 2017",
"links": [],
"images": [],
"items": {
"President": {
"type": "item",
"key": "President",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Position established",
"links": [],
"images": [],
"items": {}
},
"Succeeded by": {
"type": "item",
"key": "Succeeded by",
"text": "Position abolished",
"links": [],
"images": [],
"items": {}
}
}
}
}
},
"General Secretary of Indian National Congress": {
"type": "header",
"key": "General Secretary of Indian National Congress",
"text": "General Secretary of Indian National Congress",
"links": [
{
"text": "Indian National Congress",
"href": "/wiki/Indian_National_Congress",
"title": "Indian National Congress"
}
],
"images": [],
"items": {
"In office\n25 September 2007 \u2013 19 January 2013": {
"type": "header",
"key": "In office\n25 September 2007 \u2013 19 January 2013",
"text": "In office\n25 September 2007 \u2013 19 January 2013",
"links": [],
"images": [],
"items": {
"President": {
"type": "item",
"key": "President",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
}
}
}
}
},
"Chair of Indian Youth Congress": {
"type": "header",
"key": "Chair of Indian Youth Congress",
"text": "Chair of Indian Youth Congress",
"links": [
{
"text": "Indian Youth Congress",
"href": "/wiki/Indian_Youth_Congress",
"title": "Indian Youth Congress"
}
],
"images": [],
"items": {
"Incumbent": {
"type": "header",
"key": "Incumbent",
"text": "Incumbent",
"links": [
{
"text": "Incumbent",
"href": "/wiki/Incumbent",
"title": "Incumbent"
}
],
"images": [],
"items": {}
},
"Assumed office \n25 September 2007": {
"type": "header",
"key": "Assumed office \n25 September 2007",
"text": "Assumed office \n25 September 2007",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Position established",
"links": [],
"images": [],
"items": {}
}
}
}
}
},
"Chair of National Students\u2019 Union of India": {
"type": "header",
"key": "Chair of National Students\u2019 Union of India",
"text": "Chair of National Students\u2019 Union of India",
"links": [
{
"text": "National Students\u2019 Union of India",
"href": "/wiki/National_Students%E2%80%99_Union_of_India",
"title": "National Students\u2019 Union of India"
}
],
"images": [],
"items": {
"Incumbent": {
"type": "header",
"key": "Incumbent",
"text": "Incumbent",
"links": [
{
"text": "Incumbent",
"href": "/wiki/Incumbent",
"title": "Incumbent"
}
],
"images": [],
"items": {}
},
"Assumed office \n25 September 2007": {
"type": "header",
"key": "Assumed office \n25 September 2007",
"text": "Assumed office \n25 September 2007",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Position established",
"links": [],
"images": [],
"items": {}
}
}
}
}
},
"Personal details": {
"type": "header",
"key": "Personal details",
"text": "Personal details",
"links": [],
"images": [],
"items": {
"Born": {
"type": "item",
"key": "Born",
"text": "(1970-06-19) 19 June 1970 (age 50)\nNew Delhi, India",
"links": [],
"images": [],
"items": {}
},
"Political party": {
"type": "item",
"key": "Political party",
"text": "Indian National Congress",
"links": [
{
"text": "Indian National Congress",
"href": "/wiki/Indian_National_Congress",
"title": "Indian National Congress"
}
],
"images": [],
"items": {}
},
"Parents": {
"type": "item",
"key": "Parents",
"text": "Rajiv Gandhi\nSonia Gandhi",
"links": [
{
"text": "Rajiv Gandhi",
"href": "/wiki/Rajiv_Gandhi",
"title": "Rajiv Gandhi"
},
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Relatives": {
"type": "item",
"key": "Relatives",
"text": "Nehru\u2013Gandhi family",
"links": [
{
"text": "Nehru\u2013Gandhi family",
"href": "/wiki/Nehru%E2%80%93Gandhi_family",
"title": "Nehru\u2013Gandhi family"
}
],
"images": [],
"items": {}
},
"Education": {
"type": "item",
"key": "Education",
"text": "St. Stephen's College, Delhi\nHarvard University\nRollins College (BA)\nTrinity College, Cambridge (MPhil)",
"links": [
{
"text": "St. Stephen's College, Delhi",
"href": "/wiki/St._Stephen%27s_College,_Delhi",
"title": "St. Stephen's College, Delhi"
},
{
"text": "Harvard University",
"href": "/wiki/Harvard_University",
"title": "Harvard University"
},
{
"text": "Rollins College",
"href": "/wiki/Rollins_College",
"title": "Rollins College"
},
{
"text": "BA",
"href": "/wiki/Bachelor_of_Arts",
"title": "Bachelor of Arts"
},
{
"text": "Trinity College, Cambridge",
"href": "/wiki/Trinity_College,_Cambridge",
"title": "Trinity College, Cambridge"
},
{
"text": "MPhil",
"href": "/wiki/Master_of_Philosophy",
"title": "Master of Philosophy"
}
],
"images": [],
"items": {}
},
"Signature": {
"type": "item",
"key": "Signature",
"text": "",
"links": [
{
"text": "",
"href": "/wiki/File:Signature_of_Rahul_Gandhi.svg",
"title": "Rahul Gandhi's signature"
}
],
"images": [
{
"src": "//upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Signature_of_Rahul_Gandhi.svg/128px-Signature_of_Rahul_Gandhi.svg.png",
"width": "128",
"height": "44",
"alt": ""
}
],
"items": {}
},
"Website": {
"type": "item",
"key": "Website",
"text": "Official website",
"links": [
{
"text": "Official website",
"href": "http://rahulgandhi.in",
"title": ""
}
],
"images": [],
"items": {}
}
}
}
}
顺便说一句:
我曾使用 headers 对项目进行分组,因为有很多 Preceded by
,等等
我试图获取有关文本、链接、图像的所有信息,并在所有元素中创建相同的字段,即使它们没有某些值。
我只是不确定使用 headers 作为键是否好 - 使用部分列表而不是像 "Member of Parliament, Lok Sabha"
这样的键可能更容易,因为不同的人可能会有所不同.
我需要从维基百科中提取任何政客右侧的 table。
为此,我尝试使用维基百科 API。但是我无法提取 table 数据。 到目前为止我尝试过的代码如下:
import wikipedia
person = wikipedia.search("Rahul Gandhi")[0]
person # prints "Rahul Gandhi"
wikipedia.summary(person) # able to get summary
page = wikipedia.page(person)
page.url # prints "url"
print(page.content) # prints complete content, but not the tables
我也尝试过抓取 tables,但是很难以结构化形式获取数据。
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "https://en.wikipedia.org/wiki/Rahul_Gandhi"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")
table = soup.find("table",{"class":"infobox vcard"})
info = table.findAll('tr')
for row in info:
content = []
if row.find('th'):
content += [row.find('th').text]
if row.find('a'):
content += [row.find('a').text]
if row.find('td'):
content += [row.find('td').text]
print(content)
# Output :
['Rahul Gandhi']
['', 'Gandhi in May 2019']
['']
['President of the Indian National Congress', 'President of the Indian National Congress']
['In office16 December 2017\xa0– 10 August 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Sonia Gandhi', 'Sonia Gandhi (Interim)']
['Member of Parliament, Lok Sabha', 'Member of Parliament, Lok Sabha']
['Incumbent', 'Incumbent']
['Assumed office 23 May 2019']
['Preceded by', 'M. I. Shanavas', 'M. I. Shanavas']
['Constituency', 'Wayanad', 'Wayanad, Kerala']
['In office17 May 2004\xa0– 23 May 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Smriti Irani', 'Smriti Irani']
['Constituency', 'Amethi', 'Amethi, Uttar Pradesh']
['Vice-President of the Indian National Congress', 'Indian National Congress']
['In office19 January 2013\xa0– 16 December 2017']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Preceded by', 'Position established']
['Succeeded by', 'Position abolished']
['General Secretary of Indian National Congress', 'Indian National Congress']
['In office25 September 2007\xa0– 19 January 2013']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Chair of Indian Youth Congress', 'Indian Youth Congress']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['Chair of National Students’ Union of India', 'National Students’ Union of India']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['\n']
['Personal details']
['Born', ' (1970-06-19) 19 June 1970 (age\xa050)New Delhi, India']
['Political party', 'Indian National Congress', 'Indian National Congress']
['Parents', 'Rajiv Gandhi', 'Rajiv GandhiSonia Gandhi']
['Relatives', 'Nehru–Gandhi family', 'Nehru–Gandhi family']
['Education', "St. Stephen's College, Delhi", "St. Stephen's College, DelhiHarvard UniversityRollins College (BA)Trinity College, Cambridge (MPhil)"]
['Signature', '', '']
['Website', 'Official website', 'Official website']
使用我创建的行中有关标签和项目数量的信息
import urllib3
import requests
from bs4 import BeautifulSoup
import json
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = 'https://en.wikipedia.org/wiki/Rahul_Gandhi'
#url = 'https://en.wikipedia.org/wiki/Sonia_Gandhi'
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', {'class': 'infobox vcard'})
# ---
content = {}
header1 = None
header2 = None
current = content
for row in table.find_all('tr'):
children = list(row.children)
# replace '<br>' with '\n'
for item in children:
for br in item.find_all('br'):
br.replace_with('\n' + br.text)
# headers/subheaders (sections/subsections)
if len(children) == 1:
#html = str(children[0]).strip()
# skip empty rows
inner_html = children[0].decode_contents().strip()
if not inner_html:
continue
#print(inner_html)
text = children[0].get_text().strip() # don't `get_text(strip=True)` to keep `\n`
# clean text - replace non-breaking space
text = text.replace('\u00a0', ' ')
#print(item.name, '|', text)
images = [{
'src': x.get('src'),
'width': x.get('width', ''),
'height': x.get('height', ''),
'alt': x.get('alt'),
} for x in children[0].find_all('img')]
links = [{
'text': x.text,
'href': x.get('href', ''),
'title': x.get('title', ''),
} for x in children[0].find_all('a')]
# create headers / section
if children[0].name == 'th':
header1 = text
section = {
'type': 'header',
#'html': html,
'key' : text,
'text': text, # text in header
'links': links, # links in header
'images': images,
'items': {}, # items in section
}
content[header1] = section # add section to content
current = section['items'] # keep access to add items later
# create subheaders / subsection
if children[0].name == 'td':
header2 = text
section = {
'type': 'header',
#'html': html,
'key' : text,
'text': text, # text in subheader
'links': links, # links in subheader
'images': images,
'items': {}, # items in subsection
}
content[header1]['items'][header2] = section # add section to content
current = section['items'] # keep access to add items later
# items in sections/sections
if len(children) == 2:
#html = str(children[1])
# skip empty rows
#inner_html = children[0].decode_contents().strip()
#if not inner_html:
# continue
#print(inner_html)
key = children[0].get_text().strip()
text = children[1].get_text().strip()
links = [{
'text': x.text,
'href': x.get('href', ''),
'title': x.get('title', ''),
} for x in children[1].find_all('a')]
images = [{
'src': x.get('src'),
'width': x.get('width', ''),
'height': x.get('height', ''),
'alt': x.get('alt'),
} for x in children[1].find_all('img')]
# clean text - replace non-breaking space
text = text.replace('\u00a0', ' ')
current[key] = {
'type': 'item',
#'html': html,
'key': key,
'text': text,
'links': links,
'images': images,
'items': {}
}
#print(content[key])
#first_key = list(content.keys())[0]
#print(first_key)
#print(json.dumps(content[first_key], indent=2))
print(json.dumps(content, indent=2))
这给出了这个:
{
"Rahul Gandhi": {
"type": "header",
"key": "Rahul Gandhi",
"text": "Rahul Gandhi",
"links": [],
"images": [],
"items": {
"Gandhi in May 2019": {
"type": "header",
"key": "Gandhi in May 2019",
"text": "Gandhi in May 2019",
"links": [
{
"text": "",
"href": "/wiki/File:Rahul_Gandhi,_Member_of_Parliament,_Wayanad,_Kerala.jpg",
"title": ""
}
],
"images": [
{
"src": "//upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg/220px-Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg",
"width": "220",
"height": "293",
"alt": "Rahul Gandhi, Member of Parliament, Wayanad, Kerala.jpg"
}
],
"items": {}
}
}
},
"President of the Indian National Congress": {
"type": "header",
"key": "President of the Indian National Congress",
"text": "President of the Indian National Congress",
"links": [
{
"text": "President of the Indian National Congress",
"href": "/wiki/List_of_Presidents_of_the_Indian_National_Congress",
"title": "List of Presidents of the Indian National Congress"
}
],
"images": [],
"items": {
"In office\n16 December 2017 \u2013 10 August 2019": {
"type": "header",
"key": "In office\n16 December 2017 \u2013 10 August 2019",
"text": "In office\n16 December 2017 \u2013 10 August 2019",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Succeeded by": {
"type": "item",
"key": "Succeeded by",
"text": "Sonia Gandhi (Interim)",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
}
}
}
}
},
"Member of Parliament, Lok Sabha": {
"type": "header",
"key": "Member of Parliament, Lok Sabha",
"text": "Member of Parliament, Lok Sabha",
"links": [
{
"text": "Member of Parliament, Lok Sabha",
"href": "/wiki/Member_of_Parliament,_Lok_Sabha",
"title": "Member of Parliament, Lok Sabha"
}
],
"images": [],
"items": {
"Incumbent": {
"type": "header",
"key": "Incumbent",
"text": "Incumbent",
"links": [
{
"text": "Incumbent",
"href": "/wiki/Incumbent",
"title": "Incumbent"
}
],
"images": [],
"items": {}
},
"Assumed office \n23 May 2019": {
"type": "header",
"key": "Assumed office \n23 May 2019",
"text": "Assumed office \n23 May 2019",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "M. I. Shanavas",
"links": [
{
"text": "M. I. Shanavas",
"href": "/wiki/M._I._Shanavas",
"title": "M. I. Shanavas"
}
],
"images": [],
"items": {}
},
"Constituency": {
"type": "item",
"key": "Constituency",
"text": "Wayanad, Kerala",
"links": [
{
"text": "Wayanad",
"href": "/wiki/Wayanad_(Lok_Sabha_constituency)",
"title": "Wayanad (Lok Sabha constituency)"
},
{
"text": "Kerala",
"href": "/wiki/Kerala",
"title": "Kerala"
}
],
"images": [],
"items": {}
}
}
},
"In office\n17 May 2004 \u2013 23 May 2019": {
"type": "header",
"key": "In office\n17 May 2004 \u2013 23 May 2019",
"text": "In office\n17 May 2004 \u2013 23 May 2019",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Succeeded by": {
"type": "item",
"key": "Succeeded by",
"text": "Smriti Irani",
"links": [
{
"text": "Smriti Irani",
"href": "/wiki/Smriti_Irani",
"title": "Smriti Irani"
}
],
"images": [],
"items": {}
},
"Constituency": {
"type": "item",
"key": "Constituency",
"text": "Amethi, Uttar Pradesh",
"links": [
{
"text": "Amethi",
"href": "/wiki/Amethi_(Lok_Sabha_constituency)",
"title": "Amethi (Lok Sabha constituency)"
},
{
"text": "Uttar Pradesh",
"href": "/wiki/Uttar_Pradesh",
"title": "Uttar Pradesh"
}
],
"images": [],
"items": {}
}
}
}
}
},
"Vice-President of the Indian National Congress": {
"type": "header",
"key": "Vice-President of the Indian National Congress",
"text": "Vice-President of the Indian National Congress",
"links": [
{
"text": "Indian National Congress",
"href": "/wiki/Indian_National_Congress",
"title": "Indian National Congress"
}
],
"images": [],
"items": {
"In office\n19 January 2013 \u2013 16 December 2017": {
"type": "header",
"key": "In office\n19 January 2013 \u2013 16 December 2017",
"text": "In office\n19 January 2013 \u2013 16 December 2017",
"links": [],
"images": [],
"items": {
"President": {
"type": "item",
"key": "President",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Position established",
"links": [],
"images": [],
"items": {}
},
"Succeeded by": {
"type": "item",
"key": "Succeeded by",
"text": "Position abolished",
"links": [],
"images": [],
"items": {}
}
}
}
}
},
"General Secretary of Indian National Congress": {
"type": "header",
"key": "General Secretary of Indian National Congress",
"text": "General Secretary of Indian National Congress",
"links": [
{
"text": "Indian National Congress",
"href": "/wiki/Indian_National_Congress",
"title": "Indian National Congress"
}
],
"images": [],
"items": {
"In office\n25 September 2007 \u2013 19 January 2013": {
"type": "header",
"key": "In office\n25 September 2007 \u2013 19 January 2013",
"text": "In office\n25 September 2007 \u2013 19 January 2013",
"links": [],
"images": [],
"items": {
"President": {
"type": "item",
"key": "President",
"text": "Sonia Gandhi",
"links": [
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
}
}
}
}
},
"Chair of Indian Youth Congress": {
"type": "header",
"key": "Chair of Indian Youth Congress",
"text": "Chair of Indian Youth Congress",
"links": [
{
"text": "Indian Youth Congress",
"href": "/wiki/Indian_Youth_Congress",
"title": "Indian Youth Congress"
}
],
"images": [],
"items": {
"Incumbent": {
"type": "header",
"key": "Incumbent",
"text": "Incumbent",
"links": [
{
"text": "Incumbent",
"href": "/wiki/Incumbent",
"title": "Incumbent"
}
],
"images": [],
"items": {}
},
"Assumed office \n25 September 2007": {
"type": "header",
"key": "Assumed office \n25 September 2007",
"text": "Assumed office \n25 September 2007",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Position established",
"links": [],
"images": [],
"items": {}
}
}
}
}
},
"Chair of National Students\u2019 Union of India": {
"type": "header",
"key": "Chair of National Students\u2019 Union of India",
"text": "Chair of National Students\u2019 Union of India",
"links": [
{
"text": "National Students\u2019 Union of India",
"href": "/wiki/National_Students%E2%80%99_Union_of_India",
"title": "National Students\u2019 Union of India"
}
],
"images": [],
"items": {
"Incumbent": {
"type": "header",
"key": "Incumbent",
"text": "Incumbent",
"links": [
{
"text": "Incumbent",
"href": "/wiki/Incumbent",
"title": "Incumbent"
}
],
"images": [],
"items": {}
},
"Assumed office \n25 September 2007": {
"type": "header",
"key": "Assumed office \n25 September 2007",
"text": "Assumed office \n25 September 2007",
"links": [],
"images": [],
"items": {
"Preceded by": {
"type": "item",
"key": "Preceded by",
"text": "Position established",
"links": [],
"images": [],
"items": {}
}
}
}
}
},
"Personal details": {
"type": "header",
"key": "Personal details",
"text": "Personal details",
"links": [],
"images": [],
"items": {
"Born": {
"type": "item",
"key": "Born",
"text": "(1970-06-19) 19 June 1970 (age 50)\nNew Delhi, India",
"links": [],
"images": [],
"items": {}
},
"Political party": {
"type": "item",
"key": "Political party",
"text": "Indian National Congress",
"links": [
{
"text": "Indian National Congress",
"href": "/wiki/Indian_National_Congress",
"title": "Indian National Congress"
}
],
"images": [],
"items": {}
},
"Parents": {
"type": "item",
"key": "Parents",
"text": "Rajiv Gandhi\nSonia Gandhi",
"links": [
{
"text": "Rajiv Gandhi",
"href": "/wiki/Rajiv_Gandhi",
"title": "Rajiv Gandhi"
},
{
"text": "Sonia Gandhi",
"href": "/wiki/Sonia_Gandhi",
"title": "Sonia Gandhi"
}
],
"images": [],
"items": {}
},
"Relatives": {
"type": "item",
"key": "Relatives",
"text": "Nehru\u2013Gandhi family",
"links": [
{
"text": "Nehru\u2013Gandhi family",
"href": "/wiki/Nehru%E2%80%93Gandhi_family",
"title": "Nehru\u2013Gandhi family"
}
],
"images": [],
"items": {}
},
"Education": {
"type": "item",
"key": "Education",
"text": "St. Stephen's College, Delhi\nHarvard University\nRollins College (BA)\nTrinity College, Cambridge (MPhil)",
"links": [
{
"text": "St. Stephen's College, Delhi",
"href": "/wiki/St._Stephen%27s_College,_Delhi",
"title": "St. Stephen's College, Delhi"
},
{
"text": "Harvard University",
"href": "/wiki/Harvard_University",
"title": "Harvard University"
},
{
"text": "Rollins College",
"href": "/wiki/Rollins_College",
"title": "Rollins College"
},
{
"text": "BA",
"href": "/wiki/Bachelor_of_Arts",
"title": "Bachelor of Arts"
},
{
"text": "Trinity College, Cambridge",
"href": "/wiki/Trinity_College,_Cambridge",
"title": "Trinity College, Cambridge"
},
{
"text": "MPhil",
"href": "/wiki/Master_of_Philosophy",
"title": "Master of Philosophy"
}
],
"images": [],
"items": {}
},
"Signature": {
"type": "item",
"key": "Signature",
"text": "",
"links": [
{
"text": "",
"href": "/wiki/File:Signature_of_Rahul_Gandhi.svg",
"title": "Rahul Gandhi's signature"
}
],
"images": [
{
"src": "//upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Signature_of_Rahul_Gandhi.svg/128px-Signature_of_Rahul_Gandhi.svg.png",
"width": "128",
"height": "44",
"alt": ""
}
],
"items": {}
},
"Website": {
"type": "item",
"key": "Website",
"text": "Official website",
"links": [
{
"text": "Official website",
"href": "http://rahulgandhi.in",
"title": ""
}
],
"images": [],
"items": {}
}
}
}
}
顺便说一句:
我曾使用 headers 对项目进行分组,因为有很多 Preceded by
,等等
我试图获取有关文本、链接、图像的所有信息,并在所有元素中创建相同的字段,即使它们没有某些值。
我只是不确定使用 headers 作为键是否好 - 使用部分列表而不是像 "Member of Parliament, Lok Sabha"
这样的键可能更容易,因为不同的人可能会有所不同.