从维基百科中提取 table 数据 API

Extracting table data from wikipedia API

我需要从维基百科中提取任何政客右侧的 table。

为此,我尝试使用维基百科 API。但是我无法提取 table 数据。 到目前为止我尝试过的代码如下:

import wikipedia
person = wikipedia.search("Rahul Gandhi")[0]
person # prints "Rahul Gandhi"
wikipedia.summary(person) # able to get summary
page = wikipedia.page(person)
page.url # prints "url"
print(page.content) # prints complete content, but not the tables

我也尝试过抓取 tables,但是很难以结构化形式获取数据。

import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "https://en.wikipedia.org/wiki/Rahul_Gandhi"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")

table = soup.find("table",{"class":"infobox vcard"})
info = table.findAll('tr')

for row in info:
    content = []
    if row.find('th'):
        content += [row.find('th').text]
    if row.find('a'):
        content += [row.find('a').text]
    if row.find('td'):
        content += [row.find('td').text]
    print(content)
# Output : 
['Rahul Gandhi']
['', 'Gandhi in May 2019']
['']
['President of the Indian National Congress', 'President of the Indian National Congress']
['In office16 December 2017\xa0– 10 August 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Sonia Gandhi', 'Sonia Gandhi (Interim)']
['Member of Parliament, Lok Sabha', 'Member of Parliament, Lok Sabha']
['Incumbent', 'Incumbent']
['Assumed office 23 May 2019']
['Preceded by', 'M. I. Shanavas', 'M. I. Shanavas']
['Constituency', 'Wayanad', 'Wayanad, Kerala']
['In office17 May 2004\xa0– 23 May 2019']
['Preceded by', 'Sonia Gandhi', 'Sonia Gandhi']
['Succeeded by', 'Smriti Irani', 'Smriti Irani']
['Constituency', 'Amethi', 'Amethi, Uttar Pradesh']
['Vice-President of the Indian National Congress', 'Indian National Congress']
['In office19 January 2013\xa0– 16 December 2017']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Preceded by', 'Position established']
['Succeeded by', 'Position abolished']
['General Secretary of Indian National Congress', 'Indian National Congress']
['In office25 September 2007\xa0– 19 January 2013']
['President', 'Sonia Gandhi', 'Sonia Gandhi']
['Chair of Indian Youth Congress', 'Indian Youth Congress']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['Chair of National Students’ Union of India', 'National Students’ Union of India']
['Incumbent', 'Incumbent']
['Assumed office 25 September 2007']
['Preceded by', 'Position established']
['\n']
['Personal details']
['Born', ' (1970-06-19) 19 June 1970 (age\xa050)New Delhi, India']
['Political party', 'Indian National Congress', 'Indian National Congress']
['Parents', 'Rajiv Gandhi', 'Rajiv GandhiSonia Gandhi']
['Relatives', 'Nehru–Gandhi family', 'Nehru–Gandhi family']
['Education', "St. Stephen's College, Delhi", "St. Stephen's College, DelhiHarvard UniversityRollins College (BA)Trinity College, Cambridge (MPhil)"]
['Signature', '', '']
['Website', 'Official website', 'Official website']

使用我创建的行中有关标签和项目数量的信息

import urllib3
import requests
from bs4 import BeautifulSoup
import json

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

url = 'https://en.wikipedia.org/wiki/Rahul_Gandhi'
#url = 'https://en.wikipedia.org/wiki/Sonia_Gandhi'

session = requests.Session()

html = session.get(url, verify=False).content

soup = BeautifulSoup(html, 'lxml')

table = soup.find('table', {'class': 'infobox vcard'})

# --- 

content = {}
header1 = None
header2 = None
current = content

for row in table.find_all('tr'):
    
    children = list(row.children)
    
    # replace '<br>' with '\n'
    for item in children:
        for br in item.find_all('br'):
            br.replace_with('\n' + br.text)

    # headers/subheaders (sections/subsections)
    if len(children) == 1:

        #html = str(children[0]).strip()
        
        # skip empty rows
        inner_html = children[0].decode_contents().strip()
        if not inner_html:
            continue
        #print(inner_html)
        
        text = children[0].get_text().strip() # don't `get_text(strip=True)` to keep `\n`

        # clean text - replace non-breaking space         
        text = text.replace('\u00a0', ' ')
        #print(item.name, '|', text)

        images = [{
                    'src': x.get('src'),
                    'width': x.get('width', ''),
                    'height': x.get('height', ''),
                    'alt': x.get('alt'),
                  } for x in children[0].find_all('img')]

        links  = [{
                    'text': x.text,
                    'href': x.get('href', ''),
                    'title': x.get('title', ''),
                  } for x in children[0].find_all('a')]
        
        # create headers / section
        if children[0].name == 'th':
            header1 = text
            
            section = {
                'type': 'header',
                #'html': html,
                'key' : text,
                'text': text,   # text in header
                'links': links, # links in header
                'images': images,
                'items': {},    # items in section
            }
            
            content[header1] = section  # add section to content
            current = section['items']  # keep access to add items later

        # create subheaders / subsection
        if children[0].name == 'td':
            header2 = text

            section = {
                'type': 'header',
                #'html': html,
                'key' : text,
                'text': text,   # text in subheader
                'links': links, # links in subheader
                'images': images,
                'items': {},    # items in subsection
            }
            
            content[header1]['items'][header2] = section  # add section to content
            current = section['items']  # keep access to add items later
            
    # items in sections/sections
    if len(children) == 2:
        #html   = str(children[1])

        # skip empty rows
        #inner_html = children[0].decode_contents().strip()
        #if not inner_html:
        #    continue
        #print(inner_html)
        
        key    = children[0].get_text().strip()
        text   = children[1].get_text().strip()
        
        links  = [{
                    'text': x.text,
                    'href': x.get('href', ''),
                    'title': x.get('title', ''),
                  } for x in children[1].find_all('a')]
        
        images = [{
                    'src': x.get('src'),
                    'width': x.get('width', ''),
                    'height': x.get('height', ''),
                    'alt': x.get('alt'),
                  } for x in children[1].find_all('img')]

        # clean text - replace non-breaking space 
        text = text.replace('\u00a0', ' ')

        current[key] = {
            'type': 'item',
            #'html': html,
            'key': key,
            'text': text,
            'links': links,
            'images': images,
            'items': {}
        }

        #print(content[key])

#first_key = list(content.keys())[0]
#print(first_key)
#print(json.dumps(content[first_key], indent=2))

print(json.dumps(content, indent=2))

这给出了这个:

{
  "Rahul Gandhi": {
    "type": "header",
    "key": "Rahul Gandhi",
    "text": "Rahul Gandhi",
    "links": [],
    "images": [],
    "items": {
      "Gandhi in May 2019": {
        "type": "header",
        "key": "Gandhi in May 2019",
        "text": "Gandhi in May 2019",
        "links": [
          {
            "text": "",
            "href": "/wiki/File:Rahul_Gandhi,_Member_of_Parliament,_Wayanad,_Kerala.jpg",
            "title": ""
          }
        ],
        "images": [
          {
            "src": "//upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg/220px-Rahul_Gandhi%2C_Member_of_Parliament%2C_Wayanad%2C_Kerala.jpg",
            "width": "220",
            "height": "293",
            "alt": "Rahul Gandhi, Member of Parliament, Wayanad, Kerala.jpg"
          }
        ],
        "items": {}
      }
    }
  },
  "President of the Indian National Congress": {
    "type": "header",
    "key": "President of the Indian National Congress",
    "text": "President of the Indian National Congress",
    "links": [
      {
        "text": "President of the Indian National Congress",
        "href": "/wiki/List_of_Presidents_of_the_Indian_National_Congress",
        "title": "List of Presidents of the Indian National Congress"
      }
    ],
    "images": [],
    "items": {
      "In office\n16 December 2017 \u2013 10 August 2019": {
        "type": "header",
        "key": "In office\n16 December 2017 \u2013 10 August 2019",
        "text": "In office\n16 December 2017 \u2013 10 August 2019",
        "links": [],
        "images": [],
        "items": {
          "Preceded by": {
            "type": "item",
            "key": "Preceded by",
            "text": "Sonia Gandhi",
            "links": [
              {
                "text": "Sonia Gandhi",
                "href": "/wiki/Sonia_Gandhi",
                "title": "Sonia Gandhi"
              }
            ],
            "images": [],
            "items": {}
          },
          "Succeeded by": {
            "type": "item",
            "key": "Succeeded by",
            "text": "Sonia Gandhi (Interim)",
            "links": [
              {
                "text": "Sonia Gandhi",
                "href": "/wiki/Sonia_Gandhi",
                "title": "Sonia Gandhi"
              }
            ],
            "images": [],
            "items": {}
          }
        }
      }
    }
  },
  "Member of Parliament, Lok Sabha": {
    "type": "header",
    "key": "Member of Parliament, Lok Sabha",
    "text": "Member of Parliament, Lok Sabha",
    "links": [
      {
        "text": "Member of Parliament, Lok Sabha",
        "href": "/wiki/Member_of_Parliament,_Lok_Sabha",
        "title": "Member of Parliament, Lok Sabha"
      }
    ],
    "images": [],
    "items": {
      "Incumbent": {
        "type": "header",
        "key": "Incumbent",
        "text": "Incumbent",
        "links": [
          {
            "text": "Incumbent",
            "href": "/wiki/Incumbent",
            "title": "Incumbent"
          }
        ],
        "images": [],
        "items": {}
      },
      "Assumed office \n23 May 2019": {
        "type": "header",
        "key": "Assumed office \n23 May 2019",
        "text": "Assumed office \n23 May 2019",
        "links": [],
        "images": [],
        "items": {
          "Preceded by": {
            "type": "item",
            "key": "Preceded by",
            "text": "M. I. Shanavas",
            "links": [
              {
                "text": "M. I. Shanavas",
                "href": "/wiki/M._I._Shanavas",
                "title": "M. I. Shanavas"
              }
            ],
            "images": [],
            "items": {}
          },
          "Constituency": {
            "type": "item",
            "key": "Constituency",
            "text": "Wayanad, Kerala",
            "links": [
              {
                "text": "Wayanad",
                "href": "/wiki/Wayanad_(Lok_Sabha_constituency)",
                "title": "Wayanad (Lok Sabha constituency)"
              },
              {
                "text": "Kerala",
                "href": "/wiki/Kerala",
                "title": "Kerala"
              }
            ],
            "images": [],
            "items": {}
          }
        }
      },
      "In office\n17 May 2004 \u2013 23 May 2019": {
        "type": "header",
        "key": "In office\n17 May 2004 \u2013 23 May 2019",
        "text": "In office\n17 May 2004 \u2013 23 May 2019",
        "links": [],
        "images": [],
        "items": {
          "Preceded by": {
            "type": "item",
            "key": "Preceded by",
            "text": "Sonia Gandhi",
            "links": [
              {
                "text": "Sonia Gandhi",
                "href": "/wiki/Sonia_Gandhi",
                "title": "Sonia Gandhi"
              }
            ],
            "images": [],
            "items": {}
          },
          "Succeeded by": {
            "type": "item",
            "key": "Succeeded by",
            "text": "Smriti Irani",
            "links": [
              {
                "text": "Smriti Irani",
                "href": "/wiki/Smriti_Irani",
                "title": "Smriti Irani"
              }
            ],
            "images": [],
            "items": {}
          },
          "Constituency": {
            "type": "item",
            "key": "Constituency",
            "text": "Amethi, Uttar Pradesh",
            "links": [
              {
                "text": "Amethi",
                "href": "/wiki/Amethi_(Lok_Sabha_constituency)",
                "title": "Amethi (Lok Sabha constituency)"
              },
              {
                "text": "Uttar Pradesh",
                "href": "/wiki/Uttar_Pradesh",
                "title": "Uttar Pradesh"
              }
            ],
            "images": [],
            "items": {}
          }
        }
      }
    }
  },
  "Vice-President of the Indian National Congress": {
    "type": "header",
    "key": "Vice-President of the Indian National Congress",
    "text": "Vice-President of the Indian National Congress",
    "links": [
      {
        "text": "Indian National Congress",
        "href": "/wiki/Indian_National_Congress",
        "title": "Indian National Congress"
      }
    ],
    "images": [],
    "items": {
      "In office\n19 January 2013 \u2013 16 December 2017": {
        "type": "header",
        "key": "In office\n19 January 2013 \u2013 16 December 2017",
        "text": "In office\n19 January 2013 \u2013 16 December 2017",
        "links": [],
        "images": [],
        "items": {
          "President": {
            "type": "item",
            "key": "President",
            "text": "Sonia Gandhi",
            "links": [
              {
                "text": "Sonia Gandhi",
                "href": "/wiki/Sonia_Gandhi",
                "title": "Sonia Gandhi"
              }
            ],
            "images": [],
            "items": {}
          },
          "Preceded by": {
            "type": "item",
            "key": "Preceded by",
            "text": "Position established",
            "links": [],
            "images": [],
            "items": {}
          },
          "Succeeded by": {
            "type": "item",
            "key": "Succeeded by",
            "text": "Position abolished",
            "links": [],
            "images": [],
            "items": {}
          }
        }
      }
    }
  },
  "General Secretary of Indian National Congress": {
    "type": "header",
    "key": "General Secretary of Indian National Congress",
    "text": "General Secretary of Indian National Congress",
    "links": [
      {
        "text": "Indian National Congress",
        "href": "/wiki/Indian_National_Congress",
        "title": "Indian National Congress"
      }
    ],
    "images": [],
    "items": {
      "In office\n25 September 2007 \u2013 19 January 2013": {
        "type": "header",
        "key": "In office\n25 September 2007 \u2013 19 January 2013",
        "text": "In office\n25 September 2007 \u2013 19 January 2013",
        "links": [],
        "images": [],
        "items": {
          "President": {
            "type": "item",
            "key": "President",
            "text": "Sonia Gandhi",
            "links": [
              {
                "text": "Sonia Gandhi",
                "href": "/wiki/Sonia_Gandhi",
                "title": "Sonia Gandhi"
              }
            ],
            "images": [],
            "items": {}
          }
        }
      }
    }
  },
  "Chair of Indian Youth Congress": {
    "type": "header",
    "key": "Chair of Indian Youth Congress",
    "text": "Chair of Indian Youth Congress",
    "links": [
      {
        "text": "Indian Youth Congress",
        "href": "/wiki/Indian_Youth_Congress",
        "title": "Indian Youth Congress"
      }
    ],
    "images": [],
    "items": {
      "Incumbent": {
        "type": "header",
        "key": "Incumbent",
        "text": "Incumbent",
        "links": [
          {
            "text": "Incumbent",
            "href": "/wiki/Incumbent",
            "title": "Incumbent"
          }
        ],
        "images": [],
        "items": {}
      },
      "Assumed office \n25 September 2007": {
        "type": "header",
        "key": "Assumed office \n25 September 2007",
        "text": "Assumed office \n25 September 2007",
        "links": [],
        "images": [],
        "items": {
          "Preceded by": {
            "type": "item",
            "key": "Preceded by",
            "text": "Position established",
            "links": [],
            "images": [],
            "items": {}
          }
        }
      }
    }
  },
  "Chair of National Students\u2019 Union of India": {
    "type": "header",
    "key": "Chair of National Students\u2019 Union of India",
    "text": "Chair of National Students\u2019 Union of India",
    "links": [
      {
        "text": "National Students\u2019 Union of India",
        "href": "/wiki/National_Students%E2%80%99_Union_of_India",
        "title": "National Students\u2019 Union of India"
      }
    ],
    "images": [],
    "items": {
      "Incumbent": {
        "type": "header",
        "key": "Incumbent",
        "text": "Incumbent",
        "links": [
          {
            "text": "Incumbent",
            "href": "/wiki/Incumbent",
            "title": "Incumbent"
          }
        ],
        "images": [],
        "items": {}
      },
      "Assumed office \n25 September 2007": {
        "type": "header",
        "key": "Assumed office \n25 September 2007",
        "text": "Assumed office \n25 September 2007",
        "links": [],
        "images": [],
        "items": {
          "Preceded by": {
            "type": "item",
            "key": "Preceded by",
            "text": "Position established",
            "links": [],
            "images": [],
            "items": {}
          }
        }
      }
    }
  },
  "Personal details": {
    "type": "header",
    "key": "Personal details",
    "text": "Personal details",
    "links": [],
    "images": [],
    "items": {
      "Born": {
        "type": "item",
        "key": "Born",
        "text": "(1970-06-19) 19 June 1970 (age 50)\nNew Delhi, India",
        "links": [],
        "images": [],
        "items": {}
      },
      "Political party": {
        "type": "item",
        "key": "Political party",
        "text": "Indian National Congress",
        "links": [
          {
            "text": "Indian National Congress",
            "href": "/wiki/Indian_National_Congress",
            "title": "Indian National Congress"
          }
        ],
        "images": [],
        "items": {}
      },
      "Parents": {
        "type": "item",
        "key": "Parents",
        "text": "Rajiv Gandhi\nSonia Gandhi",
        "links": [
          {
            "text": "Rajiv Gandhi",
            "href": "/wiki/Rajiv_Gandhi",
            "title": "Rajiv Gandhi"
          },
          {
            "text": "Sonia Gandhi",
            "href": "/wiki/Sonia_Gandhi",
            "title": "Sonia Gandhi"
          }
        ],
        "images": [],
        "items": {}
      },
      "Relatives": {
        "type": "item",
        "key": "Relatives",
        "text": "Nehru\u2013Gandhi family",
        "links": [
          {
            "text": "Nehru\u2013Gandhi family",
            "href": "/wiki/Nehru%E2%80%93Gandhi_family",
            "title": "Nehru\u2013Gandhi family"
          }
        ],
        "images": [],
        "items": {}
      },
      "Education": {
        "type": "item",
        "key": "Education",
        "text": "St. Stephen's College, Delhi\nHarvard University\nRollins College (BA)\nTrinity College, Cambridge (MPhil)",
        "links": [
          {
            "text": "St. Stephen's College, Delhi",
            "href": "/wiki/St._Stephen%27s_College,_Delhi",
            "title": "St. Stephen's College, Delhi"
          },
          {
            "text": "Harvard University",
            "href": "/wiki/Harvard_University",
            "title": "Harvard University"
          },
          {
            "text": "Rollins College",
            "href": "/wiki/Rollins_College",
            "title": "Rollins College"
          },
          {
            "text": "BA",
            "href": "/wiki/Bachelor_of_Arts",
            "title": "Bachelor of Arts"
          },
          {
            "text": "Trinity College, Cambridge",
            "href": "/wiki/Trinity_College,_Cambridge",
            "title": "Trinity College, Cambridge"
          },
          {
            "text": "MPhil",
            "href": "/wiki/Master_of_Philosophy",
            "title": "Master of Philosophy"
          }
        ],
        "images": [],
        "items": {}
      },
      "Signature": {
        "type": "item",
        "key": "Signature",
        "text": "",
        "links": [
          {
            "text": "",
            "href": "/wiki/File:Signature_of_Rahul_Gandhi.svg",
            "title": "Rahul Gandhi's signature"
          }
        ],
        "images": [
          {
            "src": "//upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Signature_of_Rahul_Gandhi.svg/128px-Signature_of_Rahul_Gandhi.svg.png",
            "width": "128",
            "height": "44",
            "alt": ""
          }
        ],
        "items": {}
      },
      "Website": {
        "type": "item",
        "key": "Website",
        "text": "Official website",
        "links": [
          {
            "text": "Official website",
            "href": "http://rahulgandhi.in",
            "title": ""
          }
        ],
        "images": [],
        "items": {}
      }
    }
  }
}

顺便说一句:

我曾使用 headers 对项目进行分组,因为有很多 Preceded by,等等

我试图获取有关文本、链接、图像的所有信息,并在所有元素中创建相同的字段,即使它们没有某些值。

我只是不确定使用 headers 作为键是否好 - 使用部分列表而不是像 "Member of Parliament, Lok Sabha" 这样的键可能更容易,因为不同的人可能会有所不同.