如何使用 python 匹配和删除维基百科引用并重新

Question

from bs4 import BeautifulSoup
import requests
import time
import keyboard
import re

def searchWiki():
    search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
    url = f"https://en.wikipedia.org/wiki/{search}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    title = soup.find("title").get_text()
    info = soup.find_all("p")
    print("Press enter to read the next paragraph")
    print(title)
    print(url)
    for p in info:
        print(p.text.strip())
        keyboard.wait('enter')



searchWiki()

例如，搜索 Tom Holland。它应该是这样的：

Thomas Stanley Holland (born 1 June 1996)[1] is an English actor. A graduate of the BRIT School in London...

我想做的是删除参考号和括号。

Answer 1

您可以使用正则表达式来完成。

例如你的 p var:

import re

line = p.text.strip()
new_line = re.sub("\[[0-9]+\]", '', line)
print(new_line)

Answer 2

所有的脚注都在classreference下面，你可以用decompose()的方法去掉它们：

for tag in soup.find_all(class_="reference"):
    tag.decompose()

import requests
from bs4 import BeautifulSoup


def searchWiki():
    search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
    url = f"https://en.wikipedia.org/wiki/{search}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    # Remove all footnotes under the `reference` class
    for tag in soup.find_all(class_="reference"):
        tag.decompose()

    title = soup.find("title").get_text()
    info = soup.select("p")
    print("Press enter to read the next paragraph")
    print(title)
    print(url)
    for p in info:
        print(p.text.strip())


searchWiki()

如何使用 python 匹配和删除维基百科引用并重新

How to match and remove wikipedia refences with python and re

python

wikipedia

beautifulsoup

python-requests

python-re