如何使用 python 匹配和删除维基百科引用并重新
How to match and remove wikipedia refences with python and re
from bs4 import BeautifulSoup
import requests
import time
import keyboard
import re
def searchWiki():
search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
url = f"https://en.wikipedia.org/wiki/{search}"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find("title").get_text()
info = soup.find_all("p")
print("Press enter to read the next paragraph")
print(title)
print(url)
for p in info:
print(p.text.strip())
keyboard.wait('enter')
searchWiki()
例如,搜索 Tom Holland。它应该是这样的:
Thomas Stanley Holland (born 1 June 1996)[1] is an English actor. A graduate of the BRIT School in London...
我想做的是删除参考号和括号。
您可以使用正则表达式来完成。
例如你的 p
var:
import re
line = p.text.strip()
new_line = re.sub("\[[0-9]+\]", '', line)
print(new_line)
所有的脚注都在classreference
下面,你可以用decompose()
的方法去掉它们:
for tag in soup.find_all(class_="reference"):
tag.decompose()
import requests
from bs4 import BeautifulSoup
def searchWiki():
search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
url = f"https://en.wikipedia.org/wiki/{search}"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
# Remove all footnotes under the `reference` class
for tag in soup.find_all(class_="reference"):
tag.decompose()
title = soup.find("title").get_text()
info = soup.select("p")
print("Press enter to read the next paragraph")
print(title)
print(url)
for p in info:
print(p.text.strip())
searchWiki()
from bs4 import BeautifulSoup
import requests
import time
import keyboard
import re
def searchWiki():
search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
url = f"https://en.wikipedia.org/wiki/{search}"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find("title").get_text()
info = soup.find_all("p")
print("Press enter to read the next paragraph")
print(title)
print(url)
for p in info:
print(p.text.strip())
keyboard.wait('enter')
searchWiki()
例如,搜索 Tom Holland。它应该是这样的:
Thomas Stanley Holland (born 1 June 1996)[1] is an English actor. A graduate of the BRIT School in London...
我想做的是删除参考号和括号。
您可以使用正则表达式来完成。
例如你的 p
var:
import re
line = p.text.strip()
new_line = re.sub("\[[0-9]+\]", '', line)
print(new_line)
所有的脚注都在classreference
下面,你可以用decompose()
的方法去掉它们:
for tag in soup.find_all(class_="reference"):
tag.decompose()
import requests
from bs4 import BeautifulSoup
def searchWiki():
search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
url = f"https://en.wikipedia.org/wiki/{search}"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
# Remove all footnotes under the `reference` class
for tag in soup.find_all(class_="reference"):
tag.decompose()
title = soup.find("title").get_text()
info = soup.select("p")
print("Press enter to read the next paragraph")
print(title)
print(url)
for p in info:
print(p.text.strip())
searchWiki()