Python - 从 html 网络 url 写入指定年份的所有文本文件内容
Python - Writing all text file contents for specified years from a html web url
我有一个网页,其中包含几个指向文本文件的链接,这些文本文件被标记为:“2013 年 7 月 6 日,星期六”,其文件的超链接以:*_130706.txt 结尾。有人可以指导我将标签中包含 2013 的文件的所有内容写入文本文件的脚本吗?文本文件位于标签中。到目前为止,我已经能够以 xml 格式输出 html:
from lxml import html
from bs4 import BeautifulSoup
import urllib
string = '2013'
my_url = 'url'
html = urllib.request.urlopen(my_url).read()
html_page = BeautifulSoup(html, features='lxml')
有什么建议吗?
我的解决方案是
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
headers = {
"Host": "web.mta.info",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/87.0.4280.88 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cookie": "_ga=GA1.2.941711768.1610503542; _gid=GA1.2.1497562578.1610503542; "
"cookie_session=!AnxZS989I2WWEppONIhq3dHOsu/4VyPMB2uUQ7VhQ58NbigvNqrV3gdJ3wNP/qYFl3l5MCbT0vDDeP"
"+C3j5QyQ8/JwL1WYLBgiBYr59v "
}
r = requests.get('http://web.mta.info/developers/turnstile.html', headers)
r.raise_for_status()
html = r.text
soup = BeautifulSoup(html, "html5lib")
result = ""
for k in soup.find_all('a'):
if str(k.string).endswith("2013"):
# all<a> in 2013
url = k["href"]
r2 = requests.get("http://web.mta.info/developers/" + url, headers)
r2.raise_for_status()
result += r2.text
# do some os
print(result)
您知道所有这些文件的内容有点……庞大吗?
这里有一个没有外部库的 python3 解决方案,使用正则表达式查找链接:
from urllib import request
import re
import os
strings = ['2013', '2014']
output = 'huge.txt'
my_url = 'http://web.mta.info/developers/turnstile.html'
path, page = os.path.split(my_url)
print("path:", path, "page:", page)
html = request.urlopen(my_url).read().decode("utf8")
urls = []
for string in strings:
urls += [ u for u in re.findall(r'<a href="([^"]+)">([^<]+)</a>', html)
if u[1].find(string) >= 0 ]
print("links found:", len(urls))
open(output,"wt").write("")
for url in urls:
#link = os.path.join(path, url[0])
link = path + "/" + url[0]
print (" link:", link, " name:", url[1])
open(output, "at").write ( request.urlopen(link).read().decode("utf8") )
我有一个网页,其中包含几个指向文本文件的链接,这些文本文件被标记为:“2013 年 7 月 6 日,星期六”,其文件的超链接以:*_130706.txt 结尾。有人可以指导我将标签中包含 2013 的文件的所有内容写入文本文件的脚本吗?文本文件位于标签中。到目前为止,我已经能够以 xml 格式输出 html:
from lxml import html
from bs4 import BeautifulSoup
import urllib
string = '2013'
my_url = 'url'
html = urllib.request.urlopen(my_url).read()
html_page = BeautifulSoup(html, features='lxml')
有什么建议吗?
我的解决方案是
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
headers = {
"Host": "web.mta.info",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/87.0.4280.88 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cookie": "_ga=GA1.2.941711768.1610503542; _gid=GA1.2.1497562578.1610503542; "
"cookie_session=!AnxZS989I2WWEppONIhq3dHOsu/4VyPMB2uUQ7VhQ58NbigvNqrV3gdJ3wNP/qYFl3l5MCbT0vDDeP"
"+C3j5QyQ8/JwL1WYLBgiBYr59v "
}
r = requests.get('http://web.mta.info/developers/turnstile.html', headers)
r.raise_for_status()
html = r.text
soup = BeautifulSoup(html, "html5lib")
result = ""
for k in soup.find_all('a'):
if str(k.string).endswith("2013"):
# all<a> in 2013
url = k["href"]
r2 = requests.get("http://web.mta.info/developers/" + url, headers)
r2.raise_for_status()
result += r2.text
# do some os
print(result)
您知道所有这些文件的内容有点……庞大吗?
这里有一个没有外部库的 python3 解决方案,使用正则表达式查找链接:
from urllib import request
import re
import os
strings = ['2013', '2014']
output = 'huge.txt'
my_url = 'http://web.mta.info/developers/turnstile.html'
path, page = os.path.split(my_url)
print("path:", path, "page:", page)
html = request.urlopen(my_url).read().decode("utf8")
urls = []
for string in strings:
urls += [ u for u in re.findall(r'<a href="([^"]+)">([^<]+)</a>', html)
if u[1].find(string) >= 0 ]
print("links found:", len(urls))
open(output,"wt").write("")
for url in urls:
#link = os.path.join(path, url[0])
link = path + "/" + url[0]
print (" link:", link, " name:", url[1])
open(output, "at").write ( request.urlopen(link).read().decode("utf8") )