BeautifulSoup / 从脚本标签获取内容?
BeautifulSoup / Get content from script tag?
我想从此标签中抓取流派和艺术家信息:
<script type="text/javascript">
window.rtkGPTSlotsTargeting = [
[
["genre", "pop"],
["artist", "a1"]
]
];
</script>
我用这段代码试过了:
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import json
link = "https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
ua = UserAgent()
userAgent = ua.random
HEADERS = {"User-Agent": userAgent}
page = requests.get (link, headers=HEADERS)
soup = BeautifulSoup (page.content, "html.parser")
tmpScript = soup.find("script")
data = json.loads(tmpScript.string)
print(data)
但我总是得到这个错误:
$ python collLyrics.py
Traceback (most recent call last):
File "C:\Users\Polzi\Documents\DEV\Fiverr\TRY\kushabateni\collLyrics.py", line 14, in <module>
data = json.loads(tmpScript.string)
File "c:\users\polzi\appdata\local\programs\python\python39\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "c:\users\polzi\appdata\local\programs\python\python39\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "c:\users\polzi\appdata\local\programs\python\python39\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 2)
(NormalScraping)
如何从标签中获取这 2 条信息?
您可以使用正则表达式模式来查找正确的数据:
\[.*\]
将搜索括号内的所有文本:
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import re
link = "https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
ua = UserAgent()
userAgent = ua.random
HEADERS = {"User-Agent": userAgent}
page = requests.get(link, headers=HEADERS)
soup = BeautifulSoup(page.content, "html.parser")
tmpScript = soup.find("script")
for tag in re.findall(r"(\[.*\])", tmpScript.string):
print(tag)
输出:
["genre", "pop"]
["artist", "a1"]
有点“奇怪”的解决方案,但仍能获得所需的输出。
import json
import re
import requests
from bs4 import BeautifulSoup
data = (
json.loads(
re.search(
r"= \[(.*)\]",
BeautifulSoup(
requests.get(
"https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
).content,
"html.parser"
)
.find("script")
.string,
re.S
).group(1)
)
)
print(json.dumps(data, indent=2)
输出:
[
[
"genre",
"pop"
],
[
"artist",
"a1"
]
]
使用正则表达式模式中的命名组作为另一个例子
from fake_useragent import UserAgent
import requests
import re
link = "https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
ua = UserAgent()
userAgent = ua.random
HEADERS = {"User-Agent": userAgent}
page = requests.get(link, headers=HEADERS)
match = re.search(r'genre["\s,]+(?P<genre>(.*?))".*?artist["\s,]+(?P<artist>(.*?))"', page.text, flags = re.S)
for target in ['genre', 'artist']:
print(f'{target} = {match.group(target)}')
解释:
我想从此标签中抓取流派和艺术家信息:
<script type="text/javascript">
window.rtkGPTSlotsTargeting = [
[
["genre", "pop"],
["artist", "a1"]
]
];
</script>
我用这段代码试过了:
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import json
link = "https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
ua = UserAgent()
userAgent = ua.random
HEADERS = {"User-Agent": userAgent}
page = requests.get (link, headers=HEADERS)
soup = BeautifulSoup (page.content, "html.parser")
tmpScript = soup.find("script")
data = json.loads(tmpScript.string)
print(data)
但我总是得到这个错误:
$ python collLyrics.py
Traceback (most recent call last):
File "C:\Users\Polzi\Documents\DEV\Fiverr\TRY\kushabateni\collLyrics.py", line 14, in <module>
data = json.loads(tmpScript.string)
File "c:\users\polzi\appdata\local\programs\python\python39\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "c:\users\polzi\appdata\local\programs\python\python39\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "c:\users\polzi\appdata\local\programs\python\python39\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 2)
(NormalScraping)
如何从标签中获取这 2 条信息?
您可以使用正则表达式模式来查找正确的数据:
\[.*\]
将搜索括号内的所有文本:
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import re
link = "https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
ua = UserAgent()
userAgent = ua.random
HEADERS = {"User-Agent": userAgent}
page = requests.get(link, headers=HEADERS)
soup = BeautifulSoup(page.content, "html.parser")
tmpScript = soup.find("script")
for tag in re.findall(r"(\[.*\])", tmpScript.string):
print(tag)
输出:
["genre", "pop"]
["artist", "a1"]
有点“奇怪”的解决方案,但仍能获得所需的输出。
import json
import re
import requests
from bs4 import BeautifulSoup
data = (
json.loads(
re.search(
r"= \[(.*)\]",
BeautifulSoup(
requests.get(
"https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
).content,
"html.parser"
)
.find("script")
.string,
re.S
).group(1)
)
)
print(json.dumps(data, indent=2)
输出:
[
[
"genre",
"pop"
],
[
"artist",
"a1"
]
]
使用正则表达式模式中的命名组作为另一个例子
from fake_useragent import UserAgent
import requests
import re
link = "https://www.azlyrics.com/lyrics/a1/foreverinlove.html"
ua = UserAgent()
userAgent = ua.random
HEADERS = {"User-Agent": userAgent}
page = requests.get(link, headers=HEADERS)
match = re.search(r'genre["\s,]+(?P<genre>(.*?))".*?artist["\s,]+(?P<artist>(.*?))"', page.text, flags = re.S)
for target in ['genre', 'artist']:
print(f'{target} = {match.group(target)}')
解释: