如何在 <script> 标签内提取 link

how to extract a link inside <script> tag

我正在尝试从 TikTok 声音中获取 link 中的 .mp3 文件 问题是我无法提取它,因为它在 <"script"> 标签内
我正在使用 pycurl 而不是 requests

all i need is to extract this from the response then extract the URL from UrlList" "playUrl":{"Uri":"musically-maliva-obj/7038595527327419141.mp3","UrlList":["https://sf16-ies-music-va.tiktokcdn.com/obj/musically-maliva-obj/7038595527327419141.mp3"]}

import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup


url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
#response = response.split('"')
#response = response[1]
#response = response.split('.html?')
#response= response[0]
a = response.split("'")  # gives me a list and i don't know how to search in it 
soup = BeautifulSoup(response, 'html.parser')  # cause response is a string
link = soup.find("script", id="sigi-persisted-data")  #i tried to use bs4 but i couldn't find a reasult
print(link)

您可以使用正则表达式模式:

import re
...

print(re.search(r'"playUrl":"(.*)"', str(soup)).group(1))

您可以尝试提取 json 数据,将其解析为字典值,然后导航字典以获取数据 (json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])

import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup
import re
import json


url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
soup = BeautifulSoup(response, 'html.parser')
scripts = soup.findAll("script")

for s in scripts:
    s_str = str(s)
    res = re.search(r'<script>window.__INIT_PROPS__ = (.*)</script>', s_str)
    if res:
        json_data = json.loads(res.group(1))
        print(json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])