如何提取这些 'video_url'

How to extract these 'video_url'

这里是我提取的 text/javascript 代码。

此外,我想从这些脚本中提取 'video_id'、'video_url'、'video_alt_url' 的值!

"""{                                                                                                                                   
    video_id: '000101',
    video_categories: 'Categorie01, Categorie02',
    video_tags: 'Categorie01, Categorie02',                                                                                                                                        license_code: '3825119921245',                                                                                                                                       rnd: '1647426812',
    video_url:'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
    postfix: '.mp4',
    video_url_text: '480p',
    video_alt_url:'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243',
    video_alt_url_text: '720p',
    video_alt_url_hd: '1',
    preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
    preview_url1:'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
    preview_height1: '480',
    preview_url2:'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
    preview_height2: '720',
    skin: 'youtube.css',
    logo_position: '0,0',
    logo_anchor: 'topleft',
    hide_controlbar: '1',
    hide_style: 'fade',
    volume: '1',
    related_src: 'https://www.example.com/related_videos_html/107389/',                                                                                                                                  adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
    adv_pre_skip_duration: '5',
    adv_pre_skip_text_time: 'Skip ad in %time',
    adv_pre_skip_text: 'Skip ad',
    adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
    adv_post_skip_duration: '5',
    adv_post_skip_text_time: 'Skip ad in %time',
    adv_post_skip_text: 'Skip ad',
    lrcv: '1651572296480833989009946',
    vast_timeout1: '10',
    player_width: '882',
    player_height: '496.9014084507',
    embed: '1'
}"""

试试这个,

import demjson

str_var= """{
                                                                                                                                        video_id: '000101',
                                                                                                                                video_categories: 'Categorie01, Categorie02',
                                                                                                                                        video_tags: 'Categorie01, Categorie02',                                                                                                                                        license_code: '153818112155303616.00Rs',                                                                                                                                       rnd: '1647426812',
                                                                                                                                video_url: 'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
                                                                postfix: '.mp4',
                                                        video_url_text: '480p',
                                                        video_alt_url: 'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243',                                                                                                                                       video_alt_url_text: '720p',                                                                                                                                     video_alt_url_hd: '1',                                                                                                                                  preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
                                        preview_url1: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
                                                                                                                preview_height1: '480',
                                                                                                                preview_url2: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
                        preview_height2: '720',
                        skin: 'youtube.css',
                logo_position: '0,0',
        logo_anchor: 'topleft',
        hide_controlbar: '1',                                                                                                                                   hide_style: 'fade',                                                                                                                                     volume: '1',
                                                                                                                                related_src: 'https://www.example.com/related_videos_html/107389/',                                                                                                                                  adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',                                                                                                                                   adv_pre_skip_duration: '5',                                                                                                                                     adv_pre_skip_text_time: 'Skip ad in %time',                                                                                                                                     adv_pre_skip_text: 'Skip ad',                                                                                                                                   adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
                                                                                                                                adv_post_skip_duration: '5',
                                                                                                                                adv_post_skip_text_time: 'Skip ad in %time',                                                                                                                                    adv_post_skip_text: 'Skip ad',
                                                                                                                                        lrcv: '1651572296480833989009946',                                                                                                                                      vast_timeout1: '10',
                                                                                                                                player_width: '882',
                                                                                                                        player_height: '496.9014084507',
                                                                                                                                embed: '1'
                                                                                                }"""

script_var = str_var.split('var flashvars = ')[1]
script_var = script_var[:script_var.find('};') + 1]
data = demjson.decode(script_var)

video_url= data['video_url']
video_alt_url= data['video_alt_url']

print(video_url)
print(video_alt_url)

安装 demjson

pip install demjson

不确定为什么没有包含 url 或显示如何提取此代码的部分代码(可能有更简单的方法来获取此数据)。

你可以做的就是获取 json 文字对象,使用正则表达式对其进行操作,使其成为有效的 ast.literal_eval()。不是最强大的,但有效:

js_obj = '''{
                                                                                                                                        video_id: '000101',
                                                                                                                                video_categories: 'Categorie01, Categorie02',
                                                                                                                                        video_tags: 'Categorie01, Categorie02',                                                                                                                                        license_code: '3825119921245',                                                                                                                                       rnd: '1647426812',
                                                                                                                                video_url: 'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
                                                                postfix: '.mp4',
                                                        video_url_text: '480p',
                                                        video_alt_url: 'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243',                                                                                                                                       video_alt_url_text: '720p',                                                                                                                                     video_alt_url_hd: '1',                                                                                                                                  preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
                                        preview_url1: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
                                                                                                                preview_height1: '480',
                                                                                                                preview_url2: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
                        preview_height2: '720',
                        skin: 'youtube.css',
                logo_position: '0,0',
        logo_anchor: 'topleft',
        hide_controlbar: '1',                                                                                                                                   hide_style: 'fade',                                                                                                                                     volume: '1',
                                                                                                                                related_src: 'https://www.example.com/related_videos_html/107389/',                                                                                                                                  adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',                                                                                                                                   adv_pre_skip_duration: '5',                                                                                                                                     adv_pre_skip_text_time: 'Skip ad in %time',                                                                                                                                     adv_pre_skip_text: 'Skip ad',                                                                                                                                   adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
                                                                                                                                adv_post_skip_duration: '5',
                                                                                                                                adv_post_skip_text_time: 'Skip ad in %time',                                                                                                                                    adv_post_skip_text: 'Skip ad',
                                                                                                                                        lrcv: '1651572296480833989009946',                                                                                                                                      vast_timeout1: '10',
                                                                                                                                player_width: '882',
                                                                                                                        player_height: '496.9014084507',
                                                                                                                                embed: '1'
                                                                                                }'''


import ast
import re


js_obj = js_obj.replace("'https:",'https')
js_obj = re.sub(r'([\d\w]*):', "'\1':", js_obj)
js_obj = js_obj.replace("https","'https:")

py_obj = ast.literal_eval(js_obj)

输出:

print(py_obj['video_id'])
print(py_obj['video_url'])
print(py_obj['video_alt_url'])

000101
https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709
https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243