如何提取这些 'video_url'
How to extract these 'video_url'
这里是我提取的 text/javascript 代码。
此外,我想从这些脚本中提取 'video_id'、'video_url'、'video_alt_url' 的值!
"""{
video_id: '000101',
video_categories: 'Categorie01, Categorie02',
video_tags: 'Categorie01, Categorie02', license_code: '3825119921245', rnd: '1647426812',
video_url:'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
postfix: '.mp4',
video_url_text: '480p',
video_alt_url:'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243',
video_alt_url_text: '720p',
video_alt_url_hd: '1',
preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
preview_url1:'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
preview_height1: '480',
preview_url2:'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
preview_height2: '720',
skin: 'youtube.css',
logo_position: '0,0',
logo_anchor: 'topleft',
hide_controlbar: '1',
hide_style: 'fade',
volume: '1',
related_src: 'https://www.example.com/related_videos_html/107389/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_pre_skip_duration: '5',
adv_pre_skip_text_time: 'Skip ad in %time',
adv_pre_skip_text: 'Skip ad',
adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_post_skip_duration: '5',
adv_post_skip_text_time: 'Skip ad in %time',
adv_post_skip_text: 'Skip ad',
lrcv: '1651572296480833989009946',
vast_timeout1: '10',
player_width: '882',
player_height: '496.9014084507',
embed: '1'
}"""
试试这个,
import demjson
str_var= """{
video_id: '000101',
video_categories: 'Categorie01, Categorie02',
video_tags: 'Categorie01, Categorie02', license_code: '153818112155303616.00Rs', rnd: '1647426812',
video_url: 'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
postfix: '.mp4',
video_url_text: '480p',
video_alt_url: 'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243', video_alt_url_text: '720p', video_alt_url_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
preview_url1: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
preview_height1: '480',
preview_url2: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
preview_height2: '720',
skin: 'youtube.css',
logo_position: '0,0',
logo_anchor: 'topleft',
hide_controlbar: '1', hide_style: 'fade', volume: '1',
related_src: 'https://www.example.com/related_videos_html/107389/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_post_skip_duration: '5',
adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad',
lrcv: '1651572296480833989009946', vast_timeout1: '10',
player_width: '882',
player_height: '496.9014084507',
embed: '1'
}"""
script_var = str_var.split('var flashvars = ')[1]
script_var = script_var[:script_var.find('};') + 1]
data = demjson.decode(script_var)
video_url= data['video_url']
video_alt_url= data['video_alt_url']
print(video_url)
print(video_alt_url)
安装 demjson
pip install demjson
不确定为什么没有包含 url 或显示如何提取此代码的部分代码(可能有更简单的方法来获取此数据)。
你可以做的就是获取 json 文字对象,使用正则表达式对其进行操作,使其成为有效的 ast.literal_eval()
。不是最强大的,但有效:
js_obj = '''{
video_id: '000101',
video_categories: 'Categorie01, Categorie02',
video_tags: 'Categorie01, Categorie02', license_code: '3825119921245', rnd: '1647426812',
video_url: 'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
postfix: '.mp4',
video_url_text: '480p',
video_alt_url: 'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243', video_alt_url_text: '720p', video_alt_url_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
preview_url1: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
preview_height1: '480',
preview_url2: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
preview_height2: '720',
skin: 'youtube.css',
logo_position: '0,0',
logo_anchor: 'topleft',
hide_controlbar: '1', hide_style: 'fade', volume: '1',
related_src: 'https://www.example.com/related_videos_html/107389/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_post_skip_duration: '5',
adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad',
lrcv: '1651572296480833989009946', vast_timeout1: '10',
player_width: '882',
player_height: '496.9014084507',
embed: '1'
}'''
import ast
import re
js_obj = js_obj.replace("'https:",'https')
js_obj = re.sub(r'([\d\w]*):', "'\1':", js_obj)
js_obj = js_obj.replace("https","'https:")
py_obj = ast.literal_eval(js_obj)
输出:
print(py_obj['video_id'])
print(py_obj['video_url'])
print(py_obj['video_alt_url'])
000101
https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709
https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243
这里是我提取的 text/javascript 代码。
此外,我想从这些脚本中提取 'video_id'、'video_url'、'video_alt_url' 的值!
"""{
video_id: '000101',
video_categories: 'Categorie01, Categorie02',
video_tags: 'Categorie01, Categorie02', license_code: '3825119921245', rnd: '1647426812',
video_url:'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
postfix: '.mp4',
video_url_text: '480p',
video_alt_url:'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243',
video_alt_url_text: '720p',
video_alt_url_hd: '1',
preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
preview_url1:'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
preview_height1: '480',
preview_url2:'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
preview_height2: '720',
skin: 'youtube.css',
logo_position: '0,0',
logo_anchor: 'topleft',
hide_controlbar: '1',
hide_style: 'fade',
volume: '1',
related_src: 'https://www.example.com/related_videos_html/107389/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_pre_skip_duration: '5',
adv_pre_skip_text_time: 'Skip ad in %time',
adv_pre_skip_text: 'Skip ad',
adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_post_skip_duration: '5',
adv_post_skip_text_time: 'Skip ad in %time',
adv_post_skip_text: 'Skip ad',
lrcv: '1651572296480833989009946',
vast_timeout1: '10',
player_width: '882',
player_height: '496.9014084507',
embed: '1'
}"""
试试这个,
import demjson
str_var= """{
video_id: '000101',
video_categories: 'Categorie01, Categorie02',
video_tags: 'Categorie01, Categorie02', license_code: '153818112155303616.00Rs', rnd: '1647426812',
video_url: 'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
postfix: '.mp4',
video_url_text: '480p',
video_alt_url: 'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243', video_alt_url_text: '720p', video_alt_url_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
preview_url1: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
preview_height1: '480',
preview_url2: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
preview_height2: '720',
skin: 'youtube.css',
logo_position: '0,0',
logo_anchor: 'topleft',
hide_controlbar: '1', hide_style: 'fade', volume: '1',
related_src: 'https://www.example.com/related_videos_html/107389/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_post_skip_duration: '5',
adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad',
lrcv: '1651572296480833989009946', vast_timeout1: '10',
player_width: '882',
player_height: '496.9014084507',
embed: '1'
}"""
script_var = str_var.split('var flashvars = ')[1]
script_var = script_var[:script_var.find('};') + 1]
data = demjson.decode(script_var)
video_url= data['video_url']
video_alt_url= data['video_alt_url']
print(video_url)
print(video_alt_url)
安装 demjson
pip install demjson
不确定为什么没有包含 url 或显示如何提取此代码的部分代码(可能有更简单的方法来获取此数据)。
你可以做的就是获取 json 文字对象,使用正则表达式对其进行操作,使其成为有效的 ast.literal_eval()
。不是最强大的,但有效:
js_obj = '''{
video_id: '000101',
video_categories: 'Categorie01, Categorie02',
video_tags: 'Categorie01, Categorie02', license_code: '3825119921245', rnd: '1647426812',
video_url: 'https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709',
postfix: '.mp4',
video_url_text: '480p',
video_alt_url: 'https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243', video_alt_url_text: '720p', video_alt_url_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.jpg',
preview_url1: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview.mp4.jpg',
preview_height1: '480',
preview_url2: 'https://www.example.com/contents/videos_screenshots/107000/107389/preview_720p.mp4.jpg',
preview_height2: '720',
skin: 'youtube.css',
logo_position: '0,0',
logo_anchor: 'topleft',
hide_controlbar: '1', hide_style: 'fade', volume: '1',
related_src: 'https://www.example.com/related_videos_html/107389/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}',
adv_post_skip_duration: '5',
adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad',
lrcv: '1651572296480833989009946', vast_timeout1: '10',
player_width: '882',
player_height: '496.9014084507',
embed: '1'
}'''
import ast
import re
js_obj = js_obj.replace("'https:",'https')
js_obj = re.sub(r'([\d\w]*):', "'\1':", js_obj)
js_obj = js_obj.replace("https","'https:")
py_obj = ast.literal_eval(js_obj)
输出:
print(py_obj['video_id'])
print(py_obj['video_url'])
print(py_obj['video_alt_url'])
000101
https://www.example.com/get_file/5/bb6a5e180f5037a3f348fbdee96a8c6f681c4c0bab/107000/107389/107389.mp4/?br=709
https://www.example.com/get_file/5/47601c7136bcbe38e6eb0b2cfa04dd9d917aa6263b/107000/107389/107389_720p.mp4/?br=1243