如何抓取 google 个视频页面缩略图?

How to scrape google videos page thumbnails?

我目前正在进行一个抓取项目,其中我正在抓取 Google 视频页面,但是当我知道需要使用 ytimg 抓取 Youtube 定向视频缩略图时,我遇到了一个问题.

我正在使用 cheerio 进行解析,这是我的代码:

        $('.G6SP0b').each((i,el) => {
        img[i] = $(el)
        .find('.h1hFNe').attr('src');//which is the base64 image
        })

所以在代码中,我从 google 视频页面中提取每个缩略图,它们采用 gif/base64 格式,但我想将每个 Youtube 定向视频缩略图及其各自的图像数组存储到图像数组中URL 为 jpg 格式。

请更正我的代码,或建议我任何其他转换方式。

所以我就用了这个方法,我抓取了那个 Youtube 视频的 URL 并用这个 URL 来获取缩略图:

http://img.youtube.com/vi/videoID/mqdefault.jpg

这是我的代码:

        $('.G6SP0b').each((i,el) => {
        image[i] = $(el)
        .find('.h1hFNe').attr('src')
    })
    $('.egMi0').each((i,el) => {
        link[i] = $(el)
        .find('a').attr('href')
        link[i] = link[i].replace("%3F" , "?")
        link[i] = link[i].replace("%3D" , "=")
        link[i] = link[i].substring(7,link[i].indexOf("&"))
        if(link[i].includes("www.youtube.com"))
        {
            image[i] = `http://img.youtube.com/vi/${link[i].substring(32)}/mqdefault.jpg`
        }
    })

这就是我得到结果的方式:

"videoResults": [
{
  "thumbnail": "http://img.youtube.com/vi/ET0G1FYxWqc/mqdefault.jpg",
  "link": "https://www.youtube.com/watch?v=ET0G1FYxWqc"
},
{
  "thumbnail": "http://img.youtube.com/vi/-QXrYIHODzE%26vl%3Den-US/mqdefault.jpg",
  "link": "https://www.youtube.com/watch?v=-QXrYIHODzE%26vl%3Den-US"
},
{
  "thumbnail": "data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",
  "link": "https://www.espn.com/nfl/story/_/id/33822691/nfl-draft-2022-national-title-super-bowl-winning-qb-record-draft-night-part-impressive-year-georgia-bulldogs"
},
]

使用 cheerio 从 Google 搜索中抓取图像你不能直接从 img 标签中获取它,因为默认情况下 src 属性只包含一个1x1 占位符,全分辨率图像由 javascript 加载。要找到全分辨率图像,您需要从 script 标签中的 base64 编码中提取它。检查下面的代码和 full example in the online IDE

const cheerio = require("cheerio");
const axios = require("axios");

const searchQuery = "minecraft";                             // what we want to search
const encodedQuery = encodeURI(searchQuery);            // what we want to search for in URI encoding

const BASE_URL = "https://www.google.com/search";

const AXIOS_OPTIONS = {
  headers: {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
  },                                              // adding the User-Agent header as one way to prevent the request from being blocked
  params: {
    q: encodedQuery,                                // our encoded search string
    tbm: "vid",                                     // parameter defines the type of search you want to do ("vid" means videos)
    hl: "en",                                       // Parameter defines the language to use for the Google search
    gl: "us",                                       // parameter defines the country to use for the Google search
  },
};

function getVideosInfo() {
  return axios(BASE_URL, AXIOS_OPTIONS).then(({ data }) => {
    const $ = cheerio.load(data);

    const patternForBase64 = /s='(?<img>[^']+)';\w+\s\w+=\['(?<id>\w+_\d+)'];/gm;       //https://regex101.com/r/pMd0yx/1
    const patternForLinks = /"(?<id>[^":]+)":"(?<link>[^"]+)"/gm;                       //https://regex101.com/r/p5nj8R/1

    const imagesWithBase64 = [...data.matchAll(patternForBase64)].map(({ groups }) => ({ id: groups.id, img: groups.img.replaceAll("\x3d", "") }));
    const imagesWithLinks = [...data.matchAll(patternForLinks)].map(({ groups }) => ({ id: groups.id, link: groups.link }));

    return [...$(".dFd2Tb .dXiKIc")].map((el) => {
      return {
        link: $(el).find("a").attr("href"),
        image:
          imagesWithBase64.find(({ id }) => id === $(el).find("img").attr("id"))?.img || imagesWithLinks.find(({ id }) => id === $(el).find("img").attr("id"))?.link,
      };
    });
  });
}

getVideosInfo().then(console.log);

输出:

[
   {
      "link":"https://www.youtube.com/watch?v=MmB9b5njVbA",
      "image":"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBwgHBgkIBwgKCgkLDRYPDQwMDRsUFRAWIB0iIiAdHx8kKDQsJCYxJx8fLT0tMTU3Ojo6Iys/RD84QzQ5OjcBCgoKDQwNGg8PGjclHyU3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3N//AABEIAGQAsgMBIgACEQEDEQH/xAAbAAACAwEBAQAAAAAAAAAAAAADBAABAgUGB//EADgQAAIBAwMCBAMFBwQDAAAAAAECEQADIQQSMUFREyJhcQWBkTJCsdHwFCNScqHB8TNikuEVJIL/xAAYAQADAQEAAAAAAAAAAAAAAAAAAQIDBP/EACARAAIDAQEAAgMBAAAAAAAAAAABAhEhEjFBUQMTYXH/2gAMAwEAAhEDEQA/APlF42BfYi1CuSeeKzeQiNpEdCaxcRmYguoFb0v29u8kjviuf+mPxZfmaCW3GqKQ32AZ7imtieKGuOFcYxGfemr2hF+xutZecuTkfKl2kCkl6KWLV11J8JlRRk7YBrNvQXJhmVV6TTw1GoVLVm4jlRhmC4J71ztax8Rgu8rPlmht3SBt3h0LXwUMfNeBEbgVrep0NmyguSbhGNo6+9cpdVftrOVxBE1l9TeytxmBPehqTE+mEv3EutAt7IH2Y5oKbkcGCem3bRUewP3jhmcDkmrOoZirJ9nkyJ/tT0LYBy24i4o3AxkVSvcMqo8vfoKf12ntXLB1Ni4QTHkI+ppHxSRs3Yn+GmnaHdm7Kol5d+0pPmO2ZFH1FxdytbRbacAKYk0zbt3tFZW8ipteIkySaD4d66CbrBTzAUVPSsVoXuO11FC7tq8dYrVq1Ft2t5bviiF7dj7Iz1oP7YzvBAgme1GvwG2/BdwyXIuA7gO1EtEDzEwB6U6uothhkbj6THvQNR+9Xcyho4KyP1/3Tu8Fdg2aGXw2PqIjNWtt2u7JG5vXAq9Pp97DetwKT9rmKZuWfAkp5twySMik2kViAm2NOd164sf7TNW1tLgFwKxA4g0I22JlTM9GqzvBG5HxwBxTS+RpF/st05CnPrV1PG1PRG/5CrqtGMJ8OZwG3qJNZu6B7I3oVYcGDmKdTWR/pmQRkCl28Rl/dmQCcRxWSbM9Xpzb7Z/3CmtDrdSiwkqDzWWtX5/0/nFLOLjXFCk7j0mIq8eFNJo6z3bt9Cty8yqfukc0g1h1Y+b+tEFlyAjuNgz5a2o8KNkMB0YyamNJ6yU0vRJUc3AwB+dMpp7z3QL3lXrMUZ1uOBsbaJ+YoS3mtAowZ5OZNaSSvGUtM3tG24+C0jrOIqWdO43B2g7YEZoQv3vHMM0e9E3XGDm0wVQDINS0/AaYxe04XQBbrg3d+AD0pazYNy9sD8ESa1orySBcC3CRieldC18Pt6a34i72eJYAHpU3yTdYVrQ2nt29OW3bUme+TXOS8xuHfwO54pj4peDvvRjG0VzrEtuE0Rja0IrA1w2bmBcg9yOaG+kKHeDIHNCNp0cFhRhqbm4EEeU4AEVp/hVNGRaec8TkzTunPg25kQJJxP6/6pVbryCCJX0itLduSIbjgDFTJWJqzrafWgJCg3M9sik9ULdxhc3G2w5U1jT2GYFgc9xQ7mmdSSuPfg1EUkxRdBEexcUgMQe8Vg/s7Nslw3WpZthBLxuP8OKYbw2gEqR361TdeD6oF4Fkc3h9B+dStGxaJwzxUpdMfQta1C2gVye9FsvcZ90kA9aCu22soAzRnHFXZu3LhAZfLwIxWr0sdW8zmYxjmssjNbzaXnB4rYtG0od1IkYIp64bZ0n7oiRzjmsZYRJ0jjPc2jwwyhfSlrd1kuQoJo923a3Tux1EU3ordlLbPtLNwDV+ISqjGlDebxJVj6RFZ1VgqBtLlj09aftKbibnBQdB60A3UUJtBlyMziPapTdiTZzgt7dGZFHRNll+sjmnb3ncbZCngTxQU0sJcZ2ZwSYgyTirvNLTEdOAgB2GR1mvWaZNiNgQzlsetIfD9EVZYRlBE+Y01qriaexdUK8iMz3rGcrwylrOZ8WsK2pQEYifel/F01shRAM5YD7OKY1Sq11QQ5HPPFJXNLbh3yQGjmtYxw1iqVBwq3kGQQwJJAM9vnyKVuLZDm2583DFRxTVraLdtVWA85nPH+aHqLa3nkWLhcLEj72KqhnNJZZAznpV2HO8SSAD0rrWLGnvWyyoZHALZP6zSz6ZGIa2MHgdSP8AFHSJs2moQAIXOeSOntVOp8IFGLEcihJp52kyZHHWmV3La8i+YgYn0/RqGvoh18AbT7hP661CoYTb27uBHX9RVXWBAYKAeoqrMkFuTHPf2oH/AErwrn8aD0mpTHkGALYqUrFYssW3hiD6imVkRtEjtQUsTeAmR0BEGu3pdICoJ605SSNJtREbz6i/pkslQFUkjapJJPet6GwblvZJBrrLp12xEUCzZuW7zbLRCdCWms3+S0Q59KgF34Yqhm3yT6UC/s06KLZ3dI4rsXI25iTgSetcrVkfYdCQZ61MZOzOMhRtWxtOzXBuyBj7M4/Cg6m9b8W0VaAJzHWqv2RIABAP0xWBYUON2V5FdCpGyYXT3S119z+XBWetdDQupKobm0ksTAmM/lFJIiCNw9ZJ46805oLf/sMCPsiDmom8JkzsWXRHBa6ziDyOKFr/AAn090hvMVkStWdiLJ470HUqPBJwRtmKwT0zT05upuA+GQwnrHXGKAdRFx2LYMFT8qjXUVgNrlcbvUZ/OgtdtAhSGK/exzjtXWjpsxc1LFLZLeYMDgUx4zHayXYUCCZj8aSuXLMgBTHWetYdSygoehx1psTOporm24SzQhWZ6j9QatjDlmaEOfNgjp1+tcvSlkucjcDXRJtuBZ3iCApYHP8AiazkqM5YL3zxeW58t3EUFLjkyLhx3NEOlb0YdKyLDLGB701Q1Rm5j7RJzRrTCYJELE446TWmRL0hcE8L8pFLX7LgwJ9Pbmn7gejHiWurmfl+VVSPhP1IqUcL7K5R6HUr4d+HvK5B8xjj0rpaZlKRNL6VLLIHdFYn+ITBptUtjGxR6QK5pOzL8kk6Nhljn+tQ+8GqAQEEASPar3LHIz7VBmCv23dDtuQw4pC/pNUyDzqY7ECumymDBI5oT7g3Ln0iqTod0efui6BtDQBiAIobG6VgsY4iu8yliZtPP8tDbTG8dvhMMcxFaKZfRw3uvIBc+wpz4e9wuSrGcBqfHw7O4KaYtaYoJA+lDkqByTCLbV7Y3kmf9xqXratb2TiI5NFWQIgz+FUWEQZ+tY2ZnD1ejNq3vBnNcq4Jubcj1r17hWXnHea4HxBCtwMR1zW8JmsZHPuaVgBKkTx9Ktbuy1tKZPJzTqr+5ZolIIx1/r7Vi5YjcWjYc/XP5Vr0V19ifiiAdhmIJrVu5MSnzNGOn+8SNpE5PE1u1p5IbuOC3fpSbQNoKhL2wGAiSQSefSquWzEqfYf2pzR2rZknaxOcmYqrulPWNvasr0ixAHaxMh84+cj9Zotoq4B2iZDHdMDof11qmsC1xhY+lDF025xuMQSeucGrso1sIx+zMY6i2TNXSxeSfKv1qUC09FZtWyAcn/6o3h24iD/ypLTtsG4gbjJ5rTX9vQRFYtMhoa8O1ERn3NaCIGlFBI9ZrntqiOBMx0pvSMSoZxk5GOlJqgaDKWZwPD8uZntReREEgCqECJDTzkVBiJByDzUklnPQmOBUzMxOcZqjiCQcg81CYGRyDSEQczH9elVuicCeuaHfbaMCD9etZfTu0bbgHuDIzTodF3n2jBHrBqLZW5bViWWQOD6UF9FccYvADjKmmrSFLSIGnYoWSOap4hvDP7Op+84yDSuo+G2ryyS/cZp+AfvTAzI5FQCcbiTmZ+VLpitnCf4WjeYb56Zpf/xwgEhvea9HsBPMk5n+1ZNlJ6knr2q1+Rl9s88vw4QMMO2a3a+HoIJRvrXe8K2DwSfwqGygPBJo/Yw7FNLaW2gATy+nWjOVkA8njFG8NF4G4mAM1Vy1bYSZPbMVNk2c69aRjAaO+OKSOjDNMjb3mK7TaSzyd0dRMVa6S0qxDH3bNVGdFRmonmotjBQYqV6E6JTkLbg/zfnUq/2ovuBzCxj3oqnCj2qVKTJYxZxtA4NGDssAHr/epUqGSZa667gpiCfxrLXXVTBjJqVKKAhuuAQDHNUbrqrQY/xUqUADa4wuL/OJ9a6iw2CMSalSlIUiKd8z3H4VlWJJGAOMehNSpUElkwQoAjI/oKtjt8o6QZqqlAEJhTEcE/Q1J80YyYOKlSmBW8iO5IE1HYqJHIkCpUoGy2YgE9uKpWO2SZMTnvUqUMTIrFl8xmZqldmkk9YqVKbBl+KwwIgVKlSkB//Z"
   },
   ...and other results
]

或者,您可以使用 SerpApi 中的 Google Video Results API。 SerpApi 是免费的 API,每月有 100 次搜索。如果您需要更多搜索,有付费计划。

不同之处在于,所有需要做的只是迭代 ready-made,结构化 JSON 而不是从头开始编写所有代码维护,弄清楚如何绕过 Google,并选择正确的选择器,有时可能是 time-consuming。 Check out the playground.

用法:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your api key from serpapi.com

const searchString = "minecraft";                        // what we want to search

const params = {
  engine: "google",                                     // search engine
  q: searchString,                                      // search query
  google_domain: "google.com",                          // google domain of the search
  gl: "us",                                             // parameter defines the country to use for the Google search
  hl: "en",                                             // Parameter defines the language to use for the Google search
  tbm: "vid"                                            // parameter defines the type of search you want to do ("nws" means news)
};

const getVideosData = function ({ video_results }) {
  return video_results.map((result) => {
    const { link, thumbnail = "No image" } = result;
    return {
      link,
      thumbnail,
    }
  })
};

const getJson = (params) => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}

getJson(params).then(getVideosData).then(console.log)

输出:

[
   {
      "link":"https://www.youtube.com/watch?v=MmB9b5njVbA",
      "thumbnail":"https://serpapi.com/searches/6294dec729d17745dfdc3c84/images/23c7d943b97e967d87bd9ee7e4d0109cb6c0b7356adf2a71edf2f29dcbd11563.jpeg"
   },
   {
      "link":"https://www.youtube.com/watch?v=X-fMtNOS_gU",
      "thumbnail":"https://serpapi.com/searches/6294dec729d17745dfdc3c84/images/23c7d943b97e967d4166ae8a934a93a29023f829db54715a9b4305dd4c904e2d.jpeg"
   },
   ...and other results
]

Disclaimer, I work for SerpApi.