如何抓取 google 个视频页面缩略图?

How to scrape google videos page thumbnails?

我目前正在进行一个抓取项目,其中我正在抓取 Google 视频页面,但是当我知道需要使用 ytimg 抓取 Youtube 定向视频缩略图时,我遇到了一个问题.

我正在使用 cheerio 进行解析,这是我的代码:

        $('.G6SP0b').each((i,el) => {
        img[i] = $(el)
        .find('.h1hFNe').attr('src');//which is the base64 image
        })

所以在代码中,我从 google 视频页面中提取每个缩略图,它们采用 gif/base64 格式,但我想将每个 Youtube 定向视频缩略图及其各自的图像数组存储到图像数组中URL 为 jpg 格式。

请更正我的代码,或建议我任何其他转换方式。

所以我就用了这个方法,我抓取了那个 Youtube 视频的 URL 并用这个 URL 来获取缩略图:

http://img.youtube.com/vi/videoID/mqdefault.jpg

这是我的代码:

        $('.G6SP0b').each((i,el) => {
        image[i] = $(el)
        .find('.h1hFNe').attr('src')
    })
    $('.egMi0').each((i,el) => {
        link[i] = $(el)
        .find('a').attr('href')
        link[i] = link[i].replace("%3F" , "?")
        link[i] = link[i].replace("%3D" , "=")
        link[i] = link[i].substring(7,link[i].indexOf("&"))
        if(link[i].includes("www.youtube.com"))
        {
            image[i] = `http://img.youtube.com/vi/${link[i].substring(32)}/mqdefault.jpg`
        }
    })

这就是我得到结果的方式:

"videoResults": [
{
  "thumbnail": "http://img.youtube.com/vi/ET0G1FYxWqc/mqdefault.jpg",
  "link": "https://www.youtube.com/watch?v=ET0G1FYxWqc"
},
{
  "thumbnail": "http://img.youtube.com/vi/-QXrYIHODzE%26vl%3Den-US/mqdefault.jpg",
  "link": "https://www.youtube.com/watch?v=-QXrYIHODzE%26vl%3Den-US"
},
{
  "thumbnail": "",
  "link": "https://www.espn.com/nfl/story/_/id/33822691/nfl-draft-2022-national-title-super-bowl-winning-qb-record-draft-night-part-impressive-year-georgia-bulldogs"
},
]

使用 cheerio 从 Google 搜索中抓取图像你不能直接从 img 标签中获取它,因为默认情况下 src 属性只包含一个1x1 占位符,全分辨率图像由 javascript 加载。要找到全分辨率图像,您需要从 script 标签中的 base64 编码中提取它。检查下面的代码和 full example in the online IDE

const cheerio = require("cheerio");
const axios = require("axios");

const searchQuery = "minecraft";                             // what we want to search
const encodedQuery = encodeURI(searchQuery);            // what we want to search for in URI encoding

const BASE_URL = "https://www.google.com/search";

const AXIOS_OPTIONS = {
  headers: {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
  },                                              // adding the User-Agent header as one way to prevent the request from being blocked
  params: {
    q: encodedQuery,                                // our encoded search string
    tbm: "vid",                                     // parameter defines the type of search you want to do ("vid" means videos)
    hl: "en",                                       // Parameter defines the language to use for the Google search
    gl: "us",                                       // parameter defines the country to use for the Google search
  },
};

function getVideosInfo() {
  return axios(BASE_URL, AXIOS_OPTIONS).then(({ data }) => {
    const $ = cheerio.load(data);

    const patternForBase64 = /s='(?<img>[^']+)';\w+\s\w+=\['(?<id>\w+_\d+)'];/gm;       //https://regex101.com/r/pMd0yx/1
    const patternForLinks = /"(?<id>[^":]+)":"(?<link>[^"]+)"/gm;                       //https://regex101.com/r/p5nj8R/1

    const imagesWithBase64 = [...data.matchAll(patternForBase64)].map(({ groups }) => ({ id: groups.id, img: groups.img.replaceAll("\x3d", "") }));
    const imagesWithLinks = [...data.matchAll(patternForLinks)].map(({ groups }) => ({ id: groups.id, link: groups.link }));

    return [...$(".dFd2Tb .dXiKIc")].map((el) => {
      return {
        link: $(el).find("a").attr("href"),
        image:
          imagesWithBase64.find(({ id }) => id === $(el).find("img").attr("id"))?.img || imagesWithLinks.find(({ id }) => id === $(el).find("img").attr("id"))?.link,
      };
    });
  });
}

getVideosInfo().then(console.log);

输出:

[
   {
      "link":"https://www.youtube.com/watch?v=MmB9b5njVbA",
      "image":""
   },
   ...and other results
]

或者,您可以使用 SerpApi 中的 Google Video Results API。 SerpApi 是免费的 API,每月有 100 次搜索。如果您需要更多搜索,有付费计划。

不同之处在于,所有需要做的只是迭代 ready-made,结构化 JSON 而不是从头开始编写所有代码维护,弄清楚如何绕过 Google,并选择正确的选择器,有时可能是 time-consuming。 Check out the playground.

用法:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your api key from serpapi.com

const searchString = "minecraft";                        // what we want to search

const params = {
  engine: "google",                                     // search engine
  q: searchString,                                      // search query
  google_domain: "google.com",                          // google domain of the search
  gl: "us",                                             // parameter defines the country to use for the Google search
  hl: "en",                                             // Parameter defines the language to use for the Google search
  tbm: "vid"                                            // parameter defines the type of search you want to do ("nws" means news)
};

const getVideosData = function ({ video_results }) {
  return video_results.map((result) => {
    const { link, thumbnail = "No image" } = result;
    return {
      link,
      thumbnail,
    }
  })
};

const getJson = (params) => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}

getJson(params).then(getVideosData).then(console.log)

输出:

[
   {
      "link":"https://www.youtube.com/watch?v=MmB9b5njVbA",
      "thumbnail":"https://serpapi.com/searches/6294dec729d17745dfdc3c84/images/23c7d943b97e967d87bd9ee7e4d0109cb6c0b7356adf2a71edf2f29dcbd11563.jpeg"
   },
   {
      "link":"https://www.youtube.com/watch?v=X-fMtNOS_gU",
      "thumbnail":"https://serpapi.com/searches/6294dec729d17745dfdc3c84/images/23c7d943b97e967d4166ae8a934a93a29023f829db54715a9b4305dd4c904e2d.jpeg"
   },
   ...and other results
]

Disclaimer, I work for SerpApi.