网络爬虫停在第一页
Web crawler stops at first page
我正在开发一个应该像这样工作的网络爬虫:
- 转到网站,从该网站抓取所有 link
- 下载所有图片(从首页开始)
- 如果当前页面没有剩余图像,请转到步骤 1 中找到的下一个 link,然后执行步骤 2 和 3,直到没有剩余 links/images。
似乎下面的代码在某种程度上起作用了,比如当我尝试抓取某些网站时,我得到了一些要下载的图像。
(即使我不明白我得到的图像,因为我在网站上找不到它们,似乎爬虫不是从网站的起始页开始的)。
几张图片 (~25-500) 后,爬虫完成并停止,没有错误,它就停止了。我在多个网站上尝试过这个,在几张图片后它就停止了。我认为爬虫以某种方式忽略了第 3 步。
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
var (
currWebsite string = "https://www.youtube.com"
imageCount int = 0
crawlWebsite string
)
func processElement(index int, element *goquery.Selection) {
href, exists := element.Attr("href")
if exists && strings.HasPrefix(href, "http") {
crawlWebsite = href
response, err := http.Get(crawlWebsite)
if err != nil {
log.Fatalf("error on current website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal("Error loading HTTP response body.", err)
}
document.Find("img").Each(func(index int, element *goquery.Selection) {
imgSrc, exists := element.Attr("src")
if strings.HasPrefix(imgSrc, "http") && exists {
fileName := fmt.Sprintf("./images/img" + strconv.Itoa(imageCount) + ".jpg")
currWebsite := fmt.Sprint(imgSrc)
fmt.Println("[+]", currWebsite)
DownloadFile(fileName, currWebsite)
imageCount++
}
})
}
}
func main() {
err := os.MkdirAll("./images/", 0777)
if err != nil {
log.Fatalln("error on creating directory")
}
response, err := http.Get(currWebsite)
if err != nil {
log.Fatalln("error on searching website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
document.Find("a").Each(processElement)
}
func DownloadFile(filepath string, url string) {
response, err := http.Get(url)
if err != nil {
log.Fatalln("error getting the website infos")
}
defer response.Body.Close()
if response.StatusCode != 200 {
log.Fatalln("received non 200 response code")
}
file, err := os.Create(filepath)
if err != nil {
log.Fatalf("error creating file at %v\n", filepath)
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatalln("error copy file from src to dst")
}
}
(even I dont understand the images I get, cause I cant find them on the website, it seems like the crawler does not start with the startpage of the website).
是的,你是对的。您的代码不会从起始页下载图像,因为它从起始页获取的唯一内容是所有锚标记元素,然后为起始页上找到的每个锚元素调用 processElement()
-
response, err := http.Get(currWebsite)
if err != nil {
log.Fatalln("error on searching website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
document.Find("a").Each(processElement) // Here
要从起始页下载所有图像,您应该定义另一个函数 processUrl()
来完成获取 img
元素和下载图像的工作,但是在 processElement()
函数中您只需要获取 href
link 并在 link -
上调用 processUrl()
func processElement(index int, element *goquery.Selection) {
href, exists := element.Attr("href")
if exists && strings.HasPrefix(href, "http") {
crawlWebsite = href
processUrl(crawlWebsite)
}
}
func processUrl(crawlWebsite string) {
response, err := http.Get(crawlWebsite)
if err != nil {
log.Fatalf("error on current website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal("Error loading HTTP response body.", err)
}
document.Find("img").Each(func(index int, element *goquery.Selection) {
imgSrc, exists := element.Attr("src")
if strings.HasPrefix(imgSrc, "http") && exists {
fileName := fmt.Sprintf("./images/img" + strconv.Itoa(imageCount) + ".jpg")
currWebsite := fmt.Sprint(imgSrc)
fmt.Println("[+]", currWebsite)
DownloadFile(fileName, currWebsite)
imageCount++
}
})
}
现在只需在处理所有 links -
之前从起始页抓取图像
func main() {
...
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
// First crawl images from start page url
processUrl(currWebsite)
document.Find("a").Each(processElement)
}
我正在开发一个应该像这样工作的网络爬虫:
- 转到网站,从该网站抓取所有 link
- 下载所有图片(从首页开始)
- 如果当前页面没有剩余图像,请转到步骤 1 中找到的下一个 link,然后执行步骤 2 和 3,直到没有剩余 links/images。
似乎下面的代码在某种程度上起作用了,比如当我尝试抓取某些网站时,我得到了一些要下载的图像。
(即使我不明白我得到的图像,因为我在网站上找不到它们,似乎爬虫不是从网站的起始页开始的)。
几张图片 (~25-500) 后,爬虫完成并停止,没有错误,它就停止了。我在多个网站上尝试过这个,在几张图片后它就停止了。我认为爬虫以某种方式忽略了第 3 步。
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
var (
currWebsite string = "https://www.youtube.com"
imageCount int = 0
crawlWebsite string
)
func processElement(index int, element *goquery.Selection) {
href, exists := element.Attr("href")
if exists && strings.HasPrefix(href, "http") {
crawlWebsite = href
response, err := http.Get(crawlWebsite)
if err != nil {
log.Fatalf("error on current website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal("Error loading HTTP response body.", err)
}
document.Find("img").Each(func(index int, element *goquery.Selection) {
imgSrc, exists := element.Attr("src")
if strings.HasPrefix(imgSrc, "http") && exists {
fileName := fmt.Sprintf("./images/img" + strconv.Itoa(imageCount) + ".jpg")
currWebsite := fmt.Sprint(imgSrc)
fmt.Println("[+]", currWebsite)
DownloadFile(fileName, currWebsite)
imageCount++
}
})
}
}
func main() {
err := os.MkdirAll("./images/", 0777)
if err != nil {
log.Fatalln("error on creating directory")
}
response, err := http.Get(currWebsite)
if err != nil {
log.Fatalln("error on searching website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
document.Find("a").Each(processElement)
}
func DownloadFile(filepath string, url string) {
response, err := http.Get(url)
if err != nil {
log.Fatalln("error getting the website infos")
}
defer response.Body.Close()
if response.StatusCode != 200 {
log.Fatalln("received non 200 response code")
}
file, err := os.Create(filepath)
if err != nil {
log.Fatalf("error creating file at %v\n", filepath)
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatalln("error copy file from src to dst")
}
}
(even I dont understand the images I get, cause I cant find them on the website, it seems like the crawler does not start with the startpage of the website).
是的,你是对的。您的代码不会从起始页下载图像,因为它从起始页获取的唯一内容是所有锚标记元素,然后为起始页上找到的每个锚元素调用 processElement()
-
response, err := http.Get(currWebsite)
if err != nil {
log.Fatalln("error on searching website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
document.Find("a").Each(processElement) // Here
要从起始页下载所有图像,您应该定义另一个函数 processUrl()
来完成获取 img
元素和下载图像的工作,但是在 processElement()
函数中您只需要获取 href
link 并在 link -
processUrl()
func processElement(index int, element *goquery.Selection) {
href, exists := element.Attr("href")
if exists && strings.HasPrefix(href, "http") {
crawlWebsite = href
processUrl(crawlWebsite)
}
}
func processUrl(crawlWebsite string) {
response, err := http.Get(crawlWebsite)
if err != nil {
log.Fatalf("error on current website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal("Error loading HTTP response body.", err)
}
document.Find("img").Each(func(index int, element *goquery.Selection) {
imgSrc, exists := element.Attr("src")
if strings.HasPrefix(imgSrc, "http") && exists {
fileName := fmt.Sprintf("./images/img" + strconv.Itoa(imageCount) + ".jpg")
currWebsite := fmt.Sprint(imgSrc)
fmt.Println("[+]", currWebsite)
DownloadFile(fileName, currWebsite)
imageCount++
}
})
}
现在只需在处理所有 links -
之前从起始页抓取图像func main() {
...
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
// First crawl images from start page url
processUrl(currWebsite)
document.Find("a").Each(processElement)
}