取消 HTTP 请求时关闭所有 goroutine
Close all goroutines when HTTP request is cancelled
我正在制作一个网络爬虫。我通过爬虫函数传递 url 并解析它以获取锚标记中的所有链接,然后我为所有这些 url 调用相同的爬虫函数,每个 [= 使用单独的 goroutine 24=]。
但是,如果在我收到响应之前发送请求并取消它,则该特定请求的所有 groutines 仍然是 运行.
现在我想要的是,当我取消请求时,所有因该请求而被调用的 goroutine 都会停止。
请指导。
以下是我的 crawler 函数代码。
func crawler(c echo.Context, urlRec string, feed chan string, urlList *[]string, wg *sync.WaitGroup) {
defer wg.Done()
URL, _ := url.Parse(urlRec)
response, err := http.Get(urlRec)
if err != nil {
log.Print(err)
return
}
body := response.Body
defer body.Close()
tokenizer := html.NewTokenizer(body)
flag := true
for flag {
tokenType := tokenizer.Next()
switch {
case tokenType == html.ErrorToken:
flag = false
break
case tokenType == html.StartTagToken:
token := tokenizer.Token()
// Check if the token is an <a> tag
isAnchor := token.Data == "a"
if !isAnchor {
continue
}
ok, urlHref := getReference(token)
if !ok {
continue
}
// Make sure the url begines in http**
hasProto := strings.Index(urlHref, "http") == 0
if hasProto {
if !urlInURLList(urlHref, urlList) {
if strings.Contains(urlHref, URL.Host) {
*urlList = append(*urlList, urlHref)
// fmt.Println(urlHref)
// c.String(http.StatusOK, urlHref+"\n")Documents
if !checkExt(filepath.Ext(urlHref)) {
wg.Add(1)
go crawler(c, urlHref, feed, urlList, wg)
}
}
}
}
}
}
}
下面是我的 POST 请求处理程序
func scrapePOST(c echo.Context) error {
var urlList []string
urlSession := urlFound{}
var wg sync.WaitGroup
urlParam := c.FormValue("url")
feed := make(chan string, 1000)
wg.Add(1)
go crawler(c, urlParam, feed, &urlList, &wg)
wg.Wait()
var count = 0
for _, url := range urlList {
if filepath.Ext(url) == ".jpg" || filepath.Ext(url) == ".jpeg" || filepath.Ext(url) == ".png" {
urlSession.Images = append(urlSession.Images, url)
} else if filepath.Ext(url) == ".doc" || filepath.Ext(url) == ".docx" || filepath.Ext(url) == ".pdf" || filepath.Ext(url) == ".ppt" {
urlSession.Documents = append(urlSession.Documents, url)
} else {
urlSession.Links = append(urlSession.Links, url)
}
count = count + 1
}
urlSession.Count = count
// jsonResp, _ := json.Marshal(urlSession)
// fmt.Print(urlSession)
return c.JSON(http.StatusOK, urlSession)
}
回显上下文公开了 HTTP 请求,该请求已经有一个与服务器请求相关联的上下文。只需获取该上下文,并检查它是否被取消,and/or 将其传递给采用上下文的方法。
ctx := c.Request().Context()
select {
case <-ctx.Done():
return ctx.Err()
default:
// Continue handling the request
}
// and pass along to the db or whatever else:
rows, err := db.QueryContext(ctx, ...)
如果客户端中止连接,请求范围的上下文将自动取消。
如果您想添加自己的取消条件(超时或其他),您也可以这样做:
req := c.Request()
ctx, cancel := context.WithCancel(req.Context())
req = req.WithContext(ctx)
defer cancel()
// do stuff, which may conditionally call cancel() to cancel the context early
我正在制作一个网络爬虫。我通过爬虫函数传递 url 并解析它以获取锚标记中的所有链接,然后我为所有这些 url 调用相同的爬虫函数,每个 [= 使用单独的 goroutine 24=]。
但是,如果在我收到响应之前发送请求并取消它,则该特定请求的所有 groutines 仍然是 运行.
现在我想要的是,当我取消请求时,所有因该请求而被调用的 goroutine 都会停止。
请指导。
以下是我的 crawler 函数代码。
func crawler(c echo.Context, urlRec string, feed chan string, urlList *[]string, wg *sync.WaitGroup) {
defer wg.Done()
URL, _ := url.Parse(urlRec)
response, err := http.Get(urlRec)
if err != nil {
log.Print(err)
return
}
body := response.Body
defer body.Close()
tokenizer := html.NewTokenizer(body)
flag := true
for flag {
tokenType := tokenizer.Next()
switch {
case tokenType == html.ErrorToken:
flag = false
break
case tokenType == html.StartTagToken:
token := tokenizer.Token()
// Check if the token is an <a> tag
isAnchor := token.Data == "a"
if !isAnchor {
continue
}
ok, urlHref := getReference(token)
if !ok {
continue
}
// Make sure the url begines in http**
hasProto := strings.Index(urlHref, "http") == 0
if hasProto {
if !urlInURLList(urlHref, urlList) {
if strings.Contains(urlHref, URL.Host) {
*urlList = append(*urlList, urlHref)
// fmt.Println(urlHref)
// c.String(http.StatusOK, urlHref+"\n")Documents
if !checkExt(filepath.Ext(urlHref)) {
wg.Add(1)
go crawler(c, urlHref, feed, urlList, wg)
}
}
}
}
}
}
}
下面是我的 POST 请求处理程序
func scrapePOST(c echo.Context) error {
var urlList []string
urlSession := urlFound{}
var wg sync.WaitGroup
urlParam := c.FormValue("url")
feed := make(chan string, 1000)
wg.Add(1)
go crawler(c, urlParam, feed, &urlList, &wg)
wg.Wait()
var count = 0
for _, url := range urlList {
if filepath.Ext(url) == ".jpg" || filepath.Ext(url) == ".jpeg" || filepath.Ext(url) == ".png" {
urlSession.Images = append(urlSession.Images, url)
} else if filepath.Ext(url) == ".doc" || filepath.Ext(url) == ".docx" || filepath.Ext(url) == ".pdf" || filepath.Ext(url) == ".ppt" {
urlSession.Documents = append(urlSession.Documents, url)
} else {
urlSession.Links = append(urlSession.Links, url)
}
count = count + 1
}
urlSession.Count = count
// jsonResp, _ := json.Marshal(urlSession)
// fmt.Print(urlSession)
return c.JSON(http.StatusOK, urlSession)
}
回显上下文公开了 HTTP 请求,该请求已经有一个与服务器请求相关联的上下文。只需获取该上下文,并检查它是否被取消,and/or 将其传递给采用上下文的方法。
ctx := c.Request().Context()
select {
case <-ctx.Done():
return ctx.Err()
default:
// Continue handling the request
}
// and pass along to the db or whatever else:
rows, err := db.QueryContext(ctx, ...)
如果客户端中止连接,请求范围的上下文将自动取消。
如果您想添加自己的取消条件(超时或其他),您也可以这样做:
req := c.Request()
ctx, cancel := context.WithCancel(req.Context())
req = req.WithContext(ctx)
defer cancel()
// do stuff, which may conditionally call cancel() to cancel the context early