Retry request in go-colly

1k Views Asked by At

I have this scraper library, I would like to change my user agent if the first user agent returns error, but this code doesnt work, if first user agent doesnt work, I have send the 2nd attempt but this will never finish since onHTML is not triggered:

package scraper

import (
    "net/http"
    "github.com/davecgh/go-spew/spew"
    "github.com/gocolly/colly"
)

const (
        fbUserAgent = "ua 1"
    userAgent = "ua 2"
)

type ScrapeResult struct {
    Title       string
    Description string
    SiteName    string
    URL         string
    Images      []string
}

func Scrape2(url string) (*ScrapeResult, error) {
    var (
        res *ScrapeResult
        scrapeErr error
        done = make(chan bool, 1)
        c = colly.NewCollector()
    )

    c.OnError(func(r *colly.Response, err error) {
        if ua := r.Request.Headers.Get("User-Agent"); ua == fbUserAgent {
            c.Request(
                "GET",
                url,
                nil,
                nil,
                http.Header{
                    "User-Agent": []string{userAgent},
                    "Accept": []string{"*/*"}, 
                },
            )
        } else {
            scrapeErr = err
            done <- true
        }
    })

    c.OnHTML("html", func(e *colly.HTMLElement) {
        spew.Dump("ON HTML")
        res = &ScrapeResult{URL: url}
        res.Title = FindTitle(e)
        res.Description = FindDescription(e)
        res.SiteName = FindSiteName(e)
        res.Images = FindImages(e)
        done <- true
    })

    c.Request(
        "GET",
        url,
        nil,
        nil,
        http.Header{
            "User-Agent": []string{fbUserAgent},
            "Accept": []string{"*/*"}, // * / *
            "Accept-Language": []string{"en-GB,en-US;q=0.9,en;q=0.8"},
            "Accept-Encoding": []string{"gzip, deflate, br"},
            "Connection": []string{"keep-alive"},
            "sec-ch-ua": []string{` Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90`},
        },
    )

    <- done
    return res, scrapeErr
}

func FindTitle(e *colly.HTMLElement) string {
    if content := e.ChildAttr(`meta[property="og:title"]`, "content"); len(content) > 0 {
        return content
    }
    return ""
}

func FindDescription(e *colly.HTMLElement) string {
    if content := e.ChildAttr(`meta[property="og:description"]`, "content"); len(content) > 0 {
        return content
    }
    return ""
}

func FindSiteName(e *colly.HTMLElement) string {
    if content := e.ChildAttr(`meta[property="og:site_name"]`, "content"); len(content) > 0 {
        return content
    }
    return ""
}

func FindImages(e *colly.HTMLElement) []string {
    images := make([]string, 0)
    if content := e.ChildAttr(`meta[property="og:image"]`, "content"); len(content) > 0 {
        images = append(images, content)
    }
    return images
}

How can I make colly request for the 2nd time and trigger the onHTML? thank you

1

There are 1 best solutions below

0
On

You can set the property collector.CheckHead = true

What this does is ensures that you do a GetHEAD operation first to check connection issues and if it fails - there is a retry.

you will need /v2 of gocolly to have this feature included.

https://github.com/gocolly/colly/blob/master/colly.go#L110