I have this scraper library, I would like to change my user agent if the first user agent returns error, but this code doesnt work, if first user agent doesnt work, I have send the 2nd attempt but this will never finish since onHTML is not triggered:
package scraper
import (
"net/http"
"github.com/davecgh/go-spew/spew"
"github.com/gocolly/colly"
)
const (
fbUserAgent = "ua 1"
userAgent = "ua 2"
)
type ScrapeResult struct {
Title string
Description string
SiteName string
URL string
Images []string
}
func Scrape2(url string) (*ScrapeResult, error) {
var (
res *ScrapeResult
scrapeErr error
done = make(chan bool, 1)
c = colly.NewCollector()
)
c.OnError(func(r *colly.Response, err error) {
if ua := r.Request.Headers.Get("User-Agent"); ua == fbUserAgent {
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{userAgent},
"Accept": []string{"*/*"},
},
)
} else {
scrapeErr = err
done <- true
}
})
c.OnHTML("html", func(e *colly.HTMLElement) {
spew.Dump("ON HTML")
res = &ScrapeResult{URL: url}
res.Title = FindTitle(e)
res.Description = FindDescription(e)
res.SiteName = FindSiteName(e)
res.Images = FindImages(e)
done <- true
})
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{fbUserAgent},
"Accept": []string{"*/*"}, // * / *
"Accept-Language": []string{"en-GB,en-US;q=0.9,en;q=0.8"},
"Accept-Encoding": []string{"gzip, deflate, br"},
"Connection": []string{"keep-alive"},
"sec-ch-ua": []string{` Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90`},
},
)
<- done
return res, scrapeErr
}
func FindTitle(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:title"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindDescription(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:description"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindSiteName(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:site_name"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindImages(e *colly.HTMLElement) []string {
images := make([]string, 0)
if content := e.ChildAttr(`meta[property="og:image"]`, "content"); len(content) > 0 {
images = append(images, content)
}
return images
}
How can I make colly request for the 2nd time and trigger the onHTML? thank you
You can set the property collector.CheckHead = true
What this does is ensures that you do a GetHEAD operation first to check connection issues and if it fails - there is a retry.
you will need /v2 of gocolly to have this feature included.
https://github.com/gocolly/colly/blob/master/colly.go#L110