package main
import (
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
func main() {
c := colly.NewCollector(
)
c.SetRequestTimeout(time.Minute * 5)
queue, _ := queue.New(8, &queue.InMemoryQueueStorage{MaxSize: 1000})
var (
visited int64
cards []map[string]string
)
c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
atomic.AddInt64(&visited, 1)
card := make(map[string]string)
e.ForEach("p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
text := strings.Split(elem.Text, ":")
if len(text) > 1 {
card[text[0]] = text[1]
} else {
card["type"] = text[0]
}
})
cards = append(cards, card)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
const (
baseURL = "some_url"
maxPage = 5
)
for p := 1; p <= maxPage; p++ {
urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
queue.AddURL(urlPath)
}
queue.Run(c)
fmt.Println(visited)
fmt.Println(len(cards))
}
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"strings"
"sync/atomic"
"time"
)
func main() {
c := colly.NewCollector(
colly.Async(true),
)
c.SetRequestTimeout(5 * time.Minute)
var (
visited int64
cards []map[string]string
)
c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
atomic.AddInt64(&visited, 1)
card := make(map[string]string)
e.ForEach("li>p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
text := strings.Split(elem.Text, ":")
if len(text) > 1 {
card[text[0]] = text[1]
} else {
card["type"] = text[0]
}
})
cards = append(cards, card)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
const (
baseURL = "some_url"
maxPage = 5
)
for p := 1; p <= maxPage; p++ {
urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
c.Visit(urlPath)
}
c.Wait()
fmt.Println(visited)
fmt.Println(len(cards))
}
I am using gocolly for web scraping and I don't understand why when using the async mode or a queue, I get inconsistent results for visited. I get the value slightly smaller than the expected value as if it didn't have enough time to process all URLs.
Running all URLs consecutively yields the correct result. I have tried playing around with RequestTimeout, but without any success.
Maybe, there is something I am missing in the code that results in inconsistent behavior. For smaller values of pages, the results are most often correct.
My guess is that URLs occasionally return an error, but the library doesn't log anything unusual