Why am I just taking the picture url from the last url?

71 Views Asked by At

I wrote a program to extract link pictures from webcomics, however, when I run it, it just extracts image links from the last link chapter, not all image links from all chapters. What is the issue with my program? I have tried several ways but not things useful.

from PyQt5 import QtNetwork, QtCore
from requests_html import HTML
from functools import cached_property
from PyQt5.QtCore import QCoreApplication, QUrl

url1 = "https://saytruyen.net/truyen-su-tro-lai-cua-phap-su-hac-am-sau-66666-nam.html"

class Manager:
    def __init__(self):
        self.manager.finished.connect(self.handle_response)

    @cached_property
    def manager(self):
        return QtNetwork.QNetworkAccessManager()

    def start(self):
        self.start_request(QtCore.QUrl(url1))

    def start_request(self, url):
        request = QtNetwork.QNetworkRequest(url)
        self.manager.get(request)

    def handle_response(self, reply):
        err = reply.error()
        if err == QtNetwork.QNetworkReply.NoError:
            self.process(str(reply.readAll(), 'utf-8'))
        else:
            print("Error occured: ", err)
            print(reply.errorString())

    def process(self, data):
        html = HTML(html=data)
        rs = html.find("#list-chapter a", first=False)
        for i in reversed(rs):
            url2 = "https://saytruyen.net/" + i.attrs["href"]
            #print(url2)
            #self.start_request(QtCore.QUrl(url2))
            req = QtNetwork.QNetworkRequest(QUrl(url2))

            self.nam = QtNetwork.QNetworkAccessManager()
            self.nam.finished.connect(self.handleResponse)
            self.nam.get(req)

    def handleResponse(self, reply):

        er = reply.error()

        if er == QtNetwork.QNetworkReply.NoError:

            bytes_string = reply.readAll()
            html2 = HTML(html = str(bytes_string, 'utf-8'))
            rs_c = html2.find("#lst_content img")
            for x in rs_c:
                img ="https://saytruyen.net/" + x.attrs['src']
                print(img)

        else:
            print("Error occured: ", er)
            print(reply.errorString())
        
        QCoreApplication.quit()
1

There are 1 best solutions below

6
musicamante On

There are two problems:

  1. the QNetworkAccessManager used for the download is being continuously recreated; since a network request is asynchronous, it isn't processed instantly, and it will be destroyed along with the network manager in the next cycle of the for loop since it's being overwritten; the result is that the previous request will be destroyed and only the last one will "survive";
  2. the application is quit as soon as the first reply is received, preventing all other requests to be processed;

The solution is to create a single manager for the download process in the __init__, and quit as soon as all requests have been received.

class Manager:
    def __init__(self):
        self.manager.finished.connect(self.handle_response)
        self.nam = QtNetwork.QNetworkAccessManager()
        self.nam.finished.connect(self.handleResponse)
        self.urls = set()

    # ...

    def process(self, data):
        html = HTML(html=data)
        rs = html.find("#list-chapter a", first=False)
        for i in reversed(rs):
            url2 = QUrl("https://saytruyen.net/" + i.attrs["href"])
            if url2 in self.urls:
                continue
            self.urls.add(url2)
            req = QtNetwork.QNetworkRequest(url2)
            self.nam.get(req)

    def handleResponse(self, reply):
        self.urls.discard(reply.url())
        er = reply.error()

        if er == QtNetwork.QNetworkReply.NoError:
            bytes_string = reply.readAll()
            html2 = HTML(html = str(bytes_string, 'utf-8'))
            rs_c = html2.find("#lst_content img")
            for x in rs_c:
                img ="https://saytruyen.net/" + x.attrs['src']
                print(img)

        else:
            print("Error occured: ", er)
            print(reply.errorString())

        if not self.urls:
            QCoreApplication.quit()

Note that it's usually enough (and better) to have a single network manager and properly handle responses based on queued requests, but for simple situations like this one having two managers doesn't represent a huge problem.