Broken link check while initiating cron-multi

335 Views Asked by At

This code checks broken link of a page by initiating multiple cron. However it is not identifying the correct links which are broken . It always returns 0 broken links though there are broken links available on the page. $url_list is array of links of a page.

$mh = curl_multi_init();
    for ($i = 0; $i < $max_connections; $i++)
    {
        $this->add_url_to_multi_handle($mh, $url_list);
    }
    do
    {
        $mrc = curl_multi_exec($mh, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);

    while ($active && $mrc == CURLM_OK)
    {

        // 5. there is activity  
        if (curl_multi_select($mh) != -1)
        {

            // 6. do work  
            do
            {
                $mrc = curl_multi_exec($mh, $active);
            } while ($mrc == CURLM_CALL_MULTI_PERFORM);

            // 7. is there info?  
            if ($mhinfo = curl_multi_info_read($mh))
            {
                // this means one of the requests were finished  
                // 8. get the info on the curl handle  
                $codes = array('400', '401', '402', '403', '404', '500', '501', '502', '503');
                $chinfo = curl_getinfo($mhinfo['handle']);

                // 9. dead link?  
                if (!$chinfo['http_code'])
                {
                    $dead_urls [] = $chinfo['url'];

                    // 10. 404?  
                }
                else if (in_array($chinfo['http_code'], $codes) || !$chinfo['http_code'])
                {
                    $broken_links++;
                    $data = array(
                        'domain_id' => $domain_id,
                        'pageurl' => $url,
                        'broken_link' => $chinfo['url']
                    );
                    $this->db->insert($table, $data);

                    $not_found_urls [] = $chinfo['url'];

                    // 11. working  
                }
                else
                {
                    $working_urls [] = $chinfo['url'];
                }

                // 12. remove the handle  
                curl_multi_remove_handle($mh, $mhinfo['handle']);
                curl_close($mhinfo['handle']);

                // 13. add a new url and do work  
                if ($this->add_url_to_multi_handle($mh, $url_list))
                {

                    do
                    {
                        $mrc = curl_multi_exec($mh, $active);
                    } while ($mrc == CURLM_CALL_MULTI_PERFORM);
                }
            }
        }
    }
    curl_multi_close($mh);
    echo "<br/>Total Broken: " . $broken_links;
    echo "==<br/>Broken URLs==\n";
    echo "<pre>";
    print_r($not_found_urls);
    echo "</pre>";
    $nooflinks = $total_link;
}

function add_url_to_multi_handle($mh, $url_list)
{
    static $index = 0;
    // if we have another url to get  
    if (isset($url_list[$index]) && $url_list[$index] != "")
    {
        // new curl handle  
        $ch = curl_init();

        curl_setopt($ch, CURLOPT_URL, $url_list[$index]);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
        curl_setopt($ch, CURLOPT_NOBODY, TRUE);
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
        curl_multi_add_handle($mh, $ch);
//          $curl_handlers[] = $ch;

        // increment so next url is used next time  
        $index++;
        return true;
    }
    else
    {
        $index = 0;
        // we are done adding new URLs  
        return false;
    }
}

Thanks.

0

There are 0 best solutions below