Skip to content
Advertisement

What is correct example to use Multi-cURL?

What is correct example (up-to-date approach) use CURL-MULTI? I use the below code, but many times, it fails to get the content (returns empty result, and neither I have experience how to retrieve the correct repsonse/error):

public function multi_curl($urls)
{          
    $AllResults =[]; 
    $mch = curl_multi_init();
    $handlesArray=[];
    $curl_conn_timeout= 3 *60; //max 3 minutes
    $curl_max_timeout = 30*60; //max 30 minutes

    foreach ($urls as $key=> $url) {
        $ch = curl_init();  
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
        curl_setopt($ch, CURLOPT_HEADER, false);
        // timeouts: https://thisinterestsme.com/php-setting-curl-timeout/   and https://stackoverflow.com/a/15982505/2377343
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $curl_conn_timeout);
        curl_setopt($ch, CURLOPT_TIMEOUT, $curl_max_timeout);
        if (defined('CURLOPT_TCP_FASTOPEN')) curl_setopt($ch, CURLOPT_TCP_FASTOPEN, 1);
        curl_setopt($ch, CURLOPT_ENCODING, ""); // empty to autodetect | gzip,deflate
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($ch, CURLOPT_URL, $url);
        $handlesArray[$key] = $ch;
        curl_multi_add_handle($mch, $handlesArray[$key]);
    }
   
    // other approaches are deprecated ! https://stackoverflow.com/questions/58971677/
    do {
        $execReturnValue = curl_multi_exec($mch, $runningHandlesAmount);
        usleep(100); // stop 100 microseconds to avoid infinity speed recursion
    } while ($runningHandlesAmount>0);
   
    //exec now
    foreach($urls as $key => $url)
    {
        $AllResults[$key]['url'] =$url;
        $handle = $handlesArray[$key];
        // Check for errors
        $curlError = curl_error($handle);
        if ($curlError!="")
        {
            $AllResults[$key]['error']    =$curlError;
            $AllResults[$key]['response'] =false;
        }
        else {
            $AllResults[$key]['error']    =false;
            $AllResults[$key]['response'] =curl_multi_getcontent($handle);
        }
        curl_multi_remove_handle($mch, $handle); curl_close($handle);
    }
    curl_multi_close($mch);
    return $AllResults;
}

and executing:

$urls = [ 'https://baconipsum.com/api/?type=meat-and-filler',
          'https://baconipsum.com/api/?type=all-meat&paras=2'];

$results = $helpers->multi_curl($urls);

Is there something, that can be changed, to have better results?


update: I’ve found this repository also mentions the lack of documentation about the best-use-case for multi-curl and provides their approach. However, I ask this on SO to get other competent answers too.

Advertisement

Answer

I use the below code

that code has issues:

  • it has NO connection cap, if you try to open 1 million urls simultaneously, it will try to create 1 million tcp connections at once (many websites will block you as a TCP DDoS around 100!)
  • it doesn’t even verify that it was able to create the curl easy handles (which it definitely won’t be able to do if it has too many urls, see the first issue)
  • it sleeps for 100 microseconds, which may be 100 microseconds longer than required, it’s supposed to use select() to let the OS tell it exactly when the data has arrived/been-sent, not wait 100 us (with curl_multi_select())
  • doesn’t detect transfer errors..
  • (optimization nitpicking) it doesn’t fetch any workers data until every single worker has finished, an optimized implementation would drain completed workers while still-working-workers would be transferring simultaneously..
  • (optimization-nitpicking) it doesn’t re-use handles
  • (optimization nitpicking) it doesn’t remove completed workers from the multi_list until every single worker has finished, which use more cpu in every curl_multi_exec call (because mutli_exec has to iterate even the finished workers that are still in the list)

this implementation should be significantly faster, has a configurable limit on max simultaneous connections, re-use curl handles, removes completed workers asap, detect curl_multi errors, etc

/**
 * fetch all urls in parallel,
 * warning: all urls must be unique..
 *
 * @param array $urls_unique
 *            urls to fetch
 * @param int $max_connections
 *            (optional, default 100) max simultaneous connections
 *            (some websites will auto-ban you for "ddosing" if you send too many requests simultaneously,
 *            and some wifi routers will get unstable on too many connectionis.. )
 * @param array $additional_curlopts
 *            (optional) set additional curl options here, each curl handle will get these options
 * @throws RuntimeException on curl_multi errors
 * @throws RuntimeException on curl_init() / curl_setopt() errors
 * @return array(url=>response,url2=>response2,...)
 */
function curl_fetch_multi_2(array $urls_unique, int $max_connections = 100, array $additional_curlopts = null)
{
    // $urls_unique = array_unique($urls_unique);
    $ret = array();
    $mh = curl_multi_init();
    // $workers format: [(int)$ch]=url
    $workers = array();
    $max_connections = min($max_connections, count($urls_unique));
    $unemployed_workers = array();
    for ($i = 0; $i < $max_connections; ++ $i) {
        $unemployed_worker = curl_init();
        if (! $unemployed_worker) {
            throw new RuntimeException("failed creating unemployed worker #" . $i);
        }
        $unemployed_workers[] = $unemployed_worker;
    }
    unset($i, $unemployed_worker);

    $work = function () use (&$workers, &$unemployed_workers, &$mh, &$ret): void {
        assert(count($workers) > 0, "work() called with 0 workers!!");
        $still_running = null;
        for (;;) {
            do {
                $err = curl_multi_exec($mh, $still_running);
            } while ($err === CURLM_CALL_MULTI_PERFORM);
            if ($err !== CURLM_OK) {
                $errinfo = [
                    "multi_exec_return" => $err,
                    "curl_multi_errno" => curl_multi_errno($mh),
                    "curl_multi_strerror" => curl_multi_strerror($err)
                ];
                $errstr = "curl_multi_exec error: " . str_replace([
                    "r",
                    "n"
                ], "", var_export($errinfo, true));
                throw new RuntimeException($errstr);
            }
            if ($still_running < count($workers)) {
                // some workers has finished downloading, process them
                // echo "processing!";
                break;
            } else {
                // no workers finished yet, sleep-wait for workers to finish downloading.
                // echo "select()ing!";
                curl_multi_select($mh, 1);
                // sleep(1);
            }
        }
        while (false !== ($info = curl_multi_info_read($mh))) {
            if ($info['msg'] !== CURLMSG_DONE) {
                // no idea what this is, it's not the message we're looking for though, ignore it.
                continue;
            }
            if ($info['result'] !== CURLM_OK) {
                $errinfo = [
                    "effective_url" => curl_getinfo($info['handle'], CURLINFO_EFFECTIVE_URL),
                    "curl_errno" => curl_errno($info['handle']),
                    "curl_error" => curl_error($info['handle']),
                    "curl_multi_errno" => curl_multi_errno($mh),
                    "curl_multi_strerror" => curl_multi_strerror(curl_multi_errno($mh))
                ];
                $errstr = "curl_multi worker error: " . str_replace([
                    "r",
                    "n"
                ], "", var_export($errinfo, true));
                throw new RuntimeException($errstr);
            }
            $ch = $info['handle'];
            $ch_index = (int) $ch;
            $url = $workers[$ch_index];
            $ret[$url] = curl_multi_getcontent($ch);
            unset($workers[$ch_index]);
            curl_multi_remove_handle($mh, $ch);
            $unemployed_workers[] = $ch;
        }
    };
    $opts = array(
        CURLOPT_URL => '',
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_ENCODING => ''
    );
    if (! empty($additional_curlopts)) {
        // i would have used array_merge(), but it does scary stuff with integer keys.. foreach() is easier to reason about
        foreach ($additional_curlopts as $key => $val) {
            $opts[$key] = $val;
        }
    }
    foreach ($urls_unique as $url) {
        while (empty($unemployed_workers)) {
            $work();
        }
        $new_worker = array_pop($unemployed_workers);
        $opts[CURLOPT_URL] = $url;
        if (! curl_setopt_array($new_worker, $opts)) {
            $errstr = "curl_setopt_array failed: " . curl_errno($new_worker) . ": " . curl_error($new_worker) . " " . var_export($opts, true);
            throw new RuntimeException($errstr);
        }
        $workers[(int) $new_worker] = $url;
        curl_multi_add_handle($mh, $new_worker);
    }
    while (count($workers) > 0) {
        $work();
    }
    foreach ($unemployed_workers as $unemployed_worker) {
        curl_close($unemployed_worker);
    }
    curl_multi_close($mh);
    return $ret;
}
User contributions licensed under: CC BY-SA
10 People found this is helpful
Advertisement