Skip to content
Advertisement

Create array from DOM node values in PHP

I’m trying to create an array of amazon product variants using DOM php, My desired array should look like;

["Variant Name":"ASIN number"]

Here is my code:

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, 'https://www.amazon.co.uk/dp/B08LZHMQXS?psc=1');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
$html = curl_exec($ch)
curl_close($ch);


$dom = new DomDocument();
            $dom->loadHTML($html);
            $dom_xpath = new DOMXpath($dom);


$variants = $dom_xpath->query('//*[@class="swatchAvailable" or @class="swatchSelect"]');
foreach($variants as $data){

           $input = $data->getAttribute("data-defaultasin");
           $inputn = $data->getAttribute("title");
if (!empty($input)) {
    
preg_match_all('/(.{10})/', $input, $output);

$output1 = str_replace("Click to select ","|",$inputn);
$split = explode("|", $output1);


$json1->SizeVariant3[] = $split[1];    
$json1->SizeVariant4[] = $output[0][0];


$json->VariantB = array_combine($json1->SizeVariant3,$json1->SizeVariant4);

}    
}



Though my code is working but there are some mistakes in code and it might wont work for all amazon products. so I need suggestions and improvement. and also my output is a json object :

{
        "2031 Deep Blue": "B08LZH84TN",
        "2031 Khaki": "B08LZHMQXS",
 }

while I want it to be an array as I mentioned above.

Advertisement

Answer

<?php       
    function curl( $url=NULL ){
        $cacert='c:/wwwroot/cacert.pem';    # download a copy from internet - https://curl.haxx.se/docs/caextract.html
        
        $curl=curl_init();
        if( parse_url( $url,PHP_URL_SCHEME )=='https' ){
            curl_setopt( $curl, CURLOPT_SSL_VERIFYPEER, true );
            curl_setopt( $curl, CURLOPT_SSL_VERIFYHOST, 2 );
            curl_setopt( $curl, CURLOPT_CAINFO, $cacert );
            curl_setopt( $curl, CURLOPT_CAPATH, $cacert );
        }
        curl_setopt( $curl, CURLOPT_URL,trim( $url ) );
        curl_setopt( $curl, CURLOPT_AUTOREFERER, true );
        curl_setopt( $curl, CURLOPT_FOLLOWLOCATION, true );
        curl_setopt( $curl, CURLOPT_FAILONERROR, true );
        curl_setopt( $curl, CURLOPT_HEADER, false );
        curl_setopt( $curl, CURLINFO_HEADER_OUT, false );
        curl_setopt( $curl, CURLOPT_RETURNTRANSFER, true );
        curl_setopt( $curl, CURLOPT_BINARYTRANSFER, true );
        curl_setopt( $curl, CURLOPT_CONNECTTIMEOUT, 20 );
        curl_setopt( $curl, CURLOPT_TIMEOUT, 60 );
        curl_setopt( $curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Curly-Wurly Ding-Dong' );
        curl_setopt( $curl, CURLOPT_MAXREDIRS, 10 );
        curl_setopt( $curl, CURLOPT_ENCODING, '' );
        
        $res=(object)array(
            'response'  =>  curl_exec( $curl ),
            'info'      =>  (object)curl_getinfo( $curl ),
            'errors'    =>  curl_error( $curl )
        );
        curl_close( $curl );
        return $res;
    }
    
    
    
    
    
    $url='https://www.amazon.co.uk/dp/B08LZHMQXS?psc=1';
    
    $res=curl( $url );
    if( $res->info->http_code==200 ){
    
        libxml_use_internal_errors( true );
        $dom=new DOMDocument;
        
        $dom->validateOnParse=false;
        $dom->recover=true;
        $dom->strictErrorChecking=false;
        $dom->loadHTML( $res->response );
        libxml_clear_errors();
        
        $xp=new DOMXPath( $dom );
        $expr='//*[@class="swatchAvailable" or @class="swatchSelect"]';
        
        $tmp=array();
        
        $col=$xp->query( $expr );
        if( $col && $col->length > 0 ){
            foreach( $col as $node ){
                $asin=$node->getAttribute('data-defaultasin');
                $title=str_replace( array( 'Click to select ', '|' ), '', $node->getAttribute('title') );
                $tmp[$title]=$asin;
            }
        }

        printf('<pre>%s</pre>',print_r($tmp,true));
    }
?>

Which outputs:

Array
(
    [2031 Deep Blue] => B08LZH84TN
    [2031 Khaki] => B08LZHMQXS
    [2031 Light Grey] => B08LZFGGRL
    [2031 Navy] => B08LZNGD5H
    [2031 Deep Grey] => B08LZHZXDW
    [2031 Wine Red] => B08LZHHGPD
    [8636 All White] => B07PHQ69B7
    [2031 All Black] => B08LZKXC3G
)

When tested with a new url it yielded:

Array
(
    [Wine] => B07NYYZSWG
    [Gold] => B07H4P7TZP
    [Rose Gold] => B07H4ZMTML
    [Silver Glitz] => B07P27Y9SQ
)
User contributions licensed under: CC BY-SA
5 People found this is helpful
Advertisement