Skip to content
Advertisement

Error when trying to get Instagram Embed page HTML code

I’m trying to get the HTML Code of the Instagram’s Embed pages for my API, but it returns me a strange error and I do not know what to do now, because I’m new to PHP. The code works on other websites.

I tried it already on other websites like apple.com and the strange thing is that when I call this function on the ‘normal’ post page it works, the error only appears when I call it on the ‘/embed’ URL.

This is my PHP Code:

<?php
    if (isset($_GET['url'])) {
        $filename = $_GET['url'];
        $file = file_get_contents($filename);
        $dom = new DOMDocument;
        libxml_use_internal_errors(true);
        $dom->loadHTML($file);
        libxml_use_internal_errors(false);
        $bodies = $dom->getElementsByTagName('body');
        assert($bodies->length === 1);
        $body = $bodies->item(0);
        for ($i = 0; $i < $body->children->length; $i++) {
            $body->remove($body->children->item($i));
        }
        $stringbody = $dom->saveHTML($body);
        echo $stringbody;
    }
?>

I call the API like this:

https://api.com/get-website-body.php?url=http://instagr.am/p/BoLVWplBVFb/embed

My goal is to get the body of the website, like it is when I call this code on the https://apple.com URL for example.

Advertisement

Answer

You can use direct url to scrape the data if you use CURL and its faster than file_get_content. Here is the curl code for different urls and this will scrape the body data alone.

if (isset($_GET['url'])) {
    // $website_url = 'https://www.instagram.com/instagram/?__a=1';
    // $website_url = 'https://apple.com';
    // $website_url = $_GET['url'];
    $website_url = 'http://instagr.am/p/BoLVWplBVFb/embed';
    $curl = curl_init();
    //curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($curl, CURLOPT_HEADER, false);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_URL, $website_url);
    curl_setopt($curl, CURLOPT_REFERER, $website_url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0(Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/66.0');
    $str = curl_exec($curl);
    curl_close($curl);
        
        $json = json_decode($str, true);
        print_r($str); // Just taking tha page as it is

        // Taking body part alone and play as your wish
        $dom = new DOMDocument;
        libxml_use_internal_errors(true);
        $dom->loadHTML($str);
        libxml_use_internal_errors(false);
        $bodies = $dom->getElementsByTagName('body');
        foreach ($bodies as $key => $value) {
            print_r($value);// You will all content of body here
    }
}

NOTE: Here you don’t want to use https://api.com/get-website-body.php?url=....

User contributions licensed under: CC BY-SA
3 People found this is helpful
Advertisement