I am writing a program to scrape the following website: https://filmstoon.in/
From it, I want to find several movies (Batman Begins, Iron Man, Expendables 3) and TV series (Game of Thrones) and to scrape the title, the host url and the meta url. I managed to do this, however, it is manually crafted for the specific titles. The code:
include ("simple_html_dom.php"); ini_set('max_execution_time', 0); date_default_timezone_set('Europe/Vilnius'); $link = "https://filmstoon.in/series/game-of-thrones/"; $link1 = "https://filmstoon.in/batman-begins/"; $link2 = "https://filmstoon.in/iron-man/"; $link3 = "https://filmstoon.in/expendables-3/"; //TV Series class episode{ private $title; private $host_url; private $linking_url; public function setTitle($title){ $this->title = $title; } public function getTitle(){ return $this->title; } public function setHost_url($host_url){ $this->host_url = $host_url; } public function getHost_url(){ return $this->host_url; } public function setLinking_url($linking_url){ $this->linking_url = $linking_url; } public function getLinking_url(){ return $this->linking_url; } } function main(){ $array_url = getting_url(); foreach($array_url as $single_link){ $episodeObject = info_from_linking($single_link); echo_to_server($episodeObject->getLinking_url(), $episodeObject->getHost_url(), $episodeObject->getTitle()); writeToFile($episodeObject->getLinking_url(), $episodeObject->getHost_url(), $episodeObject->getTitle()); } } function getting_url(){ global $link; $html = file_get_html($link); $array_url = array(); foreach($html->find('.les-content a') as $divClass) { $linking_url = $divClass->href; array_push($array_url, $linking_url); } return $array_url; } function info_from_linking($episode_link){ $inside_linking = file_get_html($episode_link); $mainDiv = $inside_linking->find('div[class="main-content main-detail"]')[0]; $title = $mainDiv->find('h3[itemprop="name"]',0)->plaintext; $host_url = $mainDiv->find('iframe',1)->src; $class = new episode; $class->setTitle($title); $class->setHost_url($host_url); $class->setLinking_url($episode_link); return $class; } function echo_to_server($linking_url, $host_url, $title){ $date = date('m/d/Y H:i', time()); echo "{$date} t {$linking_url} t {$host_url} t {$title} n"; } function writeToFile($linking_url, $host_url, $title){ $date = date('m/d/Y H:i', time()); $result = array($date, $linking_url, $host_url, $title); $output = 'scrape.txt'; file_put_contents($output, print_r($result, true), FILE_APPEND); } main(); //Movies function get_content_movies($url){ $htmlContent = file_get_contents($url); $dom = new simple_html_dom(); $dom->load($htmlContent); if(count($dom->find('div[class="main-content main-detail"]'))>0){ $file = $dom->find('div[class="main-content main-detail"]')[0]; $title = $file->find('h3[itemprop="name"]',0)->plaintext; $host_url = $file->find('iframe',1)->src; $meta_link = $dom->find('meta[property="og:url"]',0)->content; $date = date('m/d/Y H:i', time()); echo "{$date} t {$host_url} t {$meta_link} t {$title} n"; $result = array($title, $host_url, $meta_link, $date); $output = 'scrape.txt'; file_put_contents($output, print_r($result, true), FILE_APPEND); } } get_content_movies($link1); get_content_movies($link2); get_content_movies($link3);
Everything works fine, however, I would like to make it so that if I write:
php crawler.php batman begins
or any other title while executing the code in cmd, it would specifically find that movie/tv series and execute the script that I wrote.
So far, the only ideas I’ve had how to execute this is to scrape the entirety of the page, store it in a database (.txt file for example) and then find the content from it with $argc and $argv. Or – go to the main page and use the search function. My argument that I write in the command line would be passed to search form and then it would execute the script.
However, I can not wrap my head around how to do any of these ideas since I am quite new.
Advertisement
Answer
function getArgumentValues($argv, $seperator){ $values = "$argv[1]"; foreach($argv as $key=>$value){ if($key>1){ $values.="$seperator$value"; } } return $values; } function get_content_movies($linkMovies, $argv){ $htmlContent = file_get_contents($linkMovies); $argvValue = getArgumentValues($argv, " "); if(!preg_match("/href="(.*?)".*?oldtitle="$argvValue/i", $htmlContent, $search)){ return null; }; $key = array_values($search)[1]; $htmlContent = file_get_contents($key); if(preg_match("/series/", $key)){ main(); } else{ preg_match('/<h3 itemprop="name">(.*)</h3>/iSU', $htmlContent, $title); preg_match('/<iframe.*data-lazy-src="(.*)".*></iframe>/iSU', $htmlContent, $embed_url); preg_match('/<meta.*property="og:url".*content="(.*)".*/>/iSU', $htmlContent, $meta_url); $date = date('m/d/Y H:i', time()); echo "{$date} t {$embed_url[1]} t {$meta_url[1]} t {$title[1]} n"; $result = array($date, $embed_url[1], $meta_url[1], $title[1]); $output = 'scrape.txt'; file_put_contents($output, print_r($result, true), FILE_APPEND); } } get_content_movies($linkMovies, $argv);