I am trying to scrape a webpage that has an AJAX paging html table. I can get the first page of the table fine but I want to be able to get every single page in the table. Here is an example of an ajax paging table.
This how I am currently scraping the first page:
<?php include_once('simple_html_dom.php'); class JobPosting { var $Id; var $RequistionNumber; var $Title; var $JobCategory; var $EmployeeClassification; var $PartTime; var $City; var $Location; } function GetJobPostings($filter) { $html = file_get_html('https://rn21.ultipro.com/SQU1001/jobboard/listjobs.aspx'); $JobPostings = array(); $headerFlag = true; foreach($html->find('table.GridTable > tbody > tr') as $job) { if($headerFlag == true) { $headerFlag = false; } else { $count = 0; $jobposting = new JobPosting; foreach($job->find('td') as $property) { switch ($count) { case 0: $jobposting->Id = $property->find('a')[0]->href; $jobposting->RequistionNumber = $property->plaintext; break; case 1: $jobposting->Title = $property->plaintext; break; case 2: $jobposting->JobCategory = $property->plaintext; break; case 3: $jobposting->EmployeeClassification = $property->plaintext; break; case 4: $jobposting->PartTime = $property->plaintext; break; case 5: $jobposting->City = $property->plaintext; break; case 6: $jobposting->Location = $property->plaintext; break; } $count++; } if($jobposting->Company == $filter ) { array_push($JobPostings, $jobposting); } } } return $JobPostings; } ?>
How can I get all the job postings from this website?
Here is the request from fiddler for the example:
Advertisement
Answer
At the end of the POST
content, search for __PreparedPXDO
and change the value of __PageNumber
:
__VIEWSTATE=%2FwEPDwUENTM4MWRk8%2BV34Sl40tIYo214geiPudK8q8E%3D&__VIEWSTATEGENERATOR=0906FCBB&__EVENTVALIDATION=%2FwEWDQLRjdiVBQLZmLPIBwLRmtKmBwLJvYapAwKduOGWBgKC9qGSBQL99cOUCAK2xqmfDQKj96rgDQLLvLGTDwLArLTaDQKD%2BaqJAQLUpr%2FLD8lqfch4SOH0EsNlMYgxMFw%2FPlV6&__PXPOSTBACK=1&__PXLBN=&__PXLBV=&__PXDO=&__Keywords=&__RecordsPerPage=&__PreparedPXDO=*501A15E038216814AFF20DE0ED6A92EF0CE34B88DABA9CE31C59DE380A87C4EC0C9572AA25DFFDFB8E15C55C55D711411B386A423D44CE26F051F013F6826174BD97FCDFC3954E186D1F0F623E8C53B4250AEB08B15EF180055517D55245D702DC340152E0ECE6CE6AE715B58F3EA59ADA88C90800A1ED5126FAD95F78078DB20185DBE2A3E3CD0BB550D5CC57A39F33002085E0BC4DAF71FAAE85E859DD39D50C2EDD607B38DD8C9572965A96E105B4896AA698A0663ACE8B3D6AA25271EB9E6225FB183415C27BD742A576A1F32AF4E51012C95495030DDF98AE9007E645CA9D54BA17839DF70BAFDB732014857D3BCFEDAD3F1C8E1BA42A22836C70FC68757FDC843E65FEFAF8D6779A62AF4C791F34DF0EC4794614C6985C18D64694DAD5C5250E7961A0A2B482D32100F42BEB6BA6F35FBB72429D02&__PageNumber= ->HERE<- &__Next=+%3E+
You can easily create a loop to scrap all available pages, i.e.:
<?php $num = 1; while($num < 5){ $postFields = "__VIEWSTATE=%2FwEPDwUENTM4MWRk8%2BV34Sl40tIYo214geiPudK8q8E%3D&__VIEWSTATEGENERATOR=0906FCBB&__EVENTVALIDATION=%2FwEWDQLRjdiVBQLZmLPIBwLRmtKmBwLJvYapAwKduOGWBgKC9qGSBQL99cOUCAK2xqmfDQKj96rgDQLLvLGTDwLArLTaDQKD%2BaqJAQLUpr%2FLD8lqfch4SOH0EsNlMYgxMFw%2FPlV6&__PXPOSTBACK=1&__PXLBN=&__PXLBV=&__PXDO=&__Keywords=&__RecordsPerPage=&__PreparedPXDO=*501A15E038216814AFF20DE0ED6A92EF0CE34B88DABA9CE31C59DE380A87C4EC0C9572AA25DFFDFB8E15C55C55D711411B386A423D44CE26F051F013F6826174BD97FCDFC3954E186D1F0F623E8C53B4250AEB08B15EF180055517D55245D702DC340152E0ECE6CE6AE715B58F3EA59ADA88C90800A1ED5126FAD95F78078DB20185DBE2A3E3CD0BB550D5CC57A39F33002085E0BC4DAF71FAAE85E859DD39D50C2EDD607B38DD8C9572965A96E105B4896AA698A0663ACE8B3D6AA25271EB9E6225FB183415C27BD742A576A1F32AF4E51012C95495030DDF98AE9007E645CA9D54BA17839DF70BAFDB732014857D3BCFEDAD3F1C8E1BA42A22836C70FC68757FDC843E65FEFAF8D6779A62AF4C791F34DF0EC4794614C6985C18D64694DAD5C5250E7961A0A2B482D32100F42BEB6BA6F35FBB72429D02&__PageNumber=$num&__Next=+%3E+"; $curlHeaders = array( "Host: rn21.ultipro.com", "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:46.0) Gecko/20100101 Firefox/46.0", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br", "Referer: https://rn21.ultipro.com/SQU1001/jobboard/listjobs.aspx?Page=Browse", "Cookie: __jbsrcid=*86EE06D72F666815; ASP.NET_SessionId=3vvj3v55gadspcyklbfg2gip; BIGipServerrn21.ultipro.com_http_pool=366258698.20480.0000", "Connection: keep-alive", "Content-Type: application/x-www-form-urlencoded", "Content-Length: " . strlen($postFields) ); $url = "https://rn21.ultipro.com/SQU1001/jobboard/listjobs.aspx?Page=Browse"; $ch = curl_init(); curl_setopt($ch,CURLOPT_URL, $url); curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch,CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch,CURLOPT_HTTPHEADER, $curlHeaders); curl_setopt($ch,CURLOPT_POSTFIELDS, $postFields); $result = curl_exec($ch); curl_close($ch); echo $result; $num++; } ?>
The trick is to make a first post (next page button) using a normal browser and sniff the post content using live http headers for firefox, or similar, and emulate it after using curl.
I’ve tested the code above and I was able to retrieve all job posts on the website.