I am trying to extract all the links from a set list of or urls in a text file and save the extracted links in another text file. I am trying to use the script below which was originally meant to extract Emails:
I changed the the email extract part
// preg_match_all('/([w+.]*w+@[w+.]*w+[w+-w+]*.w+)/is', $sPageContent, $aResults);
to extract links like this:
preg_match_all("/a[s]+[^>]*?href[s]?=[s"']+(.*?)["']+.*?>([^<]+|.*?)?</a>/is", $sPageContent, $aResults);
Here is the full code:
class getEmails { const EMAIL_STORAGE_FILE = 'links.txt'; public function __construct($sFilePath) { $aUrls = $this->getUrls($sFilePath); foreach($aUrls as $sUrl) { $rPage = $this->getContents($sUrl); $this->getAndSaveEmails($rPage); } $this->removeDuplicate(); } protected function getAndSaveEmails($sPageContent) { // preg_match_all('/([w+.]*w+@[w+.]*w+[w+-w+]*.w+)/is', $sPageContent, $aResults); preg_match_all("/a[s]+[^>]*?href[s]?=[s"']+(.*?)["']+.*?>([^<]+|.*?)?</a>/is", $sPageContent, $aResults); foreach($aResults[1] as $sCurrentEmail) { file_put_contents(self::EMAIL_STORAGE_FILE, $sCurrentEmail . "rn", FILE_APPEND); } } protected function getContents($sUrl) { if (function_exists('curl_init')) { $rCh = curl_init(); curl_setopt($rCh, CURLOPT_URL, $sUrl); curl_setopt($rCh, CURLOPT_HEADER, 0); curl_setopt($rCh, CURLOPT_RETURNTRANSFER, 1); curl_setopt($rCh, CURLOPT_FOLLOWLOCATION, 1); $mResult = curl_exec($rCh); curl_close($rCh); unset($rCh); return $mResult; } else { return file_get_contents($sUrl); } } protected function getUrls($sFilePath) { return file($sFilePath); } protected function removeDuplicate() { $aEmails = file(self::EMAIL_STORAGE_FILE); $aEmails = array_unique($aEmails); file_put_contents(self::EMAIL_STORAGE_FILE, implode('', $aEmails)); } } new getEmails('sitemap_index.txt');
The problem i have with this is that it is supposed to get all links from a list of urls but it only scanned the first link and ignored the rest. I have 30 links that i want to extract from, how can i make the above code work?
Advertisement
Answer
you must using trim() at the url..
try add trim on your code
foreach($aUrls as $sUrl) { $sUrl=trim($sUrl); //this $rPage = $this->getContents($sUrl); $this->getAndSaveEmails($rPage); }