I am trying to extract all the links from a set list of or urls in a text file and save the extracted links in another text file. I am trying to use the script below which was originally meant to extract Emails:
I changed the the email extract part
JavaScript
x
// preg_match_all('/([w+.]*w+@[w+.]*w+[w+-w+]*.w+)/is', $sPageContent, $aResults);
to extract links like this:
JavaScript
preg_match_all("/a[s]+[^>]*?href[s]?=[s"']+(.*?)["']+.*?>([^<]+|.*?)?</a>/is", $sPageContent, $aResults);
Here is the full code:
JavaScript
class getEmails
{
const EMAIL_STORAGE_FILE = 'links.txt';
public function __construct($sFilePath)
{
$aUrls = $this->getUrls($sFilePath);
foreach($aUrls as $sUrl) {
$rPage = $this->getContents($sUrl);
$this->getAndSaveEmails($rPage);
}
$this->removeDuplicate();
}
protected function getAndSaveEmails($sPageContent)
{
// preg_match_all('/([w+.]*w+@[w+.]*w+[w+-w+]*.w+)/is', $sPageContent, $aResults);
preg_match_all("/a[s]+[^>]*?href[s]?=[s"']+(.*?)["']+.*?>([^<]+|.*?)?</a>/is", $sPageContent, $aResults);
foreach($aResults[1] as $sCurrentEmail) {
file_put_contents(self::EMAIL_STORAGE_FILE, $sCurrentEmail . "rn", FILE_APPEND);
}
}
protected function getContents($sUrl)
{
if (function_exists('curl_init')) {
$rCh = curl_init();
curl_setopt($rCh, CURLOPT_URL, $sUrl);
curl_setopt($rCh, CURLOPT_HEADER, 0);
curl_setopt($rCh, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($rCh, CURLOPT_FOLLOWLOCATION, 1);
$mResult = curl_exec($rCh);
curl_close($rCh);
unset($rCh);
return $mResult;
} else {
return file_get_contents($sUrl);
}
}
protected function getUrls($sFilePath)
{
return file($sFilePath);
}
protected function removeDuplicate()
{
$aEmails = file(self::EMAIL_STORAGE_FILE);
$aEmails = array_unique($aEmails);
file_put_contents(self::EMAIL_STORAGE_FILE, implode('', $aEmails));
}
}
new getEmails('sitemap_index.txt');
The problem i have with this is that it is supposed to get all links from a list of urls but it only scanned the first link and ignored the rest. I have 30 links that i want to extract from, how can i make the above code work?
Advertisement
Answer
you must using trim() at the url..
try add trim on your code
JavaScript
foreach($aUrls as $sUrl) {
$sUrl=trim($sUrl); //this
$rPage = $this->getContents($sUrl);
$this->getAndSaveEmails($rPage);
}