Skip to content
Advertisement

Extract links from a list of urls

I am trying to extract all the links from a set list of or urls in a text file and save the extracted links in another text file. I am trying to use the script below which was originally meant to extract Emails:

I changed the the email extract part

          // preg_match_all('/([w+.]*w+@[w+.]*w+[w+-w+]*.w+)/is', $sPageContent, $aResults);
      

to extract links like this:

          preg_match_all("/a[s]+[^>]*?href[s]?=[s"']+(.*?)["']+.*?>([^<]+|.*?)?</a>/is", $sPageContent, $aResults);

Here is the full code:

class getEmails 

{
    const EMAIL_STORAGE_FILE = 'links.txt';

     public function __construct($sFilePath)
     {
         $aUrls = $this->getUrls($sFilePath);

         foreach($aUrls as $sUrl) {
             $rPage = $this->getContents($sUrl);
             $this->getAndSaveEmails($rPage);
         }
         $this->removeDuplicate();
     }

     protected function getAndSaveEmails($sPageContent)
     {
          // preg_match_all('/([w+.]*w+@[w+.]*w+[w+-w+]*.w+)/is', $sPageContent, $aResults);
          
          preg_match_all("/a[s]+[^>]*?href[s]?=[s"']+(.*?)["']+.*?>([^<]+|.*?)?</a>/is", $sPageContent, $aResults);

         foreach($aResults[1] as $sCurrentEmail) {
             file_put_contents(self::EMAIL_STORAGE_FILE, $sCurrentEmail . "rn", FILE_APPEND);
         }
     }

     protected function getContents($sUrl)
     {
         if (function_exists('curl_init')) {
            $rCh = curl_init();
            curl_setopt($rCh, CURLOPT_URL, $sUrl);
            curl_setopt($rCh, CURLOPT_HEADER, 0);
            curl_setopt($rCh, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($rCh, CURLOPT_FOLLOWLOCATION, 1);
            $mResult = curl_exec($rCh);
            curl_close($rCh);
            unset($rCh);
            return $mResult;
        } else {
            return file_get_contents($sUrl);
        }
     }

     protected function getUrls($sFilePath)
     {
         return file($sFilePath);
     }

     protected function removeDuplicate()
     {
         $aEmails = file(self::EMAIL_STORAGE_FILE);
         $aEmails = array_unique($aEmails);
         file_put_contents(self::EMAIL_STORAGE_FILE, implode('', $aEmails));
     }
}

new getEmails('sitemap_index.txt');

The problem i have with this is that it is supposed to get all links from a list of urls but it only scanned the first link and ignored the rest. I have 30 links that i want to extract from, how can i make the above code work?

Advertisement

Answer

you must using trim() at the url..
try add trim on your code

     foreach($aUrls as $sUrl) {

         $sUrl=trim($sUrl); //this

         $rPage = $this->getContents($sUrl);
         $this->getAndSaveEmails($rPage);
     }
User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement