如何从页面获取所有网址(php)

问题描述:

我有一个网页,其中的描述在另一个网站(如书签/网站列表)中列出。我如何使用PHP从该页面获取所有URL并将它们写入txt文件(每行一个,只有url没有描述)?如何从页面获取所有网址(php)

页看起来是这样的:

Some description

Other description

Another one

而且我想脚本的TXT输出看起来像这样:

http://link.com

http://link2.com

http://link3.com

单程

$url="http://wwww.somewhere.com"; 
$data=file_get_contents($url); 
$data = strip_tags($data,"<a>"); 
$d = preg_split("/<\/a>/",$data); 
foreach ($d as $k=>$u){ 
    if(strpos($u, "<a href=") !== FALSE){ 
     $u = preg_replace("/.*<a\s+href=\"/sm","",$u); 
     $u = preg_replace("/\".*/","",$u); 
     print $u."\n"; 
    } 
} 
+0

另一种方式

$url = "http://wwww.somewhere.com"; 

$html = file_get_contents($url); 

$doc = new DOMDocument(); 
$doc->loadHTML($html); //helps if html is well formed and has proper use of html entities! 

$xpath = new DOMXpath($doc); 

$nodes = $xpath->query('//a'); 

foreach($nodes as $node) { 
    var_dump($node->getAttribute('href')); 
} 

你可以用它来获取在给定的网页所有链接。

<?php 

    $var = fread_url($url); 

    preg_match_all ("/a[\s]+[^>]*?href[\s]?=[\s\"\']+". 
        "(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", 
        $var, &$matches); 

    $matches = $matches[1]; 
    $list = array(); 

    foreach($matches as $var) 
    {  
     print($var."<br>"); 
    } 

    function fread_url($url,$ref="") 
    { 
     if(function_exists("curl_init")){ 
      $ch = curl_init(); 
      $user_agent = "Mozilla/4.0 (compatible; MSIE 5.01; ". 
          "Windows NT 5.0)"; 
      $ch = curl_init(); 
      curl_setopt($ch, CURLOPT_USERAGENT, $user_agent); 
      curl_setopt($ch, CURLOPT_HTTPGET, 1); 
      curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
      curl_setopt($ch, CURLOPT_FOLLOWLOCATION , 1); 
      curl_setopt($ch, CURLOPT_FOLLOWLOCATION , 1); 
      curl_setopt($ch, CURLOPT_URL, $url); 
      curl_setopt($ch, CURLOPT_REFERER, $ref); 
      curl_setopt ($ch, CURLOPT_COOKIEJAR, 'cookie.txt'); 
      $html = curl_exec($ch); 
      curl_close($ch); 
     } 
     else{ 
      $hfile = fopen($url,"r"); 
      if($hfile){ 
       while(!feof($hfile)){ 
        $html.=fgets($hfile,1024); 
       } 
      } 
     } 
     return $html; 
    } 

    ?>