<?php define('PRE_DOMAIN','www'); define('DOMAIN','sina.com.cn'); define('PROTOCOL','https'); define('ROOT',PROTOCOL.'://'.PRE_DOMAIN.'.'.DOMAIN.'/'); foreach (spider() as $key => $value) { echo $value."\r\n"; } function spider(){ $headers=array( 'user-agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' ); $oUrls=parseURL(get(ROOT,$headers)); $result=array(); $queue=array(); foreach($oUrls as $u){ $result[$u]=true; array_push($queue,$u); while(!empty($queue)){ $v=array_pop($queue); $temp=parseURL(get($v,$headers)); foreach($temp as $j){ if(!isset($result[$j])){ yield $j; $result[$j]=true; array_push($queue,$j); } } } } } function get($url,$header=null){ $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); if (!empty($header)){ curl_setopt($curl, CURLOPT_HTTPHEADER, $header); } curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 3); curl_setopt($curl, CURLOPT_TIMEOUT, 10); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $output = curl_exec($curl); $h = curl_getinfo($curl); if(!empty($h) && $h['http_code']==200 && stripos($h['content_type'],'text/html')===false){ return ""; } curl_close($curl); return $output; } function parseURL($content){ preg_match_all('/<a.*href=["\']([^"\'>]*)["\'].*>/', $content,$matchs); if(empty($matchs[1])) return array(); $match=$matchs[1]; foreach ($match as $key => $value) { $flag=false; if(stripos($value, 'http')!==false && stripos($value,DOMAIN)===false){ $flag=true; } if(stripos($value, '//')===0 && stripos($value,DOMAIN)!==false){ $match[$key]='https:'.$value; continue; } if(stripos($value, '//')===0 && stripos($value,DOMAIN)===false){ $flag=true; } if(stripos($value, 'javascript')===0||stripos($value, '#')===0){ $flag=true; } if($flag){ unset($match[$key]); continue; } if(stripos($value,DOMAIN)!==false){ continue; } $match[$key]=ROOT.trim($value,'/'); } return $match; }