新聞中心
這里有您想知道的互聯(lián)網(wǎng)營(yíng)銷解決方案
php多線程爬蟲(chóng)類
- 代碼:
]> * @property * 1、calltrigger 觸發(fā)爬蟲(chóng)程序的回調(diào)函數(shù) * 2、calltodo 處理業(yè)務(wù)邏輯的回調(diào)函數(shù) 如:把抓取到的內(nèi)容處理后存到數(shù)據(jù)庫(kù) * 3、timeout 超時(shí)時(shí)間,默認(rèn)5秒 * 4、depth 重定向深度,默認(rèn)3 * 5、name 上傳文件的名字,默認(rèn)file * 6、cookie 模擬登錄時(shí)cookie存儲(chǔ)在本地的文件,默認(rèn)cookie_n.txt * @method * 1、ssl 是否設(shè)置https true:是 false:否 * 2、auth 啟用驗(yàn)證 user:用戶名 pass:密碼 * 3、login 模擬登錄,獲取cookie * 4、cookie 使用cookie登錄 * 5、header 設(shè)置請(qǐng)求頭 data:請(qǐng)求頭數(shù)組 * 6、proxy 設(shè)置服務(wù)器代理 url:代理服務(wù)器url port:代理服務(wù)器端口 * 7、agent 設(shè)置瀏覽器代理 browse:代理瀏覽器 默認(rèn):Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) * 8、get 模擬get請(qǐng)求 data:傳遞的數(shù)據(jù) * 9、post 模擬post請(qǐng)求 data:傳遞的數(shù)據(jù) * 10、json 模擬json請(qǐng)求 data:傳遞的數(shù)據(jù) * 11、upload 模擬表單上傳 files:上傳的文件 array|string * 12、download 下載文件 dir:要下載的文件 格式:a/b * 13、run 執(zhí)行 depth:深度 */ class crawl{ public $calltrigger = 'trigger'; # 觸發(fā)爬蟲(chóng)程序的回調(diào)函數(shù) public $calltodo = 'todo'; # 處理業(yè)務(wù)邏輯的回調(diào)函數(shù) public $timeout = 5; # 超時(shí)時(shí)間,默認(rèn)5秒 public $depth = 3; # 重定向深度,默認(rèn)3 public $name = 'file'; # 上傳文件的名字,默認(rèn)file public $cookie = 'cookie.txt'; # 模擬登錄時(shí)cookie存儲(chǔ)在本地的文件,默認(rèn)cookie_n private $schemes = array(); private $hosts = array(); private $paths = array(); private $querys = array(); private $options = array(); private $chs; private $fps; private $handle; private $urls = array(); /* @desc:內(nèi)部方法,獲取頁(yè)面中的超鏈接 @param content 頁(yè)面內(nèi)容 @return urls 獲取到的超鏈接 */ private function geturl($content){ $preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/i'; $bool = preg_match_all($preg,$content,$res); $urls = array(); if($bool){ $urls = $res[1]; } $urls = array_unique($urls); return $urls; } /* @desc:內(nèi)部方法,修復(fù)不完整的url @param url 原始url @param url 修復(fù)好的url */ private function reviseurl($url){ $info = parse_url($url); $scheme = $info["scheme"]?:'http'; $user = $info["user"]; $pass = $info["pass"]; $host = $info["host"]; $port = $info["port"]; $path = $info["path"]; $url = $scheme . '://'; if ($user && $pass) { $url .= $user . ":" . $pass . "@"; } $url .= $host; if ($port) { $url .= ":" . $port; } $url .= $path; return $url; } /* @desc:內(nèi)部方法,調(diào)用回調(diào)函數(shù)進(jìn)行業(yè)務(wù)處理 @param content 傳入到回調(diào)函數(shù)的參數(shù) */ private function todo($content){ $calltodo = $this->calltodo; call_user_func($calltodo,$content); } /* @desc:觸發(fā)爬蟲(chóng)程序的回調(diào)函數(shù) @param urls 待處理的url數(shù)組 @param depth 處理深度 */ private function trigger($urls,$depth){ $calltrigger = $this->calltrigger; call_user_func($calltrigger,$urls,$depth); } /* @desc:內(nèi)部方法 設(shè)置get請(qǐng)求參數(shù) @param data 請(qǐng)求數(shù)據(jù) */ private function setget($data){ $schemes = $this->schemes; $hosts = $this->hosts; $paths = $this->paths; $querys = $this->querys; foreach($this->chs as $k=>$v){ $sep = ($querys[$k] || !empty($data))?"?":""; $qurl = $schemes[$k].'://'.$hosts[$k].$paths[$k].$sep.$querys[$k].$data; $this->options[$k][CURLOPT_URL] = $qurl; } return $this; } /* @desc:內(nèi)部方法 設(shè)置post請(qǐng)求參數(shù) @param data 請(qǐng)求數(shù)據(jù) */ private function setpost($data){ $schemes = $this->schemes; $hosts = $this->hosts; $paths = $this->paths; $querys = $this->querys; foreach($this->chs as $k=>$v){ $sep = $query?"?":""; $qurl = $schemes[$k].'://'.$hosts[$k].$paths[$k].$sep.$querys[$k]; $this->options[$k][CURLOPT_URL] = $qurl; $this->options[$k][CURLOPT_POST] = 1; $this->options[$k][CURLOPT_POSTFIELDS] = $data; } return $this; } /* @desc:內(nèi)部方法 設(shè)置最終請(qǐng)求參數(shù) */ private function setopt(){ $options = $this->options; foreach($options as $k=>$v){ curl_setopt_array( $this->chs[$k], $v ); } return $this; } /* @desc:構(gòu)造方法 設(shè)置初始請(qǐng)求參數(shù) @param urls 請(qǐng)求地址數(shù)組 */ public function __construct($urls){ $this->urls = $urls; $this->handle = curl_multi_init(); foreach($urls as $k=>$v){ $info = parse_url($v); $this->schemes[$k] = $info['scheme']?:'http'; $this->hosts[$k] = $info['host']; $this->paths[$k] = $info['path']; $this->querys[$k] = $info['query']; $this->chs[$k] = curl_init(); $this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout; $this->options[$k][CURLOPT_RETURNTRANSFER] = 1; $this->options[$k][CURLOPT_FOLLOWLOCATION] = 1; $this->options[$k][CURLINFO_HEADER_OUT] = true; $this->options[$k][CURLOPT_ENCODING] = 'gzip'; $this->options[$k][CURLOPT_MAXREDIRS] = $this->depth; curl_multi_add_handle ($this->handle,$this->chs[$k]); } } /* @desc:是否設(shè)置https請(qǐng)求 @param bool true:https請(qǐng)求 false:http請(qǐng)求 */ public function ssl($bool = false){ if($bool){ foreach($this->chs as $k=>$v){ $this->scheme[$k] = 'https'; $this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1; $this->options[$k][CURLOPT_SSL_VERIFYPEER] = false; } } return $this; } /* @desc:設(shè)置驗(yàn)證用戶名、密碼 @param user 用戶名 @param pass 密碼 */ public function auth($user,$pass){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_USERPWD] = $user.':'.$pass; } return $this; } /* @desc:模擬登錄 */ public function login(){ $cookie = $this->cookie; $arr = explode('.',$cookie); $name = $arr[0]; $ext = $arr[1]; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_COOKIEJAR] = $name.'_'.$k.'.'.$ext; $this->options[$k][CURLOPT_RETURNTRANSFER] = 0; } return $this; } /* @desc:帶cookie登錄 */ public function cookie(){ $cookie = $this->cookie; $arr = explode('.',$cookie); $name = $arr[0]; $ext = $arr[1]; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_COOKIEFILE] = $name.'_'.$k.'.'.$ext; } return $this; } /* @desc:設(shè)置請(qǐng)求頭信息 @param data 請(qǐng)求頭 */ public function header($data){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array(); $this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data); } return $this; } /* @desc:設(shè)置代理服務(wù)器 @param url 代理服務(wù)器url @param port 代理服務(wù)器端口 */ public function proxy($url,$port){ $info = parse_url($url); $scheme = $info['scheme']?:'http'; $host = $info['host']; $path = $info['path']; $purl = $scheme.'://'.$host.$path.':'.$port; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_PROXY] = $purl; } return $this; } /* @desc:設(shè)置代理瀏覽器 @param browse 代理瀏覽器 */ public function agent($browse = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_USERAGENT] = $browse; } return $this; } /* @desc:模擬get請(qǐng)求 @param data 請(qǐng)求數(shù)據(jù) */ public function get($data = array()){ $data = http_build_query($data); $this->setget($data); return $this; } /* @desc:模擬post請(qǐng)求 @param data 請(qǐng)求數(shù)據(jù) */ public function post($data = array()){ $this->setpost($data); return $this; } /* @desc:模擬json請(qǐng)求 @param data 請(qǐng)求數(shù)據(jù) */ public function json($data = array()){ $data = json_encode($data); $header = array( 'Content-Type: application/json', 'Content-Length:' . strlen($data) ); $this->header($header); $this->setpost($data); return $this; } /* @desc:模擬表單上傳 @param files 文件路徑 */ public function upload($files){ $data = array(); $name = $this->name; if(is_array($files)){ foreach($files as $k=>$v){ $data["{$name}[{$k}]"]=new \CURLFile($v); } }else{ $data["{$name}"]=new \CURLFile($files); } $this->setpost($data); return $this; } /* @desc:下載文件 @param dir 存儲(chǔ)文件目錄 */ public function download($dir = ''){ $paths = $this->paths; if($dir && !is_dir($dir)){ mkdir($dir,0755,true); } foreach($this->paths as $k=>$v){ $name = strrchr($v, '/'); $dsep = $dir?'/':''; $this->fps[$k]=fopen('.'.$dsep.$dir.$name, 'w'); $this->options[$k][CURLOPT_FILE] = $this->fps[$k]; } $this->setget(''); return $this; } /* @desc:執(zhí)行方法 @param depth 深度 默認(rèn)2 */ public function run($depth = 2){ $this->setopt(); $chs = $this->chs; $handle = $this->handle; $urls = $this->urls; if($depth > 0){ $depth--; $active = null; $mrc = curl_multi_exec($handle, $active); while ($mrc == CURLM_CALL_MULTI_PERFORM) { $mrc = curl_multi_exec($handle, $active); } while ($active && $mrc == CURLM_OK) { if (curl_multi_select($handle) != -1) { usleep(100); } $mrc = curl_multi_exec($handle, $active); while ($mrc == CURLM_CALL_MULTI_PERFORM) { $mrc = curl_multi_exec($handle, $active); } } foreach ($chs as $k => $v) { if (curl_error($chs[$k]) == "") { $content = curl_multi_getcontent($chs[$k]); $this->todo($content); $aurls = $this->geturl($content); $urls[$k] = $this->reviseurl($urls[$k]); if (is_array($aurls) && !empty($aurls)) { foreach ($aurls as $k1=>$u) { if (preg_match('/^http/', $u)) { $returl[$k1] = $u; } else { $real = $urls[$k] . '/' . $u; $returl[$k1] = $real; } } $this->trigger($returl,$depth); } } curl_multi_remove_handle($handle, $chs[$k]); curl_close($chs[$k]); } curl_multi_close($handle); } } }
- 測(cè)試:
function todo($content){ echo 'ok'.PHP_EOL; } $urls=array( 'www.baidu.com', 'www.taobao.com' ); function trigger($urls = array(),$depth = 2){ $crawl = new crawl($urls); $crawl->get()->run($depth); } trigger($urls);
- 輸出:
ok ok ok ok ok ok ok ok ok ok ok ok ok ok
網(wǎng)頁(yè)題目:php多線程爬蟲(chóng)類
當(dāng)前路徑:http://ef60e0e.cn/article/gojios.html