- 論壇徽章:
- 1
|
php實(shí)現(xiàn)的web采集神器,只需要通過(guò)簡(jiǎn)單配置,就可以采集任意沒(méi)有嚴(yán)格校驗(yàn)的站點(diǎn)
可以擴(kuò)展IP代理功能以及偽原創(chuàng)功能
[PHP]代碼- <?php
- /**
- * 可以靈活配置使用的采集器
- * 作者:Rain
- * 創(chuàng)建時(shí)間:2015-02-03 15:17:30
- * 版本信息:V1.0
- */
-
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //數(shù)據(jù)庫(kù)的相關(guān)配置信息,請(qǐng)根據(jù)您的數(shù)據(jù)庫(kù)信息進(jìn)行配置
- define('DB_HOST', 'localhost');
- define('DB_USER', 'root');
- define('DB_PWD', 'test123456');
- define('DB_NAME', 'test_dbname');
- define('DB_CHARSET', 'utf8');
- define('TABLE_NAME', 'tb_book');
- //end
-
- //網(wǎng)站信息相關(guān)的配置,請(qǐng)根據(jù)具體需要采集的網(wǎng)站內(nèi)容信息進(jìn)行配置
- define('WEB_CHARSET', 'gbk');
- //變動(dòng)的參數(shù),使用%d進(jìn)行替換,只支持?jǐn)?shù)值形式的變動(dòng)
- define('WEB_LIST_URL', 'http://www.pcbookcn.com/book/1_%d.htm');
- //分頁(yè)的條數(shù)
- define('PAGE_COUNT', 14);
- //從哪個(gè)頁(yè)面開(kāi)始抓取
- define('PAGE_START', 1);
- //內(nèi)容頁(yè)的URL,使用正則模式,必須包含/,例如:/\/xuefu2008\/article\/details\/(\d)+/i
- define('WEB_CONTENT_URL_REG', '/\/book\/(\d)+\.htm/i');
- //網(wǎng)站域名HOST信息,不包含末尾的/,例如:http://blog.csdn.net
- define('WEB_HOST', 'http://www.pcbookcn.com');
- //列表頁(yè)內(nèi)容的精準(zhǔn)定位,用來(lái)大致抓取一個(gè)列表頁(yè)的內(nèi)容顯示模塊位置,使用正則進(jìn)行定位
- define('WEB_LIST_POSTION', '/book_name\.gif(.*?)<td\swidth="15\%"\snowrap>/i');
- //end
-
- //微調(diào)參數(shù),通常不修改也不會(huì)影響您的正常使用
- define('SLEEP_TIME', 1);
- define('IS_DEBUG', false);
- define('INSERT_DB', true);
- //內(nèi)容的輸出速度,單位:秒
- define('OUTPUT_SPEED', 1);
- //end
-
- //需要過(guò)濾刪除的文字,根據(jù)采集的網(wǎng)站類型進(jìn)行設(shè)置,不區(qū)分大小寫(xiě)
- $text_filter = array(
- '- 中華電腦書(shū)庫(kù)' => '',
- '_電腦電子書(shū)' => '',
- '_電腦書(shū)籍' => '',
- '下載' => '',
- );
-
- //表結(jié)構(gòu)映射的配置
- $table_mapping = array(
- //表字段名稱 => 獲取該字段的正則表達(dá)式,非空字段都必須在此設(shè)置映射關(guān)系,常量值請(qǐng)直接填寫(xiě)具體對(duì)應(yīng)的值,無(wú)需使用正則
- 'size' => '/軟件大小.*?000000>(.*?)<\/font>/i',
- 'logo' => 'http://www.94cto.com/index/uploads/images/20150105/0b8461910de101cc51a07684cdab797e.jpg',
- 'field1' => '/<title>(.*?)<\/title>/i',
- 'field2' => '/軟件簡(jiǎn)介.*?000000>(.*?)<\/font>/i',
- 'field3' => '1',
- 'field4' => '1',
- 'field5' => '1',
- 'field6' => '電子書(shū),計(jì)算機(jī),圖像,圖形',
- 'platform' => 'window/Linux',
- 'ishot' => '1',
- 'agreement' => '免費(fèi)',
- 'downurl' => '/(\/down\.asp\?id=.*?)"/i',
- 'istop' => '1',
- );
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
- $ga = new Gather();
- $ga->run();
-
- class Gather
- {
- public function __construct()
- {
- $this->init_check();
- }
-
- public function run()
- {
- global $table_mapping, $text_filter;
-
- for ($page = PAGE_START; $page <= PAGE_COUNT; $page++)
- {
-
- $this->write('開(kāi)始采集列表第'.$page.'頁(yè)的內(nèi)容...');
- $list_content = $this->get(sprintf(WEB_LIST_URL, $page));
- if (empty($list_content))
- {
- $this->write('抓取的列表頁(yè)的內(nèi)容為空,所以過(guò)濾掉');
- continue;
- }
-
- $list_content = str_replace("\r", '', $list_content);
- $list_content = str_replace("\n", '', $list_content);
-
- //精準(zhǔn)定位要抓取的模塊內(nèi)容
- if (!preg_match(WEB_LIST_POSTION, $list_content, $list_search))
- {
- $this->write('精準(zhǔn)匹配列表頁(yè)的內(nèi)容失敗,所以過(guò)濾掉');
- continue;
- }
- if (isset($list_search[1]))
- $list_content = $list_search[1];
- else
- $list_content = $list_search[0];
- //end
-
- preg_match_all(WEB_CONTENT_URL_REG, $list_content, $match);
- if (is_array($match[0]) && !empty($match[0]))
- {
- $this->write('當(dāng)前的列表頁(yè)面,總共匹配到:'.count($match[0]).'個(gè)內(nèi)容頁(yè)');
- foreach ($match[0] as $val)
- {
- if (strpos($val, 'http:') === false)
- {
- if (substr($val, 0, 1) == '/')
- $val = WEB_HOST.$val;
- else
- $val = WEB_HOST.'/'.$val;
- }
- $web_content = $this->get($val);
- if (empty($web_content))
- {
- $this->write('抓取的內(nèi)容頁(yè)為空,所以過(guò)濾掉');
- continue;
- }
-
- $web_content = str_replace("\r", '', $web_content);
- $web_content = str_replace("\n", '【】', $web_content);
-
- $sql = "INSERT INTO ".TABLE_NAME."(".implode(', ', array_keys($table_mapping)).")VALUES(";
- foreach ($table_mapping as $field => $reg)
- $sql .= ':'.$field.',';
- $sql = substr($sql ,0, -1);
- $sql .= ')';
-
- if (IS_DEBUG)
- $this->write('執(zhí)行SQL '.$sql);
-
- $dsn = 'mysql:dbname='.DB_NAME.';host='.DB_HOST;
- try {
- $dbh = new PDO($dsn, DB_USER, DB_PWD);
- } catch (PDOException $e) {
- $this->write( 'Connection failed: ' . $e->getMessage(), true);
- }
- $dbh->query("set names 'utf8'");
- $sth = $dbh->prepare($sql);
-
- foreach ($table_mapping as $field => $reg)
- {
- if (substr($reg, 0, 1) != '/')
- {
- $field = $reg;
- }
- else
- {
- if (!preg_match($reg, $web_content, $tmp_match))
- {
- $this->write('對(duì)不起,匹配字段:'.$field.'失敗,過(guò)濾此記錄');
- continue 2;
- }
-
- $field = $tmp_match[1];
- $field = $this->closetags($field);
-
- //刪除javascript腳本
- $field = preg_replace('/<script(.*?)>(.*?)<\/script>/i', '', $field);
-
- //將鏈接刪除
- $field = preg_replace('/<a(.*?)>(.*?)<\/a>/i', '${2}', $field);
-
- //圖片鏈接地址絕對(duì)地址化
- preg_match_all('/<img.*?src=("|\')+(.*?)("|\')+.*?>/i', $field, $img_match);
- if (isset($img_match[2]) && is_array($img_match[2]) && !empty($img_match[2]))
- {
- foreach ($img_match[2] as $img_val)
- {
- if (strpos($img_val, 'http:') === false)
- {
- $new_val = $img_val;
- if (substr($new_val, 0, 1) != '/')
- $new_val = '/'.$img_val;
- $new_val = WEB_HOST.$new_val;
- $field = str_replace($img_val, $new_val, $field);
- }
- }
- }
- //end
-
- //針對(duì)HTML里面的pre的換行先做一個(gè)特殊處理
- $field = preg_replace('/<pre.*?>(.*?)<\/pre>/i', '<pre class="prettyprint">${1}</pre>', $field);
- preg_match_all('/<pre>(.*?)<\/pre>/i', $field, $pre_match);
- if (isset($pre_match[1]) && is_array($pre_match[1]) && !empty($pre_match[1]))
- {
- foreach ($pre_match[1] as $pre_val)
- $field = str_replace($pre_val, str_replace("【】", "\r\n", $pre_val), $field);
- }
- //end
- }
-
- //入庫(kù)之前,將對(duì)應(yīng)的換行符號(hào)都還原回來(lái)
- $field = str_replace('【】', "\r\n", $field);
- //文本的過(guò)濾和替換操作
- if (is_array($text_filter) && !empty($text_filter))
- {
- foreach ($text_filter as $tk => $tv)
- $field = str_ireplace($tk, $tv, $field);
- }
-
- if (IS_DEBUG)
- $this->write('*'."\t".'字段:'.$field.' 值:'."\n****************************************************\n".$field."\n****************************************************");
- if ('downurl' == $field && stripos($field, 'http:') === false)
- if (substr($field, 0, 1) == '/')
- $field = WEB_HOST.trim($field);
- else
- $field = WEB_HOST.'/'.trim($field);
- $sth->bindValue(':'.$field, trim($field));
- }
- if (INSERT_DB)
- $sth->execute();
- $sth->closeCursor();
-
- $this->write( '休息,暫停'.SLEEP_TIME.'秒后繼續(xù)抓取...');
- sleep(SLEEP_TIME);
- }
- }
- else
- {
- $this->write('列表頁(yè)面沒(méi)有抓取到內(nèi)容,所以過(guò)濾掉');
- }
- }
- $this->write('', true);
- }
-
- protected function closetags($html)
- {
- // 不需要補(bǔ)全的標(biāo)簽
- $arr_single_tags = array('meta', 'img', 'br', 'link', 'area');
- // 匹配開(kāi)始標(biāo)簽
- preg_match_all('#<([a-z]+)(?: .*)?(?<![/|/ ])>#iU', $html, $result);
- $openedtags = $result[1];
- // 匹配關(guān)閉標(biāo)簽
- preg_match_all('#</([a-z]+)>#iU', $html, $result);
- $closedtags = $result[1];
- // 計(jì)算關(guān)閉開(kāi)啟標(biāo)簽數(shù)量,如果相同就返回html數(shù)據(jù)
- $len_opened = count($openedtags);
- if (count($closedtags) == $len_opened) {
- return $html;
- }
- // 把排序數(shù)組,將最后一個(gè)開(kāi)啟的標(biāo)簽放在最前面
- $openedtags = array_reverse($openedtags);
- // 遍歷開(kāi)啟標(biāo)簽數(shù)組
- for ($i = 0; $i < $len_opened; $i++) {
- // 如果需要補(bǔ)全的標(biāo)簽
- if (!in_array($openedtags[$i], $arr_single_tags)) {
- // 如果這個(gè)標(biāo)簽不在關(guān)閉的標(biāo)簽中
- if (!in_array($openedtags[$i], $closedtags)) {
- // 直接補(bǔ)全閉合標(biāo)簽
- $html .= '</' . $openedtags[$i] . '>';
- } else {
- unset($closedtags[array_search($openedtags[$i], $closedtags)]);
- }
- }
- }
- return $html;
- }
-
- protected function init_check()
- {
- if (!$this->check_curl_support())
- $this->write('對(duì)不起,請(qǐng)先開(kāi)啟CURL的類庫(kù)的支持,否則無(wú)法執(zhí)行', true);
- $this->check_mysql_connect();
- $this->write('程序初始化檢查通過(guò),執(zhí)行后續(xù)的流程...');
- }
-
- private function get($url, $data = array())
- {
- $this->write('開(kāi)始執(zhí)行抓取: '.$url);
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $url);
- //curl_setopt($ch, CURLOPT_USERAGENT, "Baiduspider+(+http://www.baidu.com/search/spider.htm)");
- curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
- curl_setopt($ch, CURLOPT_HEADER, 0);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_HTTPHEADER, $data);
- $ret = curl_exec($ch);
- $error = curl_error($ch);
- curl_close($ch);
- unset($ch);
- if (!empty($error))
- {
- $this->write('程序抓取URL: '.$url.'發(fā)生錯(cuò)誤,錯(cuò)誤信息: '.$error);
- return false;
- }
- if (WEB_CHARSET != 'utf-8')
- $ret = iconv(WEB_CHARSET, 'utf-8', $ret);
- return $ret;
- }
-
- //when check finish,mysql connect will auto close
- private function check_mysql_connect()
- {
- $con = mysql_connect(DB_HOST, DB_USER, DB_PWD);
- if (!is_resource($con))
- $this->write('程序無(wú)法成功鏈接到數(shù)據(jù)庫(kù),具體的錯(cuò)誤信息:'.mysql_error(), true);
- if (!mysql_select_db(DB_NAME, $con))
- $this->write('程序無(wú)法鏈接到數(shù)據(jù)庫(kù): '.DB_NAME.',具體的錯(cuò)誤信息: '.mysql_error(), true);
- mysql_close($con);
- }
-
- private function check_curl_support()
- {
- if (!extension_loaded('curl') || !function_exists('curl_init'))
- return false;
- return true;
- }
-
- private function write($str, $end = false)
- {
- if (PATH_SEPARATOR == ':')
- echo $str,PHP_EOL,PHP_EOL;
- else
- echo iconv('UTF-8', 'GBK', $str),PHP_EOL,PHP_EOL;
-
- if ($end)
- die("program exit");
-
- sleep(OUTPUT_SPEED);
- }
- }
復(fù)制代碼 |
|