亚洲av成人无遮挡网站在线观看,少妇性bbb搡bbb爽爽爽,亚洲av日韩精品久久久久久,兔费看少妇性l交大片免费,无码少妇一区二区三区
Chinaunix
標(biāo)題:
發(fā)布一個(gè)HTML正文提取程序PHP版 HTMLExtractor
[打印本頁]
作者:
xjtdy888
時(shí)間:
2010-12-19 23:18
標(biāo)題:
發(fā)布一個(gè)HTML正文提取程序PHP版 HTMLExtractor
本帖最后由 xjtdy888 于 2010-12-19 23:25 編輯
發(fā)布一個(gè)HTML正文提取程序HTMLExtractor,在線例子
http://dev.psm01.cn/c/html-extractor.php
程序主要是基于內(nèi)容統(tǒng)計(jì)的方法,暫不包含自學(xué)習(xí)能力,僅是
一個(gè)分析程序而以,網(wǎng)上也有別人實(shí)現(xiàn)了的正文提取程序,不過
大部人都當(dāng)寶,都不愿意公開完整代碼,有些大人實(shí)現(xiàn)了一些簡
單的,不過分析能力和識(shí)別能力都不太理想。所以自己做了一個(gè)
簡單的,本來想用PHP DOM分析器,不過大部份網(wǎng)頁都不規(guī)范,
缺個(gè)標(biāo)簽啥的都很正常,所以自已又造了個(gè)簡單的輪子分析HTML標(biāo)
簽,功能比較簡單,每個(gè)元素都生成一個(gè)對(duì)象,內(nèi)存方面占用比較
高,不過在這里我只是為了實(shí)現(xiàn),并沒去做優(yōu)化。因?yàn)槲也⒉皇窃?br /> 做應(yīng)用,所以希望不要讓我改改成什么樣去適用你們的業(yè)務(wù)(以前經(jīng)常
有QQ加上讓我把我的例子怎么改,很無語),
如果你們喜歡,可以和我一起開發(fā)完善他。
補(bǔ)充一下,因?yàn)閷懙闹保F(xiàn)在幾個(gè)類的耦合性還比較大,下來再守善吧。
項(xiàng)目代碼
http://code.google.com/p/html-extractor/
QQ 339534039
郵箱 xjtdy888[at]163.com
BLOG
http://hi.baidu.com/phps
<?php
/**
*
* 作者 言兌
* 郵箱 xjtdy888[at]163.com
* QQ 339534039
* 項(xiàng)目托管 http://code.google.com/p/html-extractor/
*
*/
error_reporting(E_ALL & ~E_NOTICE & ~E_DEPRECATED);
header("Content-type:text/html; charset=utf-8");
$url = $_REQUEST['url'];
$v = $url ? $url : 'http://news.sina.com.cn/w/2010-11-03/063821404648.shtml';
echo '<title>正文提取</title>';
echo '<h3>PHP 網(wǎng)頁正文提取程序</h2>';
echo '<h3>作者: 言兌</h2>';
echo '<h3>QQ: 339534039</h2>';
echo '<h3> <a href="http://hi.baidu.com/phps" target="_blank">查看博客</a> <a href="http://code.google.com/p/html-extractor/" target="_blank">查看項(xiàng)目代碼</a></h3>';
echo '<form>';
echo '請(qǐng)輸入要提取的URL:<input type="text" name="url" size="50" value="'.$v.'" /><input type="submit" value="分析" />';
echo '</form>';
if (!$url){
exit;
}
echo '<b>分析結(jié)果:</b> <a href="'.$url.'" target="_blank">查看原文</a>:<br /><br />';
$text = HTMLExtractor::getUrlMainContent($url,200,1);
$text = HTMLExtractor::convertToUTF8($text);
if (!$text) $text = "抓取失敗...可能目標(biāo)頁不規(guī)范或者正文太短";
echo ($text);
echo "<br /><br />耗時(shí):" . HTMLExtractor::$usageTime;
if (function_exists('memory_get_usage')){
echo "內(nèi)存占用:" . (memory_get_usage(true)/1024).'KB';
}
class HTMLExtractor
{
#要?jiǎng)h掉的元素
const PC_TAG_DELETE = 1;
#要?jiǎng)h掉的標(biāo)簽
const PC_TAG_STRIP = 2;
static $cleanTags= array(
array("script",self::PC_TAG_DELETE),
array("style",self::PC_TAG_DELETE),
array("link",self::PC_TAG_DELETE),
array("link",self::PC_TAG_DELETE),
array("object",self::PC_TAG_DELETE),
array("embed",self::PC_TAG_DELETE),
array("p",self::PC_TAG_STRIP),
array("b",self::PC_TAG_STRIP),
array("i",self::PC_TAG_STRIP),
array("u",self::PC_TAG_STRIP),
array("font",self::PC_TAG_STRIP),
array("strong",self::PC_TAG_STRIP),
);
static $usageTime = 0;
static function preClean($html)
{
foreach(self::$cleanTags as $t)
{
if (!$t) continue;
$name = $t[0];
$pc = $t[1];
$html = preg_replace("#<({$name})(>|\s[^>]*?>)(.*?)</\\1>#is",
$pc == self::PC_TAG_DELETE ? "" : "\\3",$html);
}
return $html;
}
static function getDataMainContent($data,$minlength,$maxdepth)
{
$s = microtime(true);
$data = self::preClean($data);
$root = new htmlTag("document",htmlTag::DOM_TAG);
$hand = new htmlExtractorHandler($root);
$p = new htmlParse($data,$hand);
$p->parse();
$text = self::getMainContent($root,$minlength,$maxdepth);
$e = microtime(true);
self::$usageTime = $e - $s;
return $text;
}
static function getUrlMainContent($url,$minlength,$maxdepth)
{
$data = self::getUrlHtml($url);
if (!$data) return false;
return self::getDataMainContent($data,$minlength,$maxdepth);
}
static function getDomText($dom,$depth)
{
if ($dom->echoset) return ;
$dom->echoset = true;
if($dom->depth <= $depth){
foreach($dom->getChildren() as $child){
if (is_object($child)){
$result .= self::getDomText($child,$depth);
}elseif(is_string($child)){
$result .= $child;
}
}
}
return $result;
}
static function getMainContent($root,$textLength=100,$maxdepth)
{
$result = '';
$cn = $root->tagNum + $root->textNum;
$per = $root->tagNum ? $root->textLength/$textLength / $root->tagNum : 1;
if ($root->textLength >= $textLength && $per>0.5){
$result .= self::getDomText($root,$root->depth+$maxdepth);
}
foreach($root->getChildren() as $dom){
if (is_object($dom)){
$result .= self::getMainContent($dom,$textLength,$maxdepth);
}
}
return $result;
}
static function checkTextType($url)
{
$url = parse_url($url);
if($fp = @fsockopen($url['host'],empty($url['port'])?80:$url['port'],$error))
{
fputs($fp,"GET ".(empty($url['path'])?'/':$url['path'])." HTTP/1.1\r\n");
fputs($fp,"Host:$url[host]\r\n\r\n");
while(!feof($fp))
{
$tmp = fgets($fp);
if(trim($tmp) == ''){
break;
}else if(preg_match('#Content-type: text/(.*)#si',$tmp,$arr)){
fclose($fp);
return true;
}
}
fclose($fp);
return false;
}else{
return false;
}
}
static function convertToUTF8($str) {
$charset = mb_detect_encoding($str, array('ASCII','UTF-8','GB2312','GBK','BIG5','ISO-8859-1'));
if (strcasecmp($charset,'UTF-8') != 0) {
$str = mb_convert_encoding($str,'UTF-8',$charset);
}
return $str;
}
static function getUrlHtml($url){
//return file_get_contents("txt.txt");
if (!self::checkTextType($url)){
exit();
}
return file_get_contents($url);
}
}
/**
*
* HTML 標(biāo)簽解析器
* 該解析器以<>為單元 比如 <div id="cc"> 這是一個(gè)處理單元
* 所以 <div></div> 這句是2個(gè)處理單元<div>和</div>
* 解析器每處理一個(gè)單元都會(huì)產(chǎn)生回調(diào)函數(shù),至于怎么來處理這個(gè)單元由處理器來決定
* 也就是說該解析器并不去處理標(biāo)簽匹不匹配之類的問題
* </span></span> 這樣的字符串也是可以進(jìn)行解析的,產(chǎn)生2次 endElement 事件回調(diào)。
* 本來先用PHP自帶的DOM對(duì)象類,不過由于大部份網(wǎng)頁都不規(guī)范,解析起來大部份是會(huì)出錯(cuò)的
* 所以自己寫了這個(gè)簡單的
* 本類總共3個(gè)回調(diào)函數(shù)
* startElement($parser,$tagName) 發(fā)現(xiàn)開始標(biāo)簽
* endElement($parser,$tagName) 發(fā)現(xiàn)閉合標(biāo)簽
* characterData($parser,$char) 發(fā)現(xiàn)標(biāo)簽內(nèi)容
* 本類沒做任何優(yōu)化,所以回調(diào)的頻率會(huì)相當(dāng)?shù)母摺?br />
*
*/
class htmlParse
{
/**
* 要處理的HTML內(nèi)容
*/
protected $_html = '';
/**
* _html 的長度
*/
protected $_htmlLength = 0;
/**
* 當(dāng)前處理位置指針
*/
protected $_pt = 0;
/**
* 標(biāo)簽狀態(tài)棧
*/
protected $_tagStatus = array();
/**
* 標(biāo)簽棧
*/
protected $_tagStack = array();
/**
* 當(dāng)前標(biāo)簽名稱
*/
protected $_tagName = '';
/**
* 標(biāo)簽開始標(biāo)識(shí)
*/
const TAG_START = 10;
/**
* 標(biāo)簽結(jié)束標(biāo)識(shí)
*/
const TAG_END = 20;
/**
* 標(biāo)簽名字開始
*/
const TAGNAME_START = 30;
/**
* 標(biāo)簽名字結(jié)束
*/
const TAGNAME_END = 40;
/**
* 注釋開始(保留)
*/
const COMMENT_START = 50;
/**
* 注釋結(jié)束(保留)
*/
const COMMENT_END = 60;
/**
* 事件回調(diào)對(duì)象
*/
public $_elementHandler = null;
/**
*
* 構(gòu)函方法
* @param striing $html 要解析的字符串
* @param object|array elementHandler 回調(diào)處理器可以是數(shù)組也可以是對(duì)象
* 對(duì)象只要實(shí)現(xiàn)相同的方法名就可以,注意這里沒有用到接口
如果是數(shù)組,方法名作為下標(biāo)即可
*
*/
public function htmlParse($html,$elementHander=null)
{
$this->setHtml($html);
$this->setElementHandler($elementHander);
}
/**
*
* 重新設(shè)定要解析的內(nèi)容
* @param string $html
*
*/
public function setHtml($html)
{
$this->_html = $html;
$this->_reset();
}
/**
*
* 重位處理指針,要處理的字符長度
*
*/
public function _reset()
{
$this->_pt = 0;
$this->_htmlLength = strlen($this->_html);
}
/**
* 重新指定處理器
* @param object|array elementHandler
*/
public function setElementHandler($elementHander)
{
$this->_elementHandler = $elementHander;
}
/**
* 獲取要處理的下一個(gè)字符 指針自動(dòng)后移
* 到結(jié)尾了返回false
* @return char
*/
public function nextChar()
{
if ($this->_pt < $this->_htmlLength){
return $this->_html[$this->_pt++];
}
return false;
}
/**
* 獲取處理過的上一個(gè)字符指針回退
* 到結(jié)尾了返回false
* @return char
*/
public function preChar()
{
if ($this->_pt > 0){
return $this->_html[--$this->_pt];
}
return false;
}
/**
* 獲得當(dāng)前處理位置
* @return integer
*/
public function getPt()
{
return $this->_pt;
}
/**
* 設(shè)置處理位置 成功返回true 失敗false
* @return bool
*/
public function setPt($v)
{
if ($v>-1 && $v < $this->_htmlLength){
$this->_pt = $v;
return true;
}
return false;
}
public function addTagStack()
{
return array_push($this->_tagStack,$this->_tagName);
}
public function startElement($parse,$tagName)
{
}
public function endElement($parse,$tagName)
{
}
public function characterData($parse,$char)
{
}
public function endParse($parse)
{
}
public function callHandler($callback)
{
$argv = func_get_args();
array_shift($argv);
array_unshift($argv,$this);
if (is_array($this->_elementHandler) && $this->_elementHandler[$callback]){
return call_user_func_array($this->_elementHandler[$callback],$argv);
}else{
$handler = is_object($this->_elementHandler) ? $this->_elementHandler : $this;
if (method_exists($handler,$callback)){
return call_user_method_array($callback,$handler,$argv);
}
}
}
public function parse()
{
while(($char=$this->nextChar()) !== false)
{
switch($char)
{
case '<':
if (!$this->_tagStatus || end($this->_tagStatus) == self::TAG_END)
{
$pt = $this->getPt();
$char1 = $this->nextChar();
$char2 = $this->nextChar();
$char3 = $this->nextChar();
$refor = false;
if ($char1 == '!' && ($char2 == '-' && $char3 == '-')) {
//如果是注釋
while(($char1=$this->nextChar()) !== false)
{
if ($char1 != '>') continue;
$pt2 = $this->getPt();
$this->preChar();
$char2 = $this->preChar();
$char3 = $this->preChar();
if ($char2 == '-' && $char3 == '-') {
$refor = true;
$this->setPt($pt2);
break;
}
$this->setPt($pt2);
}
}
if ($refor){
continue;
}
$this->setPt($pt);
array_push($this->_tagStatus,self::TAG_START);
array_push($this->_tagStatus,self::TAGNAME_START);
$this->_tagName = '';
}
break;
case ' ': case '>':
if (!$this->_tagStatus || end($this->_tagStatus) == self::TAG_END){
$callback = 'characterData';
$this->callHandler($callback,$char);
continue;
}
$callback = '';
if (end($this->_tagStatus) == self::TAGNAME_START) {
array_pop($this->_tagStatus);
array_push($this->_tagStatus , self::TAGNAME_END);
if ($this->_tagName[0] == '/'){
$this->_tagName = substr($this->_tagName,1);
$callback = 'endElement';
}else{
$callback = 'startElement';
}
}
// <p /> <p/> <p / >
// <link ... />
if (in_array(end($this->_tagStatus) ,array(self::TAGNAME_START, self::TAGNAME_END)) && $char == '>'){
$pt = $this->getPt();
$this->setPt($pt-1);
while(($char2=$this->preChar()) !== false && !preg_match("#\s#",$char2)){
if ($char2 == '/'){
//自閉合標(biāo)簽
$callback = 'endElement';
array_pop($this->_tagStatus); //end tagname_start
array_push($this->_tagStatus , self::TAGNAME_END);
}
break;
}
$this->setPt($pt);
}
if ($callback == 'startElement'){
$this->addTagStack();
$this->callHandler($callback,$this->_tagName);
}elseif ($callback == 'endElement'){
array_pop($this->_tagStatus); //end tagname
array_pop($this->_tagStatus); // end tag
$this->callHandler($callback,$this->_tagName);
}
if (end($this->_tagStatus) == self::TAGNAME_END && $char == '>'){
array_pop($this->_tagStatus); //end tag name
array_pop($this->_tagStatus); //end tag
}
break;
default:
if (end($this->_tagStatus) == self::TAGNAME_START)
{
$this->_tagName .= $char;
}
if (!$this->_tagStatus || end($this->_tagStatus) == self::TAG_END){
$callback = 'characterData';
$this->callHandler($callback,$char);
}
break;
}
}
$callback = 'endParse';
$this->callHandler($callback,$char);
}
}
/**
*/
class htmlTag
{
public $tagName = '';
public $type = '';
public $depth = 0;
public $parent = null;
public $childs = array();
public $textLength = 0;
public $tagNum = 0;
public $textNum = 0;
const DOM_TAG = 1;
public function __construct($tagName,$type)
{
$this->type = $type;
$this->tagName = $tagName;
}
public function addChild($child)
{
array_push($this->childs,$child);
if (!is_object($child)){
$this->textLength += $this->_strlen($child,true);
$this->textNum++;
}else{
$this->tagNum++;
}
}
public function _strlen($text,$ignoreSpace=false)
{
if ($ignoreSpace) $text = preg_replace("#\s*#s","",$text);
return strlen($text);
}
public function getChildren()
{
$result = array();
foreach($this->childs as $dom)
{
$result[] = $dom;
}
return $result;
}
public function getText()
{
$text = '';
foreach($this->childs as $dom){
if (is_string($dom)) $text .= $dom;
}
return $text;
}
}
class htmlExtractorHandler
{
public $ignoreTags=array(
"!doctype","meta","link","hr","!--","base","basefont","br",
"frame","frameset","noframes","iframe",
"input","button","select","optgroup","option",
"label","fieldset","legend","isindex",
"img","map","area","style",
"script","noscript","applet","object","param","marquee","embed");
protected $_dom = array();
private $_charBuffer = '';
private $_domDepth = 0;
public function __construct($root)
{
array_push($this->_dom,$root);
}
public function isIgnore($tag)
{
$tag = strtolower($tag);
return in_array($tag,$this->ignoreTags);
}
public function endParse()
{
$this->updateChacter();
}
public function updateChacter()
{
if ($this->_charBuffer != ''){
end($this->_dom)->addChild($this->_charBuffer);
$this->_charBuffer = '';
}
}
public function startElement($parse,$tagName)
{
$this->updateChacter();
$tagName = strtolower($tagName);
if ($this->isIgnore($tagName) === true) return false;
$dom = new htmlTag($tagName,htmlTag::DOM_TAG);
$parent = end($this->_dom);
$dom->parent = $parent;
//echo str_repeat(" ",$this->_domDepth)."[{$dom->tagName}_{$this->_domDepth}]\r\n";
$dom->depth = ++$this->_domDepth;
$parent->addChild($dom);
array_push($this->_dom,$dom);
}
public function endElement($parse,$tagName)
{
$this->updateChacter();
$tagName = strtolower($tagName);
if ($this->isIgnore($tagName) === true) return false;
$dom = end($this->_dom);
if (end($this->_dom)->tagName == $tagName) {
array_pop($this->_dom);
$this->_domDepth--;
}
//echo str_repeat(" ",$this->_domDepth)."[/{$dom->tagName}_{$this->_domDepth}]\r\n";
}
public function characterData($parse,$char)
{
$this->_charBuffer .= $char;
}
}
復(fù)制代碼
把代碼當(dāng)附件也放一份上來吧
html-extractor.php.zip
(5.51 KB, 下載次數(shù): 137)
2010-12-19 23:21 上傳
點(diǎn)擊文件名下載附件
歡迎光臨 Chinaunix (http://72891.cn/)
Powered by Discuz! X3.2