- 論壇徽章:
- 0
|
本帖最后由 sx98083714 于 2010-11-19 13:21 編輯
- #!/usr/bin/perl
- use strict;
- use URI;
- use Web::Scraper;
- use utf8;
- use LWP;
- use Encode;
- use HTML::TokeParser;
- use HTTP::Cookies;
- #將輸出自動解碼為utf8格式
- binmode(STDOUT,":encoding(utf8)");
- #抓取url
- my $url = "http://meilibody.taobao.com/?search=y";
- #定義抓取產(chǎn)品列表表達式
- my $scraper = scraper{
- process ".permalink","links[]" => { "url" => '@href',"txt" => 'TEXT' };
- };
- my $result = $scraper -> scrape (URI -> new($url) );
- #抓取產(chǎn)品url、標題
- for my $row ( @{ $result -> { links } } ){
-
- my $purl = $row -> { "url" };
- my $ptxt = $row -> { "txt" };
-
- #調(diào)用函數(shù)抓取產(chǎn)品詳細內(nèi)容
- my $pcontent = &getcontent($purl);
-
- print "content:",$pcontent,"\n";
- sleep 10;
-
- }
- #獲取單個產(chǎn)品詳細資料
- sub getcontent() {
-
- #抓取產(chǎn)品url
- my $url = shift;
-
- my $browser = LWP::UserAgent -> new;
- $browser -> cookie_jar ( { } );
- $browser -> timeout ( 500 );
-
- my $response = $browser -> get ($url,
- 'User-Agent' => 'Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.0.12) Gecko/2009072711 CentOS/3.0.12-1.el5.centos Firefox/3.0.12',
- 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language' => 'zh-cn,zh;q=0.5',
- 'Accept-Charset' => 'gb2312,utf-8;q=0.7,*;q=0.7',
- 'Referer' => 'http://meilibody.taobao.com/?search=y',
- );
- #獲取單個產(chǎn)品網(wǎng)頁內(nèi)容
- my $content = $response -> content;
- #網(wǎng)頁內(nèi)容為utf8編碼
- $content = decode ( 'gbk' , $content);
-
- my $stream = HTML::TokeParser -> new (\$content);
-
- #1表示找到內(nèi)容簡介的起始處
- my $find = 0;
- #產(chǎn)品簡介html代碼
- my $pcontent = "";
-
- while ( my $token = $stream -> get_token ){
-
- #產(chǎn)品簡介部分結(jié)束,將標記歸0,跳出循環(huán)
- if ( $find == 1 and $token -> [0] eq 'S' and $token -> [1] eq 'script') {
- $find = 0;
- last;
- }
-
- if ( $find == 0 and $token -> [0] eq 'S' and $token -> [1] eq 'div' and $token -> [2] -> {class} eq "content" ){
- #找到產(chǎn)品簡介起始,將標記置為1
- $find = 1;
- $pcontent .= $token -> [4];
- }elsif ( $find == 1 and $token -> [0] eq 'S' ){
- $pcontent .= $token -> [4];
- }elsif ( $find == 1 and $token -> [0] eq 'E'){
- $pcontent .= $token -> [2];
- }elsif ( $find == 1 and $token -> [0] eq 'T'){
- $pcontent .= $token -> [1];
- }elsif ( $find == 1 and $token -> [0] eq 'C'){
- $pcontent .= $token -> [1];
- }elsif ( $find == 1 and $token -> [0] eq 'D'){
- $pcontent .= $token -> [1];
- }
- }
-
- return $pcontent;
-
- }
復制代碼 匹配應該是沒有問題,但是抓取到的產(chǎn)器簡介是“描述加載中....",看了似乎產(chǎn)品簡介是通過js來得到的,過往神仙如何解決這一問題?謝謝! |
|