Chinaunix首页 | 论坛 | 博客
  • 博客访问: 72952
  • 博文数量: 13
  • 博客积分: 1435
  • 博客等级: 上尉
  • 技术积分: 220
  • 用 户 组: 普通用户
  • 注册时间: 2007-10-22 10:41
文章分类
文章存档

2010年(2)

2009年(11)

我的朋友

分类:

2009-08-04 07:14:40

[学习perl 练手用,请勿用于商业用途,尊重别人的劳动成果,访问一下提供资料的网站,最好看一下它的广告]
查询专利数据,为避免服务器过载,代码中只查询了一天的两条记录,需要更多记录的请自行修改参数,够用即可,不要竭泽而渔,把服务器弄趴下
还可以做更多的整理,例如把数据导入到数据库中,通过OCR 识别转换tif文件为文本,使用全文搜索技术...

#!/usr/bin/perl

use strict;
use warnings;
use LWP;
use LWP::UserAgent ;
use LWP::Simple;
use HTML::Tree;
use URI::Escape;
use URI::QueryParam;
use URI;


my $searchDate = '20080101';
my $root = 'sipo';
my $remoteserv ="";
mkdir ($root);
&search_by_date($searchDate);

sub search_by_date
{
    my $file = shift ;
    my $searchword = "\%C9\%EA\%C7\%EB\%C8\%D5\%3D\%28$file%29";
    my $out_file = "$root\/$file\/search_result.html"; # where to save it

    my $locdir = "$root\/$file\/";
    my $browser = LWP::UserAgent->new;
    my $response = $browser->post(
     'hyjs-jieguo.jsp',
     # That's the URL that the real form submits to.

     [
     "recshu" => "2",
     "searchword" => uri_unescape($searchword),
     "flag3" => "1",
     "pg" => "1",
     "sign" => "0",
     "textfield3" => "",
     "textfield9" => "",
     "textfield10" => "",
     "textfield12" => $file,
     "textfield4" => "",
     "textfield5" => "",
     "textfield2" => "",
     "textfield11" => "",
     "textfield6" => "",
     "textfield7" => "",
     "textfield8" => "",
     "gjgb" => "",
     "textfield13" => "",
     "textfield141" => "",
     "textfield142" => "",
     "textfield143" => "",
     ]
    );
    
    die "Error: ", $response->status_line, "\n"
     unless $response->is_success;

    mkdir($locdir);    
    open(OUT, ">$out_file") || die "Can't write-open $out_file: $!";
    binmode(OUT);
    print OUT $response->content;
    close(OUT);

    my $linkurl;
    my $localfile;
    my $uri ;
    my $prevurl="" ;
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file($out_file); # !

    foreach my $link ( $tree->look_down(_tag=>'a',class=>'a01',sub{$_[0]->attr('href') =~ /recid/}))
    {
        if($link)
        {
        $linkurl = $link->attr('href'); # !

        $linkurl = $remoteserv.$linkurl ;
        $uri = URI->new($linkurl);
        $localfile= "$locdir/".$uri->query_param('recid').".html";
        if($linkurl ne $prevurl)
        {
        print $linkurl ,"\n";
        print $localfile ,"\n";
        getstore($linkurl, $localfile);
        &getif($localfile,$locdir);
        }
        $prevurl = $linkurl ;
        }
    }
    $tree->delete; # clear memory!


}

sub getif
{
    my ($file,$locdir)=@_;
    my $uri ;
    my $name;
    my $value ;
    my $recid;
    my $tifpath;
    my $totalpage;
    my $tifdir='';
    my $tiffile;
    my $savefile;
    my $localfile ;
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file($file); # !

    foreach my $input ( $tree->look_down(_tag=>'input',type=>'hidden'))
    {
        if($input)
        {
        $name = $input->attr('name'); # !

        $value = $input->attr('value'); # !

        if($name eq 'recid') {$recid = $value ;}
        if($name eq 'tifpath') {$tifpath = $value ;}
        if($name eq 'totalpage') {$totalpage = 0+$value ;}
        }
    }

    $tree->delete; # clear memory!

    $uri = URI->new($tifpath);
    my @path=$uri->path_segments;
    # There will always be an empty first component.

    shift(@path);

    pop(@path);
    foreach my $dir (@path)
    {
     $tifdir .= $dir.'/' ;
    }
    mkdir("$locdir\/$recid");

    for (my $i=1;$i<=$totalpage;$i++)
    {
        $savefile = sprintf("%06d.tif",$i);
        $tiffile =$tifdir.$savefile;
        $localfile ="$locdir$recid\/$savefile";
        $uri->path($tiffile);

        getstore($uri->as_string, $localfile);
        
    }

}

阅读(650) | 评论(0) | 转发(0) |
0

上一篇:perl学习:EPG 之CCTV

下一篇:perl学习: EPG之BTV

给主人留下些什么吧!~~