Chinaunix首页 | 论坛 | 博客
  • 博客访问: 72954
  • 博文数量: 13
  • 博客积分: 1435
  • 博客等级: 上尉
  • 技术积分: 220
  • 用 户 组: 普通用户
  • 注册时间: 2007-10-22 10:41
文章分类
文章存档

2010年(2)

2009年(11)

我的朋友

分类:

2009-08-04 07:18:04

[学习perl 练手用,请勿用于商业用途,尊重别人的劳动成果,访问一下提供资料的网站,最好看一下它的广告]
取得电子节目单

#!/usr/bin/perl

use strict;
use warnings;
use LWP::Simple;
use HTML::Tree;
use URI;

#download html files

my $caturl = '';
my $remotefile;
my $localfile;
my $remotedir ='';
my $root = '';
my $locdir = $root;

my $html = get($caturl);

my $i ;
my $url ;
my $file ;
my $tree = HTML::TreeBuilder->new;
$tree->parse_content($html); # !

mkdir ($locdir);
$i=1 ;
foreach my $link ( $tree->look_down(_tag=>'a','target'=>'nr'))
{
    if($link)
    {
    $url = $link->attr('href'); # !

    $localfile = $root.'/'.$i.'.html';
    getstore($url, $localfile);
    $i++ ;
    }
}
$tree->delete; # clear memory!


# now merge files

my $filename;
open DIRFILE ,"-|", "ls $locdir |grep html\$|sort -t\. -n" ;
while(<DIRFILE>)
{
    chomp ;
    $filename = $_;
    print $filename,"...\r\n";
    &process_file($filename);
}
close DIRFILE ;

sub process_file
{
my $infile;
    $infile = shift;
    $infile = "$locdir\/$infile" ;
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file($infile); # !

    my $text;
    foreach my $tr ( $tree->look_down(_tag => 'tr', sub{$_[0]->attr('id') =~ /^imgdiv/}))
    {
        if($tr)
        {
            foreach my $paras ( $tr->look_down('_tag', 'p'))
            {
                if($paras)
                {
                $text = $paras->as_text; # !

                chomp $text;
                print $text ,"\r\n" ;
                }
            }
        }
    }
    $tree->delete; # clear memory!

}


阅读(542) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~