分类:
2007-02-21 15:11:55
#! /usr/bin/perl -w # Example perl file - extract H1,H2 or H3 headers from HTML files # Run via: # perl this-perl-script.pl [-o outputfile] input-file(s) # E.g. # perl proto-getH1.pl -o headers *.html # perl proto-getH1.pl -o output.txt homepage.htm # # Russell Quong 2/19/98 require 5.003; # need this version of Perl or newer use English; # use English names, not cryptic ones use FileHandle; # use FileHandles instead of open(),close() use Carp; # get standard error / warning messages use strict; # force disciplined use of variables ## define some variables. my($author) = "Russell W. Quong"; my($version) = "Version 1.0"; my($reldate) = "Jan 1998"; my($lineno) = 0; # variable, current line number my($OUT) = \*STDOUT; # default output file stream, stdout my(@headerArr) = (); # array of HTML headers # print out a non-crucial for-your-information messages. # By making fyi() a function, we enable/disable debugging messages easily. sub fyi ($) { my($str) = @_; print "$str\n"; } sub main () { fyi("perl script = $PROGRAM_NAME, $version, $author, $reldate."); handle_flags(); # handle remaining command line args, namely the input files if (@ARGV == 0) { # @ARGV used in scalar context = number of handle_file('-'); } else { my($i); foreach $i (@ARGV) { handle_file($i); } } postProcess(); # additional processing after reading input } # handle all the arguments, in the @ARGV array. # we assume flags begin with a '-' (dash or minus sign). # sub handle_flags () { my($a, $oname) = (undef, undef); foreach $a (@ARGV) { if ($a =~ /^-o/) { shift @ARGV; # discard ARGV[0] = the -o flag $oname = $ARGV[0]; # get arg after -o shift @ARGV; # discard ARGV[0] = output file name $OUT = new FileHandle "> $oname"; if (! defined($OUT) ) { croak "Unable to open output file: $oname. Bye-bye."; exit(1); } } else { last; # break out of this loop } } } # handle_file (FILENAME); # open a file handle or input stream for the file named FILENAME. # if FILENAME == '-' use stdin instead. sub handle_file ($) { my($infile) = @_; fyi(" handle_file($infile)"); if ($infile eq "-") { read_file(\*STDIN, "[stdin]"); # \*STDIN=input stream for STDIN. } else { my($IN) = new FileHandle "$infile"; if (! defined($IN)) { fyi("Can't open spec file $infile: $!\n"); return; } read_file($IN, "$infile"); # $IN = file handle for $infile $IN->close(); # done, close the file. } } # read_file (INPUT_STREAM, filename); # sub read_file ($$) { my($IN, $filename) = @_; my($line, $from) = ("", ""); $lineno = 0; # reset line number for this file while ( defined($line = <$IN>) ) { $lineno++; chomp($line); # strip off trailing '\n' (newline) do_line($line, $lineno, $filename); } } # do_line(line of text data, line number, filename); # process a line of text. sub do_line ($$$) { my($line, $lineno, $filename) = @_; my($heading, $htype) = undef; # search for a # where Hx = H1, H2 or H3. if ( $line =~ m:( $htype = $1; # either H1, H2, or H3 $heading = $2; # text matched in the parethesis in the regex fyi("FYI: $filename, $lineno: Found ($heading)"); print $OUT "$filename, $lineno: $heading\n"; # we'll also save the all the headers in an array, headerArr push(@headerArr, "$heading ($filename, $lineno)"); } } # print out headers sorted alphabetically # sub postProcess() { my(@sorted) = sort { $a cmp $b } @headerArr; # example using sort print $OUT "\n--- SORTED HEADERS ---\n"; my($h); foreach $h (@sorted) { print $OUT "$h\n"; } my $now = localtime(); print $OUT "\nGenerated $now.\n" } # start executing at main() # main(); 0; # return 0 (no error from this script) |