striphtml03.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:57 2010 from striphtml03.pl 2005/11/30 3.4 KB.

#!/Perl
# AIM: To strip HTML from a file using HTML::Strip
###use strict;
use Carp;
use HTML::Strip;
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep );
my ($t0, $t1, $elapsed);
$t0 = [gettimeofday];
##my $definp = 'C:/HOMEPAGE/P26/browser1.htm';
my $definp = 'C:/HOMEPAGE/P26/favorites.htm';
my $html_file = shift || $definp;
my $out_file = 'tempstr.txt';
my ($OF);
my $msg = '';
die "ERROR: Can not locate input file $html_file - $! - " if ! -f $html_file;
open $OF, ">$out_file" || die "ERROR: Could not create OUT file! - $! -\n";
$msg = "Processing file $html_file ...\n";
print $OF $msg;
# get the file data
my $raw_html = read_file_f( $html_file );
# create 'stripper' ...
my $hs = HTML::Strip->new();
# parse it ...
my $clean_text = $hs->parse( $raw_html );
$hs->eof;
# remove any MULTIPLE space
while( $clean_text =~ /  /gm ) {
   $clean_text =~ s/  / /gm;
}
# remove any LEADING line spaces
while( $clean_text =~ /^ /gm ) {
   $clean_text =~ s/^ //gm;
}
# remove any STARTING CR,LF pairs
while( $clean_text =~ /^\r\n/ ) {
   $clean_text =~ s/^\r\n//;
}
# remove any DOUBLE CR/LF pairs
while( $clean_text =~ /\r\n\r\n/gm ) {
   $clean_text =~ s/(\r\n\r\n)/\r\n/gm;
}
##print "$clean_text\n";
print $OF $clean_text;
$t1 = [gettimeofday];
$elapsed = tv_interval ( $t0, $t1 );
$msg = "$0 processing took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
my @lines = split("\n", $clean_text);
$msg = "Now split into " . scalar @lines . " lines ...\n";
print $msg;
print $OF $msg;
my $line;
my @words = ();
my @la;
foreach $line (@lines) {
   chomp $line;
   @la = split(' ', $line);
   push(@words, @la);
}
$msg = "Now split into " . scalar @words . " words ...\n";
print $msg;
print $OF $msg;
$t2 = [gettimeofday];
$elapsed = tv_interval ( $t1, $t2 );
$msg = "$0 processing took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
my $word;
my %HWords = ();
my $newwds = 0;
my $oldwds = 0;
foreach $word (@words) {
   if( exists $HWords{$word} ) {
      $HWords{$word}++;
      $oldwds++;
   } else {
        $HWords{$word} = 1; # start count
      ###print $OF "$word\n";
      $newwds++;
    }
}
$msg = "Got $newwds new words, and $oldwds repeats...\n";
print $msg;
print $OF $msg;
foreach $word (keys %HWords) {
   $msg = $word . ' count = ' . $HWords{$word} . "\n";
   print $OF $msg;
}
$t3 = [gettimeofday];
$elapsed = tv_interval ( $t2, $t3 );
$msg = "$0 output took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
$msg = "See $out_file for the results ...\n";
print $msg;
print $OF $msg;
close $OF;
system $out_file;
sub read_file_f {
    my( $file_name, %args ) = @_ ;
    my $buf ;
    my $buf_ref = $args{'buf_ref'} || \$buf ;
    my $mode = O_RDONLY ;
    $mode |= O_BINARY if $args{'binmode'} ;
    local( *FH ) ;
    sysopen( FH, $file_name, $mode ) or
        carp "Can't open $file_name: $!" ;
    my $size_left = -s FH ;
    while( $size_left > 0 ) {
        my $read_cnt = sysread( FH, ${$buf_ref},
            $size_left, length ${$buf_ref} ) ;
        unless( $read_cnt ) {
            carp "read error in file $file_name: $!" ;
            last ;
        }
      $size_left -= $read_cnt ;
    }
# handle void context (return scalar by buffer reference)
    return unless defined wantarray ;
# handle list context
    return split m|?<$/|g, ${$buf_ref} if wantarray ;
# handle scalar context
    return ${$buf_ref} ;
}
# eof - striphtml03.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional