striphtml03.pl to HTML.

Generated: Mon Aug 29 19:35:02 2016 from striphtml03.pl 2014/08/08 5.1 KB. text copy
#!/user/bin/perl
# AIM: To strip HTML from a file using HTML::Strip
# 08/08/2014 - Small update - Due to CSPAN failed to compile due to missing strcasecmp in Windows
# had to download source - HTML-Strip-1.06.tar.gz - unpacked and did 'fix' and zipped to
# HTML-Strip-1.06.zip
# SRC: F:\Projects\HTML-Strip-1.06 - upshtml-strip.bat - hhtml-strip.bat
# See 'setup.txt' for the compile, test and install of this module, then use HTML::Strip; compile
# Also had to reduce HiRes to just those used
# and it ran...
###use strict;
use Carp;
use HTML::Strip;
use Time::HiRes qw( gettimeofday tv_interval );
### use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep );

my ($t0, $t1, $elapsed);
$t0 = [gettimeofday];

##my $definp = 'C:/HOMEPAGE/P26/browser1.htm';
my $definp = 'C:/HOMEPAGE/P26/favorites.htm';
my $html_file = shift || $definp;
my $out_file = 'tempstr.txt';
my ($OF);
my $msg = '';

sub trim_leading($) {
    my ($ln) = shift;
   $ln = substr($ln,1) while ($ln =~ /^\s/); # remove all LEADING space
    return $ln;
}

sub trim_tailing($) {
    my ($ln) = shift;
   $ln = substr($ln,0, length($ln) - 1) while ($ln =~ /\s$/g); # remove all TRAILING space
    return $ln;
}

sub trim_ends($) {
    my ($ln) = shift;
    $ln = trim_tailing($ln); # remove all TRAINING space
   $ln = trim_leading($ln); # remove all LEADING space
    return $ln;
}

sub trim_all {
   my ($ln) = shift;
   $ln =~ s/\n/ /gm;   # replace CR (\n)
   $ln =~ s/\r/ /gm;   # replace LF (\r)
   $ln =~ s/\t/ /g;   # TAB(s) to a SPACE
    $ln = trim_ends($ln);
   $ln =~ s/\s{2}/ /g while ($ln =~ /\s{2}/);   # all double space to SINGLE
   return $ln;
}

die "ERROR: Can not locate input file $html_file - $! - " if ! -f $html_file;

open $OF, ">$out_file" || die "ERROR: Could not create OUT file! - $! -\n";

$msg = "Processing file $html_file ...\n";
print $OF $msg;

# get the file data
my $raw_html = read_file_f( $html_file );
# create 'stripper' ...
my $hs = HTML::Strip->new();
# parse it ...
my $clean_text = $hs->parse( $raw_html );
$hs->eof;
# remove any MULTIPLE space
while( $clean_text =~ /  /gm ) {
   $clean_text =~ s/  / /gm;
}
# remove any LEADING line spaces
while( $clean_text =~ /^ /gm ) {
   $clean_text =~ s/^ //gm;
}
# remove any STARTING CR,LF pairs
while( $clean_text =~ /^\r\n/ ) {
   $clean_text =~ s/^\r\n//;
}
# remove any DOUBLE CR/LF pairs
# this does NOT seem to do the job????
while( $clean_text =~ /\r\n\r\n/gm ) {
   $clean_text =~ s/(\r\n\r\n)/\r\n/gm;
}
my @lines = split("\n", $clean_text);
$msg = "Now split into " . scalar @lines . " lines ...\n";
print $msg;
my ($line,$len);
my @nlines = ();
foreach $line (@lines) {
    chomp $line;
    $line = trim_all($line);
    $len = length($line);
    next if ($len == 0);
    push(@nlines,$line);
}
### $clean_text = join("\n",@nlines);
##print "$clean_text\n";
print $OF $clean_text;
$t1 = [gettimeofday];
$elapsed = tv_interval ( $t0, $t1 );
$msg = "$0 processing took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
# @lines = @nlines;    # split("\n", $clean_text);
@lines = split("\n", $clean_text);
$msg = "Now split into " . scalar @lines . " lines ...\n";
print $msg;
print $OF $msg;
my @words = ();
my @la;
foreach $line (@lines) {
   chomp $line;
   @la = split(' ', $line);
   push(@words, @la);
}
$msg = "Now split into " . scalar @words . " words ...\n";
print $msg;
print $OF $msg;
$t2 = [gettimeofday];
$elapsed = tv_interval ( $t1, $t2 );
$msg = "$0 processing took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
my $word;
my %HWords = ();
my $newwds = 0;
my $oldwds = 0;
foreach $word (@words) {
   if( exists $HWords{$word} ) {
      $HWords{$word}++;
      $oldwds++;
   } else {
        $HWords{$word} = 1; # start count
      ###print $OF "$word\n";
      $newwds++;
    }
}
$msg = "Got $newwds new words, and $oldwds repeats...\n";
print $msg;
print $OF $msg;

foreach $word (keys %HWords) {
   $msg = $word . ' count = ' . $HWords{$word} . "\n";
   print $OF $msg;
}

$t3 = [gettimeofday];
$elapsed = tv_interval ( $t2, $t3 );
$msg = "$0 output took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
$msg = "See $out_file for the results ...\n";
print $msg;
print $OF $msg;
close $OF;
system $out_file;


sub read_file_f {
    my( $file_name, %args ) = @_ ;
    my $buf ;
    my $buf_ref = $args{'buf_ref'} || \$buf ;
    my $mode = O_RDONLY ;
    $mode |= O_BINARY if $args{'binmode'} ;
    local( *FH ) ;
    sysopen( FH, $file_name, $mode ) or
        carp "Can't open $file_name: $!" ;

    my $size_left = -s FH ;
    while( $size_left > 0 ) {
        my $read_cnt = sysread( FH, ${$buf_ref},
            $size_left, length ${$buf_ref} ) ;

        unless( $read_cnt ) {
            carp "read error in file $file_name: $!" ;
            last ;
        }

      $size_left -= $read_cnt ;
    }
# handle void context (return scalar by buffer reference)
    return unless defined wantarray ;
# handle list context
    return split m|?<$/|g, ${$buf_ref} if wantarray ;
# handle scalar context
    return ${$buf_ref} ;
}

# eof - striphtml03.pl
index -|- top