Generated: Tue Feb 2 17:54:57 2010 from striphtml03.pl 2005/11/30 3.4 KB.
#!/Perl # AIM: To strip HTML from a file using HTML::Strip ###use strict; use Carp; use HTML::Strip; use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep ); my ($t0, $t1, $elapsed); $t0 = [gettimeofday]; ##my $definp = 'C:/HOMEPAGE/P26/browser1.htm'; my $definp = 'C:/HOMEPAGE/P26/favorites.htm'; my $html_file = shift || $definp; my $out_file = 'tempstr.txt'; my ($OF); my $msg = ''; die "ERROR: Can not locate input file $html_file - $! - " if ! -f $html_file; open $OF, ">$out_file" || die "ERROR: Could not create OUT file! - $! -\n"; $msg = "Processing file $html_file ...\n"; print $OF $msg; # get the file data my $raw_html = read_file_f( $html_file ); # create 'stripper' ... my $hs = HTML::Strip->new(); # parse it ... my $clean_text = $hs->parse( $raw_html ); $hs->eof; # remove any MULTIPLE space while( $clean_text =~ / /gm ) { $clean_text =~ s/ / /gm; } # remove any LEADING line spaces while( $clean_text =~ /^ /gm ) { $clean_text =~ s/^ //gm; } # remove any STARTING CR,LF pairs while( $clean_text =~ /^\r\n/ ) { $clean_text =~ s/^\r\n//; } # remove any DOUBLE CR/LF pairs while( $clean_text =~ /\r\n\r\n/gm ) { $clean_text =~ s/(\r\n\r\n)/\r\n/gm; } ##print "$clean_text\n"; print $OF $clean_text; $t1 = [gettimeofday]; $elapsed = tv_interval ( $t0, $t1 ); $msg = "$0 processing took $elapsed seconds ...\n"; print $msg; print $OF $msg; my @lines = split("\n", $clean_text); $msg = "Now split into " . scalar @lines . " lines ...\n"; print $msg; print $OF $msg; my $line; my @words = (); my @la; foreach $line (@lines) { chomp $line; @la = split(' ', $line); push(@words, @la); } $msg = "Now split into " . scalar @words . " words ...\n"; print $msg; print $OF $msg; $t2 = [gettimeofday]; $elapsed = tv_interval ( $t1, $t2 ); $msg = "$0 processing took $elapsed seconds ...\n"; print $msg; print $OF $msg; my $word; my %HWords = (); my $newwds = 0; my $oldwds = 0; foreach $word (@words) { if( exists $HWords{$word} ) { $HWords{$word}++; $oldwds++; } else { $HWords{$word} = 1; # start count ###print $OF "$word\n"; $newwds++; } } $msg = "Got $newwds new words, and $oldwds repeats...\n"; print $msg; print $OF $msg; foreach $word (keys %HWords) { $msg = $word . ' count = ' . $HWords{$word} . "\n"; print $OF $msg; } $t3 = [gettimeofday]; $elapsed = tv_interval ( $t2, $t3 ); $msg = "$0 output took $elapsed seconds ...\n"; print $msg; print $OF $msg; $msg = "See $out_file for the results ...\n"; print $msg; print $OF $msg; close $OF; system $out_file; sub read_file_f { my( $file_name, %args ) = @_ ; my $buf ; my $buf_ref = $args{'buf_ref'} || \$buf ; my $mode = O_RDONLY ; $mode |= O_BINARY if $args{'binmode'} ; local( *FH ) ; sysopen( FH, $file_name, $mode ) or carp "Can't open $file_name: $!" ; my $size_left = -s FH ; while( $size_left > 0 ) { my $read_cnt = sysread( FH, ${$buf_ref}, $size_left, length ${$buf_ref} ) ; unless( $read_cnt ) { carp "read error in file $file_name: $!" ; last ; } $size_left -= $read_cnt ; } # handle void context (return scalar by buffer reference) return unless defined wantarray ; # handle list context return split m|?<$/|g, ${$buf_ref} if wantarray ; # handle scalar context return ${$buf_ref} ; } # eof - striphtml03.pl