#!/usr/local/bin/perl # File: mangler.cgi # NOTE: To run this script you'll need the LWP and CGI.pm Perl modules, # as well as perl 5.003 or higher. You can get them at: # http://www.perl.com/CPAN/ use LWP::UserAgent; use HTML::Parse; use HTTP::Status; use CGI qw(:standard :html3); $ICON = "pow.gif"; srand(); # Uncomment to remove irritating warning messages from LWP # open (STDERR,">/dev/null"); $url_to_mangle = param('mangle') if request_method() eq 'POST'; print header(); if ($url_to_mangle && mangle($url_to_mangle)) { ; # nothing to do } else { prompt_for_url(); } # --------------------------------------------------- # THIS SECTION IS WHERE URLs ARE FETCHED AND MANGLED # --------------------------------------------------- sub mangle { my $url = shift; my $agent = new LWP::UserAgent; my $request = new HTTP::Request('GET',$url); my $response = $agent->request($request); unless ($response->is_success) { print h1('Error Fetching URL'), "An error occurred while fetching the document located at ", a({href=>$url},"$url."), p(), "The error was ",strong($r->status_line),".", hr(); return undef; } # make sure that it's an HTML document! my $type = $response->header('Content-type'); unless ($type eq 'text/html') { print h1("Document isn't an HTML File!"), "The URL ",a({href=>$url},"$url"), " is a document of type ",em($type),". ", "Please choose an HTML file to mangle.", hr(); return undef; } my $parse_tree = parse_html($response->content); local($body_tag); $parse_tree->traverse( sub { my($node,$start,$depth) = @_; return 0 if ref($body_tag); return 1 unless $start; if ($node->tag() eq 'body') { $body_tag = $node; return 0; } else { return 1; } },1 ); my %attrs; grep(/^_/ || ($attrs{$_}=$body_tag->attr($_)),keys %{$body_tag}) if ref($body_tag); print start_html(-title=>'Mangled Document', -xbase=>$url, %attrs), div({-align=>CENTER}, h1("The Mangler"), strong(a({-href=>$url},$url)) ), p(), a({-href=>self_url()},"Mangle another page"),hr(); $parse_tree->traverse(\&swallow); $parse_tree->traverse(\®urgitate); $parse_tree->delete(); 1; } sub swallow { my ($node,$start,$depth) = @_; return 1 if ref($node); return &Travesty::swallow($node); } sub regurgitate { my ($node,$start,$depth) = @_; if (ref($node)) { return 1 if $node->tag =~ /^(html|head|body)/i; return 0 if $node->is_inside('head'); &Travesty::reset() if $node->tag =~ /^(h\d+|p|blockquote|ol|ul|dl|li|dt|dd|table|pre)$/i; if ($start) { print $node->starttag; } else { print $node->endtag; } } else { my @words = split(/\s+/,$node); print &Travesty::regurgitate(scalar(@words)); } 1; } # --------------------------------------------------- # THIS SECTION IS WHERE THE PROMPT IS CREATED # --------------------------------------------------- sub prompt_for_url { print start_html(-title=>'The Mangler', -bgcolor=>'#FFFFFF', ), -e $ICON ? img({-src=>$ICON,-align=>LEFT}): '', h1('The Mangler'), "Enter the URL of an HTML page and press ",em("Mangle. "), "For best results, choose a document that contains several pages of text. ", "Very large documents may take a long time to process, so have patience.", start_form(), textfield(-name=>'mangle',-size=>60), submit(-value=>'Mangle'), end_form(), hr(), a({-href=>'mangler.txt'},'View the source code for this script.'), p(), address( "Author: ", a({-href=>'http://formaggio.cshl.org/~lstein/',-target=>"_top"},'Lincoln D. Stein'), ), end_html(); } # --------------- modifications of the travesty code from Perl's eg/ directory ------ package Travesty; sub swallow { my $string = shift; $string =~ tr/\n/ /s; push(@ary,split(/\s+/,$string)); while ($#ary > 1) { $a = $p; $p = $n; $w = shift(@ary); $n = $num{$w}; if ($n eq '') { push(@word,$w); $n = pack('S',$#word); $num{$w} = $n; } $lookup{$a . $p} .= $n; } 1; } sub reset { my ($key,$counter); $counter = 400; while (--$counter > 0) { ($key) = each(%lookup); next if $key eq ''; ($a,$p) = (substr($key,0,2),substr($key,2,2)); $n = $lookup{$a . $p}; $n = substr($n,int(rand(length($n))) & 0177776,2); ($w) = unpack('S',$n); last if $word[$w]=~/^[A-Z]/; } } sub regurgitate { my $words = shift; my $result = ''; my $i; my $end_on_sentence = 0; my $limit = $words; if ($words >= 10) { $end_on_sentence++; $limit *= 3; } for ($i=0;$i<$limit;$i++) { $n = $lookup{$a . $p}; ($foo,$n) = each(%lookup) if $n eq ''; $n = substr($n,int(rand(length($n))) & 0177776,2); $a = $p; $p = $n; ($w) = unpack('S',$n); $w = $word[$w]; # most of this formatting stuff is only relevant for
 text,
	# but we leave it in for that purpose
	$col += length($w) + 1;
	if ($col >= 65) {
	    $col = 0;
	    $result .= "\n";
	} else {
	    $result .= ' ';
	}
	$result .= $w;
	if ($w =~ /\.$/) {
	    if (rand() < .1) {
		$result .= "\n";
		$col = 80;
	    }
	}
	last if $w =~/[.?!]$/ && $end_on_sentence && $i >= $words;
    }
    return $result;
}