|
|
||
追記
Benchmark: timing 100 iterations of libxml, parser, regexp, treebuilder...
regexp: 1 wallclock secs ( 0.97 usr + 0.01 sys = 0.98 CPU) @ 102.04/s (n=100)
parser: 1 wallclock secs ( 1.66 usr + 0.00 sys = 1.66 CPU) @ 60.24/s (n=100)
libxml: 3 wallclock secs ( 2.56 usr + 0.02 sys = 2.58 CPU) @ 38.76/s (n=100)
treebuilder: 75 wallclock secs (74.19 usr + 0.08 sys = 74.27 CPU) @ 1.35/s (n=100)
use strict; use File::Slurp qw(slurp); use HTML::TreeBuilder::XPath; use HTML::Selector::XPath 'selector_to_xpath'; use HTML::Parser; use HTML::TreeBuilder::LibXML; use Benchmark qw(:all); my $xpath = selector_to_xpath('.hatena-asin-detail-info'); my $hatena1 = slurp("motemen.txt"); # motemen/20080926/1222434809 warn parser($hatena1); warn treebuilder($hatena1); warn libxml($hatena1); timethese 100, { regexp => sub { html2text($hatena1) }, treebuilder => sub { treebuilder($hatena1) }, libxml => sub { libxml($hatena1) }, parser => sub { parser($hatena1) }, }; sub html2text { my $str = shift; $str =~ s/<.*?>//g; $str =~ s/\s//g; $str; } sub parser { my $html = shift; my $out; my $ignore; my $tag_opened = 0; HTML::Parser->new( start_h => [ sub { my ( $tag, $attr, $text ) = @_; if ( $attr->{class} =~ /hatena-asin-detail-info/ ) { $ignore = 1; } $tag_opened++ if $ignore; }, 'tag,attr,text' ], end_h => [ sub { $tag_opened-- if $ignore; if ($tag_opened == 0 && $ignore) { $ignore = 0; } } ], default_h => [ sub { $out .= shift unless $ignore }, "text" ], comment_h => [""], )->parse($html); $out =~ s{\s}{}g; return $out; } sub treebuilder { my $html = shift; my $p = HTML::TreeBuilder::XPath->new; $p->parse($html); my $nodes = $p->findnodes($xpath); for my $n (@{$nodes}) { $n->delete; } my $t = $p->as_text(""); $t =~ s{\s}{}g; return $t; } sub libxml { my $html = shift; utf8::decode($html) unless utf8::is_utf8 $html; my $p = HTML::TreeBuilder::LibXML->new; $p->parse($html); my $nodes = $p->findnodes($xpath); for my $n (@{$nodes}) { $n->delete; } my $t = $p->as_text(""); $t =~ s{\s}{}g; return $t; }