2008-06-18
Filter::AutoTaggingをみてdiff書いた
plagger |
![]()
- dbのデフォルトファイル名をid_safeから生成するように
- Encode::encode, Encode::decodeでいちいちcharsetを検索するのはおもいのでfind_encodingして使い回すように
- perltidy -pbpかけた
あとこの手のプラグインは t/plugins/Filter-AutoTagging/*.t が無いとdebug大変そうだからテスト書くと楽に作れると思った。
--- AutoTagging.pm 2008-06-18 14:35:04.000000000 +0900 +++ AutoTagging_new.pm 2008-06-18 15:33:05.000000000 +0900 @@ -7,22 +7,26 @@ use Algorithm::NaiveBayes; use Algorithm::NaiveBayes::Model::Frequency; use Storable; -#use Data::Dumper; - use Encode; sub init { my $self = shift; $self->SUPER::init(@_); - $self->conf->{db} ||= '/tmp/plagger_plugin_filter_autotagging.db'; - $self->conf->{mecab_charset} ||= 'utf-8'; - $self->conf->{generate_tags} ||= 0; - $self->conf->{guess_tags} ||= 3; - $self->conf->{threshold} ||= 0.9; - - eval {$self->{bayes} = Storable::retrieve($self->conf->{db})} if -e $self->conf->{db}; - $self->{bayes} ||= Algorithm::NaiveBayes->new(purge => 0); + if (!exists $self->conf->{db}) { + my $file = + Plagger::Util::filename_for($feed, $self->conf->{filename} || '%i.db'); + my $path = File::Spec->catfile($self->conf->{dir} || '/tmp', $file); + $self->conf->{db} = $path; + }; #'/tmp/%%id_safe%%.db'; + $self->conf->{mecab_charset} ||= 'utf-8'; + $self->conf->{generate_tags} ||= 0; + $self->conf->{guess_tags} ||= 3; + $self->conf->{threshold} ||= 0.9; + + eval { $self->{bayes} = Storable::retrieve( $self->conf->{db} ) } + if -e $self->conf->{db}; + $self->{bayes} ||= Algorithm::NaiveBayes->new( purge => 0 ); } sub register { @@ -34,70 +38,78 @@ my ( $self, $context, $args ) = @_; foreach my $entry ( $args->{feed}->entries ) { - $self->update($context, $entry); + $self->update( $context, $entry ); } } sub update { - my($self, $context, $entry) = @_; + my ( $self, $context, $entry ) = @_; - my ( $words, $gen_tags ) = $self->parse_terms($context, $entry->{body}); + my ( $words, $gen_tags ) = $self->parse_terms( $context, $entry->{body} ); - if (@{ $entry->{tags} }) { - $context->log( info => "entry tags : " . join(",", @{ $entry->{tags} } ) ) + if ( @{ $entry->{tags} } ) { + $context->log( + info => 'entry tags : ' . join( ',', @{ $entry->{tags} } ) ); } - if ($self->conf->{generate_tags}) { - $self->add_tags($context, $entry, $gen_tags); - $context->log( info => "generated tags : " . join(",", @$gen_tags) ) if (@$gen_tags); + if ( $self->conf->{generate_tags} ) { + $self->add_tags( $context, $entry, $gen_tags ); + $context->log( info => 'generated tags : ' . join( ',', @$gen_tags ) ) + if (@$gen_tags); } - - if ($self->conf->{training}) { - if (@{ $entry->{tags} }) { - $self->train_bayes($context, $entry, $words); - Storable::store($self->{bayes} => $self->conf->{db}); + + if ( $self->conf->{training} ) { + if ( @{ $entry->{tags} } ) { + $self->train_bayes( $context, $entry, $words ); + Storable::store( $self->{bayes} => $self->conf->{db} ); } } - if ($self->conf->{guess_tags}) { - my $guess_tags = $self->guess_tags($context, $words); - $self->add_tags($context, $entry, $guess_tags); - $context->log( info => "guessed tags : " . join(",", @$guess_tags) ) if (@$guess_tags); + if ( $self->conf->{guess_tags} ) { + my $guess_tags = $self->guess_tags( $context, $words ); + $self->add_tags( $context, $entry, $guess_tags ); + $context->log( info => 'guessed tags : ' . join( ',', @$guess_tags ) ) + if (@$guess_tags); } } sub train_bayes { - my($self, $context, $entry, $words) = @_; + my ( $self, $context, $entry, $words ) = @_; $self->{bayes}->add_instance( attributes => $words, - label => $entry->{tags}, + label => $entry->{tags}, ); $self->{bayes}->train; - $context->log( info => "train : " . join ',' , @{ $entry->{tags} } ); + $context->log( info => 'train : ' . join ',', @{ $entry->{tags} } ); } sub parse_terms { - my($self, $context, $body) = @_; + my ( $self, $context, $body ) = @_; + my $enc = Encode::find_encoding( $self->conf->{mecab_charset} ); my $m = Text::MeCab->new; my %words = (); - my %tags = (); + my %tags = (); - my $plain_text = Encode::encode($self->conf->{mecab_charset}, Plagger::Util::strip_html($body)); + my $plain_text = $enc->encode( Plagger::Util::strip_html($body) ); for ( my $node = $m->parse($plain_text); $node; $node = $node->next ) { - my $surface = Encode::decode($self->conf->{mecab_charset}, $node->surface); - my $feature = Encode::decode($self->conf->{mecab_charset}, $node->feature); - if (length($surface)) { + my $surface = $enc->decode( $node->surface ); + my $feature = $enc->decode( $node->feature ); + if ( length($surface) ) { $words{$surface}++; - my ($t1, $t2) = split /,/, $feature; + my ( $t1, $t2 ) = split /,/, $feature; { - use utf8; - if ($t1 eq '名詞' && $t2 ne '数' && $t2 ne '代名詞' && $t2 ne 'サ変接続') { - $tags{$surface}++; - } + use utf8; + if ( $t1 eq '名詞' + && $t2 ne '数' + && $t2 ne '代名詞' + && $t2 ne 'サ変接続' ) + { + $tags{$surface}++; + } } } } @@ -105,30 +117,30 @@ # sort my @gen_tags; my $cnt = $self->conf->{generate_tags}; - foreach my $key ( sort {$tags{$b} <=> $tags{$a}} keys %tags ) { + foreach my $key ( sort { $tags{$b} <=> $tags{$a} } keys %tags ) { last unless $cnt--; push @gen_tags, $key; } - return (\%words, \@gen_tags); -} + return ( \%words, \@gen_tags ); +} sub guess_tags { my ( $self, $context, $words ) = @_; my @tags; - + return \@tags unless $self->{bayes}->labels; - my $result = $self->{bayes}->predict( - attributes => $words - ); - + my $result = $self->{bayes}->predict( attributes => $words ); + my %guess_tags = %$result; my $cnt = $self->conf->{guess_tags}; - foreach my $key ( sort {$guess_tags{$b} <=> $guess_tags{$a}} keys %guess_tags ) { - if ($guess_tags{$key} >= $self->conf->{threshold}) { + foreach my $key ( sort { $guess_tags{$b} <=> $guess_tags{$a} } + keys %guess_tags ) + { + if ( $guess_tags{$key} >= $self->conf->{threshold} ) { last unless $cnt--; push @tags, $key; } @@ -156,7 +168,8 @@ - module: Filter::AutoTagging config: mecab_charset: utf-8 - db: /path/to/db_file.db + db: /path/to + filename: db_file.db threshold: 0.5 generate_tags: 3 guess_tags: 5
トラックバック - http://subtech.g.hatena.ne.jp/otsune/20080618

いただいたアドバイスを取り込ませていただきましたので取り急ぎご連絡まで。