otsune's SnakeOil RSSフィード

python -c "print''.join([chr(x) for x in 111&1101,110+~-~11,10^11*11,110+~-~11,-~101,-~11^11*11,~-110,111&11*11,11+11+10,11|~-0110,1-~11^11*11,10^11*11,-~11^11*11,110,101])"

2008-06-18

Filter::AutoTaggingをみてdiff書いた

| Filter::AutoTaggingをみてdiff書いた - otsune's SnakeOil  を含むブックマーク はてなブックマーク - Filter::AutoTaggingをみてdiff書いた - otsune's SnakeOil  Filter::AutoTaggingをみてdiff書いた - otsune's SnakeOil  のブックマークコメント

久しぶりにPlaggerでなんかやってみた | 人生迷い箸

  • dbのデフォルトファイル名をid_safeから生成するように
  • Encode::encode, Encode::decodeでいちいちcharsetを検索するのはおもいのでfind_encodingして使い回すように
  • perltidy -pbpかけた

あとこの手のプラグインは t/plugins/Filter-AutoTagging/*.t が無いとdebug大変そうだからテスト書くと楽に作れると思った。

--- AutoTagging.pm  2008-06-18 14:35:04.000000000 +0900
+++ AutoTagging_new.pm  2008-06-18 15:33:05.000000000 +0900
@@ -7,22 +7,26 @@
 use Algorithm::NaiveBayes;
 use Algorithm::NaiveBayes::Model::Frequency;
 use Storable;
-#use Data::Dumper;
-
 use Encode;
 
 sub init {
     my $self = shift;
 
     $self->SUPER::init(@_);
-    $self->conf->{db}             ||= '/tmp/plagger_plugin_filter_autotagging.db';
-    $self->conf->{mecab_charset}  ||= 'utf-8';
-    $self->conf->{generate_tags}  ||= 0;
-    $self->conf->{guess_tags}     ||= 3;
-    $self->conf->{threshold}      ||= 0.9;
-
-    eval {$self->{bayes} = Storable::retrieve($self->conf->{db})} if -e $self->conf->{db};
-    $self->{bayes} ||= Algorithm::NaiveBayes->new(purge => 0);
+    if (!exists $self->conf->{db}) {
+        my $file =
+        Plagger::Util::filename_for($feed, $self->conf->{filename} || '%i.db');
+        my $path = File::Spec->catfile($self->conf->{dir} || '/tmp', $file);
+        $self->conf->{db} = $path;
+    }; #'/tmp/%%id_safe%%.db';
+    $self->conf->{mecab_charset} ||= 'utf-8';
+    $self->conf->{generate_tags} ||= 0;
+    $self->conf->{guess_tags}    ||= 3;
+    $self->conf->{threshold}     ||= 0.9;
+
+    eval { $self->{bayes} = Storable::retrieve( $self->conf->{db} ) }
+        if -e $self->conf->{db};
+    $self->{bayes} ||= Algorithm::NaiveBayes->new( purge => 0 );
 }
 
 sub register {
@@ -34,70 +38,78 @@
     my ( $self, $context, $args ) = @_;
 
     foreach my $entry ( $args->{feed}->entries ) {
-        $self->update($context, $entry); 
+        $self->update( $context, $entry );
     }
 }
 
 sub update {
-    my($self, $context, $entry) = @_;
+    my ( $self, $context, $entry ) = @_;
 
-    my ( $words, $gen_tags ) = $self->parse_terms($context, $entry->{body});
+    my ( $words, $gen_tags ) = $self->parse_terms( $context, $entry->{body} );
 
-    if (@{ $entry->{tags} }) {
-        $context->log( info => "entry tags : " . join(",", @{ $entry->{tags} } ) ) 
+    if ( @{ $entry->{tags} } ) {
+        $context->log(
+            info => 'entry tags : ' . join( ',', @{ $entry->{tags} } ) );
     }
 
-    if ($self->conf->{generate_tags}) {
-        $self->add_tags($context, $entry, $gen_tags);
-        $context->log( info => "generated tags : " . join(",", @$gen_tags) ) if (@$gen_tags);
+    if ( $self->conf->{generate_tags} ) {
+        $self->add_tags( $context, $entry, $gen_tags );
+        $context->log( info => 'generated tags : ' . join( ',', @$gen_tags ) )
+            if (@$gen_tags);
     }
-    
-    if ($self->conf->{training}) {
-        if (@{ $entry->{tags} }) {
-            $self->train_bayes($context, $entry, $words);
-            Storable::store($self->{bayes} => $self->conf->{db});
+
+    if ( $self->conf->{training} ) {
+        if ( @{ $entry->{tags} } ) {
+            $self->train_bayes( $context, $entry, $words );
+            Storable::store( $self->{bayes} => $self->conf->{db} );
         }
     }
 
-    if ($self->conf->{guess_tags}) {
-        my $guess_tags = $self->guess_tags($context, $words);
-        $self->add_tags($context, $entry, $guess_tags);
-        $context->log( info => "guessed tags : " . join(",", @$guess_tags) ) if (@$guess_tags);
+    if ( $self->conf->{guess_tags} ) {
+        my $guess_tags = $self->guess_tags( $context, $words );
+        $self->add_tags( $context, $entry, $guess_tags );
+        $context->log( info => 'guessed tags : ' . join( ',', @$guess_tags ) )
+            if (@$guess_tags);
     }
 }
 
 sub train_bayes {
-    my($self, $context, $entry, $words) = @_;
+    my ( $self, $context, $entry, $words ) = @_;
 
     $self->{bayes}->add_instance(
         attributes => $words,
-        label => $entry->{tags},
+        label      => $entry->{tags},
     );
     $self->{bayes}->train;
-    $context->log( info => "train : " . join ',' , @{ $entry->{tags} } );
+    $context->log( info => 'train : ' . join ',', @{ $entry->{tags} } );
 }
 
 sub parse_terms {
-    my($self, $context, $body) = @_;
+    my ( $self, $context, $body ) = @_;
 
+    my $enc = Encode::find_encoding( $self->conf->{mecab_charset} );
     my $m = Text::MeCab->new;
 
     my %words = ();
-    my %tags = ();
+    my %tags  = ();
 
-    my $plain_text = Encode::encode($self->conf->{mecab_charset}, Plagger::Util::strip_html($body));
+    my $plain_text = $enc->encode( Plagger::Util::strip_html($body) );
 
     for ( my $node = $m->parse($plain_text); $node; $node = $node->next ) {
-        my $surface = Encode::decode($self->conf->{mecab_charset}, $node->surface); 
-        my $feature = Encode::decode($self->conf->{mecab_charset}, $node->feature);
-        if (length($surface)) {
+        my $surface = $enc->decode( $node->surface );
+        my $feature = $enc->decode( $node->feature );
+        if ( length($surface) ) {
             $words{$surface}++;
-            my ($t1, $t2) = split /,/, $feature;
+            my ( $t1, $t2 ) = split /,/, $feature;
             {
-              use utf8;
-              if ($t1 eq '名詞' && $t2 ne '数' && $t2 ne '代名詞' && $t2 ne 'サ変接続') {
-                  $tags{$surface}++;
-              }
+                use utf8;
+                if (   $t1 eq '名詞'
+                    && $t2 ne '数'
+                    && $t2 ne '代名詞'
+                    && $t2 ne 'サ変接続' )
+                {
+                    $tags{$surface}++;
+                }
             }
         }
     }
@@ -105,30 +117,30 @@
     # sort
     my @gen_tags;
     my $cnt = $self->conf->{generate_tags};
-    foreach my $key ( sort {$tags{$b} <=> $tags{$a}} keys %tags ) {
+    foreach my $key ( sort { $tags{$b} <=> $tags{$a} } keys %tags ) {
         last unless $cnt--;
         push @gen_tags, $key;
     }
 
-    return (\%words, \@gen_tags);
-} 
+    return ( \%words, \@gen_tags );
+}
 
 sub guess_tags {
     my ( $self, $context, $words ) = @_;
 
     my @tags;
-    
+
     return \@tags unless $self->{bayes}->labels;
 
-    my $result = $self->{bayes}->predict(
-        attributes => $words
-    );
- 
+    my $result = $self->{bayes}->predict( attributes => $words );
+
     my %guess_tags = %$result;
 
     my $cnt = $self->conf->{guess_tags};
-    foreach my $key ( sort {$guess_tags{$b} <=> $guess_tags{$a}} keys %guess_tags ) {
-        if ($guess_tags{$key} >= $self->conf->{threshold}) {
+    foreach my $key ( sort { $guess_tags{$b} <=> $guess_tags{$a} }
+        keys %guess_tags )
+    {
+        if ( $guess_tags{$key} >= $self->conf->{threshold} ) {
             last unless $cnt--;
             push @tags, $key;
         }
@@ -156,7 +168,8 @@
   - module: Filter::AutoTagging
     config:
       mecab_charset: utf-8
-      db: /path/to/db_file.db
+      db: /path/to
+      filename: db_file.db
       threshold: 0.5
       generate_tags: 3
       guess_tags: 5

valiumvalium2008/06/18 18:09ostuneさん、素早い反応とアドバイスどうもありがとうございます。今日はちょっと時間がとれないので、明日適用して試してみようと思います。m(__)m

valiumvalium2008/06/19 16:01↑のコメント、お名前打ち間違えました。お恥ずかしい。

いただいたアドバイスを取り込ませていただきましたので取り急ぎご連絡まで。

トラックバック - http://subtech.g.hatena.ne.jp/otsune/20080618