Plagger::Plugin::Filter::Summarize::Japanese

とりあえずさらしてみる。

package Plagger::Plugin::Filter::Summarize::Japanese;
use strict;
use base qw( Plagger::Plugin );

our $VERSION = '0.01';

use Encode;
use Lingua::JA::Summarize;

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'update.entry.fixup' => \&filter,
    );
}

sub filter {
    my($self, $context, $args) = @_;

    my $encoding = $self->conf->{encoding} || 'euc-jp';
    my $maxwords = $self->conf->{maxwords} || 5;
    my $entry    = $args->{entry};
    my $text     = encode($encoding, $entry->body_text);
    my $summary  = Lingua::JA::Summarize->new(
        $self->conf->{summarize_conf}
    );

    # sometimes this croaks for minor reasons.
    eval {
        $summary->analyze($text);
    };
    $context->log(debug => "summarize error: $@") if $@;

    # As of 0.02, Lingua::JA::Summarize has hardcoded euc-jp strings
    # and the summarized data/keywords are always euc-jp
    # whatever the encodings of the mecab dictionary or 
    # incoming texts are.

    my @keywords = map {
        decode('euc-jp', $_);
    } $summary->keywords({ maxwords => $maxwords });

    foreach my $keyword (@keywords) {
        $entry->add_tag($keyword);
        $entry->title($entry->title . " [$keyword]")
            if $self->conf->{add_keywords_to_title};
    }
}

1;

__END__

=head1 NAME

Plagger::Plugin::Filter::Summarize::Japanese - extract important keywords

=head1 SYNOPSIS

  - module: Filter::Summarize::Japanese
    config:
      summarize_conf:
        mecab: /usr/local/bin/mecab
        default_code: 800
      encoding: euc-jp
      maxwords: 5
      add_keywords_to_title: 1

=head1 CONFIG

=over 4

=item summarize_conf

Will be passed to Lingua::JA::Summarize->new(); See the module's 
documentation for details.

=item encoding

Your mecab dictionary's encoding. Defaults to 'euc-jp'.
Set to 'shiftjis' if you use pre-compiled mecab for Windows.

=item maxwords

How many keywords you'd like to extract.

=item add_keywords_to_title

If set to true, keywords (enclosed in brackets) will be added to
each entry's title.

=back

=head1 AUTHOR

Kenichi Ishigaki

=head1 SEE ALSO

L<Plagger>, L<Lingua::JA::Summarize>

=cut