Plagger::Plugin::Filter::Summarize::Japanese
とりあえずさらしてみる。
package Plagger::Plugin::Filter::Summarize::Japanese; use strict; use base qw( Plagger::Plugin ); our $VERSION = '0.01'; use Encode; use Lingua::JA::Summarize; sub register { my($self, $context) = @_; $context->register_hook( $self, 'update.entry.fixup' => \&filter, ); } sub filter { my($self, $context, $args) = @_; my $encoding = $self->conf->{encoding} || 'euc-jp'; my $maxwords = $self->conf->{maxwords} || 5; my $entry = $args->{entry}; my $text = encode($encoding, $entry->body_text); my $summary = Lingua::JA::Summarize->new( $self->conf->{summarize_conf} ); # sometimes this croaks for minor reasons. eval { $summary->analyze($text); }; $context->log(debug => "summarize error: $@") if $@; # As of 0.02, Lingua::JA::Summarize has hardcoded euc-jp strings # and the summarized data/keywords are always euc-jp # whatever the encodings of the mecab dictionary or # incoming texts are. my @keywords = map { decode('euc-jp', $_); } $summary->keywords({ maxwords => $maxwords }); foreach my $keyword (@keywords) { $entry->add_tag($keyword); $entry->title($entry->title . " [$keyword]") if $self->conf->{add_keywords_to_title}; } } 1; __END__ =head1 NAME Plagger::Plugin::Filter::Summarize::Japanese - extract important keywords =head1 SYNOPSIS - module: Filter::Summarize::Japanese config: summarize_conf: mecab: /usr/local/bin/mecab default_code: 800 encoding: euc-jp maxwords: 5 add_keywords_to_title: 1 =head1 CONFIG =over 4 =item summarize_conf Will be passed to Lingua::JA::Summarize->new(); See the module's documentation for details. =item encoding Your mecab dictionary's encoding. Defaults to 'euc-jp'. Set to 'shiftjis' if you use pre-compiled mecab for Windows. =item maxwords How many keywords you'd like to extract. =item add_keywords_to_title If set to true, keywords (enclosed in brackets) will be added to each entry's title. =back =head1 AUTHOR Kenichi Ishigaki =head1 SEE ALSO L<Plagger>, L<Lingua::JA::Summarize> =cut