root/trunk/plagger/lib/Plagger/Plugin/Filter/GuessLanguage.pm

Revision 1249 (checked in by charsbar, 14 years ago)

added Filter::GuessLanguage?

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
Line 
1 package Plagger::Plugin::Filter::GuessLanguage;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Text::Language::Guess;
6 use Locale::Language;
7 use Lingua::ZH::HanDetect;
8
9 sub register {
10     my($self, $context) = @_;
11     $context->register_hook(
12         $self,
13         'plugin.init'        => \&init_guesser,
14         'update.entry.fixup' => \&guess,
15     );
16 }
17
18 sub rule_hook { 'update.entry.fixup' }
19
20 my $re_western_lang_code = qr/^(?:en|fr|es|pt|it|de|nl|sv|no|da)$/;
21
22 sub init_guesser {
23     my ($self, $context, $args) = @_;
24
25     my @western_languages; # ie. Text::Language::Guess-able languages
26     my %accepts;
27
28     foreach my $lang (@{ $self->conf->{languages} || [] }) {
29
30         # see if $lang is human friendly lang name
31         if (my $code = language2code($lang)) {
32             push @western_languages, $code if $code =~ $re_western_lang_code;
33             $accepts{$code} = 1;
34         }
35
36         # see if $lang is existing lang code
37         elsif (my $name = code2language($lang)) {
38             push @western_languages, $lang if $lang =~ $re_western_lang_code;
39             $accepts{$lang} = 1;
40         }
41
42         # $lang is something wrong or unsupported
43         else {
44             $context->log(warn => "no such language: $lang");
45         }
46     }
47
48     $self->{guess_language}->{accepts} = \%accepts;
49     $self->{guess_language}->{western} = Text::Language::Guess->new(
50         @western_languages
51             ? ( languages => \@western_languages )
52             : ()
53     );
54 }
55
56 sub guess {
57     my ($self, $context, $args) = @_;
58
59     my $target = $self->conf->{target} || 'feed';
60
61     my $guessed;
62     if (!$guessed && $target =~ /both|entry/) {
63         $guessed = $self->guess_entry($context, $args);
64     }
65     if (!$guessed && $target =~ /both|feed/) {
66         $guessed = $self->guess_feed($context, $args);
67     }
68 }
69
70 sub guess_language {
71     my ($self, $text) = @_;
72
73     my $code;
74
75     # xxx: just a quick hack. there may be a better way.
76
77     my %accepts = %{ $self->{guess_language}->{accepts} };
78
79     if (!%accepts || $accepts{ja}) {
80         return 'ja' if $text =~ /\p{Hiragana}|\p{Katakana}/s;
81     }
82     if (!%accepts || $accepts{ko}) {
83         return 'ko' if $text =~ /\p{Hangul}/s;
84     }
85     if (!%accepts || $accepts{zh}) {
86         my ($encoding, $variant) = Lingua::ZH::HanDetect::han_detect($text);
87         return 'zh' if $encoding && $variant; # maybe chinese (but maybe j/k)
88     }
89
90     $code = $self->{guess_language}->{western}->language_guess_string($text);
91
92     # skip if no western lang is allowed
93     return $code if !%accepts || $accepts{$code};
94
95     return# doomed!
96 }
97
98 sub guess_feed {
99     my ($self, $context, $args) = @_;
100
101     return $args->{feed}->language if $args->{feed}->language;
102
103     $context->log(debug => "start guessing language");
104
105     my $body = join "\n", map $_->body_text, $args->{feed}->entries;
106
107     my $code = $self->guess_language($body);
108
109     if ($code) {
110         $context->log(debug => "guessed: $code");
111         $args->{feed}->language($code);
112         return $code;
113     }
114     else {
115         $context->log(debug => "can't identify the feed's language");
116         return;
117     }
118 }
119
120 sub guess_entry {
121     my ($self, $context, $args) = @_;
122
123     return $args->{entry}->{language} if $args->{entry}->{language};
124
125     $context->log(debug => "start guessing entry's language");
126
127     my $code = $self->guess_language($args->{entry}->body_text);
128
129     if ($code) {
130         $context->log(debug => "guessed: $code");
131         $args->{entry}->{language} = $code;
132         return $code;
133     }
134     else {
135         $context->log(debug => "can't identify the entry's language");
136         return;
137     }
138 }
139
140 1;
141
142 __END__
143
144 =head1 NAME
145
146 Plagger::Plugin::Filter::GuessLanguage - guess language of feeds/entries
147
148 =head1 SYNOPSIS
149
150   - module: Filter::GuessLanguage
151     config:
152       languages:
153         - en
154         - de
155         - Japanese
156       target: both
157
158 =head1 DESCRIPTION
159
160 =head1 CONFIG
161
162 =over 4
163
164 =item languages (optional)
165
166 Which languages you think the feeds/entries are written in.
167 English language names and ISO two letter codes are acceptable.
168 Unless you DO want to limit, specify nothing.
169
170 =item target
171
172 'entry' or 'feed' (default) or 'both'.
173
174 =back
175
176 =head1 AUTHOR
177
178 Kenichi Ishigaki
179
180 =head1 SEE ALSO
181
182 L<Plagger>, L<Text::Language::Guess>
183
184 =cut
Note: See TracBrowser for help on using the browser.