root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1299 (checked in by miyagawa, 14 years ago)

EntryFullText?: fixed UTF-8 warnings in LWP

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32     $self->{ua}->parse_head(0);
33 }
34
35 sub load_plugins {
36     my $self = shift;
37     my $context = Plagger->context;
38
39     my $dir = $self->assets_dir;
40     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
41     for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
42                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
43         $self->load_plugin(@$file);
44     }
45 }
46
47 sub load_plugin {
48     my($self, $file, $base) = @_;
49
50     Plagger->context->log(debug => "loading $file");
51
52     my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
53     push @{ $self->{plugins} }, $self->$load_method($file, $base);
54 }
55
56 sub load_plugin_perl {
57     my($self, $file, $base) = @_;
58
59     open my $fh, $file or Plagger->context->error("$file: $!");
60     (my $pkg = $base) =~ s/\.pl$//;
61     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
62
63     if ($plugin_class->can('new')) {
64         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
65         return $plugin_class->new;
66     }
67
68     my $code = join '', <$fh>;
69     unless ($code =~ /^\s*package/s) {
70         $code = join "\n",
71             ( "package $plugin_class;",
72               "use strict;",
73               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
74               "sub site_name { '$pkg' }",
75               $code,
76               "1;" );
77     }
78
79     eval $code;
80     Plagger->context->error($@) if $@;
81
82     return $plugin_class->new;
83 }
84
85 sub load_plugin_yaml {
86     my($self, $file, $base) = @_;
87     my @data = YAML::LoadFile($file);
88
89     return map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) }
90         @data;
91 }
92
93 sub handle {
94     my($self, $context, $args) = @_;
95
96     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
97     if ($handler) {
98         $args->{match} = $handler->custom_feed_follow_link;
99         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
100     }
101 }
102
103 sub filter {
104     my($self, $context, $args) = @_;
105
106     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
107     if ( !$handler && $args->{entry}->body && $args->{entry}->body =~ /<\w+>/ && !$self->conf->{force_upgrade} ) {
108         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
109         return;
110     }
111
112     if (! $args->{entry}->permalink) {
113         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
114         return;
115     }
116
117     # NoNetwork: don't connect for 3 hours
118     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
119     return if !$res->status && $res->is_error;
120
121     $args->{content} = decode_content($res);
122
123     # if the request was redirected, set it as permalink
124     if ($res->http_response) {
125         my $base = $res->http_response->request->uri;
126         if ( $base ne $args->{entry}->permalink ) {
127             $context->log(info => "rewrite permalink to $base");
128             $args->{entry}->permalink($base);
129         }
130     }
131
132     # use Last-Modified to populate entry date, even if handler doesn't find one
133     if ($res->last_modified && !$args->{entry}->date) {
134         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
135     }
136
137     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
138
139     for my $plugin (@plugins) {
140         if ( $handler || $plugin->handle($args) ) {
141             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
142             my $data = $plugin->extract($args);
143                $data = { body => $data } if $data && !ref $data;
144             if ($data) {
145                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
146                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
147                 $data->{body} = $resolver->resolve( $data->{body} );
148                 $args->{entry}->body($data->{body});
149                 $args->{entry}->title($data->{title}) if $data->{title};
150                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
151
152                 # extract date using found one
153                 if ($data->{date}) {
154                     $args->{entry}->date($data->{date});
155                 }
156
157                 return 1;
158             }
159         }
160     }
161
162     # failed to extract: store whole HTML if the config is on
163     if ($self->conf->{store_html_on_failure}) {
164         $args->{entry}->body($args->{content});
165         return 1;
166     }
167
168     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
169 }
170
171
172 package Plagger::Plugin::Filter::EntryFullText::Site;
173 sub new { bless {}, shift }
174 sub custom_feed_handle { 0 }
175 sub custom_feed_follow_link { }
176 sub handle_force { 0 }
177 sub handle { 0 }
178
179 package Plagger::Plugin::Filter::EntryFullText::YAML;
180 use Encode;
181 use List::Util qw(first);
182
183 sub new {
184     my($class, $data, $base) = @_;
185
186     # add ^ if handle method starts with http://
187     for my $key ( qw(custom_feed_handle handle handle_force) ) {
188         next unless defined $data->{$key};
189         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
190     }
191
192     # decode as UTF-8
193     for my $key ( qw(extract extract_date_format) ) {
194         next unless defined $data->{$key};
195         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
196             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
197         } else {
198             $data->{$key} = decode("UTF-8", $data->{$key});
199         }
200     }
201
202     bless {%$data, base => $base }, $class;
203 }
204
205 sub site_name {
206     my $self = shift;
207     $self->{base};
208 }
209
210 sub custom_feed_handle {
211     my($self, $args) = @_;
212     $self->{custom_feed_handle} ?
213         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
214 }
215
216 sub custom_feed_follow_link {
217     $_[0]->{custom_feed_follow_link};
218 }
219
220 sub handle_force {
221     my($self, $args) = @_;
222     $self->{handle_force}
223         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
224 }
225
226 sub handle {
227     my($self, $args) = @_;
228     $self->{handle}
229         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
230 }
231
232 sub extract {
233     my($self, $args) = @_;
234     my $data;
235
236     if (my @match = $args->{content} =~ /$self->{extract}/s) {
237         my @capture = split /\s+/, $self->{extract_capture};
238         @{$data}{@capture} = @match;
239     }
240
241     if ($self->{extract_xpath}) {
242         eval { require HTML::TreeBuilder::XPath };
243         if ($@) {
244             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
245             return;
246         }
247
248         my $tree = HTML::TreeBuilder::XPath->new;
249         $tree->parse($args->{content});
250         $tree->eof;
251
252         for my $capture (keys %{$self->{extract_xpath}}) {
253             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
254             $data->{$capture} = $children[0]->as_HTML;
255         }
256     }
257
258     if ($data) {
259         if ($self->{extract_after_hook}) {
260             eval $self->{extract_after_hook};
261             Plagger->context->error($@) if $@;
262         }
263
264         if ($data->{date}) {
265             if (my $format = $self->{extract_date_format}) {
266                 $format = [ $format ] unless ref $format;
267                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
268                 if ($data->{date} && $self->{extract_date_timezone}) {
269                     $data->{date}->set_time_zone($self->{extract_date_timezone});
270                 }
271             } else {
272                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
273             }
274         }
275
276         return $data;
277     }
278 }
279
280 1;
281
282 __END__
283
284 =head1 NAME
285
286 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
287
288 =head1 SYNOPSIS
289
290   - module: Filter::EntryFullText
291
292 =head1 DESCRIPTION
293
294 This plugin allows you to fetch entry full text by doing HTTP GET and
295 apply regexp to HTML. It's just like upgrading your flight ticket from
296 economy class to business class!
297
298 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
299 files under assets plugin directory.
300
301 =head1 CONFIG
302
303 =over 4
304
305 =item store_html_on_failure
306
307 Even if fulltext handlers fail to extract content body from HTML, this
308 option enables to store the whole document HTML as entry body. It will
309 be useful to use with search engines like Gmail and Search:: plugins.
310 Defaults to 0.
311
312 =item force_upgrade
313
314 Even if entry body already contains HTML, this config forces the
315 plugin to upgrade the body. Defaults to 0.
316
317 =back
318
319 =head1 WRITING CUSTOM FULLTEXT HANDLER
320
321 (To be documented)
322
323 =head1 AUTHOR
324
325 Tatsuhiko Miyagawa
326
327 =head1 SEE ALSO
328
329 L<Plagger>
Note: See TracBrowser for help on using the browser.