root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1805 (checked in by miyagawa, 14 years ago)

EntryFullText?: add log if there's no extract nor extract_xpath to avoid typoes

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32 }
33
34 sub load_plugins {
35     my $self = shift;
36     my $context = Plagger->context;
37
38     $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
39     $self->load_assets('*.pl',   sub { $self->load_plugin_perl(@_) });
40 }
41
42 sub load_plugin_perl {
43     my($self, $file, $base) = @_;
44
45     Plagger->context->log(debug => "Load plugin $file");
46
47     open my $fh, '<', $file or Plagger->context->error("$file: $!");
48     (my $pkg = $base) =~ s/\.pl$//;
49     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
50
51     if ($plugin_class->can('new')) {
52         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
53         return $plugin_class->new;
54     }
55
56     my $code = join '', <$fh>;
57     unless ($code =~ /^\s*package/s) {
58         $code = join "\n",
59             ( "package $plugin_class;",
60               "use strict;",
61               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
62               "sub site_name { '$pkg' }",
63               $code,
64               "1;" );
65     }
66
67     eval $code;
68     Plagger->context->error($@) if $@;
69
70     push @{ $self->{plugins} }, $plugin_class->new;
71 }
72
73 sub load_plugin_yaml {
74     my($self, $file, $base) = @_;
75
76     Plagger->context->log(debug => "Load YAML $file");
77     my @data = YAML::LoadFile($file);
78
79     push @{ $self->{plugins} },
80         map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data;
81 }
82
83 sub handle {
84     my($self, $context, $args) = @_;
85
86     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
87     if ($handler) {
88         $args->{match} = $handler->custom_feed_follow_link;
89         $args->{xpath} = $handler->custom_feed_follow_xpath;
90         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
91     }
92 }
93
94 sub filter {
95     my($self, $context, $args) = @_;
96
97     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
98     if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) {
99         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
100         return;
101     }
102
103     if (! $args->{entry}->permalink) {
104         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
105         return;
106     }
107
108     # NoNetwork: don't connect for 3 hours
109     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
110     if (!$res->status && $res->is_error) {
111         $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
112         return;
113     }
114
115     $args->{content} = decode_content($res);
116
117     # if the request was redirected, set it as permalink
118     if ($res->http_response) {
119         my $base = $res->http_response->request->uri;
120         if ( $base ne $args->{entry}->permalink ) {
121             $context->log(info => "rewrite permalink to $base");
122             $args->{entry}->permalink($base);
123         }
124     }
125
126     # use Last-Modified to populate entry date, even if handler doesn't find one
127     if ($res->last_modified && !$args->{entry}->date) {
128         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
129     }
130
131     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
132
133     for my $plugin (@plugins) {
134         if ( $handler || $plugin->handle($args) ) {
135             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
136             my $data = $plugin->extract($args);
137                $data = { body => $data } if $data && !ref $data;
138             if ($data) {
139                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
140                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
141
142                 # if body was already there, set that to summary
143                 if ($args->{entry}->body) {
144                     $args->{entry}->summary($args->{entry}->body);
145                 }
146
147                 $data->{body} = $resolver->resolve( $data->{body} );
148                 $args->{entry}->body($data->{body});
149                 $args->{entry}->title($data->{title}) if $data->{title};
150                 $args->{entry}->author($data->{author}) if $data->{author};
151                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
152                 $args->{entry}->summary($data->{summary}) if $data->{summary};
153
154                 # extract date using found one
155                 if ($data->{date}) {
156                     $args->{entry}->date($data->{date});
157                 }
158
159                 return 1;
160             }
161         }
162     }
163
164     # failed to extract: store whole HTML if the config is on
165     if ($self->conf->{store_html_on_failure}) {
166         $args->{entry}->body($args->{content});
167         return 1;
168     }
169
170     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
171 }
172
173
174 package Plagger::Plugin::Filter::EntryFullText::Site;
175 sub new { bless {}, shift }
176 sub custom_feed_handle { 0 }
177 sub custom_feed_follow_link { }
178 sub custom_feed_follow_xpath { }
179 sub handle_force { 0 }
180 sub handle { 0 }
181
182 package Plagger::Plugin::Filter::EntryFullText::YAML;
183 use Encode;
184 use List::Util qw(first);
185
186 sub new {
187     my($class, $data, $base) = @_;
188
189     # add ^ if handle method starts with http://
190     for my $key ( qw(custom_feed_handle handle handle_force) ) {
191         next unless defined $data->{$key};
192         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
193     }
194
195     # decode as UTF-8
196     for my $key ( qw(extract extract_date_format) ) {
197         next unless defined $data->{$key};
198         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
199             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
200         } else {
201             $data->{$key} = decode("UTF-8", $data->{$key});
202         }
203     }
204
205     bless {%$data, base => $base }, $class;
206 }
207
208 sub site_name {
209     my $self = shift;
210     $self->{base};
211 }
212
213 sub custom_feed_handle {
214     my($self, $args) = @_;
215     $self->{custom_feed_handle} ?
216         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
217 }
218
219 sub custom_feed_follow_link {
220     $_[0]->{custom_feed_follow_link};
221 }
222
223 sub custom_feed_follow_xpath {
224     $_[0]->{custom_feed_follow_xpath};
225 }
226
227 sub handle_force {
228     my($self, $args) = @_;
229     $self->{handle_force}
230         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
231 }
232
233 sub handle {
234     my($self, $args) = @_;
235     $self->{handle}
236         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
237 }
238
239 sub xml_escape {
240     for my $x (@_) {
241         $x = Plagger::Util::encode_xml($x);
242     }
243 }
244
245 sub extract {
246     my($self, $args) = @_;
247     my $data;
248
249     unless ($self->{extract} || $self->{extract_xpath}) {
250         Plagger->context->log(error => "YAML doesn't have either 'extract' nor 'extract_xpath'");
251         return;
252     }
253
254     if ($self->{extract}) {
255         if (my @match = $args->{content} =~ /$self->{extract}/s) {
256             my @capture = split /\s+/, $self->{extract_capture};
257             @capture = ('body') unless @capture;
258             @{$data}{@capture} = @match;
259         }
260     }
261
262     if ($self->{extract_xpath}) {
263         eval { require HTML::TreeBuilder::XPath };
264         if ($@) {
265             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
266             return;
267         }
268
269         my $tree = HTML::TreeBuilder::XPath->new;
270         $tree->parse($args->{content});
271         $tree->eof;
272
273         for my $capture (keys %{$self->{extract_xpath}}) {
274             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
275             if (@children) {
276                 no warnings 'redefine';
277                 local *HTML::Element::_xml_escape = \&xml_escape;
278                 $data->{$capture} = $children[0]->as_XML;
279             } else {
280                 Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
281             }
282         }
283     }
284
285     if ($data) {
286         if ($self->{extract_after_hook}) {
287             eval $self->{extract_after_hook};
288             Plagger->context->error($@) if $@;
289         }
290
291         if ($data->{date}) {
292             if (my $format = $self->{extract_date_format}) {
293                 $format = [ $format ] unless ref $format;
294                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
295                 if ($data->{date} && $self->{extract_date_timezone}) {
296                     $data->{date}->set_time_zone($self->{extract_date_timezone});
297                 }
298             } else {
299                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
300             }
301         }
302
303         return $data;
304     }
305 }
306
307 1;
308
309 __END__
310
311 =head1 NAME
312
313 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
314
315 =head1 SYNOPSIS
316
317   - module: Filter::EntryFullText
318
319 =head1 DESCRIPTION
320
321 This plugin allows you to fetch entry full text by doing HTTP GET and
322 apply regexp to HTML. It's just like upgrading your flight ticket from
323 economy class to business class!
324
325 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
326 files under assets plugin directory.
327
328 =head1 CONFIG
329
330 =over 4
331
332 =item store_html_on_failure
333
334 Even if fulltext handlers fail to extract content body from HTML, this
335 option enables to store the whole document HTML as entry body. It will
336 be useful to use with search engines like Gmail and Search:: plugins.
337 Defaults to 0.
338
339 =item force_upgrade
340
341 Even if entry body already contains HTML, this config forces the
342 plugin to upgrade the body. Defaults to 0.
343
344 =back
345
346 =head1 WRITING CUSTOM FULLTEXT HANDLER
347
348 (To be documented)
349
350 =head1 AUTHOR
351
352 Tatsuhiko Miyagawa
353
354 =head1 SEE ALSO
355
356 L<Plagger>
Note: See TracBrowser for help on using the browser.