root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1342 (checked in by youpy, 14 years ago)
  • added test for EFT with XPath
  • Filter::EntryFullText?: fixed bug when YAML has no extract regexp
Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32     $self->{ua}->parse_head(0);
33 }
34
35 sub load_plugins {
36     my $self = shift;
37     my $context = Plagger->context;
38
39     my $dir = $self->assets_dir;
40     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
41     for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
42                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
43         $self->load_plugin(@$file);
44     }
45 }
46
47 sub load_plugin {
48     my($self, $file, $base) = @_;
49
50     Plagger->context->log(debug => "loading $file");
51
52     my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
53     push @{ $self->{plugins} }, $self->$load_method($file, $base);
54 }
55
56 sub load_plugin_perl {
57     my($self, $file, $base) = @_;
58
59     open my $fh, $file or Plagger->context->error("$file: $!");
60     (my $pkg = $base) =~ s/\.pl$//;
61     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
62
63     if ($plugin_class->can('new')) {
64         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
65         return $plugin_class->new;
66     }
67
68     my $code = join '', <$fh>;
69     unless ($code =~ /^\s*package/s) {
70         $code = join "\n",
71             ( "package $plugin_class;",
72               "use strict;",
73               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
74               "sub site_name { '$pkg' }",
75               $code,
76               "1;" );
77     }
78
79     eval $code;
80     Plagger->context->error($@) if $@;
81
82     return $plugin_class->new;
83 }
84
85 sub load_plugin_yaml {
86     my($self, $file, $base) = @_;
87     my @data = YAML::LoadFile($file);
88
89     return map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) }
90         @data;
91 }
92
93 sub handle {
94     my($self, $context, $args) = @_;
95
96     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
97     if ($handler) {
98         $args->{match} = $handler->custom_feed_follow_link;
99         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
100     }
101 }
102
103 sub filter {
104     my($self, $context, $args) = @_;
105
106     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
107     if ( !$handler && $args->{entry}->body && $args->{entry}->body =~ /<\w+>/ && !$self->conf->{force_upgrade} ) {
108         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
109         return;
110     }
111
112     if (! $args->{entry}->permalink) {
113         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
114         return;
115     }
116
117     # NoNetwork: don't connect for 3 hours
118     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
119     return if !$res->status && $res->is_error;
120
121     $args->{content} = decode_content($res);
122
123     # if the request was redirected, set it as permalink
124     if ($res->http_response) {
125         my $base = $res->http_response->request->uri;
126         if ( $base ne $args->{entry}->permalink ) {
127             $context->log(info => "rewrite permalink to $base");
128             $args->{entry}->permalink($base);
129         }
130     }
131
132     # use Last-Modified to populate entry date, even if handler doesn't find one
133     if ($res->last_modified && !$args->{entry}->date) {
134         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
135     }
136
137     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
138
139     for my $plugin (@plugins) {
140         if ( $handler || $plugin->handle($args) ) {
141             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
142             my $data = $plugin->extract($args);
143                $data = { body => $data } if $data && !ref $data;
144             if ($data) {
145                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
146                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
147                 $data->{body} = $resolver->resolve( $data->{body} );
148                 $args->{entry}->body($data->{body});
149                 $args->{entry}->title($data->{title}) if $data->{title};
150                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
151
152                 # extract date using found one
153                 if ($data->{date}) {
154                     $args->{entry}->date($data->{date});
155                 }
156
157                 return 1;
158             }
159         }
160     }
161
162     # failed to extract: store whole HTML if the config is on
163     if ($self->conf->{store_html_on_failure}) {
164         $args->{entry}->body($args->{content});
165         return 1;
166     }
167
168     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
169 }
170
171
172 package Plagger::Plugin::Filter::EntryFullText::Site;
173 sub new { bless {}, shift }
174 sub custom_feed_handle { 0 }
175 sub custom_feed_follow_link { }
176 sub handle_force { 0 }
177 sub handle { 0 }
178
179 package Plagger::Plugin::Filter::EntryFullText::YAML;
180 use Encode;
181 use List::Util qw(first);
182
183 sub new {
184     my($class, $data, $base) = @_;
185
186     # add ^ if handle method starts with http://
187     for my $key ( qw(custom_feed_handle handle handle_force) ) {
188         next unless defined $data->{$key};
189         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
190     }
191
192     # decode as UTF-8
193     for my $key ( qw(extract extract_date_format) ) {
194         next unless defined $data->{$key};
195         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
196             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
197         } else {
198             $data->{$key} = decode("UTF-8", $data->{$key});
199         }
200     }
201
202     bless {%$data, base => $base }, $class;
203 }
204
205 sub site_name {
206     my $self = shift;
207     $self->{base};
208 }
209
210 sub custom_feed_handle {
211     my($self, $args) = @_;
212     $self->{custom_feed_handle} ?
213         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
214 }
215
216 sub custom_feed_follow_link {
217     $_[0]->{custom_feed_follow_link};
218 }
219
220 sub handle_force {
221     my($self, $args) = @_;
222     $self->{handle_force}
223         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
224 }
225
226 sub handle {
227     my($self, $args) = @_;
228     $self->{handle}
229         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
230 }
231
232 sub extract {
233     my($self, $args) = @_;
234     my $data;
235
236     if ($self->{extract}) {
237         if (my @match = $args->{content} =~ /$self->{extract}/s) {
238             my @capture = split /\s+/, $self->{extract_capture};
239             @{$data}{@capture} = @match;
240         }
241     }
242
243     if ($self->{extract_xpath}) {
244         eval { require HTML::TreeBuilder::XPath };
245         if ($@) {
246             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
247             return;
248         }
249
250         my $tree = HTML::TreeBuilder::XPath->new;
251         $tree->parse($args->{content});
252         $tree->eof;
253
254         for my $capture (keys %{$self->{extract_xpath}}) {
255             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
256             $data->{$capture} = $children[0]->as_HTML;
257         }
258     }
259
260     if ($data) {
261         if ($self->{extract_after_hook}) {
262             eval $self->{extract_after_hook};
263             Plagger->context->error($@) if $@;
264         }
265
266         if ($data->{date}) {
267             if (my $format = $self->{extract_date_format}) {
268                 $format = [ $format ] unless ref $format;
269                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
270                 if ($data->{date} && $self->{extract_date_timezone}) {
271                     $data->{date}->set_time_zone($self->{extract_date_timezone});
272                 }
273             } else {
274                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
275             }
276         }
277
278         return $data;
279     }
280 }
281
282 1;
283
284 __END__
285
286 =head1 NAME
287
288 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
289
290 =head1 SYNOPSIS
291
292   - module: Filter::EntryFullText
293
294 =head1 DESCRIPTION
295
296 This plugin allows you to fetch entry full text by doing HTTP GET and
297 apply regexp to HTML. It's just like upgrading your flight ticket from
298 economy class to business class!
299
300 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
301 files under assets plugin directory.
302
303 =head1 CONFIG
304
305 =over 4
306
307 =item store_html_on_failure
308
309 Even if fulltext handlers fail to extract content body from HTML, this
310 option enables to store the whole document HTML as entry body. It will
311 be useful to use with search engines like Gmail and Search:: plugins.
312 Defaults to 0.
313
314 =item force_upgrade
315
316 Even if entry body already contains HTML, this config forces the
317 plugin to upgrade the body. Defaults to 0.
318
319 =back
320
321 =head1 WRITING CUSTOM FULLTEXT HANDLER
322
323 (To be documented)
324
325 =head1 AUTHOR
326
327 Tatsuhiko Miyagawa
328
329 =head1 SEE ALSO
330
331 L<Plagger>
Note: See TracBrowser for help on using the browser.