root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1348 (checked in by miyagawa, 14 years ago)

skip pitchforkmedia test for now. it causes test failures without any rational reason

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32     $self->{ua}->parse_head(0);
33 }
34
35 sub load_plugins {
36     my $self = shift;
37     my $context = Plagger->context;
38
39     my $dir = $self->assets_dir;
40     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
41     for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
42                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
43         $self->load_plugin(@$file);
44     }
45 }
46
47 sub load_plugin {
48     my($self, $file, $base) = @_;
49
50     Plagger->context->log(debug => "loading $file");
51
52     my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
53     push @{ $self->{plugins} }, $self->$load_method($file, $base);
54 }
55
56 sub load_plugin_perl {
57     my($self, $file, $base) = @_;
58
59     open my $fh, $file or Plagger->context->error("$file: $!");
60     (my $pkg = $base) =~ s/\.pl$//;
61     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
62
63     if ($plugin_class->can('new')) {
64         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
65         return $plugin_class->new;
66     }
67
68     my $code = join '', <$fh>;
69     unless ($code =~ /^\s*package/s) {
70         $code = join "\n",
71             ( "package $plugin_class;",
72               "use strict;",
73               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
74               "sub site_name { '$pkg' }",
75               $code,
76               "1;" );
77     }
78
79     eval $code;
80     Plagger->context->error($@) if $@;
81
82     return $plugin_class->new;
83 }
84
85 sub load_plugin_yaml {
86     my($self, $file, $base) = @_;
87     my @data = YAML::LoadFile($file);
88
89     return map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) }
90         @data;
91 }
92
93 sub handle {
94     my($self, $context, $args) = @_;
95
96     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
97     if ($handler) {
98         $args->{match} = $handler->custom_feed_follow_link;
99         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
100     }
101 }
102
103 sub filter {
104     my($self, $context, $args) = @_;
105
106     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
107     if ( !$handler && $args->{entry}->body && $args->{entry}->body =~ /<\w+>/ && !$self->conf->{force_upgrade} ) {
108         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
109         return;
110     }
111
112     if (! $args->{entry}->permalink) {
113         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
114         return;
115     }
116
117     # NoNetwork: don't connect for 3 hours
118     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
119     if (!$res->status && $res->is_error) {
120         $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
121         return;
122     }
123
124     $args->{content} = decode_content($res);
125
126     # if the request was redirected, set it as permalink
127     if ($res->http_response) {
128         my $base = $res->http_response->request->uri;
129         if ( $base ne $args->{entry}->permalink ) {
130             $context->log(info => "rewrite permalink to $base");
131             $args->{entry}->permalink($base);
132         }
133     }
134
135     # use Last-Modified to populate entry date, even if handler doesn't find one
136     if ($res->last_modified && !$args->{entry}->date) {
137         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
138     }
139
140     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
141
142     for my $plugin (@plugins) {
143         if ( $handler || $plugin->handle($args) ) {
144             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
145             my $data = $plugin->extract($args);
146                $data = { body => $data } if $data && !ref $data;
147             if ($data) {
148                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
149                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
150                 $data->{body} = $resolver->resolve( $data->{body} );
151                 $args->{entry}->body($data->{body});
152                 $args->{entry}->title($data->{title}) if $data->{title};
153                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
154
155                 # extract date using found one
156                 if ($data->{date}) {
157                     $args->{entry}->date($data->{date});
158                 }
159
160                 return 1;
161             }
162         }
163     }
164
165     # failed to extract: store whole HTML if the config is on
166     if ($self->conf->{store_html_on_failure}) {
167         $args->{entry}->body($args->{content});
168         return 1;
169     }
170
171     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
172 }
173
174
175 package Plagger::Plugin::Filter::EntryFullText::Site;
176 sub new { bless {}, shift }
177 sub custom_feed_handle { 0 }
178 sub custom_feed_follow_link { }
179 sub handle_force { 0 }
180 sub handle { 0 }
181
182 package Plagger::Plugin::Filter::EntryFullText::YAML;
183 use Encode;
184 use List::Util qw(first);
185
186 sub new {
187     my($class, $data, $base) = @_;
188
189     # add ^ if handle method starts with http://
190     for my $key ( qw(custom_feed_handle handle handle_force) ) {
191         next unless defined $data->{$key};
192         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
193     }
194
195     # decode as UTF-8
196     for my $key ( qw(extract extract_date_format) ) {
197         next unless defined $data->{$key};
198         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
199             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
200         } else {
201             $data->{$key} = decode("UTF-8", $data->{$key});
202         }
203     }
204
205     bless {%$data, base => $base }, $class;
206 }
207
208 sub site_name {
209     my $self = shift;
210     $self->{base};
211 }
212
213 sub custom_feed_handle {
214     my($self, $args) = @_;
215     $self->{custom_feed_handle} ?
216         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
217 }
218
219 sub custom_feed_follow_link {
220     $_[0]->{custom_feed_follow_link};
221 }
222
223 sub handle_force {
224     my($self, $args) = @_;
225     $self->{handle_force}
226         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
227 }
228
229 sub handle {
230     my($self, $args) = @_;
231     $self->{handle}
232         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
233 }
234
235 sub extract {
236     my($self, $args) = @_;
237     my $data;
238
239     if ($self->{extract}) {
240         if (my @match = $args->{content} =~ /$self->{extract}/s) {
241             my @capture = split /\s+/, $self->{extract_capture};
242             @{$data}{@capture} = @match;
243         }
244     }
245
246     if ($self->{extract_xpath}) {
247         eval { require HTML::TreeBuilder::XPath };
248         if ($@) {
249             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
250             return;
251         }
252
253         my $tree = HTML::TreeBuilder::XPath->new;
254         $tree->parse($args->{content});
255         $tree->eof;
256
257         for my $capture (keys %{$self->{extract_xpath}}) {
258             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
259             $data->{$capture} = $children[0]->as_HTML;
260         }
261     }
262
263     if ($data) {
264         if ($self->{extract_after_hook}) {
265             eval $self->{extract_after_hook};
266             Plagger->context->error($@) if $@;
267         }
268
269         if ($data->{date}) {
270             if (my $format = $self->{extract_date_format}) {
271                 $format = [ $format ] unless ref $format;
272                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
273                 if ($data->{date} && $self->{extract_date_timezone}) {
274                     $data->{date}->set_time_zone($self->{extract_date_timezone});
275                 }
276             } else {
277                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
278             }
279         }
280
281         return $data;
282     }
283 }
284
285 1;
286
287 __END__
288
289 =head1 NAME
290
291 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
292
293 =head1 SYNOPSIS
294
295   - module: Filter::EntryFullText
296
297 =head1 DESCRIPTION
298
299 This plugin allows you to fetch entry full text by doing HTTP GET and
300 apply regexp to HTML. It's just like upgrading your flight ticket from
301 economy class to business class!
302
303 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
304 files under assets plugin directory.
305
306 =head1 CONFIG
307
308 =over 4
309
310 =item store_html_on_failure
311
312 Even if fulltext handlers fail to extract content body from HTML, this
313 option enables to store the whole document HTML as entry body. It will
314 be useful to use with search engines like Gmail and Search:: plugins.
315 Defaults to 0.
316
317 =item force_upgrade
318
319 Even if entry body already contains HTML, this config forces the
320 plugin to upgrade the body. Defaults to 0.
321
322 =back
323
324 =head1 WRITING CUSTOM FULLTEXT HANDLER
325
326 (To be documented)
327
328 =head1 AUTHOR
329
330 Tatsuhiko Miyagawa
331
332 =head1 SEE ALSO
333
334 L<Plagger>
Note: See TracBrowser for help on using the browser.