root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1741 (checked in by miyagawa, 14 years ago)

merge from hackathon-summary

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32 }
33
34 sub load_plugins {
35     my $self = shift;
36     my $context = Plagger->context;
37
38     $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
39     $self->load_assets('*.pl',   sub { $self->load_plugin_perl(@_) });
40 }
41
42 sub load_plugin_perl {
43     my($self, $file, $base) = @_;
44
45     Plagger->context->log(debug => "Load plugin $file");
46
47     open my $fh, '<', $file or Plagger->context->error("$file: $!");
48     (my $pkg = $base) =~ s/\.pl$//;
49     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
50
51     if ($plugin_class->can('new')) {
52         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
53         return $plugin_class->new;
54     }
55
56     my $code = join '', <$fh>;
57     unless ($code =~ /^\s*package/s) {
58         $code = join "\n",
59             ( "package $plugin_class;",
60               "use strict;",
61               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
62               "sub site_name { '$pkg' }",
63               $code,
64               "1;" );
65     }
66
67     eval $code;
68     Plagger->context->error($@) if $@;
69
70     push @{ $self->{plugins} }, $plugin_class->new;
71 }
72
73 sub load_plugin_yaml {
74     my($self, $file, $base) = @_;
75
76     Plagger->context->log(debug => "Load YAML $file");
77     my @data = YAML::LoadFile($file);
78
79     push @{ $self->{plugins} },
80         map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data;
81 }
82
83 sub handle {
84     my($self, $context, $args) = @_;
85
86     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
87     if ($handler) {
88         $args->{match} = $handler->custom_feed_follow_link;
89         $args->{xpath} = $handler->custom_feed_follow_xpath;
90         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
91     }
92 }
93
94 sub filter {
95     my($self, $context, $args) = @_;
96
97     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
98     if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) {
99         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
100         return;
101     }
102
103     if (! $args->{entry}->permalink) {
104         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
105         return;
106     }
107
108     # NoNetwork: don't connect for 3 hours
109     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
110     if (!$res->status && $res->is_error) {
111         $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
112         return;
113     }
114
115     $args->{content} = decode_content($res);
116
117     # if the request was redirected, set it as permalink
118     if ($res->http_response) {
119         my $base = $res->http_response->request->uri;
120         if ( $base ne $args->{entry}->permalink ) {
121             $context->log(info => "rewrite permalink to $base");
122             $args->{entry}->permalink($base);
123         }
124     }
125
126     # use Last-Modified to populate entry date, even if handler doesn't find one
127     if ($res->last_modified && !$args->{entry}->date) {
128         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
129     }
130
131     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
132
133     for my $plugin (@plugins) {
134         if ( $handler || $plugin->handle($args) ) {
135             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
136             my $data = $plugin->extract($args);
137                $data = { body => $data } if $data && !ref $data;
138             if ($data) {
139                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
140                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
141
142                 # if body was already there, set that to summary
143                 if ($args->{entry}->body) {
144                     $args->{entry}->summary($args->{entry}->body);
145                 }
146
147                 $data->{body} = $resolver->resolve( $data->{body} );
148                 $args->{entry}->body($data->{body});
149                 $args->{entry}->title($data->{title}) if $data->{title};
150                 $args->{entry}->author($data->{author}) if $data->{author};
151                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
152                 $args->{entry}->summary($data->{summary}) if $data->{summary};
153
154                 # extract date using found one
155                 if ($data->{date}) {
156                     $args->{entry}->date($data->{date});
157                 }
158
159                 return 1;
160             }
161         }
162     }
163
164     # failed to extract: store whole HTML if the config is on
165     if ($self->conf->{store_html_on_failure}) {
166         $args->{entry}->body($args->{content});
167         return 1;
168     }
169
170     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
171 }
172
173
174 package Plagger::Plugin::Filter::EntryFullText::Site;
175 sub new { bless {}, shift }
176 sub custom_feed_handle { 0 }
177 sub custom_feed_follow_link { }
178 sub custom_feed_follow_xpath { }
179 sub handle_force { 0 }
180 sub handle { 0 }
181
182 package Plagger::Plugin::Filter::EntryFullText::YAML;
183 use Encode;
184 use List::Util qw(first);
185
186 sub new {
187     my($class, $data, $base) = @_;
188
189     # add ^ if handle method starts with http://
190     for my $key ( qw(custom_feed_handle handle handle_force) ) {
191         next unless defined $data->{$key};
192         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
193     }
194
195     # decode as UTF-8
196     for my $key ( qw(extract extract_date_format) ) {
197         next unless defined $data->{$key};
198         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
199             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
200         } else {
201             $data->{$key} = decode("UTF-8", $data->{$key});
202         }
203     }
204
205     bless {%$data, base => $base }, $class;
206 }
207
208 sub site_name {
209     my $self = shift;
210     $self->{base};
211 }
212
213 sub custom_feed_handle {
214     my($self, $args) = @_;
215     $self->{custom_feed_handle} ?
216         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
217 }
218
219 sub custom_feed_follow_link {
220     $_[0]->{custom_feed_follow_link};
221 }
222
223 sub custom_feed_follow_xpath {
224     $_[0]->{custom_feed_follow_xpath};
225 }
226
227 sub handle_force {
228     my($self, $args) = @_;
229     $self->{handle_force}
230         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
231 }
232
233 sub handle {
234     my($self, $args) = @_;
235     $self->{handle}
236         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
237 }
238
239 sub xml_escape {
240     for my $x (@_) {
241         $x = Plagger::Util::encode_xml($x);
242     }
243 }
244
245 sub extract {
246     my($self, $args) = @_;
247     my $data;
248
249     if ($self->{extract}) {
250         if (my @match = $args->{content} =~ /$self->{extract}/s) {
251             my @capture = split /\s+/, $self->{extract_capture};
252             @capture = ('body') unless @capture;
253             @{$data}{@capture} = @match;
254         }
255     }
256
257     if ($self->{extract_xpath}) {
258         eval { require HTML::TreeBuilder::XPath };
259         if ($@) {
260             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
261             return;
262         }
263
264         my $tree = HTML::TreeBuilder::XPath->new;
265         $tree->parse($args->{content});
266         $tree->eof;
267
268         for my $capture (keys %{$self->{extract_xpath}}) {
269             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
270             if (@children) {
271                 no warnings 'redefine';
272                 local *HTML::Element::_xml_escape = \&xml_escape;
273                 $data->{$capture} = $children[0]->as_XML;
274             } else {
275                 Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
276             }
277         }
278     }
279
280     if ($data) {
281         if ($self->{extract_after_hook}) {
282             eval $self->{extract_after_hook};
283             Plagger->context->error($@) if $@;
284         }
285
286         if ($data->{date}) {
287             if (my $format = $self->{extract_date_format}) {
288                 $format = [ $format ] unless ref $format;
289                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
290                 if ($data->{date} && $self->{extract_date_timezone}) {
291                     $data->{date}->set_time_zone($self->{extract_date_timezone});
292                 }
293             } else {
294                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
295             }
296         }
297
298         return $data;
299     }
300 }
301
302 1;
303
304 __END__
305
306 =head1 NAME
307
308 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
309
310 =head1 SYNOPSIS
311
312   - module: Filter::EntryFullText
313
314 =head1 DESCRIPTION
315
316 This plugin allows you to fetch entry full text by doing HTTP GET and
317 apply regexp to HTML. It's just like upgrading your flight ticket from
318 economy class to business class!
319
320 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
321 files under assets plugin directory.
322
323 =head1 CONFIG
324
325 =over 4
326
327 =item store_html_on_failure
328
329 Even if fulltext handlers fail to extract content body from HTML, this
330 option enables to store the whole document HTML as entry body. It will
331 be useful to use with search engines like Gmail and Search:: plugins.
332 Defaults to 0.
333
334 =item force_upgrade
335
336 Even if entry body already contains HTML, this config forces the
337 plugin to upgrade the body. Defaults to 0.
338
339 =back
340
341 =head1 WRITING CUSTOM FULLTEXT HANDLER
342
343 (To be documented)
344
345 =head1 AUTHOR
346
347 Tatsuhiko Miyagawa
348
349 =head1 SEE ALSO
350
351 L<Plagger>
Note: See TracBrowser for help on using the browser.