root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1598 (checked in by miyagawa, 14 years ago)

Whoops, fixed terrible bug in EntryFullText? that *.pl is not loaded correctly

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32     $self->{ua}->parse_head(0);
33 }
34
35 sub load_plugins {
36     my $self = shift;
37     my $context = Plagger->context;
38
39     $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
40     $self->load_assets('*.pl',   sub { $self->load_plugin_perl(@_) });
41 }
42
43 sub load_plugin_perl {
44     my($self, $file, $base) = @_;
45
46     Plagger->context->log(debug => "Load plugin $file");
47
48     open my $fh, '<', $file or Plagger->context->error("$file: $!");
49     (my $pkg = $base) =~ s/\.pl$//;
50     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
51
52     if ($plugin_class->can('new')) {
53         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
54         return $plugin_class->new;
55     }
56
57     my $code = join '', <$fh>;
58     unless ($code =~ /^\s*package/s) {
59         $code = join "\n",
60             ( "package $plugin_class;",
61               "use strict;",
62               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
63               "sub site_name { '$pkg' }",
64               $code,
65               "1;" );
66     }
67
68     eval $code;
69     Plagger->context->error($@) if $@;
70
71     push @{ $self->{plugins} }, $plugin_class->new;
72 }
73
74 sub load_plugin_yaml {
75     my($self, $file, $base) = @_;
76
77     Plagger->context->log(debug => "Load YAML $file");
78     my @data = YAML::LoadFile($file);
79
80     push @{ $self->{plugins} },
81         map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data;
82 }
83
84 sub handle {
85     my($self, $context, $args) = @_;
86
87     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
88     if ($handler) {
89         $args->{match} = $handler->custom_feed_follow_link;
90         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
91     }
92 }
93
94 sub filter {
95     my($self, $context, $args) = @_;
96
97     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
98     if ( !$handler && $args->{entry}->body && $args->{entry}->body =~ /<\w+>/ && !$self->conf->{force_upgrade} ) {
99         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
100         return;
101     }
102
103     if (! $args->{entry}->permalink) {
104         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
105         return;
106     }
107
108     # NoNetwork: don't connect for 3 hours
109     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
110     if (!$res->status && $res->is_error) {
111         $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
112         return;
113     }
114
115     $args->{content} = decode_content($res);
116
117     # if the request was redirected, set it as permalink
118     if ($res->http_response) {
119         my $base = $res->http_response->request->uri;
120         if ( $base ne $args->{entry}->permalink ) {
121             $context->log(info => "rewrite permalink to $base");
122             $args->{entry}->permalink($base);
123         }
124     }
125
126     # use Last-Modified to populate entry date, even if handler doesn't find one
127     if ($res->last_modified && !$args->{entry}->date) {
128         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
129     }
130
131     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
132
133     for my $plugin (@plugins) {
134         if ( $handler || $plugin->handle($args) ) {
135             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
136             my $data = $plugin->extract($args);
137                $data = { body => $data } if $data && !ref $data;
138             if ($data) {
139                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
140                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
141                 $data->{body} = $resolver->resolve( $data->{body} );
142                 $args->{entry}->body($data->{body});
143                 $args->{entry}->title($data->{title}) if $data->{title};
144                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
145
146                 # extract date using found one
147                 if ($data->{date}) {
148                     $args->{entry}->date($data->{date});
149                 }
150
151                 return 1;
152             }
153         }
154     }
155
156     # failed to extract: store whole HTML if the config is on
157     if ($self->conf->{store_html_on_failure}) {
158         $args->{entry}->body($args->{content});
159         return 1;
160     }
161
162     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
163 }
164
165
166 package Plagger::Plugin::Filter::EntryFullText::Site;
167 sub new { bless {}, shift }
168 sub custom_feed_handle { 0 }
169 sub custom_feed_follow_link { }
170 sub handle_force { 0 }
171 sub handle { 0 }
172
173 package Plagger::Plugin::Filter::EntryFullText::YAML;
174 use Encode;
175 use List::Util qw(first);
176
177 sub new {
178     my($class, $data, $base) = @_;
179
180     # add ^ if handle method starts with http://
181     for my $key ( qw(custom_feed_handle handle handle_force) ) {
182         next unless defined $data->{$key};
183         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
184     }
185
186     # decode as UTF-8
187     for my $key ( qw(extract extract_date_format) ) {
188         next unless defined $data->{$key};
189         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
190             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
191         } else {
192             $data->{$key} = decode("UTF-8", $data->{$key});
193         }
194     }
195
196     bless {%$data, base => $base }, $class;
197 }
198
199 sub site_name {
200     my $self = shift;
201     $self->{base};
202 }
203
204 sub custom_feed_handle {
205     my($self, $args) = @_;
206     $self->{custom_feed_handle} ?
207         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
208 }
209
210 sub custom_feed_follow_link {
211     $_[0]->{custom_feed_follow_link};
212 }
213
214 sub handle_force {
215     my($self, $args) = @_;
216     $self->{handle_force}
217         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
218 }
219
220 sub handle {
221     my($self, $args) = @_;
222     $self->{handle}
223         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
224 }
225
226 sub xml_escape {
227     for my $x (@_) {
228         $x = Plagger::Util::encode_xml($x);
229     }
230 }
231
232 sub extract {
233     my($self, $args) = @_;
234     my $data;
235
236     if ($self->{extract}) {
237         if (my @match = $args->{content} =~ /$self->{extract}/s) {
238             my @capture = split /\s+/, $self->{extract_capture};
239             @capture = ('body') unless @capture;
240             @{$data}{@capture} = @match;
241         }
242     }
243
244     if ($self->{extract_xpath}) {
245         eval { require HTML::TreeBuilder::XPath };
246         if ($@) {
247             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
248             return;
249         }
250
251         my $tree = HTML::TreeBuilder::XPath->new;
252         $tree->parse($args->{content});
253         $tree->eof;
254
255         for my $capture (keys %{$self->{extract_xpath}}) {
256             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
257             if (@children) {
258                 no warnings 'redefine';
259                 local *HTML::Element::_xml_escape = \&xml_escape;
260                 $data->{$capture} = $children[0]->as_XML;
261             } else {
262                 Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
263             }
264         }
265     }
266
267     if ($data) {
268         if ($self->{extract_after_hook}) {
269             eval $self->{extract_after_hook};
270             Plagger->context->error($@) if $@;
271         }
272
273         if ($data->{date}) {
274             if (my $format = $self->{extract_date_format}) {
275                 $format = [ $format ] unless ref $format;
276                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
277                 if ($data->{date} && $self->{extract_date_timezone}) {
278                     $data->{date}->set_time_zone($self->{extract_date_timezone});
279                 }
280             } else {
281                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
282             }
283         }
284
285         return $data;
286     }
287 }
288
289 1;
290
291 __END__
292
293 =head1 NAME
294
295 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
296
297 =head1 SYNOPSIS
298
299   - module: Filter::EntryFullText
300
301 =head1 DESCRIPTION
302
303 This plugin allows you to fetch entry full text by doing HTTP GET and
304 apply regexp to HTML. It's just like upgrading your flight ticket from
305 economy class to business class!
306
307 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
308 files under assets plugin directory.
309
310 =head1 CONFIG
311
312 =over 4
313
314 =item store_html_on_failure
315
316 Even if fulltext handlers fail to extract content body from HTML, this
317 option enables to store the whole document HTML as entry body. It will
318 be useful to use with search engines like Gmail and Search:: plugins.
319 Defaults to 0.
320
321 =item force_upgrade
322
323 Even if entry body already contains HTML, this config forces the
324 plugin to upgrade the body. Defaults to 0.
325
326 =back
327
328 =head1 WRITING CUSTOM FULLTEXT HANDLER
329
330 (To be documented)
331
332 =head1 AUTHOR
333
334 Tatsuhiko Miyagawa
335
336 =head1 SEE ALSO
337
338 L<Plagger>
Note: See TracBrowser for help on using the browser.