root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1859 (checked in by miyagawa, 14 years ago)

From nikc --

Using extract_xpath I've discovered that it doesn't handle (at least)
text() nodes and attribute values.

For example, give this:

<h1>Some text with <em>extra markup</em></h1>

and just wanting the text, the XPath

extract_xpath:

title: //h1/descendant
text()

fails. Or this:

<meta name="DC.date.modified" content="2006-05-16 15:34:30">

and trying to extract the date:

extract_xpath:

date: //meta[@name="DC.date.modified"]/@content

This is because it always calls the as_XML() method on the extracted
data, and neither of those types support that method.

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32 }
33
34 sub load_plugins {
35     my $self = shift;
36     my $context = Plagger->context;
37
38     $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
39     $self->load_assets('*.pl',   sub { $self->load_plugin_perl(@_) });
40 }
41
42 sub load_plugin_perl {
43     my($self, $file, $base) = @_;
44
45     Plagger->context->log(debug => "Load plugin $file");
46
47     open my $fh, '<', $file or Plagger->context->error("$file: $!");
48     (my $pkg = $base) =~ s/\.pl$//;
49     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
50
51     if ($plugin_class->can('new')) {
52         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
53         return $plugin_class->new;
54     }
55
56     my $code = join '', <$fh>;
57     unless ($code =~ /^\s*package/s) {
58         $code = join "\n",
59             ( "package $plugin_class;",
60               "use strict;",
61               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
62               "sub site_name { '$pkg' }",
63               $code,
64               "1;" );
65     }
66
67     eval $code;
68     Plagger->context->error($@) if $@;
69
70     push @{ $self->{plugins} }, $plugin_class->new;
71 }
72
73 sub load_plugin_yaml {
74     my($self, $file, $base) = @_;
75
76     Plagger->context->log(debug => "Load YAML $file");
77     my @data = YAML::LoadFile($file);
78
79     push @{ $self->{plugins} },
80         map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data;
81 }
82
83 sub handle {
84     my($self, $context, $args) = @_;
85
86     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
87     if ($handler) {
88         $args->{match} = $handler->custom_feed_follow_link;
89         $args->{xpath} = $handler->custom_feed_follow_xpath;
90         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
91     }
92 }
93
94 sub filter {
95     my($self, $context, $args) = @_;
96
97     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
98     if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) {
99         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
100         return;
101     }
102
103     if (! $args->{entry}->permalink) {
104         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
105         return;
106     }
107
108     # NoNetwork: don't connect for 3 hours
109     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
110     if (!$res->status && $res->is_error) {
111         $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
112         return;
113     }
114
115     $args->{content} = decode_content($res);
116
117     # if the request was redirected, set it as permalink
118     if ($res->http_response) {
119         my $base = $res->http_response->request->uri;
120         if ( $base ne $args->{entry}->permalink ) {
121             $context->log(info => "rewrite permalink to $base");
122             $args->{entry}->permalink($base);
123         }
124     }
125
126     # use Last-Modified to populate entry date, even if handler doesn't find one
127     if ($res->last_modified && !$args->{entry}->date) {
128         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
129     }
130
131     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
132
133     for my $plugin (@plugins) {
134         if ( $handler || $plugin->handle($args) ) {
135             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
136             my $data = $plugin->extract($args);
137                $data = { body => $data } if $data && !ref $data;
138             if ($data) {
139                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
140                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
141
142                 # if body was already there, set that to summary
143                 if ($args->{entry}->body) {
144                     $args->{entry}->summary($args->{entry}->body);
145                 }
146
147                 $data->{body} = $resolver->resolve( $data->{body} );
148                 $args->{entry}->body($data->{body});
149                 $args->{entry}->title($data->{title}) if $data->{title};
150                 $args->{entry}->author($data->{author}) if $data->{author};
151                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
152                 $args->{entry}->summary($data->{summary}) if $data->{summary};
153
154                 # extract date using found one
155                 if ($data->{date}) {
156                     $args->{entry}->date($data->{date});
157                 }
158
159                 return 1;
160             }
161         }
162     }
163
164     # failed to extract: store whole HTML if the config is on
165     if ($self->conf->{store_html_on_failure}) {
166         $args->{entry}->body($args->{content});
167         return 1;
168     }
169
170     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
171 }
172
173
174 package Plagger::Plugin::Filter::EntryFullText::Site;
175 sub new { bless {}, shift }
176 sub custom_feed_handle { 0 }
177 sub custom_feed_follow_link { }
178 sub custom_feed_follow_xpath { }
179 sub handle_force { 0 }
180 sub handle { 0 }
181
182 package Plagger::Plugin::Filter::EntryFullText::YAML;
183 use Encode;
184 use List::Util qw(first);
185
186 sub new {
187     my($class, $data, $base) = @_;
188
189     # add ^ if handle method starts with http://
190     for my $key ( qw(custom_feed_handle handle handle_force) ) {
191         next unless defined $data->{$key};
192         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
193     }
194
195     # decode as UTF-8
196     for my $key ( qw(extract extract_date_format) ) {
197         next unless defined $data->{$key};
198         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
199             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
200         } else {
201             $data->{$key} = decode("UTF-8", $data->{$key});
202         }
203     }
204
205     bless {%$data, base => $base }, $class;
206 }
207
208 sub site_name {
209     my $self = shift;
210     $self->{base};
211 }
212
213 sub custom_feed_handle {
214     my($self, $args) = @_;
215     $self->{custom_feed_handle} ?
216         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
217 }
218
219 sub custom_feed_follow_link {
220     $_[0]->{custom_feed_follow_link};
221 }
222
223 sub custom_feed_follow_xpath {
224     $_[0]->{custom_feed_follow_xpath};
225 }
226
227 sub handle_force {
228     my($self, $args) = @_;
229     $self->{handle_force}
230         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
231 }
232
233 sub handle {
234     my($self, $args) = @_;
235     $self->{handle}
236         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
237 }
238
239 sub xml_escape {
240     for my $x (@_) {
241         $x = Plagger::Util::encode_xml($x);
242     }
243 }
244
245 sub extract {
246     my($self, $args) = @_;
247     my $data;
248
249     unless ($self->{extract} || $self->{extract_xpath}) {
250         Plagger->context->log(error => "YAML doesn't have either 'extract' nor 'extract_xpath'");
251         return;
252     }
253
254     if ($self->{extract}) {
255         if (my @match = $args->{content} =~ /$self->{extract}/s) {
256             my @capture = split /\s+/, $self->{extract_capture};
257             @capture = ('body') unless @capture;
258             @{$data}{@capture} = @match;
259         }
260     }
261
262     if ($self->{extract_xpath}) {
263         eval { require HTML::TreeBuilder::XPath };
264         if ($@) {
265             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
266             return;
267         }
268
269         my $tree = HTML::TreeBuilder::XPath->new;
270         $tree->parse($args->{content});
271         $tree->eof;
272
273         for my $capture (keys %{$self->{extract_xpath}}) {
274             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
275             if (@children) {
276                 no warnings 'redefine';
277                 local *HTML::Element::_xml_escape = \&xml_escape;
278                 $data->{$capture} = $children[0]->isElementNode
279                     ? $children[0]->as_XML
280                     : $children[0]->getValue;
281             } else {
282                 Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
283             }
284         }
285     }
286
287     if ($data) {
288         if ($self->{extract_after_hook}) {
289             eval $self->{extract_after_hook};
290             Plagger->context->error($@) if $@;
291         }
292
293         if ($data->{date}) {
294             if (my $format = $self->{extract_date_format}) {
295                 $format = [ $format ] unless ref $format;
296                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
297                 if ($data->{date} && $self->{extract_date_timezone}) {
298                     $data->{date}->set_time_zone($self->{extract_date_timezone});
299                 }
300             } else {
301                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
302             }
303         }
304
305         return $data;
306     }
307 }
308
309 1;
310
311 __END__
312
313 =head1 NAME
314
315 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
316
317 =head1 SYNOPSIS
318
319   - module: Filter::EntryFullText
320
321 =head1 DESCRIPTION
322
323 This plugin allows you to fetch entry full text by doing HTTP GET and
324 apply regexp to HTML. It's just like upgrading your flight ticket from
325 economy class to business class!
326
327 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
328 files under assets plugin directory.
329
330 =head1 CONFIG
331
332 =over 4
333
334 =item store_html_on_failure
335
336 Even if fulltext handlers fail to extract content body from HTML, this
337 option enables to store the whole document HTML as entry body. It will
338 be useful to use with search engines like Gmail and Search:: plugins.
339 Defaults to 0.
340
341 =item force_upgrade
342
343 Even if entry body already contains HTML, this config forces the
344 plugin to upgrade the body. Defaults to 0.
345
346 =back
347
348 =head1 WRITING CUSTOM FULLTEXT HANDLER
349
350 (To be documented)
351
352 =head1 AUTHOR
353
354 Tatsuhiko Miyagawa
355
356 =head1 SEE ALSO
357
358 L<Plagger>
Note: See TracBrowser for help on using the browser.