root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 481 (checked in by miyagawa, 14 years ago)
  • Added CustomFeed?::Simple to extract links that match a regexp. Fixes #32
  • Added Plagger::Date->strptime($format, $date)
  • Added decode_content and extract_title to Util
  • Support metadata in Config, for now
Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use Plagger::Date; # for metadata in plugins
10 use Plagger::Util qw( decode_content );
11
12 use Plagger::UserAgent;
13
14 sub register {
15     my($self, $context) = @_;
16     $context->register_hook(
17         $self,
18         'update.entry.fixup' => \&filter,
19     );
20 }
21
22 sub init {
23     my $self = shift;
24     $self->SUPER::init(@_);
25     $self->load_plugins();
26
27     $self->{ua} = Plagger::UserAgent->new;
28 }
29
30 sub load_plugins {
31     my $self = shift;
32     my $context = Plagger->context;
33
34     my $dir = $self->assets_dir;
35     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
36     for my $file (grep -f $_->[0] && $_->[0] =~ /\.pl$/,
37                   map [ File::Spec->catfile($dir, $_), $_ ], $dh->read) {
38         $self->load_plugin(@$file);
39     }
40 }
41
42 sub load_plugin {
43     my($self, $file, $base) = @_;
44
45     Plagger->context->log(debug => "loading $file");
46
47     open my $fh, $file or Plagger->context->error("$file: $!");
48     (my $pkg = $base) =~ s/\.pl$//;
49     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
50
51     my $code = join '', <$fh>;
52     unless ($code =~ /^\s*package/s) {
53         $code = join "\n",
54             ( "package $plugin_class;",
55               "use strict;",
56               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
57               "sub site_name { '$pkg' }",
58               $code,
59               "1;" );
60     }
61
62     eval $code;
63     Plagger->context->error($@) if $@;
64
65     my $plugin = $plugin_class->new;
66     push @{ $self->{plugins} }, $plugin;
67 }
68
69 sub filter {
70     my($self, $context, $args) = @_;
71
72     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
73     if ( !$handler && $args->{entry}->body && $args->{entry}->body =~ /<\w+>/ ) {
74         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
75         return;
76     }
77
78     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self );
79     return if $res->http_response->is_error;
80
81     $args->{content} = decode_content($res);
82
83     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
84
85     for my $plugin (@plugins) {
86         if ( $handler || $plugin->handle($args) ) {
87             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
88             my $data = $plugin->extract($args);
89                $data = { body => $data } if $data && !ref $data;
90             if ($data) {
91                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
92                 $args->{entry}->body($data->{body});
93                 $args->{entry}->title($data->{title}) if $data->{title};
94                 $args->{entry}->date($data->{date})   if $data->{date};
95                 return 1;
96             }
97         }
98     }
99
100     # failed to extract: store whole HTML if the config is on
101     if ($self->conf->{store_html_on_failure}) {
102         $args->{entry}->body($args->{content});
103         return 1;
104     }
105
106     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
107 }
108
109
110 package Plagger::Plugin::Filter::EntryFullText::Site;
111 sub new { bless {}, shift }
112 sub handle_force { 0 }
113 sub handle { 0 }
114
115 1;
116
117 __END__
118
119 =head1 NAME
120
121 Plagger::Plugin::Filter::EntryFullText - Framework to fetch entry full text
122
123 =head1 SYNOPSIS
124
125   - module: Filter::EntryFullText
126
127   # assets/plugins/filter-entryfulltext/asahi_com.pl
128   sub handle {
129       my($self, $args) = @_;
130       $args->{entry}->link =~ qr!^http://www\.asahi\.com/!;
131   }
132
133   sub extract_body {
134       my($self, $content) = @_;
135       ( $content =~ /<!-- Start of Kiji -->(.*)<!-- End of Kiji -->/s )[0];
136   }
137
138 =head1 DESCRIPTION
139
140 This plugin allows you to fetch entry full text by doing HTTP GET and
141 apply regexp to HTML. You can write custom fulltext handler by putting
142 C<.pl> files under assets plugin directory.
143
144 =head1 CONFIG
145
146 =over 4
147
148 =item store_html_on_failure
149
150 Even if fulltext handlers fail to extract content body from HTML, this
151 option enables to store the whole document HTML as entry body. It will
152 be useful to use with search engines like Gmail and Search:: plugins.
153 Defaults to 0.
154
155 =back
156
157 =head1 WRITING CUSTOM FULLTEXT HANDLER
158
159 (To be documented)
160
161 =head1 AUTHOR
162
163 Tatsuhiko Miyagawa
164
165 =head1 SEE ALSO
166
167 L<Plagger>
Note: See TracBrowser for help on using the browser.