root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 478 (checked in by miyagawa, 14 years ago)

only load .pl

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9
10 use Plagger::UserAgent;
11
12 sub register {
13     my($self, $context) = @_;
14     $context->register_hook(
15         $self,
16         'update.entry.fixup' => \&filter,
17     );
18 }
19
20 sub init {
21     my $self = shift;
22     $self->SUPER::init(@_);
23     $self->load_plugins();
24
25     $self->{ua} = Plagger::UserAgent->new;
26 }
27
28 sub load_plugins {
29     my $self = shift;
30     my $context = Plagger->context;
31
32     my $dir = $self->assets_dir;
33     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
34     for my $file (grep -f $_->[0] && $_->[0] =~ /\.pl$/,
35                   map [ File::Spec->catfile($dir, $_), $_ ], $dh->read) {
36         $self->load_plugin(@$file);
37     }
38 }
39
40 sub load_plugin {
41     my($self, $file, $base) = @_;
42
43     Plagger->context->log(debug => "loading $file");
44
45     open my $fh, $file or Plagger->context->error("$file: $!");
46     (my $pkg = $base) =~ s/\.pl$//;
47     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
48
49     my $code = join '', <$fh>;
50     unless ($code =~ /^\s*package/s) {
51         $code = join "\n",
52             ( "package $plugin_class;",
53               "use strict;",
54               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
55               "sub site_name { '$pkg' }",
56               $code,
57               "1;" );
58     }
59
60     eval $code;
61     Plagger->context->error($@) if $@;
62
63     my $plugin = $plugin_class->new;
64     push @{ $self->{plugins} }, $plugin;
65 }
66
67 sub filter {
68     my($self, $context, $args) = @_;
69
70     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
71     if ( !$handler && $args->{entry}->body && $args->{entry}->body =~ /<\w+>/ ) {
72         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
73         return;
74     }
75
76     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self );
77     return if $res->http_response->is_error;
78
79     $args->{content} = $self->decode_content($res);
80
81     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
82
83     for my $plugin (@plugins) {
84         if ( $handler || $plugin->handle($args) ) {
85             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
86             my $body = $plugin->extract_body($args);
87             if ($body) {
88                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
89                 $args->{entry}->body($body);
90                 return 1;
91             }
92         }
93     }
94
95     # failed to extract: store whole HTML if the config is on
96     if ($self->conf->{store_html_on_failure}) {
97         $args->{entry}->body($args->{content});
98         return 1;
99     }
100
101     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
102 }
103
104 # xxx make it Plagger::Entry's method so that other plugins can use
105 sub decode_content {
106     my($self, $res) = @_;
107     my $content = $res->content;
108
109     my $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0];
110     unless ($charset) {
111         $charset = ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"! )[0] || "utf-8";
112     }
113
114     return decode($charset, $content);
115 }
116
117 package Plagger::Plugin::Filter::EntryFullText::Site;
118 sub new { bless {}, shift }
119 sub handle_force { 0 }
120 sub handle { 0 }
121
122 1;
123
124 __END__
125
126 =head1 NAME
127
128 Plagger::Plugin::Filter::EntryFullText - Framework to fetch entry full text
129
130 =head1 SYNOPSIS
131
132   - module: Filter::EntryFullText
133
134   # assets/plugins/filter-entryfulltext/asahi_com.pl
135   sub handle {
136       my($self, $args) = @_;
137       $args->{entry}->link =~ qr!^http://www\.asahi\.com/!;
138   }
139
140   sub extract_body {
141       my($self, $content) = @_;
142       ( $content =~ /<!-- Start of Kiji -->(.*)<!-- End of Kiji -->/s )[0];
143   }
144
145 =head1 DESCRIPTION
146
147 This plugin allows you to fetch entry full text by doing HTTP GET and
148 apply regexp to HTML. You can write custom fulltext handler by putting
149 C<.pl> files under assets plugin directory.
150
151 =head1 CONFIG
152
153 =over 4
154
155 =item store_html_on_failure
156
157 Even if fulltext handlers fail to extract content body from HTML, this
158 option enables to store the whole document HTML as entry body. It will
159 be useful to use with search engines like Gmail and Search:: plugins.
160 Defaults to 0.
161
162 =back
163
164 =head1 WRITING CUSTOM FULLTEXT HANDLER
165
166 (To be documented)
167
168 =head1 AUTHOR
169
170 Tatsuhiko Miyagawa
171
172 =head1 SEE ALSO
173
174 L<Plagger>
Note: See TracBrowser for help on using the browser.