root/trunk/plagger/lib/Plagger/Plugin/Filter/TruePermalink.pm

Revision 1583 (checked in by miyagawa, 14 years ago)

Filter::TruePermalink?: remove redirectors.yaml. Instead introduced 'follow_redirect' config which defaults to 1, to check redirection by issuing GET

Line 
1 package Plagger::Plugin::Filter::TruePermalink;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use YAML;
7 use Plagger::UserAgent;
8 use URI;
9 use URI::QueryParam;
10
11 sub init {
12     my $self = shift;
13     $self->SUPER::init(@_);
14     $self->conf->{follow_redirect} = 1 unless exists $self->conf->{follow_redirect};
15     $self->load_plugins;
16 }
17
18 sub load_plugins {
19     my $self = shift;
20
21     my $dir = $self->assets_dir;
22     my $dh = DirHandle->new($dir) or Plagger->context->error("$dir: $!");
23     for my $file (grep -f $_->[0] && $_->[1] =~ /\.yaml$/,
24                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
25         $self->load_plugin(@$file);
26     }
27 }
28
29 sub load_plugin {
30     my($self, $file, $base) = @_;
31
32     Plagger->context->log(debug => "loading $file");
33     my $data = YAML::LoadFile($file);
34     if (ref($data) eq 'ARRAY') {
35         # redirectors.yaml ... make it backward compatible to ignore
36     } else {
37         push @{$self->{plugins}}, $data;
38     }
39 }
40
41 sub register {
42     my($self, $context) = @_;
43     $context->register_hook(
44         $self,
45         'update.entry.fixup' => \&update,
46     );
47 }
48
49 sub update {
50     my($self, $context, $args) = @_;
51
52     $self->rewrite(sub { $args->{entry}->permalink }, sub { $args->{entry}->permalink(@_) }, $args);
53     for my $enclosure ($args->{entry}->enclosures) {
54         $self->rewrite(sub { $enclosure->url }, sub { $enclosure->url( URI->new(@_) ) }, $args);
55     }
56 }
57
58 sub rewrite {
59     my($self, $getter, $callback, $args) = @_;
60
61     my $loop;
62     while ($self->rewrite_link($getter, $callback, $args)) {
63         if ($loop++ >= 100) {
64             Plagger->error("Possible infinite loop on " . $getter->());
65         }
66     }
67 }
68
69 sub rewrite_link {
70     my($self, $getter, $callback, $args) = @_;
71
72     my $context = Plagger->context;
73
74     my $link = $getter->();
75     my $orig = $link; # copy
76     my $count = 0;
77     my $rewritten;
78
79     for my $plugin (@{ $self->{plugins}}) {
80         my $match = $plugin->{match} || '.'; # anything
81         next unless $link =~ m/$match/i;
82
83         if ($plugin->{rewrite}) {
84             local $_ = $link;
85             my $done = eval $plugin->{rewrite};
86             if ($@) {
87                 $context->error("$@ in $plugin->{rewrite}");
88             } elsif ($done) {
89                 $count += $done;
90                 $rewritten = $_;
91                 last;
92             }
93         } elsif ($plugin->{query_param}) {
94             my $param = URI->new($link)->query_param($plugin->{query_param})
95                 or $context->error("No query param $plugin->{query_param} in " . $link);
96             $count++;
97             $rewritten = $param;
98             last;
99         }
100     }
101
102     # No match to known sites. Try redirect by issuing GET
103     if (!$count && $self->conf->{follow_redirect}) {
104         my $url = $self->follow_redirect($link);
105         if ($url && $url ne $link) {
106             $count++;
107             $rewritten = $url;
108         }
109     }
110
111     if ($count) {
112         $callback->($rewritten);
113         $context->log(info => "Link $orig rewritten to $rewritten");
114     }
115
116     return $count;
117 }
118
119 sub follow_redirect {
120     my($self, $link) = @_;
121
122     my $url = $self->cache->get_callback(
123         "redirector:$link",
124         sub {
125             Plagger->context->log(debug => "Issuing GET to $link to follow redirects");
126             my $ua  = Plagger::UserAgent->new;
127             my $res = $ua->simple_request( HTTP::Request->new(GET => $link) );
128             if ($res->is_redirect) {
129                 return $res->header('Location');
130             }
131             return;
132         },
133         '1 day',
134     );
135
136     Plagger->context->log(debug => "Resolved redirection of $link => $url") if $url;
137
138     return $url;
139 }
140
141 1;
142
143 __END__
144
145 =head1 NAME
146
147 Plagger::Plugin::Filter::TruePermalink - Normalize permalink using its own plugin files
148
149 =head1 SYNOPSIS
150
151   - module: Filter::TruePermalink
152
153 =head1 DESCRIPTION
154
155 This plugin normalizes permalink using YAML based URL pattern
156 files. Various permalink fix filters in the past (YahooBlogSearch,
157 Namaan, 2chRSSPermalink) can now be writting as a pattern file for
158 this plugin.
159
160 This plugin rewrites I<permalink> attribute of C<$entry>, while
161 keeping I<link> as is. If C<$entry> has enclosures, this plugin also
162 tries to rewrite url of them.
163
164 =head1 CONFIG
165
166 =over 4
167
168 =item follow_redirect
169
170 If set to 1, this plugin issues GET request to entry permalinks to see
171 if the server returns 301 or 302 redirect to other URL. Defaults to 1.
172
173 =back
174
175 =head1 PATTERN FILES
176
177 You can write your own pattern file using YAML data format. Usable keys are:
178
179 =over 4
180
181 =item author
182
183 Your name. (Optional)
184
185 =item match
186
187 Regular expression rule to match with entry's link. Rewrites only
188 happen when the URL form matches. You can omit this configuration to
189 apply the rewrite rule to any URLs.
190
191 =item rewrite
192
193 Replacement regexp to filter permalink. Permalink is stored in C<$_> variable so that you can write:
194
195   rewrite: s/;jsession_id=\w+//
196
197 =item query_param
198
199 URL query parameter to extract normalized permalink.
200
201   query_param: destination
202
203 =back
204
205 See C<assets/plugins/Filter-TruePermalink> for more examples.
206
207 =head1 AUTHOR
208
209 youpy
210
211 Tatsuhiko Miyagawa
212
213 =head1 SEE ALSO
214
215 L<Plagger>
216
217 =cut
Note: See TracBrowser for help on using the browser.