root/trunk/plagger/lib/Plagger/Plugin/Filter/TruePermalink.pm

Revision 1689 (checked in by miyagawa, 14 years ago)

Filter::TruePermalink?: don't die if it detects infinite loop on HTTP redirects.

Line 
1 package Plagger::Plugin::Filter::TruePermalink;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use YAML;
7 use Plagger::UserAgent;
8 use URI;
9 use URI::QueryParam;
10
11 sub init {
12     my $self = shift;
13     $self->SUPER::init(@_);
14     $self->conf->{follow_redirect} = 1 unless exists $self->conf->{follow_redirect};
15     $self->load_plugins;
16 }
17
18 sub load_plugins {
19     my $self = shift;
20
21     $self->load_assets(
22         '*.yaml',
23         sub {
24             my $file = shift;
25             Plagger->context->log(debug => "loading $file");
26             my $data = YAML::LoadFile($file);
27             if (ref($data) eq 'ARRAY') {
28                 # redirectors.yaml ... make it backward compatible to ignore
29             } else {
30                 push @{$self->{plugins}}, $data;
31             }
32         },
33     );
34 }
35
36 sub register {
37     my($self, $context) = @_;
38     $context->register_hook(
39         $self,
40         'update.entry.fixup' => \&update,
41     );
42 }
43
44 sub update {
45     my($self, $context, $args) = @_;
46
47     $self->rewrite(sub { $args->{entry}->permalink }, sub { $args->{entry}->permalink(@_) }, $args);
48     for my $enclosure ($args->{entry}->enclosures) {
49         $self->rewrite(sub { $enclosure->url }, sub { $enclosure->url( URI->new(@_) ) }, $args);
50     }
51 }
52
53 sub rewrite {
54     my($self, $getter, $callback, $args) = @_;
55
56     my $loop;
57     while ($self->rewrite_link($getter, $callback, $args)) {
58         if ($loop++ >= 10) {
59             $self->log(error => "Possible infinite loop on " . $getter->());
60             return;
61         }
62     }
63 }
64
65 sub rewrite_link {
66     my($self, $getter, $callback, $args) = @_;
67
68     my $context = Plagger->context;
69
70     my $link = $getter->();
71     my $orig = $link; # copy
72     my $count = 0;
73     my $rewritten;
74
75     for my $plugin (@{ $self->{plugins}}) {
76         my $match = $plugin->{match} || '.'; # anything
77         next unless $link =~ m/$match/i;
78
79         if ($plugin->{rewrite}) {
80             local $_ = $link;
81             my $done = eval $plugin->{rewrite};
82             if ($@) {
83                 $context->error("$@ in $plugin->{rewrite}");
84             } elsif ($done) {
85                 $count += $done;
86                 $rewritten = $_;
87                 last;
88             }
89         } elsif ($plugin->{query_param}) {
90             my $param = URI->new($link)->query_param($plugin->{query_param})
91                 or $context->error("No query param $plugin->{query_param} in " . $link);
92             $count++;
93             $rewritten = $param;
94             last;
95         }
96     }
97
98     # No match to known sites. Try redirect by issuing GET
99     if (!$count && $self->conf->{follow_redirect}) {
100         my $url = $self->follow_redirect($link);
101         if ($url && $url ne $link) {
102             $count++;
103             $rewritten = $url;
104         }
105     }
106
107     if ($count) {
108         $callback->($rewritten);
109         $context->log(info => "Link $orig rewritten to $rewritten");
110     }
111
112     return $count;
113 }
114
115 sub follow_redirect {
116     my($self, $link) = @_;
117
118     my $url = $self->cache->get_callback(
119         "redirector:$link",
120         sub {
121             Plagger->context->log(debug => "Issuing GET to $link to follow redirects");
122             my $ua  = Plagger::UserAgent->new;
123
124             # don't care about content body ... immediately die
125             my $res = $ua->simple_request( HTTP::Request->new(GET => $link), sub { die } );
126             if ($res->is_redirect) {
127                 return $res->header('Location');
128             }
129             return '';
130         },
131         '1 day',
132     );
133
134     Plagger->context->log(debug => "Resolved redirection of $link => $url") if $url;
135
136     return $url;
137 }
138
139 1;
140
141 __END__
142
143 =head1 NAME
144
145 Plagger::Plugin::Filter::TruePermalink - Normalize permalink using its own plugin files
146
147 =head1 SYNOPSIS
148
149   - module: Filter::TruePermalink
150
151 =head1 DESCRIPTION
152
153 This plugin normalizes permalink using YAML based URL pattern
154 files. Various permalink fix filters in the past (YahooBlogSearch,
155 Namaan, 2chRSSPermalink) can now be writting as a pattern file for
156 this plugin.
157
158 This plugin rewrites I<permalink> attribute of C<$entry>, while
159 keeping I<link> as is. If C<$entry> has enclosures, this plugin also
160 tries to rewrite url of them.
161
162 =head1 CONFIG
163
164 =over 4
165
166 =item follow_redirect
167
168 If set to 1, this plugin issues GET request to entry permalinks to see
169 if the server returns 301 or 302 redirect to other URL. Defaults to 1.
170
171 =back
172
173 =head1 PATTERN FILES
174
175 You can write your own pattern file using YAML data format. Usable keys are:
176
177 =over 4
178
179 =item author
180
181 Your name. (Optional)
182
183 =item match
184
185 Regular expression rule to match with entry's link. Rewrites only
186 happen when the URL form matches. You can omit this configuration to
187 apply the rewrite rule to any URLs.
188
189 =item rewrite
190
191 Replacement regexp to filter permalink. Permalink is stored in C<$_> variable so that you can write:
192
193   rewrite: s/;jsession_id=\w+//
194
195 =item query_param
196
197 URL query parameter to extract normalized permalink.
198
199   query_param: destination
200
201 =back
202
203 See C<assets/plugins/Filter-TruePermalink> for more examples.
204
205 =head1 AUTHOR
206
207 youpy
208
209 Tatsuhiko Miyagawa
210
211 =head1 SEE ALSO
212
213 L<Plagger>
214
215 =cut
Note: See TracBrowser for help on using the browser.