root/trunk/plagger/lib/Plagger/Plugin/Filter/TruePermalink.pm

Revision 1595 (checked in by miyagawa, 14 years ago)

Filter::TruePermalink?: fixed bug when it tries to resolve redirection of huge file (like MP3) it gets stuck. Call die() immediately to ignore content body

Line 
1 package Plagger::Plugin::Filter::TruePermalink;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use YAML;
7 use Plagger::UserAgent;
8 use URI;
9 use URI::QueryParam;
10
11 sub init {
12     my $self = shift;
13     $self->SUPER::init(@_);
14     $self->conf->{follow_redirect} = 1 unless exists $self->conf->{follow_redirect};
15     $self->load_plugins;
16 }
17
18 sub load_plugins {
19     my $self = shift;
20
21     $self->load_assets(
22         '*.yaml',
23         sub {
24             my $file = shift;
25             Plagger->context->log(debug => "loading $file");
26             my $data = YAML::LoadFile($file);
27             if (ref($data) eq 'ARRAY') {
28                 # redirectors.yaml ... make it backward compatible to ignore
29             } else {
30                 push @{$self->{plugins}}, $data;
31             }
32         },
33     );
34 }
35
36 sub register {
37     my($self, $context) = @_;
38     $context->register_hook(
39         $self,
40         'update.entry.fixup' => \&update,
41     );
42 }
43
44 sub update {
45     my($self, $context, $args) = @_;
46
47     $self->rewrite(sub { $args->{entry}->permalink }, sub { $args->{entry}->permalink(@_) }, $args);
48     for my $enclosure ($args->{entry}->enclosures) {
49         $self->rewrite(sub { $enclosure->url }, sub { $enclosure->url( URI->new(@_) ) }, $args);
50     }
51 }
52
53 sub rewrite {
54     my($self, $getter, $callback, $args) = @_;
55
56     my $loop;
57     while ($self->rewrite_link($getter, $callback, $args)) {
58         if ($loop++ >= 100) {
59             Plagger->error("Possible infinite loop on " . $getter->());
60         }
61     }
62 }
63
64 sub rewrite_link {
65     my($self, $getter, $callback, $args) = @_;
66
67     my $context = Plagger->context;
68
69     my $link = $getter->();
70     my $orig = $link; # copy
71     my $count = 0;
72     my $rewritten;
73
74     for my $plugin (@{ $self->{plugins}}) {
75         my $match = $plugin->{match} || '.'; # anything
76         next unless $link =~ m/$match/i;
77
78         if ($plugin->{rewrite}) {
79             local $_ = $link;
80             my $done = eval $plugin->{rewrite};
81             if ($@) {
82                 $context->error("$@ in $plugin->{rewrite}");
83             } elsif ($done) {
84                 $count += $done;
85                 $rewritten = $_;
86                 last;
87             }
88         } elsif ($plugin->{query_param}) {
89             my $param = URI->new($link)->query_param($plugin->{query_param})
90                 or $context->error("No query param $plugin->{query_param} in " . $link);
91             $count++;
92             $rewritten = $param;
93             last;
94         }
95     }
96
97     # No match to known sites. Try redirect by issuing GET
98     if (!$count && $self->conf->{follow_redirect}) {
99         my $url = $self->follow_redirect($link);
100         if ($url && $url ne $link) {
101             $count++;
102             $rewritten = $url;
103         }
104     }
105
106     if ($count) {
107         $callback->($rewritten);
108         $context->log(info => "Link $orig rewritten to $rewritten");
109     }
110
111     return $count;
112 }
113
114 sub follow_redirect {
115     my($self, $link) = @_;
116
117     my $url = $self->cache->get_callback(
118         "redirector:$link",
119         sub {
120             Plagger->context->log(debug => "Issuing GET to $link to follow redirects");
121             my $ua  = Plagger::UserAgent->new;
122
123             # don't care about content body ... immediately die
124             my $res = $ua->simple_request( HTTP::Request->new(GET => $link), sub { die } );
125             if ($res->is_redirect) {
126                 return $res->header('Location');
127             }
128             return '';
129         },
130         '1 day',
131     );
132
133     Plagger->context->log(debug => "Resolved redirection of $link => $url") if $url;
134
135     return $url;
136 }
137
138 1;
139
140 __END__
141
142 =head1 NAME
143
144 Plagger::Plugin::Filter::TruePermalink - Normalize permalink using its own plugin files
145
146 =head1 SYNOPSIS
147
148   - module: Filter::TruePermalink
149
150 =head1 DESCRIPTION
151
152 This plugin normalizes permalink using YAML based URL pattern
153 files. Various permalink fix filters in the past (YahooBlogSearch,
154 Namaan, 2chRSSPermalink) can now be writting as a pattern file for
155 this plugin.
156
157 This plugin rewrites I<permalink> attribute of C<$entry>, while
158 keeping I<link> as is. If C<$entry> has enclosures, this plugin also
159 tries to rewrite url of them.
160
161 =head1 CONFIG
162
163 =over 4
164
165 =item follow_redirect
166
167 If set to 1, this plugin issues GET request to entry permalinks to see
168 if the server returns 301 or 302 redirect to other URL. Defaults to 1.
169
170 =back
171
172 =head1 PATTERN FILES
173
174 You can write your own pattern file using YAML data format. Usable keys are:
175
176 =over 4
177
178 =item author
179
180 Your name. (Optional)
181
182 =item match
183
184 Regular expression rule to match with entry's link. Rewrites only
185 happen when the URL form matches. You can omit this configuration to
186 apply the rewrite rule to any URLs.
187
188 =item rewrite
189
190 Replacement regexp to filter permalink. Permalink is stored in C<$_> variable so that you can write:
191
192   rewrite: s/;jsession_id=\w+//
193
194 =item query_param
195
196 URL query parameter to extract normalized permalink.
197
198   query_param: destination
199
200 =back
201
202 See C<assets/plugins/Filter-TruePermalink> for more examples.
203
204 =head1 AUTHOR
205
206 youpy
207
208 Tatsuhiko Miyagawa
209
210 =head1 SEE ALSO
211
212 L<Plagger>
213
214 =cut
Note: See TracBrowser for help on using the browser.