root/trunk/plagger/lib/Plagger/Plugin/Filter/TruePermalink.pm

Revision 1734 (checked in by miyagawa, 14 years ago)

added Test::Spelling and fixed typoes

Line 
1 package Plagger::Plugin::Filter::TruePermalink;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use YAML;
7 use Plagger::UserAgent;
8 use URI;
9 use URI::QueryParam;
10
11 sub init {
12     my $self = shift;
13     $self->SUPER::init(@_);
14     $self->conf->{follow_redirect} = 1 unless exists $self->conf->{follow_redirect};
15     $self->load_plugins;
16 }
17
18 sub load_plugins {
19     my $self = shift;
20
21     $self->load_assets(
22         '*.yaml',
23         sub {
24             my $file = shift;
25             Plagger->context->log(debug => "loading $file");
26             my $data = YAML::LoadFile($file);
27             if (ref($data) eq 'ARRAY') {
28                 # redirectors.yaml ... make it backward compatible to ignore
29             } else {
30                 push @{$self->{plugins}}, $data;
31             }
32         },
33     );
34 }
35
36 sub register {
37     my($self, $context) = @_;
38     $context->register_hook(
39         $self,
40         'update.entry.fixup' => \&update,
41     );
42 }
43
44 sub update {
45     my($self, $context, $args) = @_;
46
47     $self->rewrite(sub { $args->{entry}->permalink }, sub { $args->{entry}->permalink(@_) }, $args);
48     for my $enclosure ($args->{entry}->enclosures) {
49         $self->rewrite(sub { $enclosure->url }, sub { $enclosure->url( URI->new(@_) ) }, $args);
50     }
51 }
52
53 sub rewrite {
54     my($self, $getter, $callback, $args) = @_;
55
56     my $loop;
57     while ($self->rewrite_link($getter, $callback, $args)) {
58         if ($loop++ >= 10) {
59             $self->log(error => "Possible infinite loop on " . $getter->());
60             return;
61         }
62     }
63 }
64
65 sub rewrite_link {
66     my($self, $getter, $callback, $args) = @_;
67
68     my $context = Plagger->context;
69
70     my $link = $getter->();
71     my $orig = $link; # copy
72     my $count = 0;
73     my $rewritten;
74
75     for my $plugin (@{ $self->{plugins}}) {
76         my $match = $plugin->{match} || '.'; # anything
77         next unless $link =~ m/$match/i;
78
79         if ($plugin->{rewrite}) {
80             local $_ = $link;
81             my $done = eval $plugin->{rewrite};
82             if ($@) {
83                 $context->error("$@ in $plugin->{rewrite}");
84             } elsif ($done) {
85                 $count += $done;
86                 $rewritten = $_;
87                 last;
88             }
89         } elsif ($plugin->{query_param}) {
90             my $param = URI->new($link)->query_param($plugin->{query_param})
91                 or $context->error("No query param $plugin->{query_param} in " . $link);
92             $count++;
93             $rewritten = $param;
94             last;
95         }
96     }
97
98     # No match to known sites. Try redirect by issuing GET
99     if (!$count && $self->conf->{follow_redirect}) {
100         my $url = $self->follow_redirect($link);
101         if ($url && $url ne $link) {
102             $count++;
103             $rewritten = $url;
104         }
105     }
106
107     if ($count) {
108         $callback->($rewritten);
109         $context->log(info => "Link $orig rewritten to $rewritten");
110     }
111
112     return $count;
113 }
114
115 sub follow_redirect {
116     my($self, $link) = @_;
117
118     my $url = $self->cache->get_callback(
119         "redirector:$link",
120         sub {
121             Plagger->context->log(debug => "Issuing GET to $link to follow redirects");
122             my $ua  = Plagger::UserAgent->new;
123
124             # don't care about content body ... immediately die
125             my $res = $ua->simple_request( HTTP::Request->new(GET => $link), sub { die } );
126             if ($res->is_redirect) {
127                 return $res->header('Location');
128             }
129             return '';
130         },
131         '1 day',
132     );
133
134     Plagger->context->log(debug => "Resolved redirection of $link => $url") if $url;
135
136     return $url;
137 }
138
139 1;
140
141 __END__
142
143 =head1 NAME
144
145 Plagger::Plugin::Filter::TruePermalink - Normalize permalink using its own plugin files
146
147 =head1 SYNOPSIS
148
149   - module: Filter::TruePermalink
150
151 =head1 DESCRIPTION
152
153 This plugin normalizes permalink using YAML based URL pattern
154 files.
155
156 This plugin rewrites I<permalink> attribute of C<$entry>, while
157 keeping I<link> as is. If C<$entry> has enclosures, this plugin also
158 tries to rewrite url of them.
159
160 =head1 CONFIG
161
162 =over 4
163
164 =item follow_redirect
165
166 If set to 1, this plugin issues GET request to entry permalinks to see
167 if the server returns 301 or 302 redirect to other URL. Defaults to 1.
168
169 =back
170
171 =head1 PATTERN FILES
172
173 You can write your own pattern file using YAML data format. Usable keys are:
174
175 =over 4
176
177 =item author
178
179 Your name. (Optional)
180
181 =item match
182
183 Regular expression rule to match with entry's link. Rewrites only
184 happen when the URL form matches. You can omit this configuration to
185 apply the rewrite rule to any URLs.
186
187 =item rewrite
188
189 Replacement regexp to filter permalink. Permalink is stored in C<$_> variable so that you can write:
190
191   rewrite: s/;jsession_id=\w+//
192
193 =item query_param
194
195 URL query parameter to extract normalized permalink.
196
197   query_param: destination
198
199 =back
200
201 See C<assets/plugins/Filter-TruePermalink> for more examples.
202
203 =head1 AUTHOR
204
205 youpy
206
207 Tatsuhiko Miyagawa
208
209 =head1 SEE ALSO
210
211 L<Plagger>
212
213 =cut
Note: See TracBrowser for help on using the browser.