root/trunk/plagger/lib/Plagger/Plugin/Filter/FindEnclosures.pm

Revision 1606 (checked in by miyagawa, 14 years ago)

moved mime_is_enclosure to Plagger::Util. Added hack to allow .ogg as enclosure. Added an unit test

Line 
1 package Plagger::Plugin::Filter::FindEnclosures;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use HTML::TokeParser;
6 use Plagger::Util qw( decode_content );
7 use List::Util qw(first);
8 use URI;
9 use DirHandle;
10 use Plagger::Enclosure;
11 use Plagger::UserAgent;
12
13 sub register {
14     my($self, $context) = @_;
15
16     $context->autoload_plugin('Filter::ResolveRelativeLink');
17     $context->register_hook(
18         $self,
19         'update.entry.fixup' => \&filter,
20     );
21 }
22
23 sub init {
24     my $self = shift;
25     $self->SUPER::init(@_);
26     $self->load_plugins();
27
28     $self->{ua} = Plagger::UserAgent->new;
29 }
30
31 sub load_plugins {
32     my $self = shift;
33     my $context = Plagger->context;
34
35     my $dir = $self->assets_dir;
36     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
37     for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
38                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
39         $self->load_plugin(@$file);
40     }
41 }
42
43 sub load_plugin {
44     my($self, $file, $base) = @_;
45
46     Plagger->context->log(debug => "loading $file");
47
48     my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
49     push @{ $self->{plugins} }, $self->$load_method($file, $base);
50 }
51
52 sub load_plugin_perl {
53     my($self, $file, $base) = @_;
54
55     open my $fh, '<', $file or Plagger->context->error("$file: $!");
56     (my $pkg = $base) =~ s/\.pl$//;
57     my $plugin_class = "Plagger::Plugin::Filter::FindEnclosures::Site::$pkg";
58
59     my $code = join '', <$fh>;
60     unless ($code =~ /^\s*package/s) {
61         $code = join "\n",
62             ( "package $plugin_class;",
63               "use strict;",
64               "use base qw( Plagger::Plugin::Filter::FindEnclosures::Site );",
65               "sub site_name { '$pkg' }",
66               $code,
67               "1;" );
68     }
69
70     eval $code;
71     Plagger->context->error($@) if $@;
72
73     return $plugin_class->new;
74 }
75
76 sub load_plugin_yaml { Plagger->context->error("NOT IMPLEMENTED YET") }
77
78 sub filter {
79     my($self, $context, $args) = @_;
80
81     # check $entry->link first, if it links directly to media files
82     $self->add_enclosure($args->{entry}, [ 'a', { href => $args->{entry}->permalink } ], 'href' );
83
84     my $parser = HTML::TokeParser->new(\$args->{entry}->body);
85     while (my $tag = $parser->get_tag('a', 'embed', 'img', 'object')) {
86         if ($tag->[0] eq 'a' ) {
87             $self->add_enclosure($args->{entry}, $tag, 'href');
88         } elsif ($tag->[0] eq 'embed') {
89             $self->add_enclosure($args->{entry}, $tag, 'src', { type => $tag->[1]->{type} });
90         } elsif ($tag->[0] eq 'img') {
91             $self->add_enclosure($args->{entry}, $tag, 'src', { inline => 1 });
92         } elsif ($tag->[0] eq 'object') {
93             $self->add_enclosure_from_object($args->{entry}, $parser);
94         }
95     }
96 }
97
98 sub add_enclosure_from_object {
99     my($self, $entry, $parser) = @_;
100
101     # get param tags and find appropriate FLV movies
102     my @params;
103     while (my $tag = $parser->get_tag('param', '/object')) {
104         last if $tag->[0] eq '/object';
105         push @params, $tag;
106     }
107
108     # find URL inside flashvars parameter
109     my $url;
110     if (my $flashvars = first { lc($_->[1]->{name}) eq 'flashvars' } @params) {
111         my %values = split /[=&]/, $flashvars->[1]->{value} || '';
112         $url   = first { m!^https?://.*\flv! } values %values;
113         $url ||= first { m!^https?://.*! } values %values;
114     }
115
116     # if URL isn't found in flash vars, then fallback to <param name="movie" />
117     if (!$url) {
118         my $movie = first { lc($_->[1]->{name}) eq 'movie' } @params;
119         $url = $movie->[1]->{value} if $movie;
120     }
121
122     if ($url) {
123         Plagger->context->log(info => "Found enclosure $url");
124         my $enclosure = Plagger::Enclosure->new;
125         $enclosure->url( URI->new($url) );
126         $enclosure->auto_set_type;
127         $entry->add_enclosure($enclosure); # XXX inline?
128     }
129 }
130
131 sub add_enclosure {
132     my($self, $entry, $tag, $attr, $opt) = @_;
133     $opt ||= {};
134
135     if ($self->is_enclosure($tag, $attr, $opt->{type})) {
136         Plagger->context->log(info => "Found enclosure $tag->[1]{$attr}");
137         my $enclosure = Plagger::Enclosure->new;
138         $enclosure->url($tag->[1]{$attr});
139         $enclosure->auto_set_type($opt->{type});
140         $enclosure->is_inline(1) if $opt->{inline};
141         $entry->add_enclosure($enclosure);
142         return;
143     }
144
145     my $url = $tag->[1]{$attr};
146     my $content;
147     for my $plugin (@{$self->{plugins}}) {
148         if ( $plugin->handle($url) ) {
149             Plagger->context->log(debug => "Try $url with " . $plugin->site_name);
150             $content ||= $self->fetch_content($url) or return;
151
152             if (my $enclosure = $plugin->find({ content => $content, url => $url })) {
153                 Plagger->context->log(info => "Found enclosure " . $enclosure->url ." with " . $plugin->site_name);
154                 $entry->add_enclosure($enclosure);
155                 return;
156             }
157         }
158     }
159 }
160
161 sub fetch_content {
162     my($self, $url) = @_;
163
164     my $ua  = Plagger::UserAgent->new;
165     my $res = $ua->fetch($url, $self, { NoNetwork => 3 * 60 * 60 });
166     return if !$res->status && $res->is_error;
167
168     return decode_content($res);
169 }
170
171 sub is_enclosure {
172     my($self, $tag, $attr, $type) = @_;
173
174     return 1 if $tag->[1]{rel} && $tag->[1]{rel} eq 'enclosure';
175     return 1 if $self->has_enclosure_mime_type($tag->[1]{$attr}, $type);
176
177     return;
178 }
179
180 sub has_enclosure_mime_type {
181     my($self, $url, $type) = @_;
182
183     my $mime = $type ? MIME::Type->new(type => $type) : Plagger::Util::mime_type_of( URI->new($url) );
184     Plagger::Util::mime_is_enclosure($mime);
185 }
186
187 package Plagger::Plugin::Filter::FindEnclosures::Site;
188 sub new { bless {}, shift }
189 sub handle { 0 }
190 sub find { }
191
192 1;
193
194 __END__
195
196 =head1 NAME
197
198 Plagger::Plugin::Filter::FindEnclosures - Auto-find enclosures from entry content using B<< <a> >> / B<< <embed> >> tags
199
200 =head1 SYNOPSIS
201
202   - module: Filter::FindEnclosures
203
204 =head1 DESCRIPTION
205
206 This plugin finds enclosures from C<< $entry->body >> by finding 1)
207 B<< <a> >> links with I<rel="enclosure"> attribute, 2) B<< <a> >>
208 links to any URL which filename extensions match with known
209 audio/video formats and 3) I<src> attributes in B<< <img> >> and B<< <embed> >> tags.
210
211 For example:
212
213   Listen to the <a href="http://example.com/foobar.mp3">Podcast</a> now, or <a rel="enclosure"
214   href="http://example.com/foobar.m4a">download AAC version</a>. <img src="/img/logo.gif" />
215
216 Those 3 links (I<foobar.mp3>, I<foobar.m4a> and I<logo.gif>) are
217 extracted as enclosures, while I<logo.gif> is marked as "inline", so
218 that they won't appear as enclosures in Publish::Feed.
219
220 You might want to also use Filter::HEADEnclosureMetadata plugin to
221 know the actual length (bytes-length) of enclosures by sending HEAD
222 requests.
223
224 =head1 AUTHOR
225
226 Tatsuhiko Miyagawa
227
228 Masahiro Nagano
229
230 =head1 SEE ALSO
231
232 L<Plagger>, L<Plagger::Plugin::Filter::HEADEnclosureMetadata>, L<http://www.msgilligan.com/rss-enclosure-bp.html>, L<http://forums.feedburner.com/viewtopic.php?t=20>
233
234 =cut
235
Note: See TracBrowser for help on using the browser.