root/trunk/plagger/lib/Plagger/Plugin/Filter/FindEnclosures.pm

Revision 1741 (checked in by miyagawa, 14 years ago)

merge from hackathon-summary

Line 
1 package Plagger::Plugin::Filter::FindEnclosures;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use HTML::TokeParser;
6 use Plagger::Util qw( decode_content );
7 use List::Util qw(first);
8 use URI;
9 use DirHandle;
10 use Plagger::Enclosure;
11 use Plagger::UserAgent;
12
13 sub register {
14     my($self, $context) = @_;
15
16     $context->autoload_plugin({ module => 'Filter::ResolveRelativeLink' });
17     $context->register_hook(
18         $self,
19         'update.entry.fixup' => \&filter,
20     );
21 }
22
23 sub init {
24     my $self = shift;
25     $self->SUPER::init(@_);
26     $self->load_plugins();
27
28     $self->{ua} = Plagger::UserAgent->new;
29 }
30
31 sub load_plugins {
32     my $self = shift;
33     my $context = Plagger->context;
34
35     my $dir = $self->assets_dir;
36     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
37     for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
38                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
39         $self->load_plugin(@$file);
40     }
41 }
42
43 sub load_plugin {
44     my($self, $file, $base) = @_;
45
46     Plagger->context->log(debug => "loading $file");
47
48     my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
49     push @{ $self->{plugins} }, $self->$load_method($file, $base);
50 }
51
52 sub load_plugin_perl {
53     my($self, $file, $base) = @_;
54
55     open my $fh, '<', $file or Plagger->context->error("$file: $!");
56     (my $pkg = $base) =~ s/\.pl$//;
57     my $plugin_class = "Plagger::Plugin::Filter::FindEnclosures::Site::$pkg";
58
59     my $code = join '', <$fh>;
60     unless ($code =~ /^\s*package/s) {
61         $code = join "\n",
62             ( "package $plugin_class;",
63               "use strict;",
64               "use base qw( Plagger::Plugin::Filter::FindEnclosures::Site );",
65               "sub site_name { '$pkg' }",
66               $code,
67               "1;" );
68     }
69
70     eval $code;
71     Plagger->context->error($@) if $@;
72
73     return $plugin_class->new;
74 }
75
76 sub load_plugin_yaml { Plagger->context->error("NOT IMPLEMENTED YET") }
77
78 sub filter {
79     my($self, $context, $args) = @_;
80
81     # check $entry->link first, if it links directly to media files
82     $self->add_enclosure($args->{entry}, [ 'a', { href => $args->{entry}->permalink } ], 'href' );
83
84     return unless $args->{entry}->body;
85
86     my $parser = HTML::TokeParser->new(\$args->{entry}->body->data);
87     while (my $tag = $parser->get_tag('a', 'embed', 'img', 'object')) {
88         if ($tag->[0] eq 'a' ) {
89             $self->add_enclosure($args->{entry}, $tag, 'href');
90         } elsif ($tag->[0] eq 'embed') {
91             $self->add_enclosure($args->{entry}, $tag, 'src', { type => $tag->[1]->{type} });
92         } elsif ($tag->[0] eq 'img') {
93             $self->add_enclosure($args->{entry}, $tag, 'src', { inline => 1 });
94         } elsif ($tag->[0] eq 'object') {
95             $self->add_enclosure_from_object($args->{entry}, $parser);
96         }
97     }
98 }
99
100 sub add_enclosure_from_object {
101     my($self, $entry, $parser) = @_;
102
103     # get param tags and find appropriate FLV movies
104     my @params;
105     while (my $tag = $parser->get_tag('param', '/object')) {
106         last if $tag->[0] eq '/object';
107         push @params, $tag;
108     }
109
110     # find URL inside flashvars parameter
111     my $url;
112     if (my $flashvars = first { lc($_->[1]->{name}) eq 'flashvars' } @params) {
113         my %values = split /[=&]/, $flashvars->[1]->{value} || '';
114         $url   = first { m!^https?://.*\flv! } values %values;
115         $url ||= first { m!^https?://.*! } values %values;
116     }
117
118     # if URL isn't found in flash vars, then fallback to <param name="movie" />
119     if (!$url) {
120         my $movie = first { lc($_->[1]->{name}) eq 'movie' } @params;
121         $url = $movie->[1]->{value} if $movie;
122     }
123
124     if ($url) {
125         Plagger->context->log(info => "Found enclosure $url");
126         my $enclosure = Plagger::Enclosure->new;
127         $enclosure->url( URI->new($url) );
128         $enclosure->auto_set_type;
129         $entry->add_enclosure($enclosure); # XXX inline?
130     }
131 }
132
133 sub add_enclosure {
134     my($self, $entry, $tag, $attr, $opt) = @_;
135     $opt ||= {};
136
137     if ($self->is_enclosure($tag, $attr, $opt->{type})) {
138         Plagger->context->log(info => "Found enclosure $tag->[1]{$attr}");
139         my $enclosure = Plagger::Enclosure->new;
140         $enclosure->url($tag->[1]{$attr});
141         $enclosure->auto_set_type($opt->{type});
142         $enclosure->is_inline(1) if $opt->{inline};
143         $entry->add_enclosure($enclosure);
144         return;
145     }
146
147     my $url = $tag->[1]{$attr};
148     my $content;
149     for my $plugin (@{$self->{plugins}}) {
150         if ( $plugin->handle($url) ) {
151             Plagger->context->log(debug => "Try $url with " . $plugin->site_name);
152             $content ||= $self->fetch_content($url) or return;
153
154             if (my $enclosure = $plugin->find({ content => $content, url => $url })) {
155                 Plagger->context->log(info => "Found enclosure " . $enclosure->url ." with " . $plugin->site_name);
156                 $entry->add_enclosure($enclosure);
157                 return;
158             }
159         }
160     }
161 }
162
163 sub fetch_content {
164     my($self, $url) = @_;
165
166     my $ua  = Plagger::UserAgent->new;
167     my $res = $ua->fetch($url, $self, { NoNetwork => 3 * 60 * 60 });
168     return if !$res->status && $res->is_error;
169
170     return decode_content($res);
171 }
172
173 sub is_enclosure {
174     my($self, $tag, $attr, $type) = @_;
175
176     return 1 if $tag->[1]{rel} && $tag->[1]{rel} eq 'enclosure';
177     return 1 if $self->has_enclosure_mime_type($tag->[1]{$attr}, $type);
178
179     return;
180 }
181
182 sub has_enclosure_mime_type {
183     my($self, $url, $type) = @_;
184
185     my $mime = $type ? MIME::Type->new(type => $type) : Plagger::Util::mime_type_of( URI->new($url) );
186     Plagger::Util::mime_is_enclosure($mime);
187 }
188
189 package Plagger::Plugin::Filter::FindEnclosures::Site;
190 sub new { bless {}, shift }
191 sub handle { 0 }
192 sub find { }
193
194 1;
195
196 __END__
197
198 =head1 NAME
199
200 Plagger::Plugin::Filter::FindEnclosures - Auto-find enclosures from entry content using B<< <a> >> / B<< <embed> >> tags
201
202 =head1 SYNOPSIS
203
204   - module: Filter::FindEnclosures
205
206 =head1 DESCRIPTION
207
208 This plugin finds enclosures from C<< $entry->body >> by finding 1)
209 B<< <a> >> links with I<rel="enclosure"> attribute, 2) B<< <a> >>
210 links to any URL which filename extensions match with known
211 audio/video formats and 3) I<src> attributes in B<< <img> >> and B<< <embed> >> tags.
212
213 For example:
214
215   Listen to the <a href="http://example.com/foobar.mp3">Podcast</a> now, or <a rel="enclosure"
216   href="http://example.com/foobar.m4a">download AAC version</a>. <img src="/img/logo.gif" />
217
218 Those 3 links (I<foobar.mp3>, I<foobar.m4a> and I<logo.gif>) are
219 extracted as enclosures, while I<logo.gif> is marked as "inline", so
220 that they won't appear as enclosures in Publish::Feed.
221
222 You might want to also use Filter::HEADEnclosureMetadata plugin to
223 know the actual length (bytes-length) of enclosures by sending HEAD
224 requests.
225
226 =head1 AUTHOR
227
228 Tatsuhiko Miyagawa
229
230 Masahiro Nagano
231
232 =head1 SEE ALSO
233
234 L<Plagger>, L<Plagger::Plugin::Filter::HEADEnclosureMetadata>, L<http://www.msgilligan.com/rss-enclosure-bp.html>, L<http://forums.feedburner.com/viewtopic.php?t=20>
235
236 =cut
237
Note: See TracBrowser for help on using the browser.