root/trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm

Revision 866 (checked in by miyagawa, 14 years ago)

more Photos! support Apple's evil photocast namespace.

  • Property svn:keywords set to Id Revision
Line 
1 package Plagger::Plugin::Aggregator::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Feed::Find;
6 use Plagger::Enclosure;
7 use Plagger::UserAgent;
8 use List::Util qw(first);
9 use UNIVERSAL::require;
10 use URI;
11 use XML::Feed;
12 use XML::Feed::RSS;
13
14 $XML::Feed::RSS::PREFERRED_PARSER = first { $_->require } qw( XML::RSS::Liberal XML::RSS::LibXML XML::RSS );
15
16 #eval { require XML::Liberal };
17 #if (!$@ && XML::Liberal->can('globally_override')) {
18 #    XML::Liberal->globally_override('LibXML');
19 #}
20
21 sub register {
22     my($self, $context) = @_;
23     $context->register_hook(
24         $self,
25         'customfeed.handle'  => \&aggregate,
26     );
27 }
28
29 sub aggregate {
30     my($self, $context, $args) = @_;
31
32     my $url = $args->{feed}->url;
33     my $res = $self->fetch_content($url) or return;
34
35     my $content_type = eval { $res->content_type } ||
36                        $res->http_response->content_type ||
37                        "text/xml";
38
39     my $content = $res->content;
40     if ( $Feed::Find::IsFeed{$content_type} || $self->looks_like_feed(\$content) ) {
41         $self->handle_feed($url, \$content, $args->{feed});
42     } else {
43         my @feeds = Feed::Find->find_in_html(\$content, $url);
44         if (@feeds) {
45             $url = $feeds[0];
46             $res = $self->fetch_content($url) or return;
47             $self->handle_feed($url, \$res->content, $args->{feed});
48         } else {
49             return;
50         }
51     }
52
53     return 1;
54 }
55
56 sub looks_like_feed {
57     my($self, $content_ref) = @_;
58     $$content_ref =~ m!<rss\s+version="|<rdf:RDF\s+xmlns="http://purl\.org/rss|<feed\s+xmlns="!s;
59 }
60
61 sub fetch_content {
62     my($self, $url) = @_;
63
64     my $context = Plagger->context;
65     $context->log(info => "Fetch $url");
66
67     my $agent    = Plagger::UserAgent->new;
68     my $response = $agent->fetch($url, $self);
69
70     if ($response->is_error) {
71         $context->log(error => "GET $url failed: " .
72                       $response->http_status . " " .
73                       $response->http_response->message);
74         return;
75     }
76
77     # TODO: handle 301 Moved Permenently and 410 Gone
78     $context->log(debug => $response->status . ": $url");
79
80     $response;
81 }
82
83 sub handle_feed {
84     my($self, $url, $xml_ref, $feed) = @_;
85
86     my $context = Plagger->context;
87
88     my $args = { content => $$xml_ref };
89     $context->run_hook('aggregator.filter.feed', $args);
90
91     my $remote = eval { XML::Feed->parse(\$args->{content}) };
92
93     unless ($remote) {
94         $context->log(error => "Parsing $url failed. " . ($@ || XML::Feed->errstr));
95         return;
96     }
97
98     $feed ||= Plagger::Feed->new;
99     $feed->title(_u($remote->title));
100     $feed->url($url);
101     $feed->link($remote->link);
102     $feed->description(_u($remote->tagline)); # xxx should support Atom 1.0
103     $feed->language($remote->language);
104     $feed->author(_u($remote->author));
105     $feed->updated($remote->modified);
106     $feed->source_xml($$xml_ref);
107
108     if ($remote->format eq 'Atom') {
109         $feed->id( $remote->{atom}->id );
110     }
111
112     if ($remote->format =~ /^RSS/) {
113         $feed->image( $remote->{rss}->image )
114             if $remote->{rss}->image;
115     } elsif ($remote->format eq 'Atom') {
116         $feed->image({ url => $remote->{atom}->logo })
117             if $remote->{atom}->logo;
118     }
119
120     for my $e ($remote->entries) {
121         my $entry = Plagger::Entry->new;
122         $entry->title(_u($e->title));
123         $entry->author(_u($e->author));
124
125         my $category = $e->category;
126            $category = [ $category ] if $category && !ref($category);
127         $entry->tags([ map _u($_), @$category ]) if $category;
128
129         $entry->date( Plagger::Date->rebless($e->issued) )
130             if eval { $e->issued };
131
132         # xxx nasty hack. We should remove this once XML::Atom or XML::Feed is fixed
133         if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') {
134             if ( $e->{entry}->published ) {
135                 my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published );
136                 $entry->date( Plagger::Date->rebless($dt) ) if $dt;
137             }
138         }
139
140         $entry->link($e->link);
141         $entry->feed_link($feed->link);
142         $entry->id($e->id);
143         $entry->body(_u($e->content->body || $e->summary->body));
144
145         # enclosure support, to be added to XML::Feed
146         if ($remote->format =~ /^RSS / && $e->{entry}->{enclosure}) {
147             my $enclosure = Plagger::Enclosure->new;
148             $enclosure->url( URI->new($e->{entry}->{enclosure}->{url}) );
149             $enclosure->length($e->{entry}->{enclosure}->{length});
150             $enclosure->auto_set_type($e->{entry}->{enclosure}->{type});
151             $entry->add_enclosure($enclosure);
152         } elsif ($remote->format eq 'Atom') {
153             for my $link ( grep { $_->rel eq 'enclosure' } $e->{entry}->link ) {
154                 my $enclosure = Plagger::Enclosure->new;
155                 $enclosure->url( URI->new($link->href) );
156                 $enclosure->length($link->length);
157                 $enclosure->auto_set_type($link->type);
158                 $entry->add_enclosure($enclosure);
159             }
160         }
161
162         # TODO: move MediaRSS, Hatena, iTunes and those specific parser to be subclassed
163
164         # Media RSS
165         my $media_ns = "http://search.yahoo.com/mrss";
166         my $media = $e->{entry}->{$media_ns}->{group} || $e->{entry};
167         my $content = $media->{$media_ns}->{content} || [];
168            $content = [ $content ] unless ref $content;
169
170         for my $media_content (@{$content}) {
171             my $enclosure = Plagger::Enclosure->new;
172             $enclosure->url( URI->new($media_content->{url}) );
173             $enclosure->auto_set_type($media_content->{type});
174             $entry->add_enclosure($enclosure);
175         }
176
177         if (my $thumbnail = $media->{$media_ns}->{thumbnail}) {
178             $entry->icon({
179                 url   => $thumbnail->{url},
180                 width => $thumbnail->{width},
181                 height => $thumbnail->{height},
182             });
183         }
184
185         # Hatena Image extensions
186         my $hatena = $e->{entry}->{"http://www.hatena.ne.jp/info/xmlns#"} || {};
187         if ($hatena->{imageurl}) {
188             my $enclosure = Plagger::Enclosure->new;
189             $enclosure->url($hatena->{imageurl});
190             $enclosure->auto_set_type;
191             $entry->add_enclosure($enclosure);
192         }
193
194         if ($hatena->{imageurlsmall}) {
195             $entry->icon({ url   => $hatena->{imageurlsmall} });
196         }
197
198         # Apple photocast feed
199         my $apple = $e->{entry}->{"http://www.apple.com/ilife/wallpapers"} || {};
200         if ($apple->{image}) {
201             my $enclosure = Plagger::Enclosure->new;
202             $enclosure->url( URI->new($apple->{image}) );
203             $enclosure->auto_set_type;
204             $entry->add_enclosure($enclosure);
205         }
206         if ($apple->{thumbnail}) {
207             $entry->icon({ url => $apple->{thumbnail} });
208         }
209
210         my $args = {
211             entry      => $entry,
212             feed       => $feed,
213             orig_entry => $e,
214             orig_feed  => $remote,
215         };
216         $context->run_hook('aggregator.entry.fixup', $args);
217
218         $feed->add_entry($entry);
219     }
220
221     $context->log(info => "Aggregate $url success: " . $feed->count . " entries.");
222     $context->update->add($feed);
223 }
224
225 sub _u {
226     my $str = shift;
227     Encode::_utf8_on($str);
228     $str;
229 }
230
231 1;
232
233 __END__
234
235 =head1 NAME
236
237 Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator
238
239 =head1 SYNOPSIS
240
241   - module: Aggregator::Simple
242
243 =head1 DESCRIPTION
244
245 This plugin implements a Plagger dumb aggregator. It crawls
246 subscription sequentially and parses XML feeds using L<XML::Feed>
247 module.
248
249 It can be also used as a base class for custom aggregators. See
250 L<Plagger::Plugin::Aggregator::Xango> for example.
251
252 =head1 AUTHOR
253
254 Tatsuhiko Miyagawa
255
256 =head1 SEE ALSO
257
258 L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML>
259
260 =cut
Note: See TracBrowser for help on using the browser.