root/trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm

Revision 1577 (checked in by miyagawa, 14 years ago)

support entry level image in RSS 0.91/2.0 I wonder why I haven't added support for this long time

  • Property svn:keywords set to Id Revision
Line 
1 package Plagger::Plugin::Aggregator::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Feed::Find;
6 use Plagger::Enclosure;
7 use Plagger::FeedParser;
8 use Plagger::UserAgent;
9 use List::Util qw(first);
10 use UNIVERSAL::require;
11 use URI;
12
13 sub register {
14     my($self, $context) = @_;
15     $context->register_hook(
16         $self,
17         'customfeed.handle'  => \&aggregate,
18     );
19 }
20
21 sub aggregate {
22     my($self, $context, $args) = @_;
23
24     my $url = $args->{feed}->url;
25     my $res = $self->fetch_content($url) or return;
26
27     my $content_type = eval { $res->content_type } ||
28                        $res->http_response->content_type ||
29                        "text/xml";
30
31     $content_type =~ s/;.*$//; # strip charset= cruft
32
33     my $feed_url = Plagger::FeedParser->discover($res);
34     if ($url eq $feed_url) {
35         $self->handle_feed($url, \$res->content, $args->{feed});
36     } elsif ($feed_url) {
37         $res = $self->fetch_content($feed_url) or return;
38         $self->handle_feed($feed_url, \$res->content, $args->{feed});
39     } else {
40         return;
41     }
42
43     return 1;
44 }
45
46 sub fetch_content {
47     my($self, $url) = @_;
48
49     my $context = Plagger->context;
50     $context->log(info => "Fetch $url");
51
52     my $agent = Plagger::UserAgent->new;
53        $agent->parse_head(0);
54     my $response = $agent->fetch($url, $self);
55
56     if ($response->is_error) {
57         $context->log(error => "GET $url failed: " .
58                       $response->http_status . " " .
59                       $response->http_response->message);
60         return;
61     }
62
63     # TODO: handle 301 Moved Permenently and 410 Gone
64     $context->log(debug => $response->status . ": $url");
65
66     $response;
67 }
68
69 sub handle_feed {
70     my($self, $url, $xml_ref, $feed) = @_;
71
72     my $context = Plagger->context;
73
74     my $args = { content => $$xml_ref };
75     $context->run_hook('aggregator.filter.feed', $args);
76
77     my $remote = eval { Plagger::FeedParser->parse(\$args->{content}) };
78     if ($@) {
79         $context->log(error => "Parser $url failed: $@");
80         return;
81     }
82
83     $feed ||= Plagger::Feed->new;
84     $feed->title(_u($remote->title)) unless defined $feed->title;
85     $feed->url($url);
86     $feed->link($remote->link);
87     $feed->description(_u($remote->tagline)); # xxx should support Atom 1.0
88     $feed->language($remote->language);
89     $feed->author(_u($remote->author));
90     $feed->updated($remote->modified);
91
92     Encode::_utf8_on($$xml_ref);
93     $feed->source_xml($$xml_ref);
94
95     if ($remote->format eq 'Atom') {
96         $feed->id( $remote->{atom}->id );
97     }
98
99     if ($remote->format =~ /^RSS/) {
100         $feed->image( \%{$remote->{rss}->image} )
101             if $remote->{rss}->image;
102     } elsif ($remote->format eq 'Atom') {
103         $feed->image({ url => $remote->{atom}->logo })
104             if $remote->{atom}->logo;
105     }
106
107     for my $e ($remote->entries) {
108         my $entry = Plagger::Entry->new;
109         $entry->title(_u($e->title));
110         $entry->author(_u($e->author));
111
112         my $category = $e->category;
113            $category = [ $category ] if $category && (!ref($category) || ref($category) ne 'ARRAY');
114         $entry->tags([ map _u($_), @$category ]) if $category;
115
116         # XXX XML::Feed doesn't support extracting atom:category yet
117         if ($remote->format eq 'Atom' && $e->{entry}->can('categories')) {
118             my @categories = $e->{entry}->categories;
119             for my $cat (@categories) {
120                 $entry->add_tag( _u($cat->label || $cat->term) );
121             }
122         }
123
124         my $date = eval { $e->issued } || eval { $e->modified };
125         $entry->date( Plagger::Date->rebless($date) ) if $date;
126
127         # xxx nasty hack. We should remove this once XML::Atom or XML::Feed is fixed
128         if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') {
129             if ( $e->{entry}->published ) {
130                 my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published );
131                 $entry->date( Plagger::Date->rebless($dt) ) if $dt;
132             }
133         }
134
135         $entry->link($e->link);
136         $entry->feed_link($feed->link);
137         $entry->id($e->id);
138         $entry->body(_u($e->content->body || $e->summary->body));
139
140         # enclosure support, to be added to XML::Feed
141         if ($remote->format =~ /^RSS / and my $encls = $e->{entry}->{enclosure}) {
142             # some RSS feeds contain multiple enclosures, and we support them
143             $encls = [ $encls ] unless ref $encls eq 'ARRAY';
144
145             for my $encl (@$encls) {
146                 my $enclosure = Plagger::Enclosure->new;
147                 $enclosure->url( URI->new($encl->{url}) );
148                 $enclosure->length($encl->{length});
149                 $enclosure->auto_set_type($encl->{type});
150                 $entry->add_enclosure($enclosure);
151             }
152         } elsif ($remote->format eq 'Atom') {
153             for my $link ( grep { defined $_->rel && $_->rel eq 'enclosure' } $e->{entry}->link ) {
154                 my $enclosure = Plagger::Enclosure->new;
155                 $enclosure->url( URI->new($link->href) );
156                 $enclosure->length($link->length);
157                 $enclosure->auto_set_type($link->type);
158                 $entry->add_enclosure($enclosure);
159             }
160         }
161
162         # entry image support
163         if ($remote->format =~ /^RSS / and my $img = $e->{entry}->{image}) {
164             $entry->icon(\%$img);
165         }
166
167         # TODO: move MediaRSS, Hatena, iTunes and those specific parser to be subclassed
168
169         # Media RSS
170         my $media_ns = "http://search.yahoo.com/mrss";
171         my $media = $e->{entry}->{$media_ns}->{group} || $e->{entry};
172         my $content = $media->{$media_ns}->{content} || [];
173            $content = [ $content ] unless ref $content && ref $content eq 'ARRAY';
174
175         for my $media_content (@{$content}) {
176             my $enclosure = Plagger::Enclosure->new;
177             $enclosure->url( URI->new($media_content->{url}) );
178             $enclosure->auto_set_type($media_content->{type});
179             $entry->add_enclosure($enclosure);
180         }
181
182         if (my $thumbnail = $media->{$media_ns}->{thumbnail}) {
183             $entry->icon({
184                 url   => $thumbnail->{url},
185                 width => $thumbnail->{width},
186                 height => $thumbnail->{height},
187             });
188         }
189
190         # Hatena Image extensions
191         my $hatena = $e->{entry}->{"http://www.hatena.ne.jp/info/xmlns#"} || {};
192         if ($hatena->{imageurl}) {
193             my $enclosure = Plagger::Enclosure->new;
194             $enclosure->url($hatena->{imageurl});
195             $enclosure->auto_set_type;
196             $entry->add_enclosure($enclosure);
197         }
198
199         if ($hatena->{imageurlsmall}) {
200             $entry->icon({ url   => $hatena->{imageurlsmall} });
201         }
202
203         # Apple photocast feed
204         my $apple = $e->{entry}->{"http://www.apple.com/ilife/wallpapers"} || {};
205         if ($apple->{image}) {
206             my $enclosure = Plagger::Enclosure->new;
207             $enclosure->url( URI->new($apple->{image}) );
208             $enclosure->auto_set_type;
209             $entry->add_enclosure($enclosure);
210         }
211         if ($apple->{thumbnail}) {
212             $entry->icon({ url => $apple->{thumbnail} });
213         }
214
215         my $args = {
216             entry      => $entry,
217             feed       => $feed,
218             orig_entry => $e,
219             orig_feed  => $remote,
220         };
221         $context->run_hook('aggregator.entry.fixup', $args);
222
223         $feed->add_entry($entry);
224     }
225
226     $context->log(info => "Aggregate $url success: " . $feed->count . " entries.");
227     $context->update->add($feed);
228 }
229
230 sub _u {
231     my $str = shift;
232     Encode::_utf8_on($str);
233     $str;
234 }
235
236 1;
237
238 __END__
239
240 =head1 NAME
241
242 Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator
243
244 =head1 SYNOPSIS
245
246   - module: Aggregator::Simple
247
248 =head1 DESCRIPTION
249
250 This plugin implements a Plagger dumb aggregator. It crawls
251 subscription sequentially and parses XML feeds using L<XML::Feed>
252 module.
253
254 It can be also used as a base class for custom aggregators. See
255 L<Plagger::Plugin::Aggregator::Xango> for example.
256
257 =head1 AUTHOR
258
259 Tatsuhiko Miyagawa
260
261 =head1 SEE ALSO
262
263 L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML>
264
265 =cut
Note: See TracBrowser for help on using the browser.