root/trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm

Revision 1389 (checked in by miyagawa, 14 years ago)

Added Publish::Serializer via http://naoya.g.hatena.ne.jp/naoya/20060315/1142433649

  • Property svn:keywords set to Id Revision
Line 
1 package Plagger::Plugin::Aggregator::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Feed::Find;
6 use Plagger::Enclosure;
7 use Plagger::UserAgent;
8 use List::Util qw(first);
9 use UNIVERSAL::require;
10 use URI;
11 use XML::Feed;
12 use XML::Feed::RSS;
13
14 $XML::Feed::RSS::PREFERRED_PARSER = first { $_->require } qw( XML::RSS::Liberal XML::RSS::LibXML XML::RSS );
15
16 sub register {
17     my($self, $context) = @_;
18     $context->register_hook(
19         $self,
20         'customfeed.handle'  => \&aggregate,
21     );
22 }
23
24 sub aggregate {
25     my($self, $context, $args) = @_;
26
27     my $url = $args->{feed}->url;
28     my $res = $self->fetch_content($url) or return;
29
30     my $content_type = eval { $res->content_type } ||
31                        $res->http_response->content_type ||
32                        "text/xml";
33
34     $content_type =~ s/;.*$//; # strip charset= cruft
35
36     my $content = $res->content;
37     if ( $Feed::Find::IsFeed{$content_type} || $self->looks_like_feed(\$content) ) {
38         $self->handle_feed($url, \$content, $args->{feed});
39     } else {
40         $content = Plagger::Util::decode_content($res);
41         my @feeds = Feed::Find->find_in_html(\$content, $url);
42         if (@feeds) {
43             $url = $feeds[0];
44             $res = $self->fetch_content($url) or return;
45             $self->handle_feed($url, \$res->content, $args->{feed});
46         } else {
47             return;
48         }
49     }
50
51     return 1;
52 }
53
54 sub looks_like_feed {
55     my($self, $content_ref) = @_;
56     $$content_ref =~ m!<rss |<rdf:RDF\s+.*?xmlns="http://purl\.org/rss|<feed\s+xmlns="!s;
57 }
58
59 sub fetch_content {
60     my($self, $url) = @_;
61
62     my $context = Plagger->context;
63     $context->log(info => "Fetch $url");
64
65     my $agent = Plagger::UserAgent->new;
66        $agent->parse_head(0);
67     my $response = $agent->fetch($url, $self);
68
69     if ($response->is_error) {
70         $context->log(error => "GET $url failed: " .
71                       $response->http_status . " " .
72                       $response->http_response->message);
73         return;
74     }
75
76     # TODO: handle 301 Moved Permenently and 410 Gone
77     $context->log(debug => $response->status . ": $url");
78
79     $response;
80 }
81
82 sub handle_feed {
83     my($self, $url, $xml_ref, $feed) = @_;
84
85     my $context = Plagger->context;
86
87     my $args = { content => $$xml_ref };
88     $context->run_hook('aggregator.filter.feed', $args);
89
90     # override XML::LibXML with Liberal
91     my $sweeper; # XML::Liberal >= 0.13
92
93     eval { require XML::Liberal };
94     if (!$@ && $XML::Liberal::VERSION >= 0.10) {
95         $sweeper = XML::Liberal->globally_override('LibXML');
96     }
97
98     local $XML::Atom::ForceUnicode = 1;
99     my $remote = eval { XML::Feed->parse(\$args->{content}) };
100
101     unless ($remote) {
102         $context->log(error => "Parsing $url failed. " . ($@ || XML::Feed->errstr));
103         return;
104     }
105
106     $feed ||= Plagger::Feed->new;
107     $feed->title(_u($remote->title)) unless defined $feed->title;
108     $feed->url($url);
109     $feed->link($remote->link);
110     $feed->description(_u($remote->tagline)); # xxx should support Atom 1.0
111     $feed->language($remote->language);
112     $feed->author(_u($remote->author));
113     $feed->updated($remote->modified);
114
115     Encode::_utf8_on($$xml_ref);
116     $feed->source_xml($$xml_ref);
117
118     if ($remote->format eq 'Atom') {
119         $feed->id( $remote->{atom}->id );
120     }
121
122     if ($remote->format =~ /^RSS/) {
123         $feed->image( $remote->{rss}->image )
124             if $remote->{rss}->image;
125     } elsif ($remote->format eq 'Atom') {
126         $feed->image({ url => $remote->{atom}->logo })
127             if $remote->{atom}->logo;
128     }
129
130     for my $e ($remote->entries) {
131         my $entry = Plagger::Entry->new;
132         $entry->title(_u($e->title));
133         $entry->author(_u($e->author));
134
135         my $category = $e->category;
136            $category = [ $category ] if $category && (!ref($category) || ref($category) ne 'ARRAY');
137         $entry->tags([ map _u($_), @$category ]) if $category;
138
139         # XXX XML::Feed doesn't support extracting atom:category yet
140         if ($remote->format eq 'Atom' && $e->{entry}->can('categories')) {
141             my @categories = $e->{entry}->categories;
142             for my $cat (@categories) {
143                 $entry->add_tag( _u($cat->label || $cat->term) );
144             }
145         }
146
147         my $date = eval { $e->issued } || eval { $e->modified };
148         $entry->date( Plagger::Date->rebless($date) ) if $date;
149
150         # xxx nasty hack. We should remove this once XML::Atom or XML::Feed is fixed
151         if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') {
152             if ( $e->{entry}->published ) {
153                 my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published );
154                 $entry->date( Plagger::Date->rebless($dt) ) if $dt;
155             }
156         }
157
158         $entry->link($e->link);
159         $entry->feed_link($feed->link);
160         $entry->id($e->id);
161         $entry->body(_u($e->content->body || $e->summary->body));
162
163         # enclosure support, to be added to XML::Feed
164         if ($remote->format =~ /^RSS / and my $encls = $e->{entry}->{enclosure}) {
165             # some RSS feeds contain multiple enclosures, and we support them
166             $encls = [ $encls ] unless ref $encls eq 'ARRAY';
167
168             for my $encl (@$encls) {
169                 my $enclosure = Plagger::Enclosure->new;
170                 $enclosure->url( URI->new($encl->{url}) );
171                 $enclosure->length($encl->{length});
172                 $enclosure->auto_set_type($encl->{type});
173                 $entry->add_enclosure($enclosure);
174             }
175         } elsif ($remote->format eq 'Atom') {
176             for my $link ( grep { defined $_->rel && $_->rel eq 'enclosure' } $e->{entry}->link ) {
177                 my $enclosure = Plagger::Enclosure->new;
178                 $enclosure->url( URI->new($link->href) );
179                 $enclosure->length($link->length);
180                 $enclosure->auto_set_type($link->type);
181                 $entry->add_enclosure($enclosure);
182             }
183         }
184
185         # TODO: move MediaRSS, Hatena, iTunes and those specific parser to be subclassed
186
187         # Media RSS
188         my $media_ns = "http://search.yahoo.com/mrss";
189         my $media = $e->{entry}->{$media_ns}->{group} || $e->{entry};
190         my $content = $media->{$media_ns}->{content} || [];
191            $content = [ $content ] unless ref $content && ref $content eq 'ARRAY';
192
193         for my $media_content (@{$content}) {
194             my $enclosure = Plagger::Enclosure->new;
195             $enclosure->url( URI->new($media_content->{url}) );
196             $enclosure->auto_set_type($media_content->{type});
197             $entry->add_enclosure($enclosure);
198         }
199
200         if (my $thumbnail = $media->{$media_ns}->{thumbnail}) {
201             $entry->icon({
202                 url   => $thumbnail->{url},
203                 width => $thumbnail->{width},
204                 height => $thumbnail->{height},
205             });
206         }
207
208         # Hatena Image extensions
209         my $hatena = $e->{entry}->{"http://www.hatena.ne.jp/info/xmlns#"} || {};
210         if ($hatena->{imageurl}) {
211             my $enclosure = Plagger::Enclosure->new;
212             $enclosure->url($hatena->{imageurl});
213             $enclosure->auto_set_type;
214             $entry->add_enclosure($enclosure);
215         }
216
217         if ($hatena->{imageurlsmall}) {
218             $entry->icon({ url   => $hatena->{imageurlsmall} });
219         }
220
221         # Apple photocast feed
222         my $apple = $e->{entry}->{"http://www.apple.com/ilife/wallpapers"} || {};
223         if ($apple->{image}) {
224             my $enclosure = Plagger::Enclosure->new;
225             $enclosure->url( URI->new($apple->{image}) );
226             $enclosure->auto_set_type;
227             $entry->add_enclosure($enclosure);
228         }
229         if ($apple->{thumbnail}) {
230             $entry->icon({ url => $apple->{thumbnail} });
231         }
232
233         my $args = {
234             entry      => $entry,
235             feed       => $feed,
236             orig_entry => $e,
237             orig_feed  => $remote,
238         };
239         $context->run_hook('aggregator.entry.fixup', $args);
240
241         $feed->add_entry($entry);
242     }
243
244     $context->log(info => "Aggregate $url success: " . $feed->count . " entries.");
245     $context->update->add($feed);
246 }
247
248 sub _u {
249     my $str = shift;
250     Encode::_utf8_on($str);
251     $str;
252 }
253
254 1;
255
256 __END__
257
258 =head1 NAME
259
260 Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator
261
262 =head1 SYNOPSIS
263
264   - module: Aggregator::Simple
265
266 =head1 DESCRIPTION
267
268 This plugin implements a Plagger dumb aggregator. It crawls
269 subscription sequentially and parses XML feeds using L<XML::Feed>
270 module.
271
272 It can be also used as a base class for custom aggregators. See
273 L<Plagger::Plugin::Aggregator::Xango> for example.
274
275 =head1 AUTHOR
276
277 Tatsuhiko Miyagawa
278
279 =head1 SEE ALSO
280
281 L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML>
282
283 =cut
Note: See TracBrowser for help on using the browser.