root/trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm

Revision 2060 (checked in by miyagawa, 11 years ago)

apply RT 42542

  • Property svn:keywords set to Id Revision
Line 
1 package Plagger::Plugin::Aggregator::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Feed::Find;
6 use Plagger::Enclosure;
7 use Plagger::FeedParser;
8 use Plagger::UserAgent;
9 use Plagger::Text;
10 use List::Util qw(first);
11 use UNIVERSAL::require;
12 use URI;
13
14 sub register {
15     my($self, $context) = @_;
16     $context->register_hook(
17         $self,
18         'customfeed.handle'  => \&aggregate,
19     );
20 }
21
22 sub aggregate {
23     my($self, $context, $args) = @_;
24
25     my $url = $args->{feed}->url;
26     my $res = $self->fetch_content($url) or return;
27
28     my $content_type = eval { $res->content_type } ||
29                        $res->http_response->content_type ||
30                        "text/xml";
31
32     $content_type =~ s/;.*$//; # strip charset= cruft
33
34     my $feed_url = Plagger::FeedParser->discover($res);
35     if ($url eq $feed_url) {
36         $self->handle_feed($url, \$res->content, $args->{feed});
37     } elsif ($feed_url) {
38         $res = $self->fetch_content($feed_url) or return;
39         $self->handle_feed($feed_url, \$res->content, $args->{feed});
40     } else {
41         return;
42     }
43
44     return 1;
45 }
46
47 sub fetch_content {
48     my($self, $url) = @_;
49
50     my $context = Plagger->context;
51     $context->log(info => "Fetch $url");
52
53     my $agent = Plagger::UserAgent->new;
54     my $response = $agent->fetch($url, $self);
55
56     if ($response->is_error) {
57         $context->log(error => "GET $url failed: " .
58                       $response->http_status . " " .
59                       $response->http_response->message);
60         return;
61     }
62
63     # TODO: handle 301 Moved Permenently and 410 Gone
64     $context->log(debug => $response->status . ": $url");
65
66     $response;
67 }
68
69 sub handle_feed {
70     my($self, $url, $xml_ref, $feed) = @_;
71
72     my $context = Plagger->context;
73
74     my $args = { content => $$xml_ref };
75     $context->run_hook('aggregator.filter.feed', $args);
76
77     my $remote = eval { Plagger::FeedParser->parse(\$args->{content}) };
78     if ($@) {
79         $context->log(error => "Parser $url failed: $@");
80         return;
81     }
82
83     $feed ||= Plagger::Feed->new;
84     $feed->title(_u($remote->title)) unless defined $feed->title;
85     $feed->url($url);
86     $feed->link($remote->link);
87     $feed->description(_u($remote->tagline)); # xxx should support Atom 1.0
88     $feed->language($remote->language);
89     $feed->author(_u($remote->author));
90     $feed->updated($remote->modified) if defined $remote->modified;
91
92     Encode::_utf8_on($$xml_ref);
93     $feed->source_xml($$xml_ref);
94
95     if ($remote->format eq 'Atom') {
96         $feed->id( $remote->{atom}->id );
97     }
98
99     if ($remote->format =~ /^RSS/) {
100         $feed->image( \%{$remote->{rss}->image} )
101             if $remote->{rss}->image;
102     } elsif ($remote->format eq 'Atom') {
103         $feed->image({ url => $remote->{atom}->logo })
104             if $remote->{atom}->logo;
105     }
106
107     for my $e ($remote->entries) {
108         my $entry = Plagger::Entry->new;
109         $entry->title(_u($e->title));
110         $entry->author(_u($e->author));
111
112         my @category = $e->category;
113         $entry->tags([ map _u($_), @category ]) if @category;
114
115         # XXX XML::Feed doesn't support extracting atom:category yet
116         if ($remote->format eq 'Atom' && $e->{entry}->can('categories')) {
117             my @categories = $e->{entry}->categories;
118             for my $cat (@categories) {
119                 $entry->add_tag( _u($cat->label || $cat->term) );
120             }
121         }
122
123         my $date = eval { $e->issued } || eval { $e->modified };
124         $entry->date( Plagger::Date->rebless($date) ) if $date;
125
126         # xxx nasty hack. We should remove this once XML::Atom or XML::Feed is fixed
127         if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') {
128             if ( $e->{entry}->published ) {
129                 my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published );
130                 $entry->date( Plagger::Date->rebless($dt) ) if $dt;
131             }
132         }
133
134         $entry->link($e->link);
135         $entry->feed_link($feed->link);
136         $entry->id($e->id);
137
138         my $content = feed_to_text($e, $e->content);
139         my $summary = feed_to_text($e, $e->summary);
140         $entry->body($content || $summary);
141         $entry->summary($summary) if $summary;
142
143         # per-entry level language support in Atom
144         if ($remote->format eq 'Atom' && $e->{entry}->content && $e->{entry}->content->lang) {
145             $entry->language($e->{entry}->content->lang);
146         }
147
148         # enclosure support, to be added to XML::Feed
149         if ($remote->format =~ /^RSS / and my $encls = $e->{entry}->{enclosure}) {
150             # some RSS feeds contain multiple enclosures, and we support them
151             $encls = [ $encls ] unless ref $encls eq 'ARRAY';
152
153             for my $encl (@$encls) {
154                 my $enclosure = Plagger::Enclosure->new;
155                 $enclosure->url( URI->new($encl->{url}) );
156                 $enclosure->length($encl->{length});
157                 $enclosure->auto_set_type($encl->{type});
158                 $entry->add_enclosure($enclosure);
159             }
160         } elsif ($remote->format eq 'Atom') {
161             for my $link ( grep { defined $_->rel && $_->rel eq 'enclosure' } $e->{entry}->link ) {
162                 my $enclosure = Plagger::Enclosure->new;
163                 $enclosure->url( URI->new($link->href) );
164                 $enclosure->length($link->length);
165                 $enclosure->auto_set_type($link->type);
166                 $entry->add_enclosure($enclosure);
167             }
168         }
169
170         # entry image support
171         if ($remote->format =~ /^RSS / and my $img = $e->{entry}->{image}) {
172             $entry->icon(\%$img);
173         }
174
175         my $args = {
176             entry      => $entry,
177             feed       => $feed,
178             orig_entry => $e,
179             orig_feed  => $remote,
180         };
181         $context->run_hook('aggregator.entry.fixup', $args);
182
183         $feed->add_entry($entry);
184     }
185
186     $context->log(info => "Aggregate $url success: " . $feed->count . " entries.");
187     $context->update->add($feed);
188 }
189
190 sub feed_to_text {
191     my($e, $content) = @_;
192     return unless $content->body;
193
194     if (ref($e) =~ /Atom/) {
195         # in Atom, be a little strict with TextConstruct
196         # TODO: this actually doesn't work since XML::Feed and XML::Atom does the right
197         # thing with Atom 1.0 TextConstruct
198         if ($content->type eq 'text/plain' || $content->type eq 'text') {
199             return Plagger::Text->new(type => 'text', data => $content->body);
200         } else {
201             return Plagger::Text->new(type => 'html', data => $content->body);
202         }
203     } elsif (ref($e) =~ /RSS/) {
204         # in RSS there's no explicit way to declare the type. Just guess it
205         return Plagger::Text->new_from_text($content->body);
206     } else {
207         die "Something is wrong: $e";
208     }
209 }
210
211 sub _u {
212     my $str = shift;
213     Encode::_utf8_on($str);
214     $str;
215 }
216
217 1;
218
219 __END__
220
221 =head1 NAME
222
223 Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator
224
225 =head1 SYNOPSIS
226
227   - module: Aggregator::Simple
228
229 =head1 DESCRIPTION
230
231 This plugin implements a Plagger dumb aggregator. It crawls
232 subscription sequentially and parses XML feeds using L<XML::Feed>
233 module.
234
235 It can be also used as a base class for custom aggregators. See
236 L<Plagger::Plugin::Aggregator::Xango> for example.
237
238 =head1 AUTHOR
239
240 Tatsuhiko Miyagawa
241
242 =head1 SEE ALSO
243
244 L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML>
245
246 =cut
Note: See TracBrowser for help on using the browser.