root/trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm

Revision 582 (checked in by miyagawa, 15 years ago)

Add 2ch RSS filter and upgrader by youpy.
http://subtech.g.hatena.ne.jp/youpy/20060413/p1

  • Property svn:keywords set to Id Revision
Line 
1 package Plagger::Plugin::Aggregator::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Feed::Find;
6 use Plagger::UserAgent;
7 use List::Util qw(first);
8 use UNIVERSAL::require;
9 use URI;
10 use XML::Feed;
11 use XML::Feed::RSS;
12
13 #$XML::Feed::RSS::PREFERRED_PARSER = first { $_->require } qw( XML::RSS::Liberal XML::RSS::LibXML XML::RSS );
14 $XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";
15
16 sub register {
17     my($self, $context) = @_;
18     $context->register_hook(
19         $self,
20         'customfeed.handle'  => \&aggregate,
21     );
22 }
23
24 sub aggregate {
25     my($self, $context, $args) = @_;
26
27     my $url = $args->{feed}->url;
28     my $res = $self->fetch_content($url) or return;
29
30     my $content_type = eval { $res->content_type } ||
31                        $res->http_response->content_type ||
32                        "text/xml";
33
34     if ( $Feed::Find::IsFeed{$content_type} ) {
35         $self->handle_feed($url, \$res->content);
36     } else {
37         my @feeds = Feed::Find->find_in_html(\$res->content, $url);
38         if (@feeds) {
39             $url = $feeds[0];
40             $res = $self->fetch_content($url) or return;
41             $self->handle_feed($url, \$res->content);
42         } else {
43             return;
44         }
45     }
46
47     return 1;
48 }
49
50 sub fetch_content {
51     my($self, $url) = @_;
52
53     my $context = Plagger->context;
54     $context->log(info => "Fetch $url");
55
56     my $agent    = Plagger::UserAgent->new;
57     my $response = $agent->fetch($url, $self);
58
59     if ($response->is_error) {
60         $context->log(error => "GET $url failed: " .
61                       $response->http_status . " " .
62                       $response->http_response->message);
63         return;
64     }
65
66     # TODO: handle 301 Moved Permenently and 410 Gone
67     $context->log(debug => $response->status . ": $url");
68
69     $response;
70 }
71
72 sub handle_feed {
73     my($self, $url, $xml_ref) = @_;
74
75     my $context = Plagger->context;
76
77     my $args = { content => $$xml_ref };
78     $context->run_hook('aggregator.filter.feed', $args);
79
80     my $remote = eval { XML::Feed->parse(\$args->{content}) };
81
82     unless ($remote) {
83         $context->log(error => "Parsing $url failed. " . ($@ || XML::Feed->errstr));
84         next;
85     }
86
87     my $feed = Plagger::Feed->new;
88     $feed->title($remote->title);
89     $feed->url($url);
90     $feed->link($remote->link);
91     $feed->description($remote->tagline); # xxx should support Atom 1.0
92     $feed->language($remote->language);
93     $feed->author($remote->author);
94     $feed->updated($remote->modified);
95     $feed->source_xml($$xml_ref);
96
97     if ($remote->format eq 'Atom') {
98         $feed->id( $remote->{atom}->id );
99     }
100
101     if ($remote->format =~ /^RSS/) {
102         $feed->image( $remote->{rss}->image )
103             if $remote->{rss}->image;
104     } elsif ($remote->format eq 'Atom') {
105         $feed->image({ url => $remote->{atom}->logo })
106             if $remote->{atom}->logo;
107     }
108
109     for my $e ($remote->entries) {
110         my $entry = Plagger::Entry->new;
111         $entry->title($e->title);
112         $entry->author($e->author);
113
114         my $category = $e->category;
115            $category = [ $category ] if $category && !ref($category);
116         $entry->tags($category) if $category;
117
118         $entry->date( Plagger::Date->rebless($e->issued) )
119             if eval { $e->issued };
120
121         # xxx nasty hack. We should remove this once XML::Atom or XML::Feed is fixed
122         if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') {
123             if ( $e->{entry}->published ) {
124                 my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published );
125                 $entry->date( Plagger::Date->rebless($dt) ) if $dt;
126             }
127         }
128
129         $entry->link($e->link);
130         $entry->feed_link($feed->link);
131         $entry->id($e->id);
132         $entry->body($e->content->body || $e->summary->body);
133
134         my $args = {
135             entry      => $entry,
136             feed       => $feed,
137             orig_entry => $e,
138             orig_feed  => $remote,
139         };
140         $context->run_hook('aggregator.entry.fixup', $args);
141
142         $feed->add_entry($entry);
143     }
144
145     $context->log(info => "Aggregate $url success: " . $feed->count . " entries.");
146     $context->update->add($feed);
147 }
148
149 1;
150
151 __END__
152
153 =head1 NAME
154
155 Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator
156
157 =head1 SYNOPSIS
158
159   - module: Aggregator::Simple
160
161 =head1 DESCRIPTION
162
163 This plugin implements a Plagger dumb aggregator. It crawls
164 subscription sequentially and parses XML feeds using L<XML::Feed>
165 module.
166
167 It can be also used as a base class for custom aggregators. See
168 L<Plagger::Plugin::Aggregator::Xango> for example.
169
170 =head1 AUTHOR
171
172 Tatsuhiko Miyagawa
173
174 =head1 SEE ALSO
175
176 L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML>
177
178 =cut
Note: See TracBrowser for help on using the browser.