root/trunk/plagger/lib/Plagger/Plugin/CustomFeed/Simple.pm

Revision 1882 (checked in by miyagawa, 14 years ago)

CustomFeed?::Simple: absolutify URL in XPath extraction

Line 
1 package Plagger::Plugin::CustomFeed::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Encode;
6 use HTML::TokeParser;
7 use HTML::ResolveLink;
8 use HTML::TreeBuilder::XPath;
9 use Plagger::UserAgent;
10 use Plagger::Util qw( decode_content extract_title );
11
12 sub register {
13     my($self, $context) = @_;
14     $context->register_hook(
15         $self,
16         'customfeed.handle' => \&handle,
17     );
18 }
19
20 sub handle {
21     my($self, $context, $args) = @_;
22
23     $args->{match} = $args->{feed}->meta->{follow_link};
24     $args->{xpath} = $args->{feed}->meta->{follow_xpath};
25     if ($args->{match} || $args->{xpath}) {
26         return $self->aggregate($context, $args);
27     }
28
29     return;
30 }
31
32 sub aggregate {
33     my($self, $context, $args) = @_;
34
35     my $url = $args->{feed}->url;
36     $context->log(info => "GET $url");
37
38     my $agent = Plagger::UserAgent->new;
39     my $res = $agent->fetch($url, $self);
40
41     if ($res->http_response->is_error) {
42         $context->log(error => "GET $url failed: " . $res->status);
43         return;
44     }
45
46     my $content = decode_content($res);
47     my $title   = extract_title($content);
48
49     my $feed = Plagger::Feed->new;
50     $feed->title($title);
51     $feed->link($url);
52
53     if( my $re = $args->{match} ) {
54         my $resolver = HTML::ResolveLink->new(base => $url);
55         $content = $resolver->resolve($content);
56
57         my %seen;
58         my $parser = HTML::TokeParser->new(\$content);
59         while (my $token = $parser->get_tag('a')) {
60             next unless ($token->[1]->{href} || '') =~ /$re/;
61
62             my $text = $parser->get_trimmed_text('/a');
63             next if !$text || $text eq '[IMG]';
64
65             my $item_url = URI->new_abs($token->[1]->{href}, $url);
66             next if $seen{$item_url->as_string}++;
67
68             my $entry = Plagger::Entry->new;
69             $entry->title($text);
70             $entry->link($item_url);
71             $feed->add_entry($entry);
72
73             $context->log(debug => "Add $token->[1]->{href} ($text)");
74         }
75     } elsif (my $xpath = $args->{xpath}) {
76         my $tree = HTML::TreeBuilder::XPath->new;
77         $tree->parse($content);
78         $tree->eof;
79
80         for my $child ( $tree->findnodes($xpath || '//a') ) {
81             my $href  = $child->attr('href') or next;
82             my $title = $child->attr('title') || $child->as_text;
83
84             my $entry = Plagger::Entry->new;
85             $entry->title($title);
86             $entry->link(URI->new_abs($href, $url));
87             $feed->add_entry($entry);
88
89             $context->log(debug => "Add $href ($title)");
90         }
91     }
92
93     $context->update->add($feed);
94
95     return 1;
96 }
97
98 1;
99
100 __END__
101
102 =head1 NAME
103
104 Plagger::Plugin::CustomFeed::Simple - Simple way to create title and link only custom feeds
105
106 =head1 SYNOPSIS
107
108   - module: Subscription::Config
109     config:
110       feed:
111         - url: http://sportsnavi.yahoo.co.jp/index.html
112           meta:
113             follow_link: /headlines/
114         - url: http://d.hatena.ne.jp/antipop/20050628/1119966355
115           meta:
116             follow_xpath: //ul[@class="xoxo" or @class="subscriptionlist"]//a
117
118   - module: CustomFeed::Simple
119
120 =head1 DESCRIPTION
121
122
123 =head1 AUTHOR
124
125 Tatsuhiko Miyagawa
126
127 =head1 SEE ALSO
128
129 L<Plagger>
130
131 =cut
132
133
134
135 1;
Note: See TracBrowser for help on using the browser.