root/trunk/plagger/lib/Plagger/Plugin/CustomFeed/Simple.pm

Revision 1683 (checked in by mizzy, 14 years ago)

CustomFeed?::Simple: support follow_xpath option to extract links by XPath.

Line 
1 package Plagger::Plugin::CustomFeed::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Encode;
6 use HTML::TokeParser;
7 use HTML::ResolveLink;
8 use HTML::TreeBuilder::XPath;
9 use Plagger::UserAgent;
10 use Plagger::Util qw( decode_content extract_title );
11
12 sub register {
13     my($self, $context) = @_;
14     $context->register_hook(
15         $self,
16         'customfeed.handle' => \&handle,
17     );
18 }
19
20 sub handle {
21     my($self, $context, $args) = @_;
22
23     if ( my $match = $args->{feed}->meta->{follow_link} || $args->{feed}->meta->{follow_xpath} ) {
24         $args->{match} = $match;
25         return $self->aggregate($context, $args);
26     }
27
28     return;
29 }
30
31 sub aggregate {
32     my($self, $context, $args) = @_;
33
34     my $url = $args->{feed}->url;
35     $context->log(info => "GET $url");
36
37     my $agent = Plagger::UserAgent->new;
38     my $res = $agent->fetch($url, $self);
39
40     if ($res->http_response->is_error) {
41         $context->log(error => "GET $url failed: " . $res->status);
42         return;
43     }
44
45     my $content = decode_content($res);
46     my $title   = extract_title($content);
47
48     my $feed = Plagger::Feed->new;
49     $feed->title($title);
50     $feed->link($url);
51
52     my $re = $args->{match};
53
54     if( $args->{feed}->meta->{follow_link} ) {
55         my $resolver = HTML::ResolveLink->new(base => $url);
56         $content = $resolver->resolve($content);
57
58         my %seen;
59         my $parser = HTML::TokeParser->new(\$content);
60         while (my $token = $parser->get_tag('a')) {
61             next unless ($token->[1]->{href} || '') =~ /$re/;
62
63             my $text = $parser->get_trimmed_text('/a');
64             next if !$text || $text eq '[IMG]';
65
66             my $url = URI->new_abs($token->[1]->{href}, $url);
67             next if $seen{$url->as_string}++;
68
69             my $entry = Plagger::Entry->new;
70             $entry->title($text);
71             $entry->link($url);
72             $feed->add_entry($entry);
73
74             $context->log(debug => "Add $token->[1]->{href} ($text)");
75         }
76     }
77     else {
78         my $tree = HTML::TreeBuilder::XPath->new;
79         $tree->parse($content);
80         $tree->eof;
81
82         for my $child ( $tree->findnodes($re || '//a') ) {
83             my $href  = $child->attr('href') or next;
84             my $title = $child->attr('title') || $child->as_text;
85
86             my $entry = Plagger::Entry->new;
87             $entry->title($title);
88             $entry->link($href);
89             $feed->add_entry($entry);
90
91             $context->log(debug => "Add $href ($title)");
92         }
93     }
94
95     $context->update->add($feed);
96
97     return 1;
98 }
99
100 1;
101
102 __END__
103
104 =head1 NAME
105
106 Plagger::Plugin::CustomFeed::Simple - Simple way to create title and link only custom feeds
107
108 =head1 SYNOPSIS
109
110   - module: Subscription::Config
111     config:
112       feed:
113         - url: http://sportsnavi.yahoo.co.jp/index.html
114           meta:
115             follow_link: /headlines/
116         - url: http://d.hatena.ne.jp/antipop/20050628/1119966355
117           meta:
118             follow_xpath: //ul[@class="xoxo" or @class="subscriptionlist"]//a
119
120   - module: CustomFeed::Simple
121
122 =head1 DESCRIPTION
123
124
125 =head1 AUTHOR
126
127 Tatsuhiko Miyagawa
128
129 =head1 SEE ALSO
130
131 L<Plagger>
132
133 =cut
134
135
136
137 1;
Note: See TracBrowser for help on using the browser.