Changeset 1683

Show
Ignore:
Timestamp:
09/15/06 00:19:30
Author:
mizzy
Message:

CustomFeed?::Simple: support follow_xpath option to extract links by XPath.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/plagger/lib/Plagger/Plugin/CustomFeed/Simple.pm

    r1673 r1683  
    66use HTML::TokeParser; 
    77use HTML::ResolveLink; 
     8use HTML::TreeBuilder::XPath; 
    89use Plagger::UserAgent; 
    910use Plagger::Util qw( decode_content extract_title ); 
     
    2021    my($self, $context, $args) = @_; 
    2122 
    22     if (my $match = $args->{feed}->meta->{follow_link}) { 
     23    if ( my $match = $args->{feed}->meta->{follow_link} || $args->{feed}->meta->{follow_xpath} ) { 
    2324        $args->{match} = $match; 
    2425        return $self->aggregate($context, $args); 
     
    4546    my $title   = extract_title($content); 
    4647 
    47     my $resolver = HTML::ResolveLink->new(base => $url); 
    48     $content = $resolver->resolve($content); 
    49  
    5048    my $feed = Plagger::Feed->new; 
    5149    $feed->title($title); 
     
    5452    my $re = $args->{match}; 
    5553 
    56     my %seen; 
    57     my $parser = HTML::TokeParser->new(\$content); 
    58     while (my $token = $parser->get_tag('a')) { 
    59         next unless ($token->[1]->{href} || '') =~ /$re/; 
     54    if( $args->{feed}->meta->{follow_link} ) { 
     55        my $resolver = HTML::ResolveLink->new(base => $url); 
     56        $content = $resolver->resolve($content); 
    6057 
    61         my $text = $parser->get_trimmed_text('/a'); 
    62         next if !$text || $text eq '[IMG]'; 
     58        my %seen; 
     59        my $parser = HTML::TokeParser->new(\$content); 
     60        while (my $token = $parser->get_tag('a')) { 
     61            next unless ($token->[1]->{href} || '') =~ /$re/; 
    6362 
    64         my $url = URI->new_abs($token->[1]->{href}, $url); 
    65         next if $seen{$url->as_string}++
     63            my $text = $parser->get_trimmed_text('/a'); 
     64            next if !$text || $text eq '[IMG]'
    6665 
    67         my $entry = Plagger::Entry->new; 
    68         $entry->title($text); 
    69         $entry->link($url); 
    70         $feed->add_entry($entry); 
     66            my $url = URI->new_abs($token->[1]->{href}, $url); 
     67            next if $seen{$url->as_string}++; 
    7168 
    72         $context->log(debug => "Add $token->[1]->{href} ($text)"); 
     69            my $entry = Plagger::Entry->new; 
     70            $entry->title($text); 
     71            $entry->link($url); 
     72            $feed->add_entry($entry); 
     73 
     74            $context->log(debug => "Add $token->[1]->{href} ($text)"); 
     75        } 
     76    } 
     77    else { 
     78        my $tree = HTML::TreeBuilder::XPath->new; 
     79        $tree->parse($content); 
     80        $tree->eof; 
     81 
     82        for my $child ( $tree->findnodes($re || '//a') ) { 
     83            my $href  = $child->attr('href') or next; 
     84            my $title = $child->attr('title') || $child->as_text; 
     85 
     86            my $entry = Plagger::Entry->new; 
     87            $entry->title($title); 
     88            $entry->link($href); 
     89            $feed->add_entry($entry); 
     90 
     91            $context->log(debug => "Add $href ($title)"); 
     92        } 
    7393    } 
    7494 
     
    94114          meta: 
    95115            follow_link: /headlines/ 
     116        - url: http://d.hatena.ne.jp/antipop/20050628/1119966355 
     117          meta: 
     118            follow_xpath: //ul[@class="xoxo" or @class="subscriptionlist"]//a 
    96119 
    97120  - module: CustomFeed::Simple