Changeset 683

Show
Ignore:
Timestamp:
05/05/06 16:51:03
Author:
miyagawa
Message:
  • Plagger::Util::decode_content now takes $content or $res
  • Try xml encoding header first, before HTML meta tag to guess charsets
  • Subscription::XOXO to handle multibyte title okay
  • Unhandled feed is removed from Subscription
  • use Plagger::UserAgent? in Util.pm
  • Planer: Unuse Scrubber for now
  • Aggregator::Xango to handle auto-discovered feed mapping as well
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/plagger/lib/Plagger.pm

    r671 r683  
    277277            if (!$ok) { 
    278278                Plagger->context->log(error => $feed->url . " is not aggregated by any aggregator"); 
     279                Plagger->context->subscription->delete_feed($feed); 
    279280            } 
    280281        } 
  • trunk/plagger/lib/Plagger/Plugin/Aggregator/Xango.pm

    r680 r683  
    125125            my @feeds = Feed::Find->find_in_html($r->content_ref, $url); 
    126126            if (@feeds) { 
    127                 $url = $feeds[0]; 
    128                 return unless $url =~ m!^https?://!i; 
    129                 $_[KERNEL]->post($_[HEAP]->{BROKER_ALIAS}, 'enqueue_job', Xango::Job->new(uri => URI->new($url), redirect => $redirect)); 
     127                my $feed_url = $feeds[0]; 
     128                return unless $feed_url =~ m!^https?://!i; 
     129 
     130                # OMG we should alias Feed so it can be looked up with $feed_url, too 
     131                $plugin->{_url2feed}->{$feed_url} = $plugin->{_url2feed}->{$url}; 
     132 
     133                $_[KERNEL]->post($_[HEAP]->{BROKER_ALIAS}, 'enqueue_job', Xango::Job->new(uri => URI->new($feed_url), redirect => $redirect)); 
    130134            } 
    131135            return; 
  • trunk/plagger/lib/Plagger/Plugin/Publish/Planet.pm

    r680 r683  
    3030#        HTML::Tidy->new, 
    3131        undef, 
    32         HTML::Scrubber->new( 
    33             rules => [ 
    34                 style => 0, 
    35                 script => 0, 
    36             ], 
    37             default => [ 1, { '*' => 1, style => 0 } ], 
    38         ), 
     32#        HTML::Scrubber->new( 
     33#            rules => [ 
     34#                style => 0, 
     35#                script => 0, 
     36#            ], 
     37#            default => [ 1, { '*' => 1, style => 0 } ], 
     38#        ), 
     39        undef, 
    3940    ); 
    4041 
     
    7172    foreach my $entry ($feed->entries) { 
    7273#        $entry->{body} = $tidy->clean($entry->{body}); 
    73         $entry->{body} = $scrubber->scrub($entry->{body})
     74        $entry->{body} = $scrubber->scrub($entry->{body}) if $scrubber
    7475    } 
    7576} 
  • trunk/plagger/lib/Plagger/Plugin/Subscription/XOXO.pm

    r681 r683  
    2828 
    2929    my $xhtml = Plagger::Util::load_uri($uri, $self); 
    30  
    3130    my $tree = HTML::TreeBuilder->new; 
    3231    $tree->parse($xhtml); 
  • trunk/plagger/lib/Plagger/Subscription.pm

    r20 r683  
    1717} 
    1818 
     19sub delete_feed { 
     20    my($self, $feed) = @_; 
     21    my @feeds = grep { $_ ne $feed } $self->feeds; 
     22    $self->{feeds} = \@feeds; 
     23} 
     24 
    1925sub types { 
    2026    my $self = shift; 
  • trunk/plagger/lib/Plagger/Util.pm

    r681 r683  
    4747 
    4848sub decode_content { 
    49     my $res = shift; 
    50     my $content = $res->content; 
     49    my $stuff = shift; 
    5150 
    52     # 1) get charset from HTTP Content-Type header 
    53     my $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0]; 
     51    my $content; 
     52    my $res; 
     53    if (ref($stuff) && ref($stuff) eq 'URI::Fetch::Response') { 
     54        $res     = $stuff; 
     55        $content = $res->content; 
     56    } elsif (ref($stuff)) { 
     57        Plagger->context->error("Don't know how to decode " . ref($stuff)); 
     58    } else { 
     59        $content = $stuff; 
     60    } 
    5461 
    55     # 2) if there's not, try META tag 
     62    my $charset; 
     63 
     64    # 1) if it is HTTP response, get charset from HTTP Content-Type header 
     65    if ($res) { 
     66        $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0]; 
     67    } 
     68 
     69    # 2) if there's not, try XML encoding 
     70    $charset ||= ( $content =~ /<\?xml version="1.0" encoding="([\w\-]+)"\?>/ )[0]; 
     71 
     72    # 3) if there's not, try META tag 
    5673    $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0]; 
    5774 
    58     # 3) if there's not still, try Detector/Guess 
     75    # 4) if there's not still, try Detector/Guess 
    5976    $charset ||= $Detector->($content); 
    6077 
    61     # 4) falls back to UTF-8 
     78    # 5) falls back to UTF-8 
    6279    $charset ||= 'utf-8'; 
    6380 
     
    8299    my($uri, $plugin) = @_; 
    83100 
     101    require Plagger::UserAgent; 
     102 
    84103    my $data; 
    85104    if (ref($uri) eq 'SCALAR') { 
     
    95114                                  $response->http_response->message); 
    96115        } 
    97         $data = $response->content
     116        $data = decode_content($response)
    98117    } 
    99118    elsif ($uri->scheme eq 'file') { 
     
    101120        open my $fh, '<', $uri->path 
    102121            or Plagger->context->error( $uri->path . ": $!" ); 
    103         $data = join '', <$fh>
     122        $data = decode_content(join '', <$fh>)
    104123    } 
    105124    else {