Changeset 1210

Show
Ignore:
Timestamp:
08/07/06 03:23:44
Author:
miyagawa
Message:

Big change to dedupe_entries, so it respects the source feed domain, date order and content size. Search based Planet will be happy. Fixes #333

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/plagger/lib/Plagger/Feed.pm

    r1173 r1210  
    77 
    88use Digest::MD5 qw(md5_hex); 
     9use URI; 
    910use Plagger::Util; 
    1011 
     
    8788sub dedupe_entries { 
    8889    my $self = shift; 
    89     my %seen; 
    90     my @entries; 
     90 
     91    # this logic breaks ordering of entries, to be sorted using sort_entries 
     92 
     93    my(%seen, @entries); 
    9194    for my $entry ($self->entries) { 
    92         push @entries, $entry if !$seen{$entry->permalink}++
     95        push @{ $seen{$entry->permalink} }, $entry
    9396    } 
     97 
     98    for my $permalink (keys %seen) { 
     99        my @sorted = _sort_prioritize($permalink, @{ $seen{$permalink} }); 
     100        push @entries, $sorted[0]; 
     101    } 
     102 
    94103    $self->{entries} = \@entries; 
     104} 
     105 
     106sub _sort_prioritize { 
     107    my($permalink, @entries) = @_; 
     108 
     109    # use domain match, date and full-content-ness to prioritize source entry 
     110    # TODO: Date vs Full-content check should be user configurable 
     111 
     112    my $now = time; 
     113    return 
     114        map { $_->[0] } 
     115        sort { $b->[1] <=> $a->[1] || $b->[2] <=> $a->[2] || $b->[3] <=> $a->[3] || $b->[4] <=> $a->[4] } 
     116        map { [ 
     117            $_,                                              # Plagger::Entry for Schwartzian 
     118            _is_same_domain($permalink, $_->source->url),    # permalink and $feed->url is the same domain 
     119            _is_same_domain($permalink, $_->source->link),   # permalink and $feed->link is the same domain 
     120            ($_->date ? ($now - $_->date->epoch) : 0),        # Older entry date is prioritized 
     121            length($_->body || ''),                          # Prioritize full content feed 
     122        ] } @entries; 
     123} 
     124 
     125sub _is_same_domain { 
     126    my $u1 = URI->new($_[0]); 
     127    my $u2 = URI->new($_[1]); 
     128 
     129    return 0 unless $u1->can('host') && $u2->can('host'); 
     130    return lc($u1->host) eq lc($u2->host); 
    95131} 
    96132