root/trunk/plagger/lib/Plagger/FeedParser.pm

Revision 1492 (checked in by miyagawa, 14 years ago)

Refactored XML::Feed and Aggregator::Simple to split out the parser and discover functionality to a separate module: Plagger::FeedParser?, and new methods to UserAgent?: $ua->find_parse($url) and $ua->fetch_parse($url). Fixes #381

Line 
1 package Plagger::FeedParser;
2 use strict;
3
4 use Feed::Find;
5 use XML::Feed;
6 use XML::Feed::RSS;
7 $XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";
8
9 use Plagger::Util;
10
11 sub parse {
12     my($class, $content_ref) = @_;
13
14     # override XML::LibXML with Liberal
15     my $sweeper; # XML::Liberal >= 0.13
16
17     eval { require XML::Liberal };
18     if (!$@ && $XML::Liberal::VERSION >= 0.10) {
19         $sweeper = XML::Liberal->globally_override('LibXML');
20     }
21
22     local $XML::Atom::ForceUnicode = 1;
23     my $remote = eval { XML::Feed->parse($content_ref) }
24         or Carp::croak("Parsing content failed: " . ($@ || XML::Feed->errstr));
25
26     return $remote;
27 }
28
29 sub discover {
30     my($self, $res) = @_;
31
32     my $content_type = eval { $res->content_type } ||
33                        $res->http_response->content_type ||
34                        "text/xml";
35
36     $content_type =~ s/;.*$//; # strip charset= cruft
37
38     my $content = $res->content;
39     if ( $Feed::Find::IsFeed{$content_type} || $self->looks_like_feed(\$content) ) {
40         return $res->uri;
41     } else {
42         $content  = Plagger::Util::decode_content($res);
43         my @feeds = Feed::Find->find_in_html(\$content, $res->uri);
44         if (@feeds) {
45             return $feeds[0];
46         } else {
47             return;
48         }
49     }
50 }
51
52 sub looks_like_feed {
53     my($self, $content_ref) = @_;
54     $$content_ref =~ m!<rss |<rdf:RDF\s+.*?xmlns="http://purl\.org/rss|<feed\s+xmlns="!s;
55 }
56
57 1;
Note: See TracBrowser for help on using the browser.