root/trunk/plagger/lib/Plagger/Util.pm

Revision 683 (checked in by miyagawa, 14 years ago)
  • Plagger::Util::decode_content now takes $content or $res
  • Try xml encoding header first, before HTML meta tag to guess charsets
  • Subscription::XOXO to handle multibyte title okay
  • Unhandled feed is removed from Subscription
  • use Plagger::UserAgent? in Util.pm
  • Planer: Unuse Scrubber for now
  • Aggregator::Xango to handle auto-discovered feed mapping as well
Line 
1 package Plagger::Util;
2 use strict;
3 our @ISA = qw(Exporter);
4 our @EXPORT_OK = qw( strip_html dumbnail decode_content extract_title load_uri );
5
6 use Encode ();
7 use List::Util qw(min);
8 use HTML::Entities;
9
10 our $Detector;
11
12 BEGIN {
13     if ( eval { require Encode::Detect::Detector; 1 } ) {
14         $Detector = sub { Encode::Detect::Detector::detect($_[0]) };
15     } else {
16         require Encode::Guess;
17         $Detector = sub {
18             my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
19             eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
20         };
21     }
22 }
23
24 sub strip_html {
25     my $html = shift;
26     $html =~ s/<[^>]*>//g;
27     HTML::Entities::decode($html);
28 }
29
30 sub dumbnail {
31     my($img, $p) = @_;
32
33     if (!$img->{width} && !$img->{height}) {
34         return '';
35     }
36
37     if ($img->{width} <= $p->{width} && $img->{height} <= $p->{height}) {
38         return qq(width="$img->{width}" height="$img->{height}");
39     }
40
41     my $ratio_w = $p->{width}  / $img->{width};
42     my $ratio_h = $p->{height} / $img->{height};
43     my $ratio   = min($ratio_w, $ratio_h);
44
45     sprintf qq(width="%d" height="%d"), ($img->{width} * $ratio), ($img->{height} * $ratio);
46 }
47
48 sub decode_content {
49     my $stuff = shift;
50
51     my $content;
52     my $res;
53     if (ref($stuff) && ref($stuff) eq 'URI::Fetch::Response') {
54         $res     = $stuff;
55         $content = $res->content;
56     } elsif (ref($stuff)) {
57         Plagger->context->error("Don't know how to decode " . ref($stuff));
58     } else {
59         $content = $stuff;
60     }
61
62     my $charset;
63
64     # 1) if it is HTTP response, get charset from HTTP Content-Type header
65     if ($res) {
66         $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0];
67     }
68
69     # 2) if there's not, try XML encoding
70     $charset ||= ( $content =~ /<\?xml version="1.0" encoding="([\w\-]+)"\?>/ )[0];
71
72     # 3) if there's not, try META tag
73     $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];
74
75     # 4) if there's not still, try Detector/Guess
76     $charset ||= $Detector->($content);
77
78     # 5) falls back to UTF-8
79     $charset ||= 'utf-8';
80
81     my $decoded = eval { Encode::decode($charset, $content) };
82
83     if ($@ && $@ =~ /Unknown encoding/) {
84         Plagger->context->log(warn => $@);
85         $charset = $Detector->($content) || 'utf-8';
86         $decoded = Encode::decode($charset, $content);
87     }
88
89     $decoded;
90 }
91
92 sub extract_title {
93     my $content = shift;
94     my $title = ($content =~ m!<title>\s*(.*?)\s*</title>!s)[0] or return;
95     HTML::Entities::decode($1);
96 }
97
98 sub load_uri {
99     my($uri, $plugin) = @_;
100
101     require Plagger::UserAgent;
102
103     my $data;
104     if (ref($uri) eq 'SCALAR') {
105         $data = $$uri;
106     }
107     elsif ($uri->scheme =~ /^https?$/) {
108         Plagger->context->log(debug => "Fetch remote file from $uri");
109
110         my $response = Plagger::UserAgent->new->fetch($uri, $plugin);
111         if ($response->is_error) {
112             Plagger->context->log(error => "GET $uri failed: " .
113                                   $response->http_status . " " .
114                                   $response->http_response->message);
115         }
116         $data = decode_content($response);
117     }
118     elsif ($uri->scheme eq 'file') {
119         Plagger->context->log(debug => "Open local file " . $uri->path);
120         open my $fh, '<', $uri->path
121             or Plagger->context->error( $uri->path . ": $!" );
122         $data = decode_content(join '', <$fh>);
123     }
124     else {
125         Plagger->context->error("Unsupported URI scheme: " . $uri->scheme);
126     }
127
128     return $data;
129 }
130
131 1;
Note: See TracBrowser for help on using the browser.