root/trunk/plagger/lib/Plagger/Util.pm

Revision 537 (checked in by miyagawa, 14 years ago)

use different var name

Line 
1 package Plagger::Util;
2 use strict;
3 our @ISA = qw(Exporter);
4 our @EXPORT_OK = qw( strip_html dumbnail decode_content extract_title );
5
6 use Encode ();
7 use List::Util qw(min);
8 use HTML::Entities;
9
10 our $Detector;
11
12 BEGIN {
13     if ( eval { require Encode::Detect::Detector; 1 } ) {
14         $Detector = sub { Encode::Detect::Detector::detect($_[0]) };
15     } else {
16         require Encode::Guess;
17         $Detector = sub {
18             my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
19             eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
20         };
21     }
22 }
23
24 sub strip_html {
25     my $html = shift;
26     $html =~ s/<[^>]*>//g;
27     HTML::Entities::decode($html);
28 }
29
30 sub dumbnail {
31     my($img, $p) = @_;
32
33     if (!$img->{width} && !$img->{height}) {
34         return '';
35     }
36
37     if ($img->{width} <= $p->{width} && $img->{height} <= $p->{height}) {
38         return qq(width="$img->{width}" height="$img->{height}");
39     }
40
41     my $ratio_w = $p->{width}  / $img->{width};
42     my $ratio_h = $p->{height} / $img->{height};
43     my $ratio   = min($ratio_w, $ratio_h);
44
45     sprintf qq(width="%d" height="%d"), ($img->{width} * $ratio), ($img->{height} * $ratio);
46 }
47
48 sub decode_content {
49     my $res = shift;
50     my $content = $res->content;
51
52     # 1) get charset from HTTP Content-Type header
53     my $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0];
54
55     # 2) if there's not, try META tag
56     $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];
57
58     # 3) if there's not still, try Detector/Guess
59     $charset ||= $Detector->($content);
60
61     # 4) falls back to UTF-8
62     $charset ||= 'utf-8';
63
64     my $decoded = eval { Encode::decode($charset, $content) };
65
66     if ($@ && $@ =~ /Unknown encoding/) {
67         Plagger->context->log(warn => $@);
68         $charset = $Detector->($content) || 'utf-8';
69         $decoded = Encode::decode($charset, $content);
70     }
71
72     $decoded;
73 }
74
75 sub extract_title {
76     my $content = shift;
77     my $title = ($content =~ m!<title>\s*(.*?)\s*</title>!s)[0] or return;
78     HTML::Entities::decode($1);
79 }
80
81 1;
Note: See TracBrowser for help on using the browser.