root/trunk/plagger/lib/Plagger/Util.pm

Revision 1668 (checked in by miyagawa, 14 years ago)
  • merged [1621] from hackathon-summary branch
  • autoload Filter::FloatingDateTime? from Publish::Feed to address invalid RFC date format. Fixes #400
Line 
1 package Plagger::Util;
2 use strict;
3 our @ISA = qw(Exporter);
4 our @EXPORT_OK = qw( strip_html dumbnail decode_content extract_title load_uri mime_type_of filename_for mime_is_enclosure );
5
6 use Encode ();
7 use List::Util qw(min);
8 use HTML::Entities;
9 use MIME::Types;
10 use MIME::Type;
11
12 our $Detector;
13
14 BEGIN {
15     if ( eval { require Encode::Detect::Detector; 1 } ) {
16         $Detector = sub { Encode::Detect::Detector::detect($_[0]) };
17     } else {
18         require Encode::Guess;
19         $Detector = sub {
20             my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
21             eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
22         };
23     }
24 }
25
26 sub strip_html {
27     my $html = shift;
28     $html =~ s/<[^>]*>//g;
29     HTML::Entities::decode($html);
30 }
31
32 sub dumbnail {
33     my($img, $p) = @_;
34
35     if (!$img->{width} && !$img->{height}) {
36         return '';
37     }
38
39     if ($img->{width} <= $p->{width} && $img->{height} <= $p->{height}) {
40         return qq(width="$img->{width}" height="$img->{height}");
41     }
42
43     my $ratio_w = $p->{width}  / $img->{width};
44     my $ratio_h = $p->{height} / $img->{height};
45     my $ratio   = min($ratio_w, $ratio_h);
46
47     sprintf qq(width="%d" height="%d"), ($img->{width} * $ratio), ($img->{height} * $ratio);
48 }
49
50 sub decode_content {
51     my $stuff = shift;
52
53     my $content;
54     my $res;
55     if (ref($stuff) && ref($stuff) eq 'URI::Fetch::Response') {
56         $res     = $stuff;
57         $content = $res->content;
58     } elsif (ref($stuff)) {
59         Plagger->context->error("Don't know how to decode " . ref($stuff));
60     } else {
61         $content = $stuff;
62     }
63
64     my $charset;
65
66     # 1) if it is HTTP response, get charset from HTTP Content-Type header
67     if ($res) {
68         $charset = ($res->content_type =~ /charset=([\w\-]+)/)[0];
69     }
70
71     # 2) if there's not, try XML encoding
72     $charset ||= ( $content =~ /<\?xml version="1.0" encoding="([\w\-]+)"\?>/ )[0];
73
74     # 3) if there's not, try META tag
75     $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];
76
77     # 4) if there's not still, try Detector/Guess
78     $charset ||= $Detector->($content);
79
80     # 5) falls back to UTF-8
81     $charset ||= 'utf-8';
82
83     my $decoded = eval { Encode::decode($charset, $content) };
84
85     if ($@ && $@ =~ /Unknown encoding/) {
86         Plagger->context->log(warn => $@);
87         $charset = $Detector->($content) || 'utf-8';
88         $decoded = Encode::decode($charset, $content);
89     }
90
91     $decoded;
92 }
93
94 sub extract_title {
95     my $content = shift;
96     my $title = ($content =~ m!<title>\s*(.*?)\s*</title>!is)[0] or return;
97     HTML::Entities::decode($1);
98 }
99
100 sub load_uri {
101     my($uri, $plugin) = @_;
102
103     require Plagger::UserAgent;
104
105     my $data;
106     if (ref($uri) eq 'SCALAR') {
107         $data = $$uri;
108     }
109     elsif ($uri->scheme =~ /^https?$/) {
110         Plagger->context->log(debug => "Fetch remote file from $uri");
111
112         my $response = Plagger::UserAgent->new->fetch($uri, $plugin);
113         if ($response->is_error) {
114             Plagger->context->log(error => "GET $uri failed: " .
115                                   $response->http_status . " " .
116                                   $response->http_response->message);
117         }
118         $data = decode_content($response);
119     }
120     elsif ($uri->scheme eq 'file') {
121         Plagger->context->log(debug => "Open local file " . $uri->file);
122         open my $fh, '<', $uri->file
123             or Plagger->context->error( $uri->file . ": $!" );
124         $data = decode_content(join '', <$fh>);
125     }
126     else {
127         Plagger->context->error("Unsupported URI scheme: " . $uri->scheme);
128     }
129
130     return $data;
131 }
132
133 our $mimetypes = MIME::Types->new;
134 $mimetypes->addType( MIME::Type->new(type => 'video/x-flv', extensions => [ 'flv' ]) );
135 $mimetypes->addType( MIME::Type->new(type => 'audio/aac', extensions => [ 'm4a', 'aac' ]) );
136 $mimetypes->addType( MIME::Type->new(type => 'video/mp4', extensions => [ 'mp4', 'm4v' ]) );
137
138 sub mime_type_of {
139     my $ext = shift;
140
141     if (UNIVERSAL::isa($ext, 'URI')) {
142         $ext = ( $ext->path =~ /\.(\w+)$/ )[0];
143     }
144
145     return unless $ext;
146     return $mimetypes->mimeTypeOf($ext);
147 }
148
149 sub mime_is_enclosure {
150     my $mime = shift;
151     return unless $mime;
152     $mime->mediaType =~ m!^(?:audio|video|image)$! || $mime->type eq 'application/ogg';
153 }
154
155 my %entities = (
156     '&' => '&amp;',
157     '<' => '&lt;',
158     '>' => '&gt;',
159     "'" => '&quot;',
160 );
161
162 my $entities_re = join '|', keys %entities;
163
164 sub encode_xml {
165     my $stuff = shift;
166     $stuff =~ s/($entities_re)/$entities{$1}/g;
167     $stuff;
168 }
169
170 my %formats = (
171     'u' => sub { my $s = $_[0]->url;  $s =~ s!^https?://!!; $s },
172     'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
173     't' => sub { $_[0]->title },
174     'i' => sub { $_[0]->id_safe },
175 );
176
177 my $format_re = qr/%(u|l|t|i)/;
178
179 sub filename_for {
180     my($feed, $file) = @_;
181     $file =~ s{$format_re}{
182         safe_filename($formats{$1}->($feed))
183     }egx;
184     $file;
185 }
186
187 sub safe_filename {
188     my($path) = @_;
189     $path =~ s![^\w\s]+!_!g;
190     $path =~ s!\s+!_!g;
191     $path;
192 }
193
194 sub safe_id {
195     my $id = shift;
196     $id =~ s/^urn:guid://;
197     $id =~ /^([\w\-]+)$/ ? $1 : Digest::MD5::md5_hex($id);
198 }
199
200 1;
Note: See TracBrowser for help on using the browser.