root/trunk/plagger/lib/Plagger/Util.pm

Revision 1884 (checked in by miyagawa, 14 years ago)

use Digest::MD5 which is left unloaded somehow

Line 
1 package Plagger::Util;
2 use strict;
3 our @ISA = qw(Exporter);
4 our @EXPORT_OK = qw( strip_html dumbnail decode_content extract_title load_uri mime_type_of filename_for mime_is_enclosure );
5
6 use Digest::MD5;
7 use Encode ();
8 use List::Util qw(min);
9 use HTML::Entities;
10 use HTML::Tagset;
11 use MIME::Types;
12 use MIME::Type;
13 use Plagger::Text;
14
15 our $Detector;
16
17 BEGIN {
18     if ( eval { require Encode::Detect::Detector; 1 } ) {
19         $Detector = sub { Encode::Detect::Detector::detect($_[0]) };
20     } else {
21         require Encode::Guess;
22         $Detector = sub {
23             my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
24             eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
25         };
26     }
27 }
28
29 sub strip_html {
30     my $html = shift;
31
32     eval {
33         require HTML::FormatText;
34         require HTML::TreeBuilder;
35     };
36
37     if ($@) {
38         # dump stripper
39         $html =~ s/<[^>]*>//g;
40         return HTML::Entities::decode($html);
41     }
42
43     my $tree = HTML::TreeBuilder->new;
44     $tree->parse($html);
45     $tree->eof;
46
47     my $formatter = HTML::FormatText->new(leftmargin => 0);
48     my $text = $formatter->format($tree);
49 #    utf8::decode($text);
50     $text =~ s/\s*$//s;
51     $text;
52 }
53
54 sub dumbnail {
55     my($img, $p) = @_;
56
57     if (!$img->{width} && !$img->{height}) {
58         return '';
59     }
60
61     if ($img->{width} <= $p->{width} && $img->{height} <= $p->{height}) {
62         return qq(width="$img->{width}" height="$img->{height}");
63     }
64
65     my $ratio_w = $p->{width}  / $img->{width};
66     my $ratio_h = $p->{height} / $img->{height};
67     my $ratio   = min($ratio_w, $ratio_h);
68
69     sprintf qq(width="%d" height="%d"), ($img->{width} * $ratio), ($img->{height} * $ratio);
70 }
71
72 sub decode_content {
73     my $stuff = shift;
74
75     my $content;
76     my $res;
77     if (ref($stuff) && ref($stuff) eq 'URI::Fetch::Response') {
78         $res     = $stuff;
79         $content = $res->content;
80     } elsif (ref($stuff)) {
81         Plagger->context->error("Don't know how to decode " . ref($stuff));
82     } else {
83         $content = $stuff;
84     }
85
86     my $charset;
87
88     # 1) if it is HTTP response, get charset from HTTP Content-Type header
89     if ($res) {
90         $charset = ($res->content_type =~ /charset=([\w\-]+)/)[0];
91     }
92
93     # 2) if there's not, try XML encoding
94     $charset ||= ( $content =~ /<\?xml version="1.0" encoding="([\w\-]+)"\?>/ )[0];
95
96     # 3) if there's not, try META tag
97     $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];
98
99     # 4) if there's not still, try Detector/Guess
100     $charset ||= $Detector->($content);
101
102     # 5) falls back to UTF-8
103     $charset ||= 'utf-8';
104
105     my $decoded = eval { Encode::decode($charset, $content) };
106
107     if ($@ && $@ =~ /Unknown encoding/) {
108         Plagger->context->log(warn => $@);
109         $charset = $Detector->($content) || 'utf-8';
110         $decoded = Encode::decode($charset, $content);
111     }
112
113     $decoded;
114 }
115
116 sub extract_title {
117     my $content = shift;
118     my $title = ($content =~ m!<title>\s*(.*?)\s*</title>!is)[0] or return;
119     HTML::Entities::decode($1);
120 }
121
122 sub load_uri {
123     my($uri, $plugin) = @_;
124
125     require Plagger::UserAgent;
126
127     my $data;
128     if (ref($uri) eq 'SCALAR') {
129         $data = $$uri;
130     }
131     elsif ($uri->scheme =~ /^https?$/) {
132         Plagger->context->log(debug => "Fetch remote file from $uri");
133
134         my $response = Plagger::UserAgent->new->fetch($uri, $plugin);
135         if ($response->is_error) {
136             Plagger->context->log(error => "GET $uri failed: " .
137                                   $response->http_status . " " .
138                                   $response->http_response->message);
139         }
140         $data = decode_content($response);
141     }
142     elsif ($uri->scheme eq 'file') {
143         Plagger->context->log(debug => "Open local file " . $uri->file);
144         open my $fh, '<', $uri->file
145             or Plagger->context->error( $uri->file . ": $!" );
146         $data = decode_content(join '', <$fh>);
147     }
148     else {
149         Plagger->context->error("Unsupported URI scheme: " . $uri->scheme);
150     }
151
152     return $data;
153 }
154
155 our $mimetypes = MIME::Types->new;
156 $mimetypes->addType( MIME::Type->new(type => 'video/x-flv', extensions => [ 'flv' ]) );
157 $mimetypes->addType( MIME::Type->new(type => 'audio/aac', extensions => [ 'm4a', 'aac' ]) );
158 $mimetypes->addType( MIME::Type->new(type => 'video/mp4', extensions => [ 'mp4', 'm4v' ]) );
159
160 sub mime_type_of {
161     my $ext = shift;
162
163     if (UNIVERSAL::isa($ext, 'URI')) {
164         $ext = ( $ext->path =~ /\.(\w+)$/ )[0];
165     }
166
167     return unless $ext;
168     return $mimetypes->mimeTypeOf($ext);
169 }
170
171 sub mime_is_enclosure {
172     my $mime = shift;
173     return unless $mime;
174     $mime->mediaType =~ m!^(?:audio|video|image)$! || $mime->type eq 'application/ogg';
175 }
176
177 my %entities = (
178     '&' => '&amp;',
179     '<' => '&lt;',
180     '>' => '&gt;',
181     '"' => '&quot;',
182     "'" => '&apos;',
183 );
184
185 my $entities_re = join '|', keys %entities;
186
187 sub encode_xml {
188     my $stuff = shift;
189     $stuff =~ s/($entities_re)/$entities{$1}/g;
190     $stuff;
191 }
192
193 my %formats = (
194     'u' => sub { my $s = $_[0]->url;  $s =~ s!^https?://!!; $s },
195     'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
196     't' => sub { $_[0]->title },
197     'i' => sub { $_[0]->id_safe },
198 );
199
200 my $format_re = qr/%(u|l|t|i)/;
201
202 sub filename_for {
203     my($feed, $file) = @_;
204     $file =~ s{$format_re}{
205         safe_filename($formats{$1}->($feed))
206     }egx;
207     $file;
208 }
209
210 sub safe_filename {
211     my($path) = @_;
212     $path =~ s![^\w\s]+!_!g;
213     $path =~ s!\s+!_!g;
214     $path;
215 }
216
217 sub safe_id {
218     my $id = shift;
219     $id =~ s/^urn:guid://;
220     $id =~ /^([\w\-]+)$/ ? $1 : Digest::MD5::md5_hex($id);
221 }
222
223 1;
Note: See TracBrowser for help on using the browser.