root/trunk/plagger/lib/Plagger/Util.pm

Revision 1741 (checked in by miyagawa, 14 years ago)

merge from hackathon-summary

Line 
1 package Plagger::Util;
2 use strict;
3 our @ISA = qw(Exporter);
4 our @EXPORT_OK = qw( strip_html dumbnail decode_content extract_title load_uri mime_type_of filename_for mime_is_enclosure );
5
6 use Encode ();
7 use List::Util qw(min);
8 use HTML::Entities;
9 use HTML::Tagset;
10 use MIME::Types;
11 use MIME::Type;
12 use Plagger::Text;
13
14 our $Detector;
15
16 BEGIN {
17     if ( eval { require Encode::Detect::Detector; 1 } ) {
18         $Detector = sub { Encode::Detect::Detector::detect($_[0]) };
19     } else {
20         require Encode::Guess;
21         $Detector = sub {
22             my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
23             eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
24         };
25     }
26 }
27
28 sub strip_html {
29     my $html = shift;
30
31     eval {
32         require HTML::FormatText;
33         require HTML::TreeBuilder;
34     };
35
36     if ($@) {
37         # dump stripper
38         $html =~ s/<[^>]*>//g;
39         return HTML::Entities::decode($html);
40     }
41
42     my $tree = HTML::TreeBuilder->new;
43     $tree->parse($html);
44     $tree->eof;
45
46     my $formatter = HTML::FormatText->new(leftmargin => 0);
47     my $text = $formatter->format($tree);
48 #    utf8::decode($text);
49     $text =~ s/\s*$//s;
50     $text;
51 }
52
53 sub dumbnail {
54     my($img, $p) = @_;
55
56     if (!$img->{width} && !$img->{height}) {
57         return '';
58     }
59
60     if ($img->{width} <= $p->{width} && $img->{height} <= $p->{height}) {
61         return qq(width="$img->{width}" height="$img->{height}");
62     }
63
64     my $ratio_w = $p->{width}  / $img->{width};
65     my $ratio_h = $p->{height} / $img->{height};
66     my $ratio   = min($ratio_w, $ratio_h);
67
68     sprintf qq(width="%d" height="%d"), ($img->{width} * $ratio), ($img->{height} * $ratio);
69 }
70
71 sub decode_content {
72     my $stuff = shift;
73
74     my $content;
75     my $res;
76     if (ref($stuff) && ref($stuff) eq 'URI::Fetch::Response') {
77         $res     = $stuff;
78         $content = $res->content;
79     } elsif (ref($stuff)) {
80         Plagger->context->error("Don't know how to decode " . ref($stuff));
81     } else {
82         $content = $stuff;
83     }
84
85     my $charset;
86
87     # 1) if it is HTTP response, get charset from HTTP Content-Type header
88     if ($res) {
89         $charset = ($res->content_type =~ /charset=([\w\-]+)/)[0];
90     }
91
92     # 2) if there's not, try XML encoding
93     $charset ||= ( $content =~ /<\?xml version="1.0" encoding="([\w\-]+)"\?>/ )[0];
94
95     # 3) if there's not, try META tag
96     $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];
97
98     # 4) if there's not still, try Detector/Guess
99     $charset ||= $Detector->($content);
100
101     # 5) falls back to UTF-8
102     $charset ||= 'utf-8';
103
104     my $decoded = eval { Encode::decode($charset, $content) };
105
106     if ($@ && $@ =~ /Unknown encoding/) {
107         Plagger->context->log(warn => $@);
108         $charset = $Detector->($content) || 'utf-8';
109         $decoded = Encode::decode($charset, $content);
110     }
111
112     $decoded;
113 }
114
115 sub extract_title {
116     my $content = shift;
117     my $title = ($content =~ m!<title>\s*(.*?)\s*</title>!is)[0] or return;
118     HTML::Entities::decode($1);
119 }
120
121 sub load_uri {
122     my($uri, $plugin) = @_;
123
124     require Plagger::UserAgent;
125
126     my $data;
127     if (ref($uri) eq 'SCALAR') {
128         $data = $$uri;
129     }
130     elsif ($uri->scheme =~ /^https?$/) {
131         Plagger->context->log(debug => "Fetch remote file from $uri");
132
133         my $response = Plagger::UserAgent->new->fetch($uri, $plugin);
134         if ($response->is_error) {
135             Plagger->context->log(error => "GET $uri failed: " .
136                                   $response->http_status . " " .
137                                   $response->http_response->message);
138         }
139         $data = decode_content($response);
140     }
141     elsif ($uri->scheme eq 'file') {
142         Plagger->context->log(debug => "Open local file " . $uri->file);
143         open my $fh, '<', $uri->file
144             or Plagger->context->error( $uri->file . ": $!" );
145         $data = decode_content(join '', <$fh>);
146     }
147     else {
148         Plagger->context->error("Unsupported URI scheme: " . $uri->scheme);
149     }
150
151     return $data;
152 }
153
154 our $mimetypes = MIME::Types->new;
155 $mimetypes->addType( MIME::Type->new(type => 'video/x-flv', extensions => [ 'flv' ]) );
156 $mimetypes->addType( MIME::Type->new(type => 'audio/aac', extensions => [ 'm4a', 'aac' ]) );
157 $mimetypes->addType( MIME::Type->new(type => 'video/mp4', extensions => [ 'mp4', 'm4v' ]) );
158
159 sub mime_type_of {
160     my $ext = shift;
161
162     if (UNIVERSAL::isa($ext, 'URI')) {
163         $ext = ( $ext->path =~ /\.(\w+)$/ )[0];
164     }
165
166     return unless $ext;
167     return $mimetypes->mimeTypeOf($ext);
168 }
169
170 sub mime_is_enclosure {
171     my $mime = shift;
172     return unless $mime;
173     $mime->mediaType =~ m!^(?:audio|video|image)$! || $mime->type eq 'application/ogg';
174 }
175
176 my %entities = (
177     '&' => '&amp;',
178     '<' => '&lt;',
179     '>' => '&gt;',
180     '"' => '&quot;',
181     "'" => '&apos;',
182 );
183
184 my $entities_re = join '|', keys %entities;
185
186 sub encode_xml {
187     my $stuff = shift;
188     $stuff =~ s/($entities_re)/$entities{$1}/g;
189     $stuff;
190 }
191
192 my %formats = (
193     'u' => sub { my $s = $_[0]->url;  $s =~ s!^https?://!!; $s },
194     'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
195     't' => sub { $_[0]->title },
196     'i' => sub { $_[0]->id_safe },
197 );
198
199 my $format_re = qr/%(u|l|t|i)/;
200
201 sub filename_for {
202     my($feed, $file) = @_;
203     $file =~ s{$format_re}{
204         safe_filename($formats{$1}->($feed))
205     }egx;
206     $file;
207 }
208
209 sub safe_filename {
210     my($path) = @_;
211     $path =~ s![^\w\s]+!_!g;
212     $path =~ s!\s+!_!g;
213     $path;
214 }
215
216 sub safe_id {
217     my $id = shift;
218     $id =~ s/^urn:guid://;
219     $id =~ /^([\w\-]+)$/ ? $1 : Digest::MD5::md5_hex($id);
220 }
221
222 1;
Note: See TracBrowser for help on using the browser.