root/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTidy.pm

Revision 1875 (checked in by miyagawa, 14 years ago)

Filter::HTMLTidy: tweak default config a bit to generate saner XHTML. Refs #334

Line 
1 package Plagger::Plugin::Filter::HTMLTidy;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use HTML::Tidy;
6
7 sub register {
8     my($self, $context) = @_;
9     $context->register_hook(
10         $self,
11         'update.entry.fixup' => \&filter,
12     );
13 }
14
15 our %defaults = (
16     doctype      => 'omit',
17     output_xhtml => 1,
18     wrap         => 0,
19     break_before_br => 0,
20     input_encoding => 'utf8',
21     output_encoding => 'utf8',
22     tidy_mark => 0,
23 );
24
25 sub filter {
26     my($self, $context, $args) = @_;
27
28     my $body = $args->{entry}->body;
29     return unless $body && $body->is_html;
30
31     my $conf = $self->conf || {};
32     while (my($key, $value) = each %defaults) {
33         $conf->{$key} = $value unless exists $conf->{$key};
34     }
35
36     my $tidy = HTML::Tidy->new( $self->conf || {} );
37     $tidy->ignore( type => TIDY_WARNING );
38     my $new_body = $tidy->clean($body->data); # pass in Unicode string, not UTF-8
39
40     # HACK to extract <body /> only
41     $new_body =~ s!^.*<body>\s*(.*?)\s*</body>\s*</html>\s*$!$1!s;
42
43     $args->{entry}->body($new_body);
44 }
45
46 1;
47 __END__
48
49 =head1 NAME
50
51 Plagger::Plugin::Filter::HTMLTidy - Filters body HTML using HTML::Tidy
52
53 =head1 SYNOPSIS
54
55   - module: Filter::HTMLTidy
56     config:
57       output-xhtml: yes
58       char-encoding: utf-8
59
60 =head1 DESCRIPTION
61
62 This plugin glues HTML::Tidy as an entry filter, so it scrubs HTML to
63 make it tidy. Best used with Publish plugins like Planet.
64
65 =head1 CONFIG
66
67 This plugin accepts any config options that can be used as htmltidy
68 config file.  See L<http://tidy.sourceforge.net/docs/quickref.html> for details.
69
70 =head1 AUTHOR
71
72 Tatsuhiko Miyagawa
73
74 =head1 SEE ALSO
75
76 L<Plagger>, L<HTML::Tidy>, L<http://tidy.sourceforge.net/docs/quickref.html>
77
78 =cut
Note: See TracBrowser for help on using the browser.