root/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm

Revision 1883 (checked in by nik, 14 years ago)

Include a space in the output so that the URL doesn't run in to the text.

Line 
1 package Plagger::Plugin::Filter::HTMLScrubber;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use HTML::Scrubber;
6
7 sub rules {
8     return(
9         img => {
10             src => qr{^http://},    # only URL with http://
11             alt => 1,               # alt attributes allowed
12             '*' => 0,               # deny all others
13         },
14         style  => 0,
15         script => 0,
16     );
17 }
18
19 sub default {
20     return(
21         '*'    => 1,                        # default rule, allow all attributes
22         'href' => qr{^(?!(?:java)?script)}i,
23         'src'  => qr{^(?!(?:java)?script)}i,
24         'cite'     => '(?i-xsm:^(?!(?:java)?script))',
25         'language' => 0,
26         'name'        => 1,                 # could be sneaky, but hey ;)
27         'onblur'      => 0,
28         'onchange'    => 0,
29         'onclick'     => 0,
30         'ondblclick'  => 0,
31         'onerror'     => 0,
32         'onfocus'     => 0,
33         'onkeydown'   => 0,
34         'onkeypress'  => 0,
35         'onkeyup'     => 0,
36         'onload'      => 0,
37         'onmousedown' => 0,
38         'onmousemove' => 0,
39         'onmouseout'  => 0,
40         'onmouseover' => 0,
41         'onmouseup'   => 0,
42         'onreset'     => 0,
43         'onselect'    => 0,
44         'onsubmit'    => 0,
45         'onunload'    => 0,
46         'src'         => 0,
47         'type'        => 0,
48         'style'       => 0,
49     );
50 }
51
52 sub register {
53     my ( $self, $context ) = @_;
54
55     $context->register_hook(
56         $self,
57         'update.entry.fixup' => \&update,
58         'plugin.init'        => \&initialize,
59     );
60 }
61
62 sub initialize {
63     my($self, $context, $args) = @_;
64
65     $self->{scrubber} = do {
66         my $scrubber = HTML::Scrubber->new;
67         my $config   = $self->conf;
68
69         my ( %rules, %default );
70         unless ( delete $config->{no_default_configs} ) {
71             %rules   = $self->rules;
72             %default = $self->default;
73         }
74         $scrubber->rules( %rules, %{ delete $config->{rules} || {} } );
75         $scrubber->default(1, { %default, %{ delete $config->{default} || {} } });
76
77         while ( my ( $method, $arg ) = each %$config ) {
78             eval {
79                 $scrubber->$method(
80                       ref $arg eq 'ARRAY' ? @$arg
81                     : ref $arg eq 'HASH'  ? %$arg
82                     : $arg );
83             };
84             $context->error(qq/Invalid method call "$method": $@/) if $@;
85         }
86
87         $scrubber;
88     };
89 }
90
91 sub update {
92     my ( $self, $context, $args ) = @_;
93
94     if (defined $args->{entry}->body && $args->{entry}->body->is_html) {
95         $context->log(debug => "Scrubbing body for " . $args->{entry}->permalink || '(no-link)');
96         my $body = $self->{scrubber}->scrub( $args->{entry}->body );
97         $args->{entry}->body($body);
98     }
99 }
100
101 1;
102
103 __END__
104
105 =head1 NAME
106
107 Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
108
109 =head1 SYNOPSIS
110
111   - module: Filter::HTMLScrubber
112     config:
113       rules:
114         style: 0
115         script: 0
116
117 =head1 DESCRIPTION
118
119 This plugin scrubs feed content using L<HTML::Scrubber>.
120
121 All config parameters (except 'no_default_configs') are implemented as
122 HTML::Scrubber's method: value.  For example, if you write:
123
124     method: value
125
126 in the config: section, this plugin will automatically turn the config
127 into the method call:
128
129     $scrubber->method('value');
130
131 See L<HTML::Scrubber> document for details.
132
133 =head1 CONFIG
134
135 =over 4
136
137 =item no_default_configs
138
139 Some rules and default config parameters are set by default. See I<rules>
140 and I<default> methods defined in this module code for details.
141
142 If you don't need these settings, use C<no_default_configs>
143
144    no_detault_configs: 1
145
146 Defaults to 0, which means it uses the default (somewhat secure) config.
147
148 =back
149
150 =head1 AUTHOR
151
152 Daisuke Murase <typester@cpan.org>
153
154 Tatsuhiko Miyagawa
155
156 =head1 SEE ALSO
157
158 L<Plagger>, L<HTML::Scrubber>
159
160 =cut
Note: See TracBrowser for help on using the browser.