root/trunk/plagger/lib/Plagger/Plugin/Summary/Simple.pm

Revision 1779 (checked in by miyagawa, 14 years ago)

Summary::Simple: tighten up the regexp

Line 
1 package Plagger::Plugin::Summary::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 sub register {
6     my($self, $context) = @_;
7     $context->register_hook(
8         $self,
9         'summarizer.summarize' => \&summarize,
10     );
11 }
12
13 sub summarize {
14     my($self, $context, $args) = @_;
15
16     my $text = $args->{text};
17     $text = Plagger::Text->new_from_text($text) unless ref $text;
18
19     if ($text->is_html) {
20         # HTML: grab first block paragraph, or until first <br />
21         local $HTML::Tagset::isBodyElement{div} = 0;
22         my $html = $text->data;
23         while ($html =~ s|^\s*<(\w*)\s*[^>]*>(.*?)</\1>|$2|gs) {
24             if ($HTML::Tagset::isBodyElement{lc($1)}) {
25                 return "<$1>$2</$1>";
26             }
27         }
28
29         if ($text->data =~ m!^(.*?)<br\s*/?>!s) {
30             return $1;
31         } else {
32             return $text->data;
33         }
34     } else {
35         # text: strip until the ending dots
36         # TODO: make this 255 configurable?
37         if ($text =~ /^(.{20,254}?(\x{3002}|\.\s))/) {
38             (my $summary = $1) =~ s/\s*$//;
39             return $summary;
40         }
41
42         if (length($text) > 255) {
43             return substr($text, 0, 255) . "...";
44         } else {
45             return $text;
46         }
47     }
48 }
49
50 1;
51 __END__
52
53 =head1 NAME
54
55 Plagger::Plugin::Summary::Simple - Default summary generator
56
57 =head1 SYNOPSIS
58
59   # this is not actually needed
60   - module: Summary::Simple
61
62 =head1 DESCRIPTION
63
64 Summary::Simple is a core plugin that does simple generation of summary
65 using HTML snippet extraction algorithm. This plugin is autoloaded
66 from Plagger core and if you don't load any Summary plugins, or all of
67 your plugins declined to handle summary generation, Plagger fallbacks
68 to this plugin.
69
70 =head1 AUTHOR
71
72 Tatsuhiko Miyagawa
73
74 =head1 SEE ALSO
75
76 L<Plagger>
77
78 =cut
Note: See TracBrowser for help on using the browser.