root/trunk/plagger/lib/Plagger/Plugin/Search/KinoSearch.pm

Revision 1391 (checked in by miyagawa, 14 years ago)

merge from hackathon-mt

Line 
1 package Plagger::Plugin::Search::KinoSearch;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Encode;
6 use KinoSearch::Index::Term;
7 use KinoSearch::InvIndexer;
8 use KinoSearch::Searcher;
9 use KinoSearch::Analysis::PolyAnalyzer;
10
11 sub register {
12     my($self, $context) = @_;
13     $context->register_hook(
14         $self,
15         'publish.entry'    => \&entry,
16         'plugin.init'      => \&initialize,
17         'plugin.finalize'  => \&finalize,
18         'searcher.search'  => \&search,
19     );
20 }
21
22 sub initialize {
23     my($self, $context, $args) = @_;
24
25     $self->conf->{invindex} ||= $self->cache->path_to('invindex');
26
27     # TODO: CJKAnalyzer
28     $self->{analyzer} = KinoSearch::Analysis::PolyAnalyzer->new(
29         analyzers => [
30             KinoSearch::Analysis::LCNormalizer->new,
31             KinoSearch::Analysis::Tokenizer->new,
32         ],
33     );
34
35     $self->{indexer} = KinoSearch::InvIndexer->new(
36         invindex => $self->conf->{invindex},
37         create   => !-e $self->conf->{invindex},
38         analyzer => $self->{analyzer},
39     );
40
41     $self->{indexer}->spec_field( name => 'link' );
42     $self->{indexer}->spec_field( name => 'title', boost => 3 );
43     $self->{indexer}->spec_field( name => 'body' );
44     $self->{indexer}->spec_field( name => 'date' );
45     $self->{indexer}->spec_field( name => 'author' );
46 }
47
48 sub entry {
49     my($self, $context, $args) = @_;
50
51     return unless $args->{entry}->permalink;
52     $context->log(info => "Going to index entry " . $args->{entry}->permalink );
53
54     my $term = KinoSearch::Index::Term->new( url => $args->{entry}->permalink );
55     $self->{indexer}->delete_docs_by_term($term);
56
57     my $doc = $self->{indexer}->new_doc;
58     $doc->set_value( link   => $args->{entry}->permalink );
59     $doc->set_value( title  => $args->{entry}->title );
60     $doc->set_value( body   => $args->{entry}->body_text );
61     $doc->set_value( date   => $args->{entry}->date->format('W3CDTF') ) if $args->{entry}->date;
62     $doc->set_value( author => $args->{entry}->author ) if $args->{entry}->author;
63
64     $self->{indexer}->add_doc($doc);
65 }
66
67 sub finalize {
68     my($self, $context, $args) = @_;
69     $self->{indexer}->finish;
70
71     $self->search($context, { query => "murakami" });
72 }
73
74 sub search {
75     my($self, $context, $args) = @_;
76
77     my $searcher = KinoSearch::Searcher->new(
78         invindex => $self->conf->{invindex},
79         analyzer => $self->{analyzer},
80     );
81
82     my $feed = Plagger::Feed->new;
83     $feed->type('search:KinoSearch');
84     $feed->title("Search: $args->{query}");
85
86     my $hits = $searcher->search( query => $args->{query} );
87     while ( my $hit = $hits->fetch_hit_hashref ) {
88         my $entry = Plagger::Entry->new;
89
90         for my $col (qw( link title body date author )) {
91             $entry->$col($hit->{$col}) if defined $hit->{$col};
92         }
93         $feed->add_entry($entry);
94     }
95
96     return $feed;
97 }
98
99 1;
100
101 __END__
102
103 =head1 NAME
104
105 Plagger::Plugin::Search::KinoSearch - Index entries using KinoSearch
106
107 =head1 SYNOPSIS
108
109   - module: Search::KinoSearch
110     config:
111       invindex: /path/to/invindex
112
113 =head1 DESCRIPTION
114
115 This plugin stores feeds to KinoSearch inverted index. KinoSearch is a
116 Lucene loose port to Perl/C.
117
118 =head1 AUTHOR
119
120 Tatsuhiko Miyagawa
121
122 =head1 SEE ALSO
123
124 L<Plagger>, L<KinoSearch>
125
126 =cut
Note: See TracBrowser for help on using the browser.