root/trunk/plagger/lib/Plagger/Plugin/CustomFeed/2chSearch.pm

Revision 962 (checked in by miyagawa, 14 years ago)

2chSearh: Fix error handling

Line 
1 package Plagger::Plugin::CustomFeed::2chSearch;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Encode;
6 use HTML::Entities;
7 use Plagger::UserAgent;
8 use Plagger::Util qw( decode_content );
9
10 sub register {
11     my($self, $context) = @_;
12     $context->register_hook(
13         $self,
14         'customfeed.handle' => \&handle,
15     );
16 }
17
18 sub handle {
19     my($self, $context, $args) = @_;
20
21     if ($args->{feed}->url =~ m!^http://find\.2ch\.net/index\.php\?.*TYPE=BODY!) {
22         $self->aggregate($context, $args);
23         return 1;
24     }
25
26     return;
27 }
28
29 sub aggregate {
30     my($self, $context, $args) = @_;
31
32     my $url = $args->{feed}->url;
33     $context->log(info => "GET $url");
34
35     my $agent = Plagger::UserAgent->new;
36     my $res = $agent->fetch($url, $self, { NoNetwork => 60 * 60 });
37
38     if (!$res->status && $res->is_error) {
39         $context->log(error => "GET $url failed: " . $res->status);
40         return;
41     }
42
43     my $content = decode_content($res);
44
45     my %query = URI->new($url)->query_form;
46     my $query = decode("euc-jp", $query{STR});
47
48     my $feed = $args->{feed};
49     $feed->title( decode("utf-8", "2ch 検索: ") . $query );
50     $feed->link($url);
51
52     my $re = decode('utf-8', <<'RE');
53 <dt><a href="(.*?)"><b>(.*?)</b></a> \((\d+)\) - <font size=-1>.*?</font> - <font size=-1><a href=.*?</a>@.*?</font></dt><dd>(.*?)<br><font color=\#228822>.*?鯖 / 最新:(\d{4}/\d\d/\d\d \d\d:\d\d)</font> - .*?</dd>
54 RE
55
56     $content =~ s/\r\n/\n/g;
57
58     my @matches;
59     my @keys = qw( link title count body date );
60     my $date_format = "%Y/%m/%d %H:%M";
61
62     while ($content =~ /$re/gs) {
63         my $data;
64         @{$data}{@keys} = ($1, $2, $3, $4, $5);
65
66         $data->{date} = Plagger::Date->strptime($date_format, $data->{date});
67         $data->{date}->set_time_zone('Asia/Tokyo'); # set floating datetime
68         $data->{date}->set_time_zone(Plagger->context->conf->{timezone} || 'local');
69
70         $self->find_entry($data, $agent, $query);
71
72         my $entry = Plagger::Entry->new;
73         $entry->title($data->{title});
74         $entry->link( URI->new_abs($data->{link}, $url) );
75         $entry->date($data->{date});
76         $entry->body( munge_body($data->{body}) );
77
78         $feed->add_entry($entry);
79     }
80
81     $context->update->add($feed);
82 }
83
84 # mess with 2ch dat to find the actual entry, Ugggh
85 sub find_entry {
86     my($self, $data, $agent, $query) = @_;
87
88     # http://pc7.2ch.net/test/read.cgi/mac/1149563958/1-100
89     # => http://pc7.2ch.net/mac/dat/1149563958.dat
90     my($server, $board, $thread, $from, $to) =
91         $data->{link} =~ m!^http://(\w+)\.2ch\.net/test/read\.cgi/([^/]+)/(\d+)/(\d+)-(\d+)!;
92     my $dat = "http://$server.2ch.net/$board/dat/$thread.dat";
93
94     Plagger->context->log(debug => "GET $dat to find true entry link");
95     my $res = $agent->fetch($dat, $self);
96
97     if (!$res->status && $res->is_error) {
98         Plagger->context->log(error => "GET $dat failed: " . $res->status_code);
99         return;
100     }
101
102     my $content = decode('shift_jis', $res->content);
103     my @lines = split /\r?\n/, $content;
104
105     # if it links to 101-200, search from 200 to 101 to find the newest one
106     for my $id ( reverse ($from .. $to) ) {
107         my $line = $lines[$id-1] or next;
108         my @data = split /<>/, $line;
109         if ($data[3] =~ /$query/i) {
110             Plagger->context->log(info => "found entry on $id");
111             # xxx I could update other metadata, but leave it for EntryFullText ...
112             $data->{link} = "http://$server.2ch.net/test/read.cgi/$board/$thread/$id";
113
114             if ($data[2] =~ m!^(\d{4}/\d\d/\d\d)\(.*?\) (\d\d:\d\d:\d\d)!) {
115                 $data->{date} = Plagger::Date->strptime("%Y/%m/%d %H:%M:%S", "$1 $2");
116                 $data->{date}->set_time_zone('Asia/Tokyo'); # set floating datetime
117                 $data->{date}->set_time_zone(Plagger->context->conf->{timezone} || 'local');
118             }
119             return;
120         }
121     }
122 }
123
124 sub munge_body {
125     my $body = shift;
126     $body =~ s!<b id=e\d+>(.*?)</b>!$1!g;
127     decode_entities($body);
128 }
129
130 1;
131
132 __END__
133
134 =head1 NAME
135
136 Plagger::Plugin::CustomFeed::2chSearch - Custom feed for 2ch Search with Moritapo
137
138 =head1 SYNOPSIS
139
140   global:
141     user_agent:
142       cookies: /path/to/cookies.txt
143
144   plugins:
145     - module: Subscription::Config
146       config:
147         feed:
148           - http://find.2ch.net/index.php?BBS=2ch&TYPE=BODY&STR=Plagger&COUNT=10
149     - module: CustomFeed::2chSearch
150
151 =head1 DESCRIPTION
152
153 This plugin creates a custom feed off of 2ch search
154 L<http://find.2ch.net/>. Since 2ch search requires Moritapo to search
155 by fulltext, this plugin also requires a valid login cookie set to
156 global I<user_agent> config.
157
158 =head1 FREQUENCY FOR SEARCHES
159
160 By default, this plugin doesn't search more than once in an hour by
161 default, to save your money (Moritapo). If you want to reduce seach
162 frequency more (like once in a day), consider using
163 L<Plagger::Rule::DateTimeCron> to trigger Subscription::Config for it.
164
165 =head1 AUTHOR
166
167 Tatsuhiko Miyagawa
168
169 =head1 SEE ALSO
170
171 L<Plagger>, L<http://find.2ch.net/>
172
173 =cut
Note: See TracBrowser for help on using the browser.