1
|
#!/usr/bin/perl -T
|
2
|
|
3
|
use v5.10;
|
4
|
use strict;
|
5
|
use warnings;
|
6
|
|
7
|
use threads;
|
8
|
use threads::shared;
|
9
|
use Thread::Queue ();
|
10
|
|
11
|
BEGIN {
|
12
|
# JSON::XS is not thread-safe...
|
13
|
$ENV{PERL_JSON_BACKEND} = "Cpanel::JSON::XS";
|
14
|
}
|
15
|
|
16
|
use Config::Tiny ();
|
17
|
use Crypt::URandom qw/urandom/;
|
18
|
use DBI qw/:sql_types/;
|
19
|
use Encode ();
|
20
|
use File::Basename qw/basename dirname/;
|
21
|
use JSON ();
|
22
|
use LWP::MediaTypes qw/guess_media_type/;
|
23
|
use LWP::UserAgent ();
|
24
|
use POSIX ();
|
25
|
use URI::Escape qw/uri_escape_utf8/;
|
26
|
use Time::HiRes qw/usleep/;
|
27
|
|
28
|
my $SCRIPT_FILENAME = $0 =~ /\A(\p{Graph}+)\z/ ? $1 : die;
|
29
|
my $CONF = Config::Tiny::->read("$SCRIPT_FILENAME.conf") or die Config::Tiny::->errstr()."\n";
|
30
|
my ($DBH, $ASKBOT, $DISCOURSE, $WORKER, $INDENT);
|
31
|
my ($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT);
|
32
|
my $NUM_WORKERS = 8;
|
33
|
|
34
|
#######################################################################
|
35
|
|
36
|
# Logging
|
37
|
open my $LOG, ">", "$SCRIPT_FILENAME.log" or die "open($SCRIPT_FILENAME.log): $!";
|
38
|
sub logmsg($@) {
|
39
|
my $level = shift;
|
40
|
my @msg = @_;
|
41
|
$_ = Encode::encode("UTF-8", $_) foreach @msg;
|
42
|
my $level2 = $level eq "SOFTFAIL" ? "ERROR" : $level;
|
43
|
unshift @msg, $level2, " ";
|
44
|
unshift @msg, sprintf("[%0*d] ", length($NUM_WORKERS), $WORKER) if defined $WORKER;
|
45
|
push @msg, "\n";
|
46
|
$LOG->printflush( POSIX::strftime("%b %d %H:%M:%S ", localtime), @msg );
|
47
|
if ($level eq "WARN" or $level eq "NOTICE" or $level eq "SOFTFAIL") {
|
48
|
print STDERR @msg;
|
49
|
} elsif ($level eq "PANIC" or $level eq "ERROR") {
|
50
|
die @msg;
|
51
|
}
|
52
|
}
|
53
|
sub fail(@) {
|
54
|
return logmsg(ERROR => @_);
|
55
|
}
|
56
|
sub panic(@) {
|
57
|
my @msg = @_;
|
58
|
my @loc = caller;
|
59
|
push @msg, " " if @msg;
|
60
|
push @msg, "at line $loc[2] in $loc[1]";
|
61
|
return logmsg(PANIC => @msg);
|
62
|
}
|
63
|
|
64
|
# useragent_init()
|
65
|
# (Re-)init user agent, incl. connection cache.
|
66
|
sub useragent_init() {
|
67
|
$ASKBOT = LWP::UserAgent::->new(
|
68
|
keep_alive => 1,
|
69
|
ssl_opts => { verify_hostname => 1, SSL_hostname => $CONF->{askbot}->{host} },
|
70
|
default_headers => HTTP::Headers::->new()
|
71
|
);
|
72
|
$DISCOURSE = LWP::UserAgent::->new(
|
73
|
keep_alive => 1,
|
74
|
ssl_opts => { verify_hostname => 1 },
|
75
|
default_headers => HTTP::Headers::->new(
|
76
|
"Accept" => "application/json",
|
77
|
"Api-Key" => $CONF->{discourse}->{apikey}
|
78
|
)
|
79
|
);
|
80
|
($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT) = (0,0,0,0);
|
81
|
}
|
82
|
|
83
|
# decode_json_response($AGENT => $RESPONSE)
|
84
|
# Decode an application/json HTTP response. If a 5XY HTTP response
|
85
|
# was received, try again to send the request with a 1min timeout.
|
86
|
sub decode_json_response($$;$);
|
87
|
sub decode_json_response($$;$) {
|
88
|
my ($ua, $resp, $timeout) = @_;
|
89
|
|
90
|
if ($resp->is_success) {
|
91
|
my $data = $resp->decoded_content();
|
92
|
return JSON::->new->utf8->decode($data);
|
93
|
}
|
94
|
|
95
|
my $code = $resp->code();
|
96
|
my $req = $resp->request();
|
97
|
|
98
|
# random delay between 0.75 and 1.25 seconds to help solving race conditions
|
99
|
# between threads
|
100
|
$timeout = defined $timeout ? ($timeout * 2) : (int(rand(500_000)) + 750_000);
|
101
|
if ($code >= 500 and $code <= 599 and $timeout <= 320_000_000) {
|
102
|
# upon 5XY response codes, retry up to 5 minutes
|
103
|
logmsg(INFO => "Received $code from <", $req->uri(), ">; ",
|
104
|
"will retry in ", int($timeout/1000), "ms");
|
105
|
usleep($timeout);
|
106
|
return decode_json_response($ua => $ua->request($req), $timeout);
|
107
|
}
|
108
|
|
109
|
my @error = ($resp->status_line(), " ", $req->method(), " <", $req->uri(), ">");
|
110
|
my $content_type = $resp->content_type();
|
111
|
if ($content_type eq "application/json") {
|
112
|
my $content = $resp->decoded_content();
|
113
|
chomp $content;
|
114
|
push @error, " ", $content;
|
115
|
}
|
116
|
fail(@error);
|
117
|
}
|
118
|
|
119
|
# askbot_get($URL)
|
120
|
# Send a request to the AskBot API and decode the JSON response.
|
121
|
sub askbot_get($) {
|
122
|
my $url = $CONF->{askbot}->{apiurl} . shift;
|
123
|
my $t0 = Time::HiRes::gettimeofday();
|
124
|
my $resp = $ASKBOT->get($url, Host => $CONF->{askbot}->{host}, Accept => "application/json");
|
125
|
$ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
|
126
|
$ASKBOT_COUNT++;
|
127
|
return decode_json_response($ASKBOT => $resp);
|
128
|
}
|
129
|
|
130
|
# discourse_req($USERNAME, $METHOD, $PATH, [$BODY])
|
131
|
# Send a request to the Discourse API and decode the JSON response.
|
132
|
sub discourse_req($$$;$) {
|
133
|
my ($username, $method, $path, $body) = @_;
|
134
|
my $req = HTTP::Request::->new( $method => $CONF->{discourse}->{apiurl} . $path );
|
135
|
$req->header( Host => $CONF->{discourse}->{host} );
|
136
|
# the API doesn't mention case for usernames, but seems we need it
|
137
|
# for some Unicode usernames...
|
138
|
$req->header( "Api-Username" => Encode::encode("UTF-8", lc($username)) );
|
139
|
if (defined $body) {
|
140
|
$req->content_type("application/json");
|
141
|
$req->content(JSON::->new->utf8->encode($body));
|
142
|
}
|
143
|
my $t0 = Time::HiRes::gettimeofday();
|
144
|
my $resp = $DISCOURSE->request($req);
|
145
|
$DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t0;
|
146
|
$DISCOURSE_COUNT++;
|
147
|
return decode_json_response($DISCOURSE => $resp);
|
148
|
}
|
149
|
|
150
|
# upload($USERNAME, $PATH, type => $TYPE, ...)
|
151
|
# Download $PATH from AskBot and re-upload to Discourse; returns the
|
152
|
# upload response from the Discourse API, cf.
|
153
|
# https://docs.discourse.org/#tag/Uploads/paths/~1uploads.json/post
|
154
|
#
|
155
|
# If the download yields 404 then returns undef instead.
|
156
|
sub upload($$%) {
|
157
|
my $username = shift;
|
158
|
my $path = shift;
|
159
|
my %attrs = @_;
|
160
|
|
161
|
my $filename = delete $attrs{filename} // basename($path);
|
162
|
my $content_type = guess_media_type($filename);
|
163
|
|
164
|
logmsg(INFO => " " x $INDENT, "upload $path ($filename)");
|
165
|
panic($path) unless $path =~ /\A\//;
|
166
|
|
167
|
my $uri = $CONF->{askbot}->{apiurl} . $path;
|
168
|
my $t0 = Time::HiRes::gettimeofday();
|
169
|
my $resp = $ASKBOT->get($uri, Host => $CONF->{askbot}->{host}) or panic();
|
170
|
$ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
|
171
|
$ASKBOT_COUNT++;
|
172
|
unless ($resp->is_success) {
|
173
|
my @error = ($resp->status_line(), " <", $uri, ">");
|
174
|
if ($resp->code() == 404) {
|
175
|
logmsg(WARN => @error);
|
176
|
return;
|
177
|
} else {
|
178
|
panic(@error);
|
179
|
}
|
180
|
}
|
181
|
|
182
|
# RFC 1867, using HTTP::Request::Common's form_ref
|
183
|
my @form_data = ( undef, Encode::encode("UTF-8", $filename) );
|
184
|
push @form_data, "Content-type" => $content_type
|
185
|
, Content => $resp->decoded_content();
|
186
|
|
187
|
my $t1 = Time::HiRes::gettimeofday();
|
188
|
$resp = $DISCOURSE->post( $CONF->{discourse}->{apiurl} . "/uploads.json",
|
189
|
"Api-Username" => Encode::encode("UTF-8", lc($username)),
|
190
|
Host => $CONF->{discourse}->{host},
|
191
|
Content_Type => "form-data",
|
192
|
Content => [ %attrs, synchronous => "true", "files[]" => \@form_data ]
|
193
|
);
|
194
|
$DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t1;
|
195
|
$DISCOURSE_COUNT++;
|
196
|
|
197
|
return decode_json_response($DISCOURSE => $resp);
|
198
|
}
|
199
|
|
200
|
# username($ID)
|
201
|
# Returns the Discourse username associated with the AskBot user ID.
|
202
|
# The Discourse user is created if it doesn't already exist.
|
203
|
my %ASKBOT_PROFILES; # pre-existing AskBot profiles, incl. SSO linking information
|
204
|
my %ASKBOT_DUPLICATES;
|
205
|
my %DISCOURSE_PROFILES; # map AskBot user ID -> Discourse profile
|
206
|
sub username($);
|
207
|
sub username($) {
|
208
|
my $orig_id = shift;
|
209
|
my $id = $ASKBOT_DUPLICATES{$orig_id} // $orig_id;
|
210
|
|
211
|
state $sth_user_lookup = $DBH->prepare(q{
|
212
|
SELECT username FROM users WHERE id = ?
|
213
|
});
|
214
|
|
215
|
my $profile = $ASKBOT_PROFILES{$id} // fail("Can't find user #$id in users.json");
|
216
|
my $username = $profile->{discourse_username} // panic($id);
|
217
|
|
218
|
my ($u, $user_id);
|
219
|
{
|
220
|
my $p = $DISCOURSE_PROFILES{$id} // panic($id);
|
221
|
lock(%$p); # only need to lock that user's profile
|
222
|
return $username if exists $p->{user_id}; # already created
|
223
|
|
224
|
# user API is not localized, let's stick to english
|
225
|
$u = askbot_get("/en/api/v1/users/$id/");
|
226
|
|
227
|
my $msg = $profile->{username};
|
228
|
$msg .= " -> $username" if $username ne $profile->{username};
|
229
|
$msg .= ", duplicate #$orig_id" if $id != $orig_id;
|
230
|
logmsg(INFO => " " x $INDENT, "user #$id ($msg)");
|
231
|
|
232
|
my $email = $profile->{email} // panic();
|
233
|
if ($email =~ /\@(?:mailinator|trashmail)\.[a-z]+$/i
|
234
|
or $email !~ /\@[0-9a-z\-]+(?:\.[0-9a-z\-]+)+$/i) {
|
235
|
logmsg(WARN => "user #$id ($username) has invalid email address <$email>");
|
236
|
# create the account for content attribution but lock them out
|
237
|
$email = "noreply+$username\@documentfoundation.org";
|
238
|
}
|
239
|
my $r = discourse_req("system", POST => "/users.json", {
|
240
|
username => $username
|
241
|
, password => unpack("H*", urandom(32)) # XXX required even with SSO
|
242
|
, name => $profile->{name} // ""
|
243
|
, email => $email
|
244
|
, active => JSON::true
|
245
|
, approved => JSON::true
|
246
|
});
|
247
|
panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} == JSON::true;
|
248
|
$user_id = $r->{user_id} // panic($id);
|
249
|
|
250
|
$sth_user_lookup->bind_param(1, $user_id, SQL_INTEGER);
|
251
|
$sth_user_lookup->execute();
|
252
|
my ($username2) = $sth_user_lookup->fetchrow();
|
253
|
$sth_user_lookup->finish();
|
254
|
unless (defined $username2 and $username2 eq $username) {
|
255
|
# there is a race condition causing PostgreSQL to sometimes abort the transaction:
|
256
|
# Discourse reports the user as created but any attempt to use it as Api-Username fails
|
257
|
logmsg(SOFTFAIL => "Couldn't create user #$orig_id ($profile->{username}), retrying...");
|
258
|
return username($orig_id);
|
259
|
}
|
260
|
|
261
|
# assign a Trust Level to the newly created account, depending on the users's activity on
|
262
|
# AskBot, in order to bootstrap the site
|
263
|
my $tl = ($u->{reputation} >= 1000 and time - $u->{last_seen_at} <= 31557600) ? 4
|
264
|
: ($u->{reputation} >= 250 and $u->{answers} >= 100 and time - $u->{last_seen_at} <= 31557600) ? 3
|
265
|
: ($u->{answers} + $u->{comments} >= 100) ? 2
|
266
|
: ($u->{questions} + $u->{answers} + $u->{comments} >= 5) ? 1
|
267
|
: 0;
|
268
|
# leave non-returning (less than 10 between creation and last seen date) users at TL0
|
269
|
$tl = 0 if $u->{last_seen_at} - $u->{joined_at} < 864000;
|
270
|
discourse_req("system", PUT => "/admin/users/$user_id/trust_level", { level => $tl })
|
271
|
unless $tl == 0;
|
272
|
|
273
|
$p->{user_id} = $user_id;
|
274
|
$p->{trust_level} = $tl;
|
275
|
} # account was created and TL assigned, we can release the lock on $DISCOURSE_PROFILES{$id}
|
276
|
$INDENT++;
|
277
|
|
278
|
# preserve avatar
|
279
|
my %avatar_upload;
|
280
|
my $username_escaped = uri_escape_utf8(lc($username));
|
281
|
if ($u->{avatar} =~ m#^(?:(?:https?:)?//www\.gravatar\.com/avatar)?/#) {
|
282
|
my $path = "/user_avatar/$username_escaped/refresh_gravatar.json";
|
283
|
my $r = discourse_req($username, POST => $path, { username => $username });
|
284
|
$avatar_upload{upload_id} = $r->{gravatar_upload_id};
|
285
|
$avatar_upload{type} = "gravatar";
|
286
|
} elsif ($u->{avatar} =~ s,^(?:(?:https?:)?//ask\.libreoffice\.org)?(/[^/?#][^?#]*).*,$1,i) {
|
287
|
unless ($u->{avatar} eq "/m/default/media/images/nophoto.png") {
|
288
|
$u->{avatar} =~ s#^(/upfiles/avatars/[^/]+)/resized/[0-9]+/([^/]+)$#$1/$2#;
|
289
|
my $r = eval { upload($username, $u->{avatar}, type => "avatar", user_id => $user_id) };
|
290
|
if ($@) {
|
291
|
print STDERR $@; # already logged
|
292
|
} else {
|
293
|
$avatar_upload{upload_id} = $r->{id};
|
294
|
$avatar_upload{type} = "uploaded";
|
295
|
}
|
296
|
}
|
297
|
} else {
|
298
|
logmsg(WARN => "$username: unknown avatar URL $u->{avatar}");
|
299
|
}
|
300
|
|
301
|
if (%avatar_upload) {
|
302
|
my $errstr = "$username: Couldn't set avatar ($avatar_upload{type})";
|
303
|
if (defined $avatar_upload{upload_id}) {
|
304
|
my $path = "/u/$username_escaped/preferences/avatar/pick.json";
|
305
|
my $r = eval { discourse_req($username, PUT => $path, \%avatar_upload) };
|
306
|
unless (!$@ and defined $r->{success} and $r->{success} eq "OK") {
|
307
|
print STDERR $@ if $@; # already logged
|
308
|
$errstr .= ": " . $r->{message} if defined $r and defined $r->{message};
|
309
|
logmsg(SOFTFAIL => $errstr);
|
310
|
}
|
311
|
} else {
|
312
|
logmsg(WARN => $errstr);
|
313
|
}
|
314
|
}
|
315
|
|
316
|
# database surgery: set created and last seen dates from AskBot
|
317
|
state $sth_user_last = $DBH->prepare(q{
|
318
|
UPDATE users SET created_at = to_timestamp(?), last_seen_at = to_timestamp(?)
|
319
|
WHERE id = ?
|
320
|
});
|
321
|
$sth_user_last->bind_param(1, $u->{joined_at}, SQL_INTEGER);
|
322
|
$sth_user_last->bind_param(2, $u->{last_seen_at}, SQL_INTEGER);
|
323
|
$sth_user_last->bind_param(3, $user_id, SQL_INTEGER);
|
324
|
my $r = $sth_user_last->execute();
|
325
|
panic($r) unless $r == 1;
|
326
|
|
327
|
$INDENT--;
|
328
|
return $username;
|
329
|
}
|
330
|
|
331
|
# mangle_post_upload($USERNAME, $TEXT => $PATH, $INLINE)
|
332
|
# Return a Markdown string with given $TEXT and AskBot source link
|
333
|
# (attachment or screeshot). The source is downloaded and re-uploaded
|
334
|
# to Discourse.
|
335
|
sub mangle_post_upload($$$$) {
|
336
|
my ($username, $text, $path, $inline) = @_;
|
337
|
my $filename;
|
338
|
unless ($inline) {
|
339
|
# fix ugly C:\fakepath\something cf. https://www.mediawiki.org/wiki/Special:Code/MediaWiki/83225
|
340
|
$text =~ s/\AC:\\fakepath\\([^\\]+)\z/$1/;
|
341
|
my $ext = $path =~ /[^.\/\\]\.([a-zA-Z](?:-?[a-zA-Z0-9])*)\z/ ? $1 : undef;
|
342
|
$filename = !(defined $ext and $text =~ /[^.\/\\]\.\Q$ext\E\z/i) ? basename($path)
|
343
|
: $text =~ /\A[A-Z]:\\.*\\(.+)\z/ ? basename($1)
|
344
|
: basename($text);
|
345
|
$filename =~ tr[\t\n\f\r ][ ]s;
|
346
|
}
|
347
|
my $u = eval { upload($username, $path, type => "composer", filename => $filename) };
|
348
|
unless (!$@ and defined $u) {
|
349
|
# 404 or fatal error, can't do better
|
350
|
print STDERR $@ if $@; # already logged
|
351
|
logmsg(SOFTFAIL => "Couldn't re-upload $path");
|
352
|
return;
|
353
|
}
|
354
|
return $inline ? "![$text]($u->{short_url})" : "[$text|attachment]($u->{short_url})";
|
355
|
}
|
356
|
|
357
|
# mangle_post_internal_link($ID, $LINK, $FINAL, $REF)
|
358
|
# Return appropriate replacement for AskBot internal $LINK in post $ID.
|
359
|
# The 3rd argument is a boolean indicating whether the import is
|
360
|
# ongoing or if we're at the final phase.
|
361
|
# The 4rd argument is a reference to a variable that is is set to 1 if
|
362
|
# $FINAL is false and the mangling couldn't proceeed because the
|
363
|
# AskBot question hasn't been processed yet, or if $FINAL is true and
|
364
|
# a suitable substitute was found.
|
365
|
my %URLMAP :shared; # map AskBot post ID -> Discourse URL
|
366
|
my %CATEGORY_IDs; # map language code -> Discourse category
|
367
|
sub mangle_post_internal_link($$$$) {
|
368
|
my ($post_id, $link, $final, $ref_rv) = @_;
|
369
|
|
370
|
my $uri = $link;
|
371
|
my $hash = $uri =~ s/\#(.*)// ? $1 : "";
|
372
|
my $query = $uri =~ s/\?(.*)// ? $1 : "";
|
373
|
my $lang = $uri =~ m,^ /(?:question|user)s? (?:/|$),x ? "en"
|
374
|
: $uri =~ s,^/([\p{PosixAlnum}\-_]+)(/(?:question|user)s? (?:/|$)),$2,x ? $1
|
375
|
: $uri =~ s,^/([\p{PosixAlnum}\-_]+) /?$,/questions/,x ? $1
|
376
|
: undef;
|
377
|
|
378
|
my $newlink;
|
379
|
unless (defined $lang) {
|
380
|
return unless $uri eq "" or $uri eq "/"; # can't do better
|
381
|
return if $CONF->{discourse}->{baseurl} eq "https://ask.libreoffice.org" and $query eq "" and $hash eq "";
|
382
|
$newlink = "/"; # site root (ignore query and hash)
|
383
|
} elsif ($uri =~ s#^/users?/([0-9]+)(?:/|$)##) {
|
384
|
# don't create accounts during the final phase
|
385
|
return if $final;
|
386
|
my $username = eval { username($1) };
|
387
|
if ($@) {
|
388
|
print STDERR $@; # already logged
|
389
|
logmsg(SOFTFAIL => "Couldn't create user #$1");
|
390
|
return;
|
391
|
}
|
392
|
$newlink = "/u/" . uri_escape_utf8(lc($username)); # profile URL
|
393
|
} elsif ($uri =~ s#^/questions?/([0-9]+)(?:/|$)##) {
|
394
|
my $id = $1;
|
395
|
my $old_post_id =
|
396
|
$hash =~ /^post-id-([0-9]+)$/ ? $1
|
397
|
: $query =~ /(?:^|&)(?:answer|comment)=([0-9]+)(?:&|$)/ ? $1
|
398
|
: $id;
|
399
|
unless (defined ($newlink = $URLMAP{$old_post_id})) {
|
400
|
if (!$final) {
|
401
|
$$ref_rv = 1; # needs post mangling
|
402
|
} else {
|
403
|
logmsg(WARN => "#$post_id: Can't mangle <$link> (missing #$old_post_id)");
|
404
|
}
|
405
|
return;
|
406
|
}
|
407
|
} elsif ($uri =~ m#^/questions?/?$#) {
|
408
|
my $c = $CATEGORY_IDs{$lang} // return;
|
409
|
$newlink = "/c/$c->{slug}/$c->{id}";
|
410
|
} elsif ($uri =~ m#^/questions?/ask/?$#) {
|
411
|
my $c = $CATEGORY_IDs{$lang} // return;
|
412
|
$newlink = "/new-topic?category=$c->{slug}";
|
413
|
} elsif ($uri =~ m#^/users?/?$#) {
|
414
|
$newlink = "/u";
|
415
|
} else {
|
416
|
return;
|
417
|
}
|
418
|
$$ref_rv = 1 if $final; # modified
|
419
|
logmsg(DEBUG => "#$post_id: Mangling <$link> -> <$newlink>");
|
420
|
return $CONF->{discourse}->{baseurl} . $newlink;
|
421
|
}
|
422
|
|
423
|
# mangle_post_internal_links($ID, $REF_TEXT, [$FINAL])
|
424
|
# Mangle all internal question/answer/comment/user links. The 3rd
|
425
|
# argument is a boolean indicating whether the import is ongoing or if
|
426
|
# we're at the final phase.
|
427
|
# Return a boolean, which when $FINAL is false indicates whether the
|
428
|
# post needs post-mangling during the final phase, and when $FINAL is
|
429
|
# true indicates whether the input text was mangled.
|
430
|
sub mangle_post_internal_links($$;$) {
|
431
|
my ($id, $text, $final) = @_;
|
432
|
my $rv = 0;
|
433
|
|
434
|
$$text =~ s#\b (?i:(?:https?:)//ask\.libreoffice\.org)
|
435
|
( / [^\P{Graph}!"'\(\)<>\[\]\`\{\}]+? )
|
436
|
(?! [^\P{Graph}!"'\(\),\.<>\?\[\]\`\{\}] | [!',\.\?]\p{Graph} )
|
437
|
# mangle_post_internal_link($id, $1, $final => \$rv) // $& #egx;
|
438
|
|
439
|
if ($final or !$rv) {
|
440
|
# note: the regexp is intentionally loose
|
441
|
logmsg(WARN => "Post #$id contains dangling internal link <$_>") foreach
|
442
|
$$text =~ m#\b(?i:(?:https?:)//ask\.libreoffice\.org)/(?![ctu]/)\S+#g;
|
443
|
}
|
444
|
|
445
|
return $rv;
|
446
|
}
|
447
|
|
448
|
# mangle_post($AUTHOR, $POST)
|
449
|
# Mangle $POST->{text} in place:
|
450
|
# * fix attachments and screenshots;
|
451
|
# * fix link to other profiles, "normal" links, bare URLs, and @-addressing
|
452
|
# (the target profile is created if it doesn't exist); and
|
453
|
# * fix links to questions, answers and comments.
|
454
|
# Return a boolean indicating whether the post needs further mangling
|
455
|
# during the final phase.
|
456
|
sub mangle_post($$) {
|
457
|
my ($author, $p) = @_;
|
458
|
|
459
|
# fix attachments and screenshots (an empty link text is allowed for screenshots, and we also
|
460
|
# allow it for atttachment since we append "|attachment")
|
461
|
$p->{text} =~ s#!\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/m/[\p{PosixAlnum}\-_]+/media/images/[^\[\]\(\)]+) \)#
|
462
|
mangle_post_upload("system", $1 => $2, 1) // $& #egx; # NOTE: discourse does deduplication
|
463
|
$p->{text} =~ s#(!)?\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/upfiles/[^\[\]\(\)]+) \)#
|
464
|
mangle_post_upload($author, $2 => $3, $1) // $& #egx;
|
465
|
# NOTE: mangle_post_upload() can fail if it tries to upload the same document twice at the
|
466
|
# same time (race condition on uniqueness constraint) -- we could retry but we don't care so
|
467
|
# much since sidekiq will automatically fetch https://ask.libreoffice.org/upfiles/12345.png
|
468
|
# locally and update the post accordingly
|
469
|
# NOTE: the handful of bare links https://ask.libreoffice.org/upfiles/foobar.xyz are
|
470
|
# untouched hence will 404
|
471
|
|
472
|
# fix @-addressing; we can't match $p->{text} against /\@\w+/ here because AskBot
|
473
|
# usernames may contain spaces... so we check for <a href="..."/> in the HTML instead
|
474
|
my %user_ids;
|
475
|
$user_ids{$_} = 1 foreach $p->{summary} =~ m#<a\s(?:[^<>]*\s)*?href=['"]
|
476
|
(?i:(?:https?:)?//ask\.libreoffice\.org)? (?:/[\p{PosixAlnum}\-_]+)?
|
477
|
/users?/([0-9]+)[/'"]#gsx;
|
478
|
foreach my $id (keys %user_ids) {
|
479
|
# we don't want duplicate resolution here
|
480
|
my $profile = $ASKBOT_PROFILES{$id};
|
481
|
unless (defined $profile) {
|
482
|
logmsg(WARN => "#$id is not a known AskBot user, can't mangle its \@-addresses");
|
483
|
next;
|
484
|
}
|
485
|
my $old_username = $profile->{username} // panic($id);
|
486
|
my $new_username = eval { username($id) }; # might need to be created
|
487
|
if ($@) {
|
488
|
print STDERR $@; # already logged
|
489
|
logmsg(SOFTFAIL => "Couldn't create user #$id");
|
490
|
next;
|
491
|
}
|
492
|
$p->{text} =~ s/\@\Q$old_username\E (?! [\.\-]?[\p{Mark}_\p{Alnum}] )
|
493
|
/\@$new_username/gx
|
494
|
unless $old_username eq $new_username;
|
495
|
}
|
496
|
|
497
|
# explicitely prefix AskBot base URL so we don't have to do the work twice
|
498
|
$p->{text} =~ s#\[ ([^\[\]\(\)]+) \]\( (/[^\[\]\(\)]*) \)#[$1](https://ask.libreoffice.org$2)#gx;
|
499
|
|
500
|
# replace [URL](URL) links with a bare URL when possible so Discourse can use
|
501
|
# its own text for internal post links
|
502
|
$p->{text} =~ s#\[ ((?i:https?:)?//[^\[\]\(\)]+) \]\( \1 \)
|
503
|
(?! [!',\.\?]* [^\P{Graph}!"'\(\),\.<>\?\[\]\{\}] )#$1#gx;
|
504
|
|
505
|
return mangle_post_internal_links($p->{id} => \$p->{text});
|
506
|
}
|
507
|
|
508
|
# ts2iso8601($TIMESTAMP)
|
509
|
# Convert a timestamp to ISO 8601.
|
510
|
sub ts2iso8601($) {
|
511
|
my $ts = 0 + shift;
|
512
|
return POSIX::strftime("%Y-%m-%dT%H:%M:%SZ", gmtime($ts));
|
513
|
}
|
514
|
|
515
|
# replies(language_code => $LANG, topic_id => $ID, replies => [..], ..)
|
516
|
# Recursively process the given replies under some topic ID. The
|
517
|
# parent post is indicated by the optional reply_to_post_number
|
518
|
# attribute.
|
519
|
# Return the number of replies that couldn't be posted, if any
|
520
|
my @NEEDS_POST_MANGLING :shared;
|
521
|
sub replies(%);
|
522
|
sub replies(%) {
|
523
|
my %attrs = @_;
|
524
|
my $post_type = defined $attrs{reply_to_post_number} ? "comment" : "answer";
|
525
|
my $askbot_url = "/$attrs{language_code}/api/v1/${post_type}s/";
|
526
|
|
527
|
state $sth_set_vote_count = $DBH->prepare(q{
|
528
|
INSERT INTO post_custom_fields
|
529
|
(post_id, name, value, created_at, updated_at)
|
530
|
VALUES (?, 'vote_count', ?, NOW(), NOW())
|
531
|
});
|
532
|
|
533
|
my $num_errors = 0;
|
534
|
my $indent_base = $INDENT;
|
535
|
foreach my $id (sort {$a <=> $b} @{$attrs{replies}}) { # lowest ID first
|
536
|
logmsg(INFO => " " x $indent_base, "$post_type #$id");
|
537
|
$INDENT = $indent_base + 1;
|
538
|
|
539
|
my ($r, $t);
|
540
|
eval {
|
541
|
$r = askbot_get("$askbot_url$id/");
|
542
|
my $author = username($r->{author}->{id});
|
543
|
my $needs_post_mangling = mangle_post($author, $r);
|
544
|
|
545
|
$t = discourse_req($author, POST => "/posts.json", {
|
546
|
topic_id => $attrs{topic_id}
|
547
|
, reply_to_post_number => $attrs{reply_to_post_number}
|
548
|
, raw => $r->{text}
|
549
|
, created_at => ts2iso8601($r->{added_at})
|
550
|
, skip_validations => JSON::true
|
551
|
});
|
552
|
|
553
|
push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
|
554
|
};
|
555
|
if ($@) {
|
556
|
print STDERR $@; # already logged
|
557
|
logmsg(SOFTFAIL => "Couldn't post $post_type #$id");
|
558
|
$num_errors++;
|
559
|
next;
|
560
|
}
|
561
|
|
562
|
# no need to lock here, other threads will never store that ID
|
563
|
$URLMAP{$r->{id}} = "/t/$t->{topic_slug}/$t->{topic_id}/$t->{post_number}";
|
564
|
|
565
|
if ($post_type eq "answer") {
|
566
|
# database surgery: set vote count on answers (skip
|
567
|
# questions and comments as Discourse doesn't show these)
|
568
|
if ((my $score = $r->{score}) > 0) {
|
569
|
$sth_set_vote_count->bind_param(1, $t->{id}, SQL_INTEGER);
|
570
|
$sth_set_vote_count->bind_param(2, $score, SQL_INTEGER);
|
571
|
my $r = $sth_set_vote_count->execute();
|
572
|
panic($r) unless $r == 1;
|
573
|
}
|
574
|
if (defined $attrs{accepted_answer_id} and $attrs{accepted_answer_id} == $id) {
|
575
|
# XXX rate-limiting is not configurable, need to comment out
|
576
|
# "limit_accepts" in plugins/discourse-solved/plugin.rb:accept
|
577
|
logmsg(INFO => " " x $INDENT, "accept answer #$id");
|
578
|
$INDENT++;
|
579
|
my $r = eval { discourse_req($attrs{accepted_by}, POST => "/solution/accept",
|
580
|
{ id => $t->{id} }) };
|
581
|
if ($@) {
|
582
|
print STDERR $@; # already logged
|
583
|
logmsg(SOFTFAIL => "Couldn't accept answer #$id");
|
584
|
} elsif (!defined $r->{success} or $r->{success} ne "OK") {
|
585
|
panic(JSON::->new->utf8->encode($r));
|
586
|
}
|
587
|
$INDENT--;
|
588
|
}
|
589
|
}
|
590
|
|
591
|
# nested comments
|
592
|
$num_errors += replies(
|
593
|
language_code => $attrs{language_code}
|
594
|
, topic_id => $t->{topic_id}
|
595
|
, reply_to_post_number => $t->{post_number}
|
596
|
, replies => $r->{comment_ids}
|
597
|
) if @{$r->{comment_ids}};
|
598
|
}
|
599
|
return $num_errors;
|
600
|
}
|
601
|
|
602
|
|
603
|
#######################################################################
|
604
|
# Initialisation
|
605
|
|
606
|
logmsg(NOTICE => "Starting $SCRIPT_FILENAME");
|
607
|
my $TAG_GROUP_LIBO_COMPONENT = "LibreOffice component";
|
608
|
|
609
|
# Categories
|
610
|
useragent_init();
|
611
|
{
|
612
|
my $path = dirname($SCRIPT_FILENAME) . "/categories.json";
|
613
|
open my $fh, "<", $path or panic("open($path): $!");
|
614
|
my $categories = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
|
615
|
close $fh or panic("close: $!");
|
616
|
|
617
|
my %slugs;
|
618
|
my $r = discourse_req("system", GET => "/categories.json");
|
619
|
$slugs{$_->{slug}} = $_->{id} foreach @{$r->{category_list}->{categories}};
|
620
|
|
621
|
foreach my $c (@$categories) {
|
622
|
my $l = $c->{language_code} // panic();
|
623
|
$CATEGORY_IDs{$l} = { slug => $c->{slug} };
|
624
|
if (defined (my $id = $slugs{$c->{slug}})) {
|
625
|
# the category already exists
|
626
|
$CATEGORY_IDs{$l}->{id} = $id;
|
627
|
} else {
|
628
|
logmsg(NOTICE => "Creating category $c->{slug}");
|
629
|
my $r = discourse_req("system", POST => "/categories.json", {
|
630
|
name => $c->{name},
|
631
|
slug => $c->{slug},
|
632
|
custom_fields => { qa_enabled => JSON::true }
|
633
|
});
|
634
|
$CATEGORY_IDs{$l}->{id} = $r->{category}->{id} // panic();
|
635
|
}
|
636
|
}
|
637
|
|
638
|
$r = discourse_req("system", POST => "/tag_groups.json", {
|
639
|
tag_group => {
|
640
|
name => $TAG_GROUP_LIBO_COMPONENT,
|
641
|
tag_names => [ qw/common writer calc impress base draw math meta/ ],
|
642
|
permissions => { everyone => 1 },
|
643
|
one_per_topic => JSON::false
|
644
|
}
|
645
|
});
|
646
|
panic("Couldn't create group tag") unless defined $r->{tag_group};
|
647
|
|
648
|
# reorder (lexicographically) by slugs
|
649
|
logmsg(NOTICE => "Updating and ordering categories");
|
650
|
my (%categories_sorted, %positions);
|
651
|
my @categories_sorted = sort { $a->{slug} cmp $b->{slug} } @$categories;
|
652
|
foreach (my $i = 0; $i <= $#$categories; $i++) {
|
653
|
$categories_sorted{$categories_sorted[$i]->{slug}} = $i;
|
654
|
}
|
655
|
my $offset = 0;
|
656
|
$r = discourse_req("system", GET => "/categories.json");
|
657
|
foreach my $c (@{$r->{category_list}->{categories}}) {
|
658
|
my $s = $c->{slug} // panic();
|
659
|
my %h;
|
660
|
if ($s eq "uncategorized") {
|
661
|
$h{color} = "3AB54A";
|
662
|
$h{text_color} = "FFFFFF";
|
663
|
$positions{$c->{id}} = 0;
|
664
|
} elsif ($s eq "site-feedback") {
|
665
|
$positions{$c->{id}} = scalar(@$categories) + 1;
|
666
|
} elsif ($s eq "staff") {
|
667
|
$positions{$c->{id}} = scalar(@$categories) + 2;
|
668
|
} elsif ($s eq "lounge") {
|
669
|
$positions{$c->{id}} = scalar(@$categories) + 3;
|
670
|
} elsif (defined (my $i = $categories_sorted{$s})) {
|
671
|
$positions{$c->{id}} = $i + 1;
|
672
|
$h{num_featured_topics} = 1;
|
673
|
} else {
|
674
|
$positions{$c->{id}} = scalar(@$categories) + 4 + $offset;
|
675
|
$offset++;
|
676
|
}
|
677
|
next unless %h;
|
678
|
my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", \%h);
|
679
|
panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
|
680
|
}
|
681
|
$r = discourse_req("system", POST => "/categories/reorder",
|
682
|
{ mapping => JSON::->new->encode(\%positions) });
|
683
|
panic("Couldn't reorder categories") unless defined $r->{success} and $r->{success} eq "OK";
|
684
|
}
|
685
|
|
686
|
# AskBot (and SSO linking information) user profiles
|
687
|
{
|
688
|
my $path = dirname($SCRIPT_FILENAME) . "/users.json";
|
689
|
logmsg(NOTICE => "Loading $path");
|
690
|
open my $fh, "<", $path or panic("open($path): $!");
|
691
|
my $profiles = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
|
692
|
close $fh or panic("close: $!");
|
693
|
|
694
|
# merge accounts with the same email address (into the account with lowest ID)
|
695
|
my (%emails, %duplicates);
|
696
|
foreach my $id (keys %$profiles) {
|
697
|
my $p = $profiles->{$id} // panic();
|
698
|
# `lc` is a bit YOLO but Discourse matches email addresses case-insensitively...
|
699
|
my %e = ( lc($p->{email}) => 1 );
|
700
|
$e{lc($_)} = 1 foreach @{$p->{sso_emails} // []};
|
701
|
foreach (keys %e) {
|
702
|
$emails{$_} //= [];
|
703
|
push @{$emails{$_}}, $id;
|
704
|
}
|
705
|
}
|
706
|
foreach my $e (keys %emails) {
|
707
|
my $ids = $emails{$e} // panic();
|
708
|
next unless $#$ids > 0;
|
709
|
my ($id, @duplicates) = sort {$a <=> $b} @$ids;
|
710
|
$duplicates{$id} = \@duplicates;
|
711
|
}
|
712
|
foreach my $id (sort {$a <=> $b} keys %duplicates) {
|
713
|
my $dups = $duplicates{$id};
|
714
|
my $username = $profiles->{$id}->{username} // panic();
|
715
|
logmsg(INFO => "Merging user(s) ", join (", ", map {"#$_"} @$dups),
|
716
|
" with #$id ($username)");
|
717
|
$ASKBOT_DUPLICATES{$_} = $id + 0 foreach @$dups;
|
718
|
}
|
719
|
|
720
|
my @reserved_usernames = # 'reserved_usernames' setting
|
721
|
qw/admin moderator administrator mod sys system community info
|
722
|
you name username user nickname discourse discourseorg
|
723
|
discourseforum support/;
|
724
|
|
725
|
my %usernames;
|
726
|
my @profile_ids = sort {$a <=> $b} keys %$profiles;
|
727
|
|
728
|
# given users #1 "FooBar" and #2 "whatever" with SSO username "foobar", #1 and #2
|
729
|
# are respectively renamed to "foobar1" and "foobar"; we assume users not in SSO
|
730
|
# aren't that active in the project and allow hijacking usernames for SSO users
|
731
|
$usernames{lc($profiles->{$_}->{sso_username})} //= $ASKBOT_DUPLICATES{$_} // $_
|
732
|
# no need to add the version without `lc` here, SSO usernames are case insensitive
|
733
|
foreach grep { defined $profiles->{$_}->{sso_username} } @profile_ids;
|
734
|
|
735
|
# AskBot doesn't have a username requirement but Discourse does (and usernames are matched
|
736
|
# case-insensitively); we try to preserve users on a first come first served basis: given users
|
737
|
# #1 "Foo", #2 "foo", #3 "bar", #4 "bar" then "foo" and "bar" are respectively attributed to #2
|
738
|
# (because the case was correct) and #3 while #1 and #4 are renamed "Foo1" and "bar1"
|
739
|
$usernames{$profiles->{$_}->{username}} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
|
740
|
$usernames{lc($profiles->{$_}->{username})} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
|
741
|
|
742
|
# populate %ASKBOT_PROFILES
|
743
|
foreach my $id (@profile_ids) {
|
744
|
my $p = $profiles->{$id} // panic();
|
745
|
my %p = ( username => $p->{username} );
|
746
|
$id += 0; # convert to integer
|
747
|
|
748
|
if (exists $ASKBOT_DUPLICATES{$id}) {
|
749
|
# keep AskBot username for @-addressing mangling but skip other attributes
|
750
|
$ASKBOT_PROFILES{$id} = \%p;
|
751
|
next;
|
752
|
}
|
753
|
|
754
|
# mangle AskBot/SSO username to comply with Discourse's stricter policy
|
755
|
# https://github.com/discourse/discourse/blob/master/app/models/username_validator.rb
|
756
|
my $username = $p->{sso_username} // $p->{username} // panic();
|
757
|
$username =~ s/^\@+//; # @foo -> foo
|
758
|
$username =~ s/\@.*//; # foo@example.net -> foo
|
759
|
$username =~ s/[^\p{Alnum}\p{Mark}._\-]//g; # UNICODE_INVALID_CHAR_PATTERN
|
760
|
$username =~ s/^[^\p{Alnum}\p{Mark}_]+//; # INVALID_LEADING_CHAR_PATTERN
|
761
|
$username =~ s/[^\p{Alnum}\p{Mark}]+$//; # INVALID_TRAILING_CHAR_PATTERN
|
762
|
$username =~ s/([\-_.])+/$1/g; # REPEATED_SPECIAL_CHAR_PATTERN
|
763
|
$username =~ s/\.(js|json|css|html?|xml|jpe?g|png|gif|bmp|ico|tiff?|woff2?)$/-$1/i; # CONFUSING_EXTENSIONS
|
764
|
$username = "unknown" if $username eq "";
|
765
|
|
766
|
# comparison is case insensitive
|
767
|
$username = "not-$username" if grep { lc($username) eq $_ } @reserved_usernames;
|
768
|
|
769
|
while (($usernames{lc($username)} // $id) != $id) {
|
770
|
# username is already taken, increment or append a number
|
771
|
$username .= "1" unless $username =~ s/([0-9]+)$/sprintf("%0*d", length($1), $1 + 1)/e;
|
772
|
}
|
773
|
|
774
|
$usernames{lc($username)} = $id;
|
775
|
$p{discourse_username} = $username;
|
776
|
$p{name} = $p->{name} if defined $p->{name} and $p->{name} ne "";
|
777
|
$p{email} = @{$p->{sso_emails} // []} ? $p->{sso_emails}->[0] : $p->{email};
|
778
|
$ASKBOT_PROFILES{$id} = \%p;
|
779
|
$DISCOURSE_PROFILES{$id} = shared_clone({});
|
780
|
}
|
781
|
|
782
|
# pre-existing users
|
783
|
$DISCOURSE_PROFILES{32931}->{user_id} = 1;
|
784
|
$DISCOURSE_PROFILES{32931}->{trust_level} = 1;
|
785
|
}
|
786
|
|
787
|
|
788
|
#######################################################################
|
789
|
# Import workers, each fetching work from a queue of questions
|
790
|
|
791
|
my $QUEUE = Thread::Queue::->new();
|
792
|
my ($TOPIC_lck, $CLOSE_lck) :shared;
|
793
|
sub import_questions($) {
|
794
|
$WORKER = shift;
|
795
|
useragent_init(); # re-init
|
796
|
|
797
|
my $conf_db = $CONF->{database} // panic();
|
798
|
my $dsn = "DBI:Pg:dbname=$conf_db->{dbname}; host=$conf_db->{host}";
|
799
|
$DBH = DBI::->connect( $dsn, $conf_db->{user} // "", $conf_db->{password} // "", {RaiseError => 1} );
|
800
|
|
801
|
my $sth_set_view_count = $DBH->prepare(q{
|
802
|
UPDATE topics SET views = ? WHERE id = ?
|
803
|
});
|
804
|
my $sth_set_closed_at_by = $DBH->prepare(q{
|
805
|
UPDATE posts SET created_at = to_timestamp(?)
|
806
|
WHERE topic_id = ? AND post_type != 1 AND action_code = 'closed.enabled'
|
807
|
});
|
808
|
|
809
|
my $elapsed = 0;
|
810
|
while (defined (my $q = $QUEUE->dequeue())) {
|
811
|
logmsg(INFO => "question #$q->{id}");
|
812
|
$INDENT = 1;
|
813
|
|
814
|
my @tags = grep # skip reserved names, cf. app/models/tag.rb
|
815
|
{ lc($_) ne "none" and lc($_) ne "constructor" }
|
816
|
# NOTE: discourse mangles tag names, in particular
|
817
|
# turns 'libreoffice-1.2.3' to 'libreoffice-123', cf.
|
818
|
# lib/discourse_tagging.rb:clean_tag()
|
819
|
@{$q->{tags}};
|
820
|
|
821
|
my $lang = $q->{language_code} // panic();
|
822
|
my $category = $CATEGORY_IDs{$lang} // panic($lang);
|
823
|
my $t0 = Time::HiRes::gettimeofday();
|
824
|
|
825
|
my ($author, $t);
|
826
|
eval {
|
827
|
$author = username($q->{author}->{id});
|
828
|
my $needs_post_mangling = mangle_post($author, $q);
|
829
|
|
830
|
# post topic
|
831
|
lock($TOPIC_lck);
|
832
|
$t = discourse_req($author, POST => "/posts.json", {
|
833
|
title => $q->{title}
|
834
|
, raw => $q->{text}
|
835
|
, category => $category->{id}
|
836
|
, tags => \@tags
|
837
|
, created_at => ts2iso8601($q->{added_at})
|
838
|
, skip_validations => JSON::true
|
839
|
});
|
840
|
push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
|
841
|
};
|
842
|
if ($@) {
|
843
|
print STDERR $@; # already logged
|
844
|
logmsg(SOFTFAIL => "Couldn't post question #$q->{id}");
|
845
|
next;
|
846
|
}
|
847
|
|
848
|
my $topic_id = $t->{topic_id} // panic();
|
849
|
my $topic_url = "/t/$t->{topic_slug}/$topic_id";
|
850
|
my $num_errors = 0;
|
851
|
|
852
|
# no need to lock here, other threads will never store that ID
|
853
|
$URLMAP{$q->{id}} = $topic_url;
|
854
|
|
855
|
# (recursively) process replies
|
856
|
$num_errors += replies(
|
857
|
language_code => $lang
|
858
|
, topic_id => $topic_id
|
859
|
, reply_to_post_number => $t->{post_number}
|
860
|
, replies => $q->{comment_ids}
|
861
|
) if @{$q->{comment_ids}};
|
862
|
|
863
|
$num_errors += replies(
|
864
|
language_code => $lang
|
865
|
, topic_id => $topic_id
|
866
|
# answers have null has reply_to_post_number attribute, see
|
867
|
# https://github.com/paviliondev/discourse-question-answer/blob/master/extensions/topic_extension.rb#L16
|
868
|
, reply_to_post_number => undef
|
869
|
, replies => $q->{answer_ids}
|
870
|
, accepted_answer_id => $q->{accepted_answer_id}
|
871
|
, accepted_by => $author # assume accepted by OP
|
872
|
) if @{$q->{answer_ids}};
|
873
|
|
874
|
if ($q->{closed} == JSON::true) {
|
875
|
logmsg(INFO => " " x $INDENT, "close question #$q->{id}");
|
876
|
$INDENT++;
|
877
|
my $id = $q->{closed_by}->{id} // panic();
|
878
|
my $username = eval { username($id) };
|
879
|
if ($@) {
|
880
|
print STDERR $@; # already logged
|
881
|
logmsg(SOFTFAIL => "Couldn't create user #$id");
|
882
|
} else {
|
883
|
{
|
884
|
my $p = $DISCOURSE_PROFILES{ $ASKBOT_DUPLICATES{$id} // $id } // panic();
|
885
|
lock(%$p); # only need to lock that user's profile
|
886
|
if ((my $tl = $p->{trust_level}) < 4) {
|
887
|
logmsg(DEBUG => "Temporarily promoting $username to TL4");
|
888
|
discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level",
|
889
|
{ level => 4 });
|
890
|
$p->{trust_level} = 4;
|
891
|
$p->{real_trust_level} = $tl;
|
892
|
}
|
893
|
}
|
894
|
lock($CLOSE_lck);
|
895
|
my $r = discourse_req($username, PUT => "/t/$topic_id/status.json",
|
896
|
{ status=> "closed", enabled => "true" });
|
897
|
panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} eq "OK";
|
898
|
}
|
899
|
|
900
|
# database surgery: set closing date from AskBot
|
901
|
$sth_set_closed_at_by->bind_param(1, $q->{closed_at}, SQL_INTEGER);
|
902
|
$sth_set_closed_at_by->bind_param(2, $topic_id, SQL_INTEGER);
|
903
|
my $r = $sth_set_closed_at_by->execute();
|
904
|
panic($r) unless $r == 1;
|
905
|
$INDENT--;
|
906
|
}
|
907
|
|
908
|
if ((my $view_count = $q->{view_count}) > 0) {
|
909
|
# database surgery: set view count on the topic
|
910
|
$sth_set_view_count->bind_param(1, $view_count, SQL_INTEGER);
|
911
|
$sth_set_view_count->bind_param(2, $topic_id, SQL_INTEGER);
|
912
|
my $r = $sth_set_view_count->execute();
|
913
|
panic($r) unless $r == 1;
|
914
|
}
|
915
|
|
916
|
logmsg(WARN => "$topic_url (AskBot question #$q->{id}) has >=$num_errors missing replies!")
|
917
|
if $num_errors > 0;
|
918
|
$elapsed += Time::HiRes::gettimeofday() - $t0;
|
919
|
}
|
920
|
|
921
|
logmsg(INFO => sprintf("Worker stats: total active time %.2fs", $elapsed));
|
922
|
logmsg(INFO => sprintf("AskBot stats: %d requests, %.2fs total", $ASKBOT_COUNT, $ASKBOT_TIME));
|
923
|
logmsg(INFO => sprintf("Discourse stats: %d requests, %.2fs total", $DISCOURSE_COUNT, $DISCOURSE_TIME));
|
924
|
}
|
925
|
|
926
|
logmsg(NOTICE => "Spawning $NUM_WORKERS workers");
|
927
|
$QUEUE->limit = 4 * $NUM_WORKERS; # no need to prefetch pages if the workers can't keep up
|
928
|
threads->create(\&import_questions, $_) for 1 .. $NUM_WORKERS;
|
929
|
|
930
|
|
931
|
#######################################################################
|
932
|
# Queue up questions
|
933
|
# we process them ordered by ID so lower question IDs get lower topic IDs
|
934
|
|
935
|
#{
|
936
|
# my %question_ids = (
|
937
|
# 161330 => "en" # @-addressing with old username
|
938
|
# , 238673 => "en" # closed
|
939
|
# , 302629 => "en" # upvote, accepted answer
|
940
|
# , 303050 => "en" # screenshots, attachments (both with C:\fakepath\ and without)
|
941
|
# , 285734 => "ja" # URL to profile, unicode attachements
|
942
|
# , 161228 => "en" # question linked in the above, OneBox
|
943
|
# , 304617 => "en" # closed by someone with TL < 4
|
944
|
# , 161334 => "en" # Hello1 -> Hello2
|
945
|
# # duplicate accounts
|
946
|
# , 18673 => "en", 1505 => "en"
|
947
|
# , 25329 => "pt-br", 128011 => "pt-br"
|
948
|
# );
|
949
|
#
|
950
|
# foreach my $id (sort {$a <=> $b} keys %question_ids) {
|
951
|
# my $lang = $question_ids{$id};
|
952
|
# my $url = "/$lang/api/v1/questions/$id/";
|
953
|
# my $q = askbot_get($url);
|
954
|
# $q->{language_code} = $lang;
|
955
|
# $QUEUE->enqueue($q);
|
956
|
# }
|
957
|
#}
|
958
|
|
959
|
my $PAGE_NUM = 0; # number of most recent pages to process (0 means ALL)
|
960
|
my %PAGES;
|
961
|
foreach my $lang (keys %CATEGORY_IDs) {
|
962
|
my $url = "/$lang/api/v1/questions/";
|
963
|
|
964
|
my $offset = 1; # index of the first page to consider
|
965
|
unless ($PAGE_NUM == 0) {
|
966
|
my $h = askbot_get($url);
|
967
|
$offset = $h->{pages} - $PAGE_NUM + 1 if $PAGE_NUM < $h->{pages};
|
968
|
}
|
969
|
|
970
|
my $h = askbot_get("$url?sort=age-asc&page=$offset");
|
971
|
my @questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
|
972
|
logmsg(NOTICE => "Loaded /$lang/questions page $offset/$h->{pages}");
|
973
|
|
974
|
# buffer the page
|
975
|
$PAGES{$lang} = {
|
976
|
current_page => $offset
|
977
|
, pages => $h->{pages}
|
978
|
, questions => \@questions
|
979
|
, url => $url
|
980
|
} if @questions;
|
981
|
}
|
982
|
|
983
|
#my $COUNT = 1000;
|
984
|
MAINLOOP: while (%PAGES) {
|
985
|
# find language code with the lowest question ID
|
986
|
my ($alang, $blang) = sort { $PAGES{$a}->{questions}->[0]->{id} <=> $PAGES{$b}->{questions}->[0]->{id} }
|
987
|
keys %PAGES;
|
988
|
my $p = $PAGES{$alang};
|
989
|
|
990
|
# process questions from $alang that have a lower ID than any other language
|
991
|
# (ie, until we reach the first question from $blang)
|
992
|
my $questions = $p->{questions};
|
993
|
my $bound = defined $blang ? $PAGES{$blang}->{questions}->[0]->{id} : -1;
|
994
|
while ($bound < 0 or $questions->[0]->{id} < $bound) {
|
995
|
my $q = shift @$questions;
|
996
|
$q->{language_code} = $alang;
|
997
|
$QUEUE->enqueue($q);
|
998
|
#last MAINLOOP unless --$COUNT > 0;
|
999
|
next if @$questions; # $alang has some questions buffered still
|
1000
|
|
1001
|
unless ($p->{current_page} < $p->{pages}) {
|
1002
|
logmsg(NOTICE => "Done queuing /$alang/questions");
|
1003
|
delete $PAGES{$alang};
|
1004
|
last;
|
1005
|
}
|
1006
|
|
1007
|
# $alang's highest buffered question ID was lower that $blang's lowest, need to load another page
|
1008
|
my $offset = ++$p->{current_page};
|
1009
|
my $h = askbot_get($p->{url} . "?sort=age-asc&page=$offset");
|
1010
|
logmsg(NOTICE => "Loaded /$alang/questions page $offset/$h->{pages}");
|
1011
|
$p->{pages} = $h->{pages}; # update page count, there might be new posts
|
1012
|
|
1013
|
@$questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
|
1014
|
}
|
1015
|
}
|
1016
|
|
1017
|
|
1018
|
#######################################################################
|
1019
|
# Done!
|
1020
|
|
1021
|
$QUEUE->end();
|
1022
|
$_->join() foreach threads->list();
|
1023
|
logmsg(NOTICE => "Done importing");
|
1024
|
|
1025
|
logmsg(NOTICE => "Updating categories");
|
1026
|
foreach my $c (values %CATEGORY_IDs) {
|
1027
|
my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", {
|
1028
|
num_featured_topics => 3 # restore default
|
1029
|
, email_in => "question+$c->{slug}\@ask.libreoffice.org"
|
1030
|
, required_tag_group_name => $TAG_GROUP_LIBO_COMPONENT
|
1031
|
, min_tags_from_required_group => 1
|
1032
|
, allow_global_tags => "true"
|
1033
|
});
|
1034
|
panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
|
1035
|
}
|
1036
|
|
1037
|
{
|
1038
|
# generate an AskBot question/answer/comment ID -> Discourse path for nginx rewriting
|
1039
|
my $pathmap = dirname($SCRIPT_FILENAME) . "/nginx.map";
|
1040
|
if (open my $fh, ">", $pathmap) {
|
1041
|
printf $fh "%-6d %s;\n", $_, $URLMAP{$_} foreach sort {$a <=> $b} keys %URLMAP;
|
1042
|
} else {
|
1043
|
logmsg(WARN => "Couldn't open $pathmap for writing");
|
1044
|
}
|
1045
|
}
|
1046
|
|
1047
|
|
1048
|
#######################################################################
|
1049
|
# Post processing
|
1050
|
|
1051
|
sub post_process($) {
|
1052
|
$WORKER = shift;
|
1053
|
while (defined (my $x = $QUEUE->dequeue())) {
|
1054
|
my $action = $x->{action} // panic();
|
1055
|
if ($action eq "mangle_post") {
|
1056
|
# try again to mangle internal links now that we have the full URL map
|
1057
|
my $id = $x->{id} // panic();
|
1058
|
logmsg(INFO => "Post-mangling post #$id");
|
1059
|
my $p = discourse_req("system", GET => "/posts/$id.json");
|
1060
|
|
1061
|
my $modified = mangle_post_internal_links($id => \$p->{raw}, 1);
|
1062
|
unless ($modified) {
|
1063
|
logmsg(INFO => "Not updating unchanged post #$id");
|
1064
|
next;
|
1065
|
}
|
1066
|
discourse_req("system", PUT => "/posts/$id.json", {
|
1067
|
raw => $p->{raw}
|
1068
|
, edit_reason => "Fix internal links (post AskBot migration)."
|
1069
|
# XXX the below requires patching app/controllers/posts_controller.rb
|
1070
|
, skip_validations => JSON::true
|
1071
|
, no_bump => JSON::true
|
1072
|
});
|
1073
|
}
|
1074
|
elsif ($action eq "trust_level") {
|
1075
|
# reset TL back to its original value (before we raised it to close the issue)
|
1076
|
my $id = $x->{id} // panic();
|
1077
|
my $p = $DISCOURSE_PROFILES{$id} // panic($id); # never a duplicate
|
1078
|
my $username = $ASKBOT_PROFILES{$id}->{discourse_username} // panic($id);
|
1079
|
my $level = $p->{real_trust_level} // panic();
|
1080
|
logmsg(INFO => "Demoting $username back to TL$level");
|
1081
|
discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level", { level => $level });
|
1082
|
}
|
1083
|
else {
|
1084
|
panic($action);
|
1085
|
}
|
1086
|
}
|
1087
|
}
|
1088
|
|
1089
|
my @FIX_TRUST_LEVELS = grep { defined $DISCOURSE_PROFILES{$_}->{real_trust_level} }
|
1090
|
keys %DISCOURSE_PROFILES;
|
1091
|
if (@NEEDS_POST_MANGLING or @FIX_TRUST_LEVELS) {
|
1092
|
logmsg(NOTICE => "Starting post-processing");
|
1093
|
$QUEUE = Thread::Queue::->new();
|
1094
|
threads->create(\&post_process, $_) for 1 .. $NUM_WORKERS;
|
1095
|
|
1096
|
$QUEUE->enqueue({ id => $_, action => "mangle_post" }) foreach @NEEDS_POST_MANGLING;
|
1097
|
$QUEUE->enqueue({ id => $_, action => "trust_level" }) foreach @FIX_TRUST_LEVELS;
|
1098
|
$QUEUE->end();
|
1099
|
$_->join() foreach threads->list();
|
1100
|
}
|
1101
|
|
1102
|
|
1103
|
#######################################################################
|
1104
|
logmsg(NOTICE => "Finished!");
|