Project

General

Profile

Task #2952 » migrate.pl

migration script - Guilhem Moulin, 2021-07-31 01:46

 
1
#!/usr/bin/perl -T
2

    
3
use v5.10;
4
use strict;
5
use warnings;
6

    
7
use threads;
8
use threads::shared;
9
use Thread::Queue ();
10

    
11
BEGIN {
12
    # JSON::XS is not thread-safe...
13
    $ENV{PERL_JSON_BACKEND} = "Cpanel::JSON::XS";
14
}
15

    
16
use Config::Tiny ();
17
use Crypt::URandom qw/urandom/;
18
use DBI qw/:sql_types/;
19
use Encode ();
20
use File::Basename qw/basename dirname/;
21
use JSON ();
22
use LWP::MediaTypes qw/guess_media_type/;
23
use LWP::UserAgent ();
24
use POSIX ();
25
use URI::Escape qw/uri_escape_utf8/;
26
use Time::HiRes qw/usleep/;
27

    
28
my $SCRIPT_FILENAME = $0 =~ /\A(\p{Graph}+)\z/ ? $1 : die;
29
my $CONF = Config::Tiny::->read("$SCRIPT_FILENAME.conf") or die Config::Tiny::->errstr()."\n";
30
my ($DBH, $ASKBOT, $DISCOURSE, $WORKER, $INDENT);
31
my ($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT);
32
my $NUM_WORKERS = 8;
33

    
34
#######################################################################
35

    
36
# Logging
37
open my $LOG, ">", "$SCRIPT_FILENAME.log" or die "open($SCRIPT_FILENAME.log): $!";
38
sub logmsg($@) {
39
    my $level = shift;
40
    my @msg = @_;
41
    $_ = Encode::encode("UTF-8", $_) foreach @msg;
42
    my $level2 = $level eq "SOFTFAIL" ? "ERROR" : $level;
43
    unshift @msg, $level2, " ";
44
    unshift @msg, sprintf("[%0*d] ", length($NUM_WORKERS), $WORKER) if defined $WORKER;
45
    push @msg, "\n";
46
    $LOG->printflush( POSIX::strftime("%b %d %H:%M:%S ", localtime), @msg );
47
    if ($level eq "WARN" or $level eq "NOTICE" or $level eq "SOFTFAIL") {
48
        print STDERR @msg;
49
    } elsif ($level eq "PANIC" or $level eq "ERROR") {
50
        die @msg;
51
    }
52
}
53
sub fail(@) {
54
    return logmsg(ERROR => @_);
55
}
56
sub panic(@) {
57
    my @msg = @_;
58
    my @loc = caller;
59
    push @msg, " " if @msg;
60
    push @msg, "at line $loc[2] in $loc[1]";
61
    return logmsg(PANIC => @msg);
62
}
63

    
64
# useragent_init()
65
#   (Re-)init user agent, incl. connection cache.
66
sub useragent_init() {
67
    $ASKBOT = LWP::UserAgent::->new(
68
        keep_alive => 1,
69
        ssl_opts => { verify_hostname => 1, SSL_hostname => $CONF->{askbot}->{host} },
70
        default_headers => HTTP::Headers::->new()
71
    );
72
    $DISCOURSE = LWP::UserAgent::->new(
73
        keep_alive => 1,
74
        ssl_opts => { verify_hostname => 1 },
75
        default_headers => HTTP::Headers::->new(
76
            "Accept" => "application/json",
77
            "Api-Key" => $CONF->{discourse}->{apikey}
78
        )
79
    );
80
    ($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT) = (0,0,0,0);
81
}
82

    
83
# decode_json_response($AGENT => $RESPONSE)
84
#   Decode an application/json HTTP response.  If a 5XY HTTP response
85
#   was received, try again to send the request with a 1min timeout.
86
sub decode_json_response($$;$);
87
sub decode_json_response($$;$) {
88
    my ($ua, $resp, $timeout) = @_;
89

    
90
    if ($resp->is_success) {
91
        my $data = $resp->decoded_content();
92
        return JSON::->new->utf8->decode($data);
93
    }
94

    
95
    my $code = $resp->code();
96
    my $req = $resp->request();
97

    
98
    # random delay between 0.75 and 1.25 seconds to help solving race conditions
99
    # between threads
100
    $timeout = defined $timeout ? ($timeout * 2) : (int(rand(500_000)) + 750_000);
101
    if ($code >= 500 and $code <= 599 and $timeout <= 320_000_000) {
102
        # upon 5XY response codes, retry up to 5 minutes
103
        logmsg(INFO => "Received $code from <", $req->uri(), ">; ",
104
            "will retry in ", int($timeout/1000), "ms");
105
        usleep($timeout);
106
        return decode_json_response($ua => $ua->request($req), $timeout);
107
    }
108

    
109
    my @error = ($resp->status_line(), " ", $req->method(), " <", $req->uri(), ">");
110
    my $content_type = $resp->content_type();
111
    if ($content_type eq "application/json") {
112
        my $content = $resp->decoded_content();
113
        chomp $content;
114
        push @error, " ", $content;
115
    }
116
    fail(@error);
117
}
118

    
119
# askbot_get($URL)
120
#   Send a request to the AskBot API and decode the JSON response.
121
sub askbot_get($) {
122
    my $url = $CONF->{askbot}->{apiurl} . shift;
123
    my $t0 = Time::HiRes::gettimeofday();
124
    my $resp = $ASKBOT->get($url, Host => $CONF->{askbot}->{host}, Accept => "application/json");
125
    $ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
126
    $ASKBOT_COUNT++;
127
    return decode_json_response($ASKBOT => $resp);
128
}
129

    
130
# discourse_req($USERNAME, $METHOD, $PATH, [$BODY])
131
#   Send a request to the Discourse API and decode the JSON response.
132
sub discourse_req($$$;$) {
133
    my ($username, $method, $path, $body) = @_;
134
    my $req = HTTP::Request::->new( $method => $CONF->{discourse}->{apiurl} . $path );
135
    $req->header( Host => $CONF->{discourse}->{host} );
136
    # the API doesn't mention case for usernames, but seems we need it
137
    # for some Unicode usernames...
138
    $req->header( "Api-Username" => Encode::encode("UTF-8", lc($username)) );
139
    if (defined $body) {
140
        $req->content_type("application/json");
141
        $req->content(JSON::->new->utf8->encode($body));
142
    }
143
    my $t0 = Time::HiRes::gettimeofday();
144
    my $resp = $DISCOURSE->request($req);
145
    $DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t0;
146
    $DISCOURSE_COUNT++;
147
    return decode_json_response($DISCOURSE => $resp);
148
}
149

    
150
# upload($USERNAME, $PATH, type => $TYPE, ...)
151
#   Download $PATH from AskBot and re-upload to Discourse; returns the
152
#   upload response from the Discourse API, cf.
153
#   https://docs.discourse.org/#tag/Uploads/paths/~1uploads.json/post
154
#
155
#   If the download yields 404 then returns undef instead.
156
sub upload($$%) {
157
    my $username = shift;
158
    my $path = shift;
159
    my %attrs = @_;
160

    
161
    my $filename = delete $attrs{filename} // basename($path);
162
    my $content_type = guess_media_type($filename);
163

    
164
    logmsg(INFO => "    " x $INDENT, "upload $path ($filename)");
165
    panic($path) unless $path =~ /\A\//;
166

    
167
    my $uri = $CONF->{askbot}->{apiurl} . $path;
168
    my $t0 = Time::HiRes::gettimeofday();
169
    my $resp = $ASKBOT->get($uri, Host => $CONF->{askbot}->{host}) or panic();
170
    $ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
171
    $ASKBOT_COUNT++;
172
    unless ($resp->is_success) {
173
        my @error = ($resp->status_line(), " <", $uri, ">");
174
        if ($resp->code() == 404) {
175
            logmsg(WARN => @error);
176
            return;
177
        } else {
178
            panic(@error);
179
        }
180
    }
181

    
182
    # RFC 1867, using HTTP::Request::Common's form_ref
183
    my @form_data = ( undef, Encode::encode("UTF-8", $filename) );
184
    push @form_data, "Content-type" => $content_type
185
                   ,  Content => $resp->decoded_content();
186

    
187
    my $t1 = Time::HiRes::gettimeofday();
188
    $resp = $DISCOURSE->post( $CONF->{discourse}->{apiurl} . "/uploads.json",
189
        "Api-Username" => Encode::encode("UTF-8", lc($username)),
190
        Host => $CONF->{discourse}->{host},
191
        Content_Type => "form-data",
192
        Content => [ %attrs, synchronous => "true", "files[]" => \@form_data ]
193
    );
194
    $DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t1;
195
    $DISCOURSE_COUNT++;
196

    
197
    return decode_json_response($DISCOURSE => $resp);
198
}
199

    
200
# username($ID)
201
#   Returns the Discourse username associated with the AskBot user ID.
202
#   The Discourse user is created if it doesn't already exist.
203
my %ASKBOT_PROFILES; # pre-existing AskBot profiles, incl. SSO linking information
204
my %ASKBOT_DUPLICATES;
205
my %DISCOURSE_PROFILES; # map AskBot user ID -> Discourse profile
206
sub username($);
207
sub username($) {
208
    my $orig_id = shift;
209
    my $id = $ASKBOT_DUPLICATES{$orig_id} // $orig_id;
210

    
211
    state $sth_user_lookup = $DBH->prepare(q{
212
        SELECT username FROM users WHERE id = ?
213
    });
214

    
215
    my $profile = $ASKBOT_PROFILES{$id} // fail("Can't find user #$id in users.json");
216
    my $username = $profile->{discourse_username} // panic($id);
217

    
218
    my ($u, $user_id);
219
    {
220
        my $p = $DISCOURSE_PROFILES{$id} // panic($id);
221
        lock(%$p); # only need to lock that user's profile
222
        return $username if exists $p->{user_id}; # already created
223

    
224
        # user API is not localized, let's stick to english
225
        $u = askbot_get("/en/api/v1/users/$id/");
226

    
227
        my $msg = $profile->{username};
228
        $msg .= " -> $username" if $username ne $profile->{username};
229
        $msg .= ", duplicate #$orig_id" if $id != $orig_id;
230
        logmsg(INFO => "    " x $INDENT, "user #$id ($msg)");
231

    
232
        my $email = $profile->{email} // panic();
233
        if ($email =~ /\@(?:mailinator|trashmail)\.[a-z]+$/i
234
                or $email !~ /\@[0-9a-z\-]+(?:\.[0-9a-z\-]+)+$/i) {
235
            logmsg(WARN => "user #$id ($username) has invalid email address <$email>");
236
            # create the account for content attribution but lock them out
237
            $email = "noreply+$username\@documentfoundation.org";
238
        }
239
        my $r = discourse_req("system", POST => "/users.json", {
240
            username => $username
241
          , password => unpack("H*", urandom(32)) # XXX required even with SSO
242
          , name => $profile->{name} // ""
243
          , email => $email
244
          , active => JSON::true
245
          , approved => JSON::true
246
        });
247
        panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} == JSON::true;
248
        $user_id = $r->{user_id} // panic($id);
249

    
250
        $sth_user_lookup->bind_param(1, $user_id, SQL_INTEGER);
251
        $sth_user_lookup->execute();
252
        my ($username2) = $sth_user_lookup->fetchrow();
253
        $sth_user_lookup->finish();
254
        unless (defined $username2 and $username2 eq $username) {
255
            # there is a race condition causing PostgreSQL to sometimes abort the transaction:
256
            # Discourse reports the user as created but any attempt to use it as Api-Username fails
257
            logmsg(SOFTFAIL => "Couldn't create user #$orig_id ($profile->{username}), retrying...");
258
            return username($orig_id);
259
        }
260

    
261
        # assign a Trust Level to the newly created account, depending on the users's activity on
262
        # AskBot, in order to bootstrap the site
263
        my $tl = ($u->{reputation} >= 1000 and time - $u->{last_seen_at} <= 31557600) ? 4
264
               : ($u->{reputation} >= 250 and $u->{answers} >= 100 and time - $u->{last_seen_at} <= 31557600) ? 3
265
               : ($u->{answers} + $u->{comments} >= 100) ? 2
266
               : ($u->{questions} + $u->{answers} + $u->{comments} >= 5) ? 1
267
               : 0;
268
        # leave non-returning (less than 10 between creation and last seen date) users at TL0
269
        $tl = 0 if $u->{last_seen_at} - $u->{joined_at} < 864000;
270
        discourse_req("system", PUT => "/admin/users/$user_id/trust_level", { level => $tl })
271
            unless $tl == 0;
272

    
273
        $p->{user_id} = $user_id;
274
        $p->{trust_level} = $tl;
275
    } # account was created and TL assigned, we can release the lock on $DISCOURSE_PROFILES{$id}
276
    $INDENT++;
277

    
278
    # preserve avatar
279
    my %avatar_upload;
280
    my $username_escaped = uri_escape_utf8(lc($username));
281
    if ($u->{avatar} =~ m#^(?:(?:https?:)?//www\.gravatar\.com/avatar)?/#) {
282
        my $path = "/user_avatar/$username_escaped/refresh_gravatar.json";
283
        my $r = discourse_req($username, POST => $path, { username => $username });
284
        $avatar_upload{upload_id} = $r->{gravatar_upload_id};
285
        $avatar_upload{type} = "gravatar";
286
    } elsif ($u->{avatar} =~ s,^(?:(?:https?:)?//ask\.libreoffice\.org)?(/[^/?#][^?#]*).*,$1,i) {
287
        unless ($u->{avatar} eq "/m/default/media/images/nophoto.png") {
288
            $u->{avatar} =~ s#^(/upfiles/avatars/[^/]+)/resized/[0-9]+/([^/]+)$#$1/$2#;
289
            my $r = eval { upload($username, $u->{avatar}, type => "avatar", user_id => $user_id) };
290
            if ($@) {
291
                print STDERR $@; # already logged
292
            } else {
293
                $avatar_upload{upload_id} = $r->{id};
294
                $avatar_upload{type} = "uploaded";
295
            }
296
        }
297
    } else {
298
        logmsg(WARN => "$username: unknown avatar URL $u->{avatar}");
299
    }
300

    
301
    if (%avatar_upload) {
302
        my $errstr = "$username: Couldn't set avatar ($avatar_upload{type})";
303
        if (defined $avatar_upload{upload_id}) {
304
            my $path = "/u/$username_escaped/preferences/avatar/pick.json";
305
            my $r = eval { discourse_req($username, PUT => $path, \%avatar_upload) };
306
            unless (!$@ and defined $r->{success} and $r->{success} eq "OK") {
307
                print STDERR $@ if $@; # already logged
308
                $errstr .= ": " . $r->{message} if defined $r and defined $r->{message};
309
                logmsg(SOFTFAIL => $errstr);
310
            }
311
        } else {
312
            logmsg(WARN => $errstr);
313
        }
314
    }
315

    
316
    # database surgery: set created and last seen dates from AskBot
317
    state $sth_user_last = $DBH->prepare(q{
318
        UPDATE users SET created_at = to_timestamp(?), last_seen_at = to_timestamp(?)
319
        WHERE id = ?
320
    });
321
    $sth_user_last->bind_param(1, $u->{joined_at},    SQL_INTEGER);
322
    $sth_user_last->bind_param(2, $u->{last_seen_at}, SQL_INTEGER);
323
    $sth_user_last->bind_param(3, $user_id,           SQL_INTEGER);
324
    my $r = $sth_user_last->execute();
325
    panic($r) unless $r == 1;
326

    
327
    $INDENT--;
328
    return $username;
329
}
330

    
331
# mangle_post_upload($USERNAME, $TEXT => $PATH, $INLINE)
332
#   Return a Markdown string with given $TEXT and AskBot source link
333
#   (attachment or screeshot).  The source is downloaded and re-uploaded
334
#   to Discourse.
335
sub mangle_post_upload($$$$) {
336
    my ($username, $text, $path, $inline) = @_;
337
    my $filename;
338
    unless ($inline) {
339
        # fix ugly C:\fakepath\something cf. https://www.mediawiki.org/wiki/Special:Code/MediaWiki/83225
340
        $text =~ s/\AC:\\fakepath\\([^\\]+)\z/$1/;
341
        my $ext = $path =~ /[^.\/\\]\.([a-zA-Z](?:-?[a-zA-Z0-9])*)\z/ ? $1 : undef;
342
        $filename = !(defined $ext and $text =~ /[^.\/\\]\.\Q$ext\E\z/i) ? basename($path)
343
                  : $text =~ /\A[A-Z]:\\.*\\(.+)\z/ ? basename($1)
344
                  : basename($text);
345
        $filename =~ tr[\t\n\f\r ][     ]s;
346
    }
347
    my $u = eval { upload($username, $path, type => "composer", filename => $filename) };
348
    unless (!$@ and defined $u) {
349
        # 404 or fatal error, can't do better
350
        print STDERR $@ if $@; # already logged
351
        logmsg(SOFTFAIL => "Couldn't re-upload $path");
352
        return;
353
    }
354
    return $inline ? "![$text]($u->{short_url})" : "[$text|attachment]($u->{short_url})";
355
}
356

    
357
# mangle_post_internal_link($ID, $LINK, $FINAL, $REF)
358
#   Return appropriate replacement for AskBot internal $LINK in post $ID.
359
#   The 3rd argument is a boolean indicating whether the import is
360
#   ongoing or if we're at the final phase.
361
#   The 4rd argument is a reference to a variable that is is set to 1 if
362
#   $FINAL is false and the mangling couldn't proceeed because the
363
#   AskBot question hasn't been processed yet, or if $FINAL is true and
364
#   a suitable substitute was found.
365
my %URLMAP :shared; # map AskBot post ID -> Discourse URL
366
my %CATEGORY_IDs;   # map language code -> Discourse category
367
sub mangle_post_internal_link($$$$) {
368
    my ($post_id, $link, $final, $ref_rv) = @_;
369

    
370
    my $uri = $link;
371
    my $hash  = $uri =~ s/\#(.*)// ? $1 : "";
372
    my $query = $uri =~ s/\?(.*)// ? $1 : "";
373
    my $lang = $uri =~ m,^                        /(?:question|user)s? (?:/|$),x ? "en"
374
             : $uri =~ s,^/([\p{PosixAlnum}\-_]+)(/(?:question|user)s? (?:/|$)),$2,x ? $1
375
             : $uri =~ s,^/([\p{PosixAlnum}\-_]+) /?$,/questions/,x ? $1
376
             : undef;
377

    
378
    my $newlink;
379
    unless (defined $lang) {
380
        return unless $uri eq "" or $uri eq "/"; # can't do better
381
        return if $CONF->{discourse}->{baseurl} eq "https://ask.libreoffice.org" and $query eq "" and $hash eq "";
382
        $newlink = "/"; # site root (ignore query and hash)
383
    } elsif ($uri =~ s#^/users?/([0-9]+)(?:/|$)##) {
384
        # don't create accounts during the final phase
385
        return if $final;
386
        my $username = eval { username($1) };
387
        if ($@) {
388
            print STDERR $@; # already logged
389
            logmsg(SOFTFAIL => "Couldn't create user #$1");
390
            return;
391
        }
392
        $newlink = "/u/" . uri_escape_utf8(lc($username)); # profile URL
393
    } elsif ($uri =~ s#^/questions?/([0-9]+)(?:/|$)##) {
394
        my $id = $1;
395
        my $old_post_id =
396
            $hash  =~ /^post-id-([0-9]+)$/                        ? $1
397
          : $query =~ /(?:^|&)(?:answer|comment)=([0-9]+)(?:&|$)/ ? $1
398
          : $id;
399
        unless (defined ($newlink = $URLMAP{$old_post_id})) {
400
            if (!$final) {
401
                $$ref_rv = 1; # needs post mangling
402
            } else {
403
                logmsg(WARN => "#$post_id: Can't mangle <$link> (missing #$old_post_id)");
404
            }
405
            return;
406
        }
407
    } elsif ($uri =~ m#^/questions?/?$#) {
408
        my $c = $CATEGORY_IDs{$lang} // return;
409
        $newlink = "/c/$c->{slug}/$c->{id}";
410
    } elsif ($uri =~ m#^/questions?/ask/?$#) {
411
        my $c = $CATEGORY_IDs{$lang} // return;
412
        $newlink = "/new-topic?category=$c->{slug}";
413
    } elsif ($uri =~ m#^/users?/?$#) {
414
        $newlink = "/u";
415
    } else {
416
        return;
417
    }
418
    $$ref_rv = 1 if $final; # modified
419
    logmsg(DEBUG => "#$post_id: Mangling <$link> -> <$newlink>");
420
    return $CONF->{discourse}->{baseurl} . $newlink;
421
}
422

    
423
# mangle_post_internal_links($ID, $REF_TEXT, [$FINAL])
424
#   Mangle all internal question/answer/comment/user links.  The 3rd
425
#   argument is a boolean indicating whether the import is ongoing or if
426
#   we're at the final phase.
427
#   Return a boolean, which when $FINAL is false indicates whether the
428
#   post needs post-mangling during the final phase, and when $FINAL is
429
#   true indicates whether the input text was mangled.
430
sub mangle_post_internal_links($$;$) {
431
    my ($id, $text, $final) = @_;
432
    my $rv = 0;
433

    
434
    $$text =~ s#\b (?i:(?:https?:)//ask\.libreoffice\.org)
435
            ( / [^\P{Graph}!"'\(\)<>\[\]\`\{\}]+? )
436
            (?! [^\P{Graph}!"'\(\),\.<>\?\[\]\`\{\}] | [!',\.\?]\p{Graph} )
437
            # mangle_post_internal_link($id, $1, $final => \$rv) // $& #egx;
438

    
439
    if ($final or !$rv) {
440
        # note: the regexp is intentionally loose
441
        logmsg(WARN => "Post #$id contains dangling internal link <$_>") foreach
442
            $$text =~ m#\b(?i:(?:https?:)//ask\.libreoffice\.org)/(?![ctu]/)\S+#g;
443
    }
444

    
445
    return $rv;
446
}
447

    
448
# mangle_post($AUTHOR, $POST)
449
#   Mangle $POST->{text} in place:
450
#   * fix attachments and screenshots;
451
#   * fix link to other profiles, "normal" links, bare URLs, and @-addressing
452
#     (the target profile is created if it doesn't exist); and
453
#   * fix links to questions, answers and comments.
454
#   Return a boolean indicating whether the post needs further mangling
455
#   during the final phase.
456
sub mangle_post($$) {
457
    my ($author, $p) = @_;
458

    
459
    # fix attachments and screenshots (an empty link text is allowed for screenshots, and we also
460
    # allow it for atttachment since we append "|attachment")
461
    $p->{text} =~ s#!\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/m/[\p{PosixAlnum}\-_]+/media/images/[^\[\]\(\)]+) \)#
462
            mangle_post_upload("system", $1 => $2, 1) // $& #egx; # NOTE: discourse does deduplication
463
    $p->{text} =~ s#(!)?\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/upfiles/[^\[\]\(\)]+) \)#
464
            mangle_post_upload($author, $2 => $3, $1) // $& #egx;
465
    # NOTE: mangle_post_upload() can fail if it tries to upload the same document twice at the
466
    # same time (race condition on uniqueness constraint) -- we could retry but we don't care so
467
    # much since sidekiq will automatically fetch https://ask.libreoffice.org/upfiles/12345.png
468
    # locally and update the post accordingly
469
    # NOTE: the handful of bare links https://ask.libreoffice.org/upfiles/foobar.xyz are
470
    # untouched hence will 404
471

    
472
    # fix @-addressing; we can't match $p->{text} against /\@\w+/ here because AskBot
473
    # usernames may contain spaces... so we check for <a href="..."/> in the HTML instead
474
    my %user_ids;
475
    $user_ids{$_} = 1 foreach $p->{summary} =~ m#<a\s(?:[^<>]*\s)*?href=['"]
476
            (?i:(?:https?:)?//ask\.libreoffice\.org)? (?:/[\p{PosixAlnum}\-_]+)?
477
                /users?/([0-9]+)[/'"]#gsx;
478
    foreach my $id (keys %user_ids) {
479
        # we don't want duplicate resolution here
480
        my $profile = $ASKBOT_PROFILES{$id};
481
        unless (defined $profile) {
482
            logmsg(WARN => "#$id is not a known AskBot user, can't mangle its \@-addresses");
483
            next;
484
        }
485
        my $old_username = $profile->{username} // panic($id);
486
        my $new_username = eval { username($id) }; # might need to be created
487
        if ($@) {
488
            print STDERR $@; # already logged
489
            logmsg(SOFTFAIL => "Couldn't create user #$id");
490
            next;
491
        }
492
        $p->{text} =~ s/\@\Q$old_username\E (?! [\.\-]?[\p{Mark}_\p{Alnum}] )
493
                       /\@$new_username/gx
494
            unless $old_username eq $new_username;
495
    }
496

    
497
    # explicitely prefix AskBot base URL so we don't have to do the work twice
498
    $p->{text} =~ s#\[ ([^\[\]\(\)]+) \]\( (/[^\[\]\(\)]*) \)#[$1](https://ask.libreoffice.org$2)#gx;
499

    
500
    # replace [URL](URL) links with a bare URL when possible so Discourse can use
501
    # its own text for internal post links
502
    $p->{text} =~ s#\[ ((?i:https?:)?//[^\[\]\(\)]+) \]\( \1 \)
503
            (?! [!',\.\?]* [^\P{Graph}!"'\(\),\.<>\?\[\]\{\}] )#$1#gx;
504

    
505
    return mangle_post_internal_links($p->{id} => \$p->{text});
506
}
507

    
508
# ts2iso8601($TIMESTAMP)
509
#   Convert a timestamp to ISO 8601.
510
sub ts2iso8601($) {
511
    my $ts = 0 + shift;
512
    return POSIX::strftime("%Y-%m-%dT%H:%M:%SZ", gmtime($ts));
513
}
514

    
515
# replies(language_code => $LANG, topic_id => $ID, replies => [..], ..)
516
#   Recursively process the given replies under some topic ID.  The
517
#   parent post is indicated by the optional reply_to_post_number
518
#   attribute.
519
#   Return the number of replies that couldn't be posted, if any
520
my @NEEDS_POST_MANGLING :shared;
521
sub replies(%);
522
sub replies(%) {
523
    my %attrs = @_;
524
    my $post_type = defined $attrs{reply_to_post_number} ? "comment" : "answer";
525
    my $askbot_url = "/$attrs{language_code}/api/v1/${post_type}s/";
526

    
527
    state $sth_set_vote_count = $DBH->prepare(q{
528
        INSERT INTO post_custom_fields
529
            (post_id, name, value, created_at, updated_at)
530
            VALUES (?, 'vote_count', ?, NOW(), NOW())
531
    });
532

    
533
    my $num_errors = 0;
534
    my $indent_base = $INDENT;
535
    foreach my $id (sort {$a <=> $b} @{$attrs{replies}}) { # lowest ID first
536
        logmsg(INFO => "    " x $indent_base, "$post_type #$id");
537
        $INDENT = $indent_base + 1;
538

    
539
        my ($r, $t);
540
        eval {
541
            $r = askbot_get("$askbot_url$id/");
542
            my $author = username($r->{author}->{id});
543
            my $needs_post_mangling = mangle_post($author, $r);
544

    
545
            $t = discourse_req($author, POST => "/posts.json", {
546
                topic_id => $attrs{topic_id}
547
              , reply_to_post_number => $attrs{reply_to_post_number}
548
              , raw => $r->{text}
549
              , created_at => ts2iso8601($r->{added_at})
550
              , skip_validations => JSON::true
551
            });
552

    
553
            push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
554
        };
555
        if ($@) {
556
            print STDERR $@; # already logged
557
            logmsg(SOFTFAIL => "Couldn't post $post_type #$id");
558
            $num_errors++;
559
            next;
560
        }
561

    
562
        # no need to lock here, other threads will never store that ID
563
        $URLMAP{$r->{id}} = "/t/$t->{topic_slug}/$t->{topic_id}/$t->{post_number}";
564

    
565
        if ($post_type eq "answer") {
566
            # database surgery: set vote count on answers (skip
567
            # questions and comments as Discourse doesn't show these)
568
            if ((my $score = $r->{score}) > 0) {
569
                $sth_set_vote_count->bind_param(1, $t->{id}, SQL_INTEGER);
570
                $sth_set_vote_count->bind_param(2, $score,   SQL_INTEGER);
571
                my $r = $sth_set_vote_count->execute();
572
                panic($r) unless $r == 1;
573
            }
574
            if (defined $attrs{accepted_answer_id} and $attrs{accepted_answer_id} == $id) {
575
                # XXX rate-limiting is not configurable, need  to comment out
576
                # "limit_accepts" in plugins/discourse-solved/plugin.rb:accept
577
                logmsg(INFO => "    " x $INDENT, "accept answer #$id");
578
                $INDENT++;
579
                my $r = eval { discourse_req($attrs{accepted_by}, POST => "/solution/accept",
580
                                    { id => $t->{id} }) };
581
                if ($@) {
582
                    print STDERR $@; # already logged
583
                    logmsg(SOFTFAIL => "Couldn't accept answer #$id");
584
                } elsif (!defined $r->{success} or $r->{success} ne "OK") {
585
                    panic(JSON::->new->utf8->encode($r));
586
                }
587
                $INDENT--;
588
            }
589
        }
590

    
591
        # nested comments
592
        $num_errors += replies(
593
            language_code => $attrs{language_code}
594
          , topic_id => $t->{topic_id}
595
          , reply_to_post_number => $t->{post_number}
596
          , replies => $r->{comment_ids}
597
        ) if @{$r->{comment_ids}};
598
    }
599
    return $num_errors;
600
}
601

    
602

    
603
#######################################################################
604
# Initialisation
605

    
606
logmsg(NOTICE => "Starting $SCRIPT_FILENAME");
607
my $TAG_GROUP_LIBO_COMPONENT = "LibreOffice component";
608

    
609
# Categories
610
useragent_init();
611
{
612
    my $path = dirname($SCRIPT_FILENAME) . "/categories.json";
613
    open my $fh, "<", $path or panic("open($path): $!");
614
    my $categories = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
615
    close $fh or panic("close: $!");
616

    
617
    my %slugs;
618
    my $r = discourse_req("system", GET => "/categories.json");
619
    $slugs{$_->{slug}} = $_->{id} foreach @{$r->{category_list}->{categories}};
620

    
621
    foreach my $c (@$categories) {
622
        my $l = $c->{language_code} // panic();
623
        $CATEGORY_IDs{$l} = { slug => $c->{slug} };
624
        if (defined (my $id = $slugs{$c->{slug}})) {
625
            # the category already exists
626
            $CATEGORY_IDs{$l}->{id} = $id;
627
        } else {
628
            logmsg(NOTICE => "Creating category $c->{slug}");
629
            my $r = discourse_req("system", POST => "/categories.json", {
630
                name => $c->{name},
631
                slug => $c->{slug},
632
                custom_fields => { qa_enabled => JSON::true }
633
            });
634
            $CATEGORY_IDs{$l}->{id} = $r->{category}->{id} // panic();
635
        }
636
    }
637

    
638
    $r = discourse_req("system", POST => "/tag_groups.json", {
639
        tag_group => {
640
            name => $TAG_GROUP_LIBO_COMPONENT,
641
            tag_names => [ qw/common writer calc impress base draw math meta/ ],
642
            permissions => { everyone => 1 },
643
            one_per_topic => JSON::false
644
        }
645
    });
646
    panic("Couldn't create group tag") unless defined $r->{tag_group};
647

    
648
    # reorder (lexicographically) by slugs
649
    logmsg(NOTICE => "Updating and ordering categories");
650
    my (%categories_sorted, %positions);
651
    my @categories_sorted = sort { $a->{slug} cmp $b->{slug} } @$categories;
652
    foreach (my $i = 0; $i <= $#$categories; $i++) {
653
        $categories_sorted{$categories_sorted[$i]->{slug}} = $i;
654
    }
655
    my $offset = 0;
656
    $r = discourse_req("system", GET => "/categories.json");
657
    foreach my $c (@{$r->{category_list}->{categories}}) {
658
        my $s = $c->{slug} // panic();
659
        my %h;
660
        if ($s eq "uncategorized") {
661
            $h{color} = "3AB54A";
662
            $h{text_color} = "FFFFFF";
663
            $positions{$c->{id}} = 0;
664
        } elsif ($s eq "site-feedback") {
665
            $positions{$c->{id}} = scalar(@$categories) + 1;
666
        } elsif ($s eq "staff") {
667
            $positions{$c->{id}} = scalar(@$categories) + 2;
668
        } elsif ($s eq "lounge") {
669
            $positions{$c->{id}} = scalar(@$categories) + 3;
670
        } elsif (defined (my $i = $categories_sorted{$s})) {
671
            $positions{$c->{id}} = $i + 1;
672
            $h{num_featured_topics} = 1;
673
        } else {
674
            $positions{$c->{id}} = scalar(@$categories) + 4 + $offset;
675
            $offset++;
676
        }
677
        next unless %h;
678
        my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", \%h);
679
        panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
680
    }
681
    $r = discourse_req("system", POST => "/categories/reorder",
682
        { mapping => JSON::->new->encode(\%positions) });
683
    panic("Couldn't reorder categories") unless defined $r->{success} and $r->{success} eq "OK";
684
}
685

    
686
# AskBot (and SSO linking information) user profiles
687
{
688
    my $path = dirname($SCRIPT_FILENAME) . "/users.json";
689
    logmsg(NOTICE => "Loading $path");
690
    open my $fh, "<", $path or panic("open($path): $!");
691
    my $profiles = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
692
    close $fh or panic("close: $!");
693

    
694
    # merge accounts with the same email address (into the account with lowest ID)
695
    my (%emails, %duplicates);
696
    foreach my $id (keys %$profiles) {
697
        my $p = $profiles->{$id} // panic();
698
        # `lc` is a bit YOLO but Discourse matches email addresses case-insensitively...
699
        my %e = ( lc($p->{email}) => 1 );
700
        $e{lc($_)} = 1 foreach @{$p->{sso_emails} // []};
701
        foreach (keys %e) {
702
            $emails{$_} //= [];
703
            push @{$emails{$_}}, $id;
704
        }
705
    }
706
    foreach my $e (keys %emails) {
707
        my $ids = $emails{$e} // panic();
708
        next unless $#$ids > 0;
709
        my ($id, @duplicates) = sort {$a <=> $b} @$ids;
710
        $duplicates{$id} = \@duplicates;
711
    }
712
    foreach my $id (sort {$a <=> $b} keys %duplicates) {
713
        my $dups = $duplicates{$id};
714
        my $username = $profiles->{$id}->{username} // panic();
715
        logmsg(INFO => "Merging user(s) ", join (", ", map {"#$_"} @$dups),
716
                            " with #$id ($username)");
717
        $ASKBOT_DUPLICATES{$_} = $id + 0 foreach @$dups;
718
    }
719

    
720
    my @reserved_usernames = # 'reserved_usernames' setting
721
        qw/admin moderator administrator mod sys system community info
722
           you name username user nickname discourse discourseorg
723
           discourseforum support/;
724

    
725
    my %usernames;
726
    my @profile_ids = sort {$a <=> $b} keys %$profiles;
727

    
728
    # given users #1 "FooBar" and #2 "whatever" with SSO username "foobar", #1 and #2
729
    # are respectively renamed to "foobar1" and "foobar"; we assume users not in SSO
730
    # aren't that active in the project and allow hijacking usernames for SSO users
731
    $usernames{lc($profiles->{$_}->{sso_username})} //= $ASKBOT_DUPLICATES{$_} // $_
732
        # no need to add the version without `lc` here, SSO usernames are case insensitive
733
        foreach grep { defined $profiles->{$_}->{sso_username} } @profile_ids;
734

    
735
    # AskBot doesn't have a username requirement but Discourse does (and usernames are matched
736
    # case-insensitively); we try to preserve users on a first come first served basis: given users
737
    # #1 "Foo", #2 "foo", #3 "bar", #4 "bar" then "foo" and "bar" are respectively attributed to #2
738
    # (because the case was correct) and #3 while #1 and #4 are renamed "Foo1" and "bar1"
739
    $usernames{$profiles->{$_}->{username}}     //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
740
    $usernames{lc($profiles->{$_}->{username})} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
741

    
742
    # populate %ASKBOT_PROFILES
743
    foreach my $id (@profile_ids) {
744
        my $p = $profiles->{$id} // panic();
745
        my %p = ( username => $p->{username} );
746
        $id += 0; # convert to integer
747

    
748
        if (exists $ASKBOT_DUPLICATES{$id}) {
749
            # keep AskBot username for @-addressing mangling but skip other attributes
750
            $ASKBOT_PROFILES{$id} = \%p;
751
            next;
752
        }
753

    
754
        # mangle AskBot/SSO username to comply with Discourse's stricter policy
755
        # https://github.com/discourse/discourse/blob/master/app/models/username_validator.rb
756
        my $username = $p->{sso_username} // $p->{username} // panic();
757
        $username =~ s/^\@+//; # @foo -> foo
758
        $username =~ s/\@.*//; # foo@example.net -> foo
759
        $username =~ s/[^\p{Alnum}\p{Mark}._\-]//g; # UNICODE_INVALID_CHAR_PATTERN
760
        $username =~ s/^[^\p{Alnum}\p{Mark}_]+//;   # INVALID_LEADING_CHAR_PATTERN
761
        $username =~ s/[^\p{Alnum}\p{Mark}]+$//;    # INVALID_TRAILING_CHAR_PATTERN
762
        $username =~ s/([\-_.])+/$1/g;              # REPEATED_SPECIAL_CHAR_PATTERN
763
        $username =~ s/\.(js|json|css|html?|xml|jpe?g|png|gif|bmp|ico|tiff?|woff2?)$/-$1/i; # CONFUSING_EXTENSIONS
764
        $username = "unknown" if $username eq "";
765

    
766
        # comparison is case insensitive
767
        $username = "not-$username" if grep { lc($username) eq $_ } @reserved_usernames;
768

    
769
        while (($usernames{lc($username)} // $id) != $id) {
770
            # username is already taken, increment or append a number
771
            $username .= "1" unless $username =~ s/([0-9]+)$/sprintf("%0*d", length($1), $1 + 1)/e;
772
        }
773

    
774
        $usernames{lc($username)} = $id;
775
        $p{discourse_username} = $username;
776
        $p{name} = $p->{name} if defined $p->{name} and $p->{name} ne "";
777
        $p{email} = @{$p->{sso_emails} // []} ? $p->{sso_emails}->[0] : $p->{email};
778
        $ASKBOT_PROFILES{$id} = \%p;
779
        $DISCOURSE_PROFILES{$id} = shared_clone({});
780
    }
781

    
782
    # pre-existing users
783
    $DISCOURSE_PROFILES{32931}->{user_id} = 1;
784
    $DISCOURSE_PROFILES{32931}->{trust_level} = 1;
785
}
786

    
787

    
788
#######################################################################
789
# Import workers, each fetching work from a queue of questions
790

    
791
my $QUEUE = Thread::Queue::->new();
792
my ($TOPIC_lck, $CLOSE_lck) :shared;
793
sub import_questions($) {
794
    $WORKER = shift;
795
    useragent_init(); # re-init
796

    
797
    my $conf_db = $CONF->{database} // panic();
798
    my $dsn = "DBI:Pg:dbname=$conf_db->{dbname}; host=$conf_db->{host}";
799
    $DBH = DBI::->connect( $dsn, $conf_db->{user} // "", $conf_db->{password} // "", {RaiseError => 1} );
800

    
801
    my $sth_set_view_count = $DBH->prepare(q{
802
        UPDATE topics SET views = ? WHERE id = ?
803
    });
804
    my $sth_set_closed_at_by = $DBH->prepare(q{
805
        UPDATE posts SET created_at = to_timestamp(?)
806
        WHERE topic_id = ? AND post_type != 1 AND action_code = 'closed.enabled'
807
    });
808

    
809
    my $elapsed = 0;
810
    while (defined (my $q = $QUEUE->dequeue())) {
811
        logmsg(INFO => "question #$q->{id}");
812
        $INDENT = 1;
813

    
814
        my @tags = grep # skip reserved names, cf. app/models/tag.rb
815
                        { lc($_) ne "none" and lc($_) ne "constructor" }
816
                        # NOTE: discourse mangles tag names, in particular
817
                        # turns 'libreoffice-1.2.3' to 'libreoffice-123', cf.
818
                        # lib/discourse_tagging.rb:clean_tag()
819
                        @{$q->{tags}};
820

    
821
        my $lang = $q->{language_code} // panic();
822
        my $category = $CATEGORY_IDs{$lang} // panic($lang);
823
        my $t0 = Time::HiRes::gettimeofday();
824

    
825
        my ($author, $t);
826
        eval {
827
            $author = username($q->{author}->{id});
828
            my $needs_post_mangling = mangle_post($author, $q);
829

    
830
            # post topic
831
            lock($TOPIC_lck);
832
            $t = discourse_req($author, POST => "/posts.json", {
833
                title => $q->{title}
834
              , raw => $q->{text}
835
              , category => $category->{id}
836
              , tags => \@tags
837
              , created_at => ts2iso8601($q->{added_at})
838
              , skip_validations => JSON::true
839
            });
840
            push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
841
        };
842
        if ($@) {
843
            print STDERR $@; # already logged
844
            logmsg(SOFTFAIL => "Couldn't post question #$q->{id}");
845
            next;
846
        }
847

    
848
        my $topic_id = $t->{topic_id} // panic();
849
        my $topic_url = "/t/$t->{topic_slug}/$topic_id";
850
        my $num_errors = 0;
851

    
852
        # no need to lock here, other threads will never store that ID
853
        $URLMAP{$q->{id}} = $topic_url;
854

    
855
        # (recursively) process replies
856
        $num_errors += replies(
857
            language_code => $lang
858
          , topic_id => $topic_id
859
          , reply_to_post_number => $t->{post_number}
860
          , replies => $q->{comment_ids}
861
        ) if @{$q->{comment_ids}};
862

    
863
        $num_errors += replies(
864
            language_code => $lang
865
          , topic_id => $topic_id
866
            # answers have null has reply_to_post_number attribute, see
867
            # https://github.com/paviliondev/discourse-question-answer/blob/master/extensions/topic_extension.rb#L16
868
          , reply_to_post_number => undef
869
          , replies => $q->{answer_ids}
870
          , accepted_answer_id => $q->{accepted_answer_id}
871
          , accepted_by => $author # assume accepted by OP
872
        ) if @{$q->{answer_ids}};
873

    
874
        if ($q->{closed} == JSON::true) {
875
            logmsg(INFO => "    " x $INDENT, "close question #$q->{id}");
876
            $INDENT++;
877
            my $id = $q->{closed_by}->{id} // panic();
878
            my $username = eval { username($id) };
879
            if ($@) {
880
                print STDERR $@; # already logged
881
                logmsg(SOFTFAIL => "Couldn't create user #$id");
882
            } else {
883
                {
884
                    my $p = $DISCOURSE_PROFILES{ $ASKBOT_DUPLICATES{$id} // $id } // panic();
885
                    lock(%$p); # only need to lock that user's profile
886
                    if ((my $tl = $p->{trust_level}) < 4) {
887
                        logmsg(DEBUG => "Temporarily promoting $username to TL4");
888
                        discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level",
889
                            { level => 4 });
890
                        $p->{trust_level} = 4;
891
                        $p->{real_trust_level} = $tl;
892
                    }
893
                }
894
                lock($CLOSE_lck);
895
                my $r = discourse_req($username, PUT => "/t/$topic_id/status.json",
896
                            { status=> "closed", enabled => "true" });
897
                panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} eq "OK";
898
            }
899

    
900
            # database surgery: set closing date from AskBot
901
            $sth_set_closed_at_by->bind_param(1, $q->{closed_at}, SQL_INTEGER);
902
            $sth_set_closed_at_by->bind_param(2, $topic_id,       SQL_INTEGER);
903
            my $r = $sth_set_closed_at_by->execute();
904
            panic($r) unless $r == 1;
905
            $INDENT--;
906
        }
907

    
908
        if ((my $view_count = $q->{view_count}) > 0) {
909
            # database surgery: set view count on the topic
910
            $sth_set_view_count->bind_param(1, $view_count, SQL_INTEGER);
911
            $sth_set_view_count->bind_param(2, $topic_id,   SQL_INTEGER);
912
            my $r = $sth_set_view_count->execute();
913
            panic($r) unless $r == 1;
914
        }
915

    
916
        logmsg(WARN => "$topic_url (AskBot question #$q->{id}) has >=$num_errors missing replies!")
917
            if $num_errors > 0;
918
        $elapsed += Time::HiRes::gettimeofday() - $t0;
919
    }
920

    
921
    logmsg(INFO => sprintf("Worker stats: total active time %.2fs", $elapsed));
922
    logmsg(INFO => sprintf("AskBot stats: %d requests, %.2fs total", $ASKBOT_COUNT, $ASKBOT_TIME));
923
    logmsg(INFO => sprintf("Discourse stats: %d requests, %.2fs total", $DISCOURSE_COUNT, $DISCOURSE_TIME));
924
}
925

    
926
logmsg(NOTICE => "Spawning $NUM_WORKERS workers");
927
$QUEUE->limit = 4 * $NUM_WORKERS; # no need to prefetch pages if the workers can't keep up
928
threads->create(\&import_questions, $_) for 1 .. $NUM_WORKERS;
929

    
930

    
931
#######################################################################
932
# Queue up questions
933
# we process them ordered by ID so lower question IDs get lower topic IDs
934

    
935
#{
936
#    my %question_ids = (
937
#        161330 => "en" # @-addressing with old username
938
#      , 238673 => "en" # closed
939
#      , 302629 => "en" # upvote, accepted answer
940
#      , 303050 => "en" # screenshots, attachments (both with C:\fakepath\ and without)
941
#      , 285734 => "ja" # URL to profile, unicode attachements
942
#      , 161228 => "en" # question linked in the above, OneBox
943
#      , 304617 => "en" # closed by someone with TL < 4
944
#      , 161334 => "en" # Hello1 -> Hello2
945
#      # duplicate accounts
946
#      , 18673 => "en", 1505 => "en"
947
#      , 25329 => "pt-br", 128011 => "pt-br"
948
#    );
949
#
950
#    foreach my $id (sort {$a <=> $b} keys %question_ids) {
951
#        my $lang = $question_ids{$id};
952
#        my $url = "/$lang/api/v1/questions/$id/";
953
#        my $q = askbot_get($url);
954
#        $q->{language_code} = $lang;
955
#        $QUEUE->enqueue($q);
956
#    }
957
#}
958

    
959
my $PAGE_NUM = 0; # number of most recent pages to process (0 means ALL)
960
my %PAGES;
961
foreach my $lang (keys %CATEGORY_IDs) {
962
    my $url = "/$lang/api/v1/questions/";
963

    
964
    my $offset = 1; # index of the first page to consider
965
    unless ($PAGE_NUM == 0) {
966
        my $h = askbot_get($url);
967
        $offset = $h->{pages} - $PAGE_NUM + 1 if $PAGE_NUM < $h->{pages};
968
    }
969

    
970
    my $h = askbot_get("$url?sort=age-asc&page=$offset");
971
    my @questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
972
    logmsg(NOTICE => "Loaded /$lang/questions page $offset/$h->{pages}");
973

    
974
    # buffer the page
975
    $PAGES{$lang} = {
976
        current_page => $offset
977
      , pages => $h->{pages}
978
      , questions => \@questions
979
      , url => $url
980
    } if @questions;
981
}
982

    
983
#my $COUNT = 1000;
984
MAINLOOP: while (%PAGES) {
985
    # find language code with the lowest question ID
986
    my ($alang, $blang) = sort { $PAGES{$a}->{questions}->[0]->{id} <=> $PAGES{$b}->{questions}->[0]->{id} }
987
                               keys %PAGES;
988
    my $p = $PAGES{$alang};
989

    
990
    # process questions from $alang that have a lower ID than any other language
991
    # (ie, until we reach the first question from $blang)
992
    my $questions = $p->{questions};
993
    my $bound = defined $blang ? $PAGES{$blang}->{questions}->[0]->{id} : -1;
994
    while ($bound < 0 or $questions->[0]->{id} < $bound) {
995
        my $q = shift @$questions;
996
        $q->{language_code} = $alang;
997
        $QUEUE->enqueue($q);
998
        #last MAINLOOP unless --$COUNT > 0;
999
        next if @$questions; # $alang has some questions buffered still
1000

    
1001
        unless ($p->{current_page} < $p->{pages}) {
1002
            logmsg(NOTICE => "Done queuing /$alang/questions");
1003
            delete $PAGES{$alang};
1004
            last;
1005
        }
1006

    
1007
        # $alang's highest buffered question ID was lower that $blang's lowest, need to load another page
1008
        my $offset = ++$p->{current_page};
1009
        my $h = askbot_get($p->{url} . "?sort=age-asc&page=$offset");
1010
        logmsg(NOTICE => "Loaded /$alang/questions page $offset/$h->{pages}");
1011
        $p->{pages} = $h->{pages}; # update page count, there might be new posts
1012

    
1013
        @$questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
1014
    }
1015
}
1016

    
1017

    
1018
#######################################################################
1019
# Done!
1020

    
1021
$QUEUE->end();
1022
$_->join() foreach threads->list();
1023
logmsg(NOTICE => "Done importing");
1024

    
1025
logmsg(NOTICE => "Updating categories");
1026
foreach my $c (values %CATEGORY_IDs) {
1027
    my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", {
1028
        num_featured_topics => 3 # restore default
1029
      , email_in => "question+$c->{slug}\@ask.libreoffice.org"
1030
      , required_tag_group_name => $TAG_GROUP_LIBO_COMPONENT
1031
      , min_tags_from_required_group => 1
1032
      , allow_global_tags => "true"
1033
    });
1034
    panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
1035
}
1036

    
1037
{
1038
    # generate an AskBot question/answer/comment ID -> Discourse path for nginx rewriting
1039
    my $pathmap = dirname($SCRIPT_FILENAME) . "/nginx.map";
1040
    if (open my $fh, ">", $pathmap) {
1041
        printf $fh "%-6d %s;\n", $_, $URLMAP{$_} foreach sort {$a <=> $b} keys %URLMAP;
1042
    } else {
1043
        logmsg(WARN => "Couldn't open $pathmap for writing");
1044
    }
1045
}
1046

    
1047

    
1048
#######################################################################
1049
# Post processing
1050

    
1051
sub post_process($) {
1052
    $WORKER = shift;
1053
    while (defined (my $x = $QUEUE->dequeue())) {
1054
        my $action = $x->{action} // panic();
1055
        if ($action eq "mangle_post") {
1056
            # try again to mangle internal links now that we have the full URL map
1057
            my $id = $x->{id} // panic();
1058
            logmsg(INFO => "Post-mangling post #$id");
1059
            my $p = discourse_req("system", GET => "/posts/$id.json");
1060

    
1061
            my $modified = mangle_post_internal_links($id => \$p->{raw}, 1);
1062
            unless ($modified) {
1063
                logmsg(INFO => "Not updating unchanged post #$id");
1064
                next;
1065
            }
1066
            discourse_req("system", PUT => "/posts/$id.json", {
1067
                raw => $p->{raw}
1068
              , edit_reason => "Fix internal links (post AskBot migration)."
1069
              # XXX the below requires patching app/controllers/posts_controller.rb
1070
              , skip_validations => JSON::true
1071
              , no_bump => JSON::true
1072
            });
1073
        }
1074
        elsif ($action eq "trust_level") {
1075
            # reset TL back to its original value (before we raised it to close the issue)
1076
            my $id = $x->{id} // panic();
1077
            my $p = $DISCOURSE_PROFILES{$id} // panic($id); # never a duplicate
1078
            my $username = $ASKBOT_PROFILES{$id}->{discourse_username} // panic($id);
1079
            my $level = $p->{real_trust_level} // panic();
1080
            logmsg(INFO => "Demoting $username back to TL$level");
1081
            discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level", { level => $level });
1082
        }
1083
        else {
1084
            panic($action);
1085
        }
1086
    }
1087
}
1088

    
1089
my @FIX_TRUST_LEVELS = grep { defined $DISCOURSE_PROFILES{$_}->{real_trust_level} }
1090
                            keys %DISCOURSE_PROFILES;
1091
if (@NEEDS_POST_MANGLING or @FIX_TRUST_LEVELS) {
1092
    logmsg(NOTICE => "Starting post-processing");
1093
    $QUEUE = Thread::Queue::->new();
1094
    threads->create(\&post_process, $_) for 1 .. $NUM_WORKERS;
1095

    
1096
    $QUEUE->enqueue({ id => $_, action => "mangle_post" }) foreach @NEEDS_POST_MANGLING;
1097
    $QUEUE->enqueue({ id => $_, action => "trust_level" }) foreach @FIX_TRUST_LEVELS;
1098
    $QUEUE->end();
1099
    $_->join() foreach threads->list();
1100
}
1101

    
1102

    
1103
#######################################################################
1104
logmsg(NOTICE => "Finished!");
(4-4/6)