Project

General

Profile

Task #2952 » migrate.pl

migration script - Guilhem Moulin, 2021-07-31 01:46

 
#!/usr/bin/perl -T

use v5.10;
use strict;
use warnings;

use threads;
use threads::shared;
use Thread::Queue ();

BEGIN {
# JSON::XS is not thread-safe...
$ENV{PERL_JSON_BACKEND} = "Cpanel::JSON::XS";
}

use Config::Tiny ();
use Crypt::URandom qw/urandom/;
use DBI qw/:sql_types/;
use Encode ();
use File::Basename qw/basename dirname/;
use JSON ();
use LWP::MediaTypes qw/guess_media_type/;
use LWP::UserAgent ();
use POSIX ();
use URI::Escape qw/uri_escape_utf8/;
use Time::HiRes qw/usleep/;

my $SCRIPT_FILENAME = $0 =~ /\A(\p{Graph}+)\z/ ? $1 : die;
my $CONF = Config::Tiny::->read("$SCRIPT_FILENAME.conf") or die Config::Tiny::->errstr()."\n";
my ($DBH, $ASKBOT, $DISCOURSE, $WORKER, $INDENT);
my ($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT);
my $NUM_WORKERS = 8;

#######################################################################

# Logging
open my $LOG, ">", "$SCRIPT_FILENAME.log" or die "open($SCRIPT_FILENAME.log): $!";
sub logmsg($@) {
my $level = shift;
my @msg = @_;
$_ = Encode::encode("UTF-8", $_) foreach @msg;
my $level2 = $level eq "SOFTFAIL" ? "ERROR" : $level;
unshift @msg, $level2, " ";
unshift @msg, sprintf("[%0*d] ", length($NUM_WORKERS), $WORKER) if defined $WORKER;
push @msg, "\n";
$LOG->printflush( POSIX::strftime("%b %d %H:%M:%S ", localtime), @msg );
if ($level eq "WARN" or $level eq "NOTICE" or $level eq "SOFTFAIL") {
print STDERR @msg;
} elsif ($level eq "PANIC" or $level eq "ERROR") {
die @msg;
}
}
sub fail(@) {
return logmsg(ERROR => @_);
}
sub panic(@) {
my @msg = @_;
my @loc = caller;
push @msg, " " if @msg;
push @msg, "at line $loc[2] in $loc[1]";
return logmsg(PANIC => @msg);
}

# useragent_init()
# (Re-)init user agent, incl. connection cache.
sub useragent_init() {
$ASKBOT = LWP::UserAgent::->new(
keep_alive => 1,
ssl_opts => { verify_hostname => 1, SSL_hostname => $CONF->{askbot}->{host} },
default_headers => HTTP::Headers::->new()
);
$DISCOURSE = LWP::UserAgent::->new(
keep_alive => 1,
ssl_opts => { verify_hostname => 1 },
default_headers => HTTP::Headers::->new(
"Accept" => "application/json",
"Api-Key" => $CONF->{discourse}->{apikey}
)
);
($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT) = (0,0,0,0);
}

# decode_json_response($AGENT => $RESPONSE)
# Decode an application/json HTTP response. If a 5XY HTTP response
# was received, try again to send the request with a 1min timeout.
sub decode_json_response($$;$);
sub decode_json_response($$;$) {
my ($ua, $resp, $timeout) = @_;

if ($resp->is_success) {
my $data = $resp->decoded_content();
return JSON::->new->utf8->decode($data);
}

my $code = $resp->code();
my $req = $resp->request();

# random delay between 0.75 and 1.25 seconds to help solving race conditions
# between threads
$timeout = defined $timeout ? ($timeout * 2) : (int(rand(500_000)) + 750_000);
if ($code >= 500 and $code <= 599 and $timeout <= 320_000_000) {
# upon 5XY response codes, retry up to 5 minutes
logmsg(INFO => "Received $code from <", $req->uri(), ">; ",
"will retry in ", int($timeout/1000), "ms");
usleep($timeout);
return decode_json_response($ua => $ua->request($req), $timeout);
}

my @error = ($resp->status_line(), " ", $req->method(), " <", $req->uri(), ">");
my $content_type = $resp->content_type();
if ($content_type eq "application/json") {
my $content = $resp->decoded_content();
chomp $content;
push @error, " ", $content;
}
fail(@error);
}

# askbot_get($URL)
# Send a request to the AskBot API and decode the JSON response.
sub askbot_get($) {
my $url = $CONF->{askbot}->{apiurl} . shift;
my $t0 = Time::HiRes::gettimeofday();
my $resp = $ASKBOT->get($url, Host => $CONF->{askbot}->{host}, Accept => "application/json");
$ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
$ASKBOT_COUNT++;
return decode_json_response($ASKBOT => $resp);
}

# discourse_req($USERNAME, $METHOD, $PATH, [$BODY])
# Send a request to the Discourse API and decode the JSON response.
sub discourse_req($$$;$) {
my ($username, $method, $path, $body) = @_;
my $req = HTTP::Request::->new( $method => $CONF->{discourse}->{apiurl} . $path );
$req->header( Host => $CONF->{discourse}->{host} );
# the API doesn't mention case for usernames, but seems we need it
# for some Unicode usernames...
$req->header( "Api-Username" => Encode::encode("UTF-8", lc($username)) );
if (defined $body) {
$req->content_type("application/json");
$req->content(JSON::->new->utf8->encode($body));
}
my $t0 = Time::HiRes::gettimeofday();
my $resp = $DISCOURSE->request($req);
$DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t0;
$DISCOURSE_COUNT++;
return decode_json_response($DISCOURSE => $resp);
}

# upload($USERNAME, $PATH, type => $TYPE, ...)
# Download $PATH from AskBot and re-upload to Discourse; returns the
# upload response from the Discourse API, cf.
# https://docs.discourse.org/#tag/Uploads/paths/~1uploads.json/post
#
# If the download yields 404 then returns undef instead.
sub upload($$%) {
my $username = shift;
my $path = shift;
my %attrs = @_;

my $filename = delete $attrs{filename} // basename($path);
my $content_type = guess_media_type($filename);

logmsg(INFO => " " x $INDENT, "upload $path ($filename)");
panic($path) unless $path =~ /\A\//;

my $uri = $CONF->{askbot}->{apiurl} . $path;
my $t0 = Time::HiRes::gettimeofday();
my $resp = $ASKBOT->get($uri, Host => $CONF->{askbot}->{host}) or panic();
$ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
$ASKBOT_COUNT++;
unless ($resp->is_success) {
my @error = ($resp->status_line(), " <", $uri, ">");
if ($resp->code() == 404) {
logmsg(WARN => @error);
return;
} else {
panic(@error);
}
}

# RFC 1867, using HTTP::Request::Common's form_ref
my @form_data = ( undef, Encode::encode("UTF-8", $filename) );
push @form_data, "Content-type" => $content_type
, Content => $resp->decoded_content();

my $t1 = Time::HiRes::gettimeofday();
$resp = $DISCOURSE->post( $CONF->{discourse}->{apiurl} . "/uploads.json",
"Api-Username" => Encode::encode("UTF-8", lc($username)),
Host => $CONF->{discourse}->{host},
Content_Type => "form-data",
Content => [ %attrs, synchronous => "true", "files[]" => \@form_data ]
);
$DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t1;
$DISCOURSE_COUNT++;

return decode_json_response($DISCOURSE => $resp);
}

# username($ID)
# Returns the Discourse username associated with the AskBot user ID.
# The Discourse user is created if it doesn't already exist.
my %ASKBOT_PROFILES; # pre-existing AskBot profiles, incl. SSO linking information
my %ASKBOT_DUPLICATES;
my %DISCOURSE_PROFILES; # map AskBot user ID -> Discourse profile
sub username($);
sub username($) {
my $orig_id = shift;
my $id = $ASKBOT_DUPLICATES{$orig_id} // $orig_id;

state $sth_user_lookup = $DBH->prepare(q{
SELECT username FROM users WHERE id = ?
});

my $profile = $ASKBOT_PROFILES{$id} // fail("Can't find user #$id in users.json");
my $username = $profile->{discourse_username} // panic($id);

my ($u, $user_id);
{
my $p = $DISCOURSE_PROFILES{$id} // panic($id);
lock(%$p); # only need to lock that user's profile
return $username if exists $p->{user_id}; # already created

# user API is not localized, let's stick to english
$u = askbot_get("/en/api/v1/users/$id/");

my $msg = $profile->{username};
$msg .= " -> $username" if $username ne $profile->{username};
$msg .= ", duplicate #$orig_id" if $id != $orig_id;
logmsg(INFO => " " x $INDENT, "user #$id ($msg)");

my $email = $profile->{email} // panic();
if ($email =~ /\@(?:mailinator|trashmail)\.[a-z]+$/i
or $email !~ /\@[0-9a-z\-]+(?:\.[0-9a-z\-]+)+$/i) {
logmsg(WARN => "user #$id ($username) has invalid email address <$email>");
# create the account for content attribution but lock them out
$email = "noreply+$username\@documentfoundation.org";
}
my $r = discourse_req("system", POST => "/users.json", {
username => $username
, password => unpack("H*", urandom(32)) # XXX required even with SSO
, name => $profile->{name} // ""
, email => $email
, active => JSON::true
, approved => JSON::true
});
panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} == JSON::true;
$user_id = $r->{user_id} // panic($id);

$sth_user_lookup->bind_param(1, $user_id, SQL_INTEGER);
$sth_user_lookup->execute();
my ($username2) = $sth_user_lookup->fetchrow();
$sth_user_lookup->finish();
unless (defined $username2 and $username2 eq $username) {
# there is a race condition causing PostgreSQL to sometimes abort the transaction:
# Discourse reports the user as created but any attempt to use it as Api-Username fails
logmsg(SOFTFAIL => "Couldn't create user #$orig_id ($profile->{username}), retrying...");
return username($orig_id);
}

# assign a Trust Level to the newly created account, depending on the users's activity on
# AskBot, in order to bootstrap the site
my $tl = ($u->{reputation} >= 1000 and time - $u->{last_seen_at} <= 31557600) ? 4
: ($u->{reputation} >= 250 and $u->{answers} >= 100 and time - $u->{last_seen_at} <= 31557600) ? 3
: ($u->{answers} + $u->{comments} >= 100) ? 2
: ($u->{questions} + $u->{answers} + $u->{comments} >= 5) ? 1
: 0;
# leave non-returning (less than 10 between creation and last seen date) users at TL0
$tl = 0 if $u->{last_seen_at} - $u->{joined_at} < 864000;
discourse_req("system", PUT => "/admin/users/$user_id/trust_level", { level => $tl })
unless $tl == 0;

$p->{user_id} = $user_id;
$p->{trust_level} = $tl;
} # account was created and TL assigned, we can release the lock on $DISCOURSE_PROFILES{$id}
$INDENT++;

# preserve avatar
my %avatar_upload;
my $username_escaped = uri_escape_utf8(lc($username));
if ($u->{avatar} =~ m#^(?:(?:https?:)?//www\.gravatar\.com/avatar)?/#) {
my $path = "/user_avatar/$username_escaped/refresh_gravatar.json";
my $r = discourse_req($username, POST => $path, { username => $username });
$avatar_upload{upload_id} = $r->{gravatar_upload_id};
$avatar_upload{type} = "gravatar";
} elsif ($u->{avatar} =~ s,^(?:(?:https?:)?//ask\.libreoffice\.org)?(/[^/?#][^?#]*).*,$1,i) {
unless ($u->{avatar} eq "/m/default/media/images/nophoto.png") {
$u->{avatar} =~ s#^(/upfiles/avatars/[^/]+)/resized/[0-9]+/([^/]+)$#$1/$2#;
my $r = eval { upload($username, $u->{avatar}, type => "avatar", user_id => $user_id) };
if ($@) {
print STDERR $@; # already logged
} else {
$avatar_upload{upload_id} = $r->{id};
$avatar_upload{type} = "uploaded";
}
}
} else {
logmsg(WARN => "$username: unknown avatar URL $u->{avatar}");
}

if (%avatar_upload) {
my $errstr = "$username: Couldn't set avatar ($avatar_upload{type})";
if (defined $avatar_upload{upload_id}) {
my $path = "/u/$username_escaped/preferences/avatar/pick.json";
my $r = eval { discourse_req($username, PUT => $path, \%avatar_upload) };
unless (!$@ and defined $r->{success} and $r->{success} eq "OK") {
print STDERR $@ if $@; # already logged
$errstr .= ": " . $r->{message} if defined $r and defined $r->{message};
logmsg(SOFTFAIL => $errstr);
}
} else {
logmsg(WARN => $errstr);
}
}

# database surgery: set created and last seen dates from AskBot
state $sth_user_last = $DBH->prepare(q{
UPDATE users SET created_at = to_timestamp(?), last_seen_at = to_timestamp(?)
WHERE id = ?
});
$sth_user_last->bind_param(1, $u->{joined_at}, SQL_INTEGER);
$sth_user_last->bind_param(2, $u->{last_seen_at}, SQL_INTEGER);
$sth_user_last->bind_param(3, $user_id, SQL_INTEGER);
my $r = $sth_user_last->execute();
panic($r) unless $r == 1;

$INDENT--;
return $username;
}

# mangle_post_upload($USERNAME, $TEXT => $PATH, $INLINE)
# Return a Markdown string with given $TEXT and AskBot source link
# (attachment or screeshot). The source is downloaded and re-uploaded
# to Discourse.
sub mangle_post_upload($$$$) {
my ($username, $text, $path, $inline) = @_;
my $filename;
unless ($inline) {
# fix ugly C:\fakepath\something cf. https://www.mediawiki.org/wiki/Special:Code/MediaWiki/83225
$text =~ s/\AC:\\fakepath\\([^\\]+)\z/$1/;
my $ext = $path =~ /[^.\/\\]\.([a-zA-Z](?:-?[a-zA-Z0-9])*)\z/ ? $1 : undef;
$filename = !(defined $ext and $text =~ /[^.\/\\]\.\Q$ext\E\z/i) ? basename($path)
: $text =~ /\A[A-Z]:\\.*\\(.+)\z/ ? basename($1)
: basename($text);
$filename =~ tr[\t\n\f\r ][ ]s;
}
my $u = eval { upload($username, $path, type => "composer", filename => $filename) };
unless (!$@ and defined $u) {
# 404 or fatal error, can't do better
print STDERR $@ if $@; # already logged
logmsg(SOFTFAIL => "Couldn't re-upload $path");
return;
}
return $inline ? "![$text]($u->{short_url})" : "[$text|attachment]($u->{short_url})";
}

# mangle_post_internal_link($ID, $LINK, $FINAL, $REF)
# Return appropriate replacement for AskBot internal $LINK in post $ID.
# The 3rd argument is a boolean indicating whether the import is
# ongoing or if we're at the final phase.
# The 4rd argument is a reference to a variable that is is set to 1 if
# $FINAL is false and the mangling couldn't proceeed because the
# AskBot question hasn't been processed yet, or if $FINAL is true and
# a suitable substitute was found.
my %URLMAP :shared; # map AskBot post ID -> Discourse URL
my %CATEGORY_IDs; # map language code -> Discourse category
sub mangle_post_internal_link($$$$) {
my ($post_id, $link, $final, $ref_rv) = @_;

my $uri = $link;
my $hash = $uri =~ s/\#(.*)// ? $1 : "";
my $query = $uri =~ s/\?(.*)// ? $1 : "";
my $lang = $uri =~ m,^ /(?:question|user)s? (?:/|$),x ? "en"
: $uri =~ s,^/([\p{PosixAlnum}\-_]+)(/(?:question|user)s? (?:/|$)),$2,x ? $1
: $uri =~ s,^/([\p{PosixAlnum}\-_]+) /?$,/questions/,x ? $1
: undef;

my $newlink;
unless (defined $lang) {
return unless $uri eq "" or $uri eq "/"; # can't do better
return if $CONF->{discourse}->{baseurl} eq "https://ask.libreoffice.org" and $query eq "" and $hash eq "";
$newlink = "/"; # site root (ignore query and hash)
} elsif ($uri =~ s#^/users?/([0-9]+)(?:/|$)##) {
# don't create accounts during the final phase
return if $final;
my $username = eval { username($1) };
if ($@) {
print STDERR $@; # already logged
logmsg(SOFTFAIL => "Couldn't create user #$1");
return;
}
$newlink = "/u/" . uri_escape_utf8(lc($username)); # profile URL
} elsif ($uri =~ s#^/questions?/([0-9]+)(?:/|$)##) {
my $id = $1;
my $old_post_id =
$hash =~ /^post-id-([0-9]+)$/ ? $1
: $query =~ /(?:^|&)(?:answer|comment)=([0-9]+)(?:&|$)/ ? $1
: $id;
unless (defined ($newlink = $URLMAP{$old_post_id})) {
if (!$final) {
$$ref_rv = 1; # needs post mangling
} else {
logmsg(WARN => "#$post_id: Can't mangle <$link> (missing #$old_post_id)");
}
return;
}
} elsif ($uri =~ m#^/questions?/?$#) {
my $c = $CATEGORY_IDs{$lang} // return;
$newlink = "/c/$c->{slug}/$c->{id}";
} elsif ($uri =~ m#^/questions?/ask/?$#) {
my $c = $CATEGORY_IDs{$lang} // return;
$newlink = "/new-topic?category=$c->{slug}";
} elsif ($uri =~ m#^/users?/?$#) {
$newlink = "/u";
} else {
return;
}
$$ref_rv = 1 if $final; # modified
logmsg(DEBUG => "#$post_id: Mangling <$link> -> <$newlink>");
return $CONF->{discourse}->{baseurl} . $newlink;
}

# mangle_post_internal_links($ID, $REF_TEXT, [$FINAL])
# Mangle all internal question/answer/comment/user links. The 3rd
# argument is a boolean indicating whether the import is ongoing or if
# we're at the final phase.
# Return a boolean, which when $FINAL is false indicates whether the
# post needs post-mangling during the final phase, and when $FINAL is
# true indicates whether the input text was mangled.
sub mangle_post_internal_links($$;$) {
my ($id, $text, $final) = @_;
my $rv = 0;

$$text =~ s#\b (?i:(?:https?:)//ask\.libreoffice\.org)
( / [^\P{Graph}!"'\(\)<>\[\]\`\{\}]+? )
(?! [^\P{Graph}!"'\(\),\.<>\?\[\]\`\{\}] | [!',\.\?]\p{Graph} )
# mangle_post_internal_link($id, $1, $final => \$rv) // $& #egx;

if ($final or !$rv) {
# note: the regexp is intentionally loose
logmsg(WARN => "Post #$id contains dangling internal link <$_>") foreach
$$text =~ m#\b(?i:(?:https?:)//ask\.libreoffice\.org)/(?![ctu]/)\S+#g;
}

return $rv;
}

# mangle_post($AUTHOR, $POST)
# Mangle $POST->{text} in place:
# * fix attachments and screenshots;
# * fix link to other profiles, "normal" links, bare URLs, and @-addressing
# (the target profile is created if it doesn't exist); and
# * fix links to questions, answers and comments.
# Return a boolean indicating whether the post needs further mangling
# during the final phase.
sub mangle_post($$) {
my ($author, $p) = @_;

# fix attachments and screenshots (an empty link text is allowed for screenshots, and we also
# allow it for atttachment since we append "|attachment")
$p->{text} =~ s#!\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/m/[\p{PosixAlnum}\-_]+/media/images/[^\[\]\(\)]+) \)#
mangle_post_upload("system", $1 => $2, 1) // $& #egx; # NOTE: discourse does deduplication
$p->{text} =~ s#(!)?\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/upfiles/[^\[\]\(\)]+) \)#
mangle_post_upload($author, $2 => $3, $1) // $& #egx;
# NOTE: mangle_post_upload() can fail if it tries to upload the same document twice at the
# same time (race condition on uniqueness constraint) -- we could retry but we don't care so
# much since sidekiq will automatically fetch https://ask.libreoffice.org/upfiles/12345.png
# locally and update the post accordingly
# NOTE: the handful of bare links https://ask.libreoffice.org/upfiles/foobar.xyz are
# untouched hence will 404

# fix @-addressing; we can't match $p->{text} against /\@\w+/ here because AskBot
# usernames may contain spaces... so we check for <a href="..."/> in the HTML instead
my %user_ids;
$user_ids{$_} = 1 foreach $p->{summary} =~ m#<a\s(?:[^<>]*\s)*?href=['"]
(?i:(?:https?:)?//ask\.libreoffice\.org)? (?:/[\p{PosixAlnum}\-_]+)?
/users?/([0-9]+)[/'"]#gsx;
foreach my $id (keys %user_ids) {
# we don't want duplicate resolution here
my $profile = $ASKBOT_PROFILES{$id};
unless (defined $profile) {
logmsg(WARN => "#$id is not a known AskBot user, can't mangle its \@-addresses");
next;
}
my $old_username = $profile->{username} // panic($id);
my $new_username = eval { username($id) }; # might need to be created
if ($@) {
print STDERR $@; # already logged
logmsg(SOFTFAIL => "Couldn't create user #$id");
next;
}
$p->{text} =~ s/\@\Q$old_username\E (?! [\.\-]?[\p{Mark}_\p{Alnum}] )
/\@$new_username/gx
unless $old_username eq $new_username;
}

# explicitely prefix AskBot base URL so we don't have to do the work twice
$p->{text} =~ s#\[ ([^\[\]\(\)]+) \]\( (/[^\[\]\(\)]*) \)#[$1](https://ask.libreoffice.org$2)#gx;

# replace [URL](URL) links with a bare URL when possible so Discourse can use
# its own text for internal post links
$p->{text} =~ s#\[ ((?i:https?:)?//[^\[\]\(\)]+) \]\( \1 \)
(?! [!',\.\?]* [^\P{Graph}!"'\(\),\.<>\?\[\]\{\}] )#$1#gx;

return mangle_post_internal_links($p->{id} => \$p->{text});
}

# ts2iso8601($TIMESTAMP)
# Convert a timestamp to ISO 8601.
sub ts2iso8601($) {
my $ts = 0 + shift;
return POSIX::strftime("%Y-%m-%dT%H:%M:%SZ", gmtime($ts));
}

# replies(language_code => $LANG, topic_id => $ID, replies => [..], ..)
# Recursively process the given replies under some topic ID. The
# parent post is indicated by the optional reply_to_post_number
# attribute.
# Return the number of replies that couldn't be posted, if any
my @NEEDS_POST_MANGLING :shared;
sub replies(%);
sub replies(%) {
my %attrs = @_;
my $post_type = defined $attrs{reply_to_post_number} ? "comment" : "answer";
my $askbot_url = "/$attrs{language_code}/api/v1/${post_type}s/";

state $sth_set_vote_count = $DBH->prepare(q{
INSERT INTO post_custom_fields
(post_id, name, value, created_at, updated_at)
VALUES (?, 'vote_count', ?, NOW(), NOW())
});

my $num_errors = 0;
my $indent_base = $INDENT;
foreach my $id (sort {$a <=> $b} @{$attrs{replies}}) { # lowest ID first
logmsg(INFO => " " x $indent_base, "$post_type #$id");
$INDENT = $indent_base + 1;

my ($r, $t);
eval {
$r = askbot_get("$askbot_url$id/");
my $author = username($r->{author}->{id});
my $needs_post_mangling = mangle_post($author, $r);

$t = discourse_req($author, POST => "/posts.json", {
topic_id => $attrs{topic_id}
, reply_to_post_number => $attrs{reply_to_post_number}
, raw => $r->{text}
, created_at => ts2iso8601($r->{added_at})
, skip_validations => JSON::true
});

push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
};
if ($@) {
print STDERR $@; # already logged
logmsg(SOFTFAIL => "Couldn't post $post_type #$id");
$num_errors++;
next;
}

# no need to lock here, other threads will never store that ID
$URLMAP{$r->{id}} = "/t/$t->{topic_slug}/$t->{topic_id}/$t->{post_number}";

if ($post_type eq "answer") {
# database surgery: set vote count on answers (skip
# questions and comments as Discourse doesn't show these)
if ((my $score = $r->{score}) > 0) {
$sth_set_vote_count->bind_param(1, $t->{id}, SQL_INTEGER);
$sth_set_vote_count->bind_param(2, $score, SQL_INTEGER);
my $r = $sth_set_vote_count->execute();
panic($r) unless $r == 1;
}
if (defined $attrs{accepted_answer_id} and $attrs{accepted_answer_id} == $id) {
# XXX rate-limiting is not configurable, need to comment out
# "limit_accepts" in plugins/discourse-solved/plugin.rb:accept
logmsg(INFO => " " x $INDENT, "accept answer #$id");
$INDENT++;
my $r = eval { discourse_req($attrs{accepted_by}, POST => "/solution/accept",
{ id => $t->{id} }) };
if ($@) {
print STDERR $@; # already logged
logmsg(SOFTFAIL => "Couldn't accept answer #$id");
} elsif (!defined $r->{success} or $r->{success} ne "OK") {
panic(JSON::->new->utf8->encode($r));
}
$INDENT--;
}
}

# nested comments
$num_errors += replies(
language_code => $attrs{language_code}
, topic_id => $t->{topic_id}
, reply_to_post_number => $t->{post_number}
, replies => $r->{comment_ids}
) if @{$r->{comment_ids}};
}
return $num_errors;
}


#######################################################################
# Initialisation

logmsg(NOTICE => "Starting $SCRIPT_FILENAME");
my $TAG_GROUP_LIBO_COMPONENT = "LibreOffice component";

# Categories
useragent_init();
{
my $path = dirname($SCRIPT_FILENAME) . "/categories.json";
open my $fh, "<", $path or panic("open($path): $!");
my $categories = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
close $fh or panic("close: $!");

my %slugs;
my $r = discourse_req("system", GET => "/categories.json");
$slugs{$_->{slug}} = $_->{id} foreach @{$r->{category_list}->{categories}};

foreach my $c (@$categories) {
my $l = $c->{language_code} // panic();
$CATEGORY_IDs{$l} = { slug => $c->{slug} };
if (defined (my $id = $slugs{$c->{slug}})) {
# the category already exists
$CATEGORY_IDs{$l}->{id} = $id;
} else {
logmsg(NOTICE => "Creating category $c->{slug}");
my $r = discourse_req("system", POST => "/categories.json", {
name => $c->{name},
slug => $c->{slug},
custom_fields => { qa_enabled => JSON::true }
});
$CATEGORY_IDs{$l}->{id} = $r->{category}->{id} // panic();
}
}

$r = discourse_req("system", POST => "/tag_groups.json", {
tag_group => {
name => $TAG_GROUP_LIBO_COMPONENT,
tag_names => [ qw/common writer calc impress base draw math meta/ ],
permissions => { everyone => 1 },
one_per_topic => JSON::false
}
});
panic("Couldn't create group tag") unless defined $r->{tag_group};

# reorder (lexicographically) by slugs
logmsg(NOTICE => "Updating and ordering categories");
my (%categories_sorted, %positions);
my @categories_sorted = sort { $a->{slug} cmp $b->{slug} } @$categories;
foreach (my $i = 0; $i <= $#$categories; $i++) {
$categories_sorted{$categories_sorted[$i]->{slug}} = $i;
}
my $offset = 0;
$r = discourse_req("system", GET => "/categories.json");
foreach my $c (@{$r->{category_list}->{categories}}) {
my $s = $c->{slug} // panic();
my %h;
if ($s eq "uncategorized") {
$h{color} = "3AB54A";
$h{text_color} = "FFFFFF";
$positions{$c->{id}} = 0;
} elsif ($s eq "site-feedback") {
$positions{$c->{id}} = scalar(@$categories) + 1;
} elsif ($s eq "staff") {
$positions{$c->{id}} = scalar(@$categories) + 2;
} elsif ($s eq "lounge") {
$positions{$c->{id}} = scalar(@$categories) + 3;
} elsif (defined (my $i = $categories_sorted{$s})) {
$positions{$c->{id}} = $i + 1;
$h{num_featured_topics} = 1;
} else {
$positions{$c->{id}} = scalar(@$categories) + 4 + $offset;
$offset++;
}
next unless %h;
my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", \%h);
panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
}
$r = discourse_req("system", POST => "/categories/reorder",
{ mapping => JSON::->new->encode(\%positions) });
panic("Couldn't reorder categories") unless defined $r->{success} and $r->{success} eq "OK";
}

# AskBot (and SSO linking information) user profiles
{
my $path = dirname($SCRIPT_FILENAME) . "/users.json";
logmsg(NOTICE => "Loading $path");
open my $fh, "<", $path or panic("open($path): $!");
my $profiles = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
close $fh or panic("close: $!");

# merge accounts with the same email address (into the account with lowest ID)
my (%emails, %duplicates);
foreach my $id (keys %$profiles) {
my $p = $profiles->{$id} // panic();
# `lc` is a bit YOLO but Discourse matches email addresses case-insensitively...
my %e = ( lc($p->{email}) => 1 );
$e{lc($_)} = 1 foreach @{$p->{sso_emails} // []};
foreach (keys %e) {
$emails{$_} //= [];
push @{$emails{$_}}, $id;
}
}
foreach my $e (keys %emails) {
my $ids = $emails{$e} // panic();
next unless $#$ids > 0;
my ($id, @duplicates) = sort {$a <=> $b} @$ids;
$duplicates{$id} = \@duplicates;
}
foreach my $id (sort {$a <=> $b} keys %duplicates) {
my $dups = $duplicates{$id};
my $username = $profiles->{$id}->{username} // panic();
logmsg(INFO => "Merging user(s) ", join (", ", map {"#$_"} @$dups),
" with #$id ($username)");
$ASKBOT_DUPLICATES{$_} = $id + 0 foreach @$dups;
}

my @reserved_usernames = # 'reserved_usernames' setting
qw/admin moderator administrator mod sys system community info
you name username user nickname discourse discourseorg
discourseforum support/;

my %usernames;
my @profile_ids = sort {$a <=> $b} keys %$profiles;

# given users #1 "FooBar" and #2 "whatever" with SSO username "foobar", #1 and #2
# are respectively renamed to "foobar1" and "foobar"; we assume users not in SSO
# aren't that active in the project and allow hijacking usernames for SSO users
$usernames{lc($profiles->{$_}->{sso_username})} //= $ASKBOT_DUPLICATES{$_} // $_
# no need to add the version without `lc` here, SSO usernames are case insensitive
foreach grep { defined $profiles->{$_}->{sso_username} } @profile_ids;

# AskBot doesn't have a username requirement but Discourse does (and usernames are matched
# case-insensitively); we try to preserve users on a first come first served basis: given users
# #1 "Foo", #2 "foo", #3 "bar", #4 "bar" then "foo" and "bar" are respectively attributed to #2
# (because the case was correct) and #3 while #1 and #4 are renamed "Foo1" and "bar1"
$usernames{$profiles->{$_}->{username}} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
$usernames{lc($profiles->{$_}->{username})} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;

# populate %ASKBOT_PROFILES
foreach my $id (@profile_ids) {
my $p = $profiles->{$id} // panic();
my %p = ( username => $p->{username} );
$id += 0; # convert to integer

if (exists $ASKBOT_DUPLICATES{$id}) {
# keep AskBot username for @-addressing mangling but skip other attributes
$ASKBOT_PROFILES{$id} = \%p;
next;
}

# mangle AskBot/SSO username to comply with Discourse's stricter policy
# https://github.com/discourse/discourse/blob/master/app/models/username_validator.rb
my $username = $p->{sso_username} // $p->{username} // panic();
$username =~ s/^\@+//; # @foo -> foo
$username =~ s/\@.*//; # foo@example.net -> foo
$username =~ s/[^\p{Alnum}\p{Mark}._\-]//g; # UNICODE_INVALID_CHAR_PATTERN
$username =~ s/^[^\p{Alnum}\p{Mark}_]+//; # INVALID_LEADING_CHAR_PATTERN
$username =~ s/[^\p{Alnum}\p{Mark}]+$//; # INVALID_TRAILING_CHAR_PATTERN
$username =~ s/([\-_.])+/$1/g; # REPEATED_SPECIAL_CHAR_PATTERN
$username =~ s/\.(js|json|css|html?|xml|jpe?g|png|gif|bmp|ico|tiff?|woff2?)$/-$1/i; # CONFUSING_EXTENSIONS
$username = "unknown" if $username eq "";

# comparison is case insensitive
$username = "not-$username" if grep { lc($username) eq $_ } @reserved_usernames;

while (($usernames{lc($username)} // $id) != $id) {
# username is already taken, increment or append a number
$username .= "1" unless $username =~ s/([0-9]+)$/sprintf("%0*d", length($1), $1 + 1)/e;
}

$usernames{lc($username)} = $id;
$p{discourse_username} = $username;
$p{name} = $p->{name} if defined $p->{name} and $p->{name} ne "";
$p{email} = @{$p->{sso_emails} // []} ? $p->{sso_emails}->[0] : $p->{email};
$ASKBOT_PROFILES{$id} = \%p;
$DISCOURSE_PROFILES{$id} = shared_clone({});
}

# pre-existing users
$DISCOURSE_PROFILES{32931}->{user_id} = 1;
$DISCOURSE_PROFILES{32931}->{trust_level} = 1;
}


#######################################################################
# Import workers, each fetching work from a queue of questions

my $QUEUE = Thread::Queue::->new();
my ($TOPIC_lck, $CLOSE_lck) :shared;
sub import_questions($) {
$WORKER = shift;
useragent_init(); # re-init

my $conf_db = $CONF->{database} // panic();
my $dsn = "DBI:Pg:dbname=$conf_db->{dbname}; host=$conf_db->{host}";
$DBH = DBI::->connect( $dsn, $conf_db->{user} // "", $conf_db->{password} // "", {RaiseError => 1} );

my $sth_set_view_count = $DBH->prepare(q{
UPDATE topics SET views = ? WHERE id = ?
});
my $sth_set_closed_at_by = $DBH->prepare(q{
UPDATE posts SET created_at = to_timestamp(?)
WHERE topic_id = ? AND post_type != 1 AND action_code = 'closed.enabled'
});

my $elapsed = 0;
while (defined (my $q = $QUEUE->dequeue())) {
logmsg(INFO => "question #$q->{id}");
$INDENT = 1;

my @tags = grep # skip reserved names, cf. app/models/tag.rb
{ lc($_) ne "none" and lc($_) ne "constructor" }
# NOTE: discourse mangles tag names, in particular
# turns 'libreoffice-1.2.3' to 'libreoffice-123', cf.
# lib/discourse_tagging.rb:clean_tag()
@{$q->{tags}};

my $lang = $q->{language_code} // panic();
my $category = $CATEGORY_IDs{$lang} // panic($lang);
my $t0 = Time::HiRes::gettimeofday();

my ($author, $t);
eval {
$author = username($q->{author}->{id});
my $needs_post_mangling = mangle_post($author, $q);

# post topic
lock($TOPIC_lck);
$t = discourse_req($author, POST => "/posts.json", {
title => $q->{title}
, raw => $q->{text}
, category => $category->{id}
, tags => \@tags
, created_at => ts2iso8601($q->{added_at})
, skip_validations => JSON::true
});
push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
};
if ($@) {
print STDERR $@; # already logged
logmsg(SOFTFAIL => "Couldn't post question #$q->{id}");
next;
}

my $topic_id = $t->{topic_id} // panic();
my $topic_url = "/t/$t->{topic_slug}/$topic_id";
my $num_errors = 0;

# no need to lock here, other threads will never store that ID
$URLMAP{$q->{id}} = $topic_url;

# (recursively) process replies
$num_errors += replies(
language_code => $lang
, topic_id => $topic_id
, reply_to_post_number => $t->{post_number}
, replies => $q->{comment_ids}
) if @{$q->{comment_ids}};

$num_errors += replies(
language_code => $lang
, topic_id => $topic_id
# answers have null has reply_to_post_number attribute, see
# https://github.com/paviliondev/discourse-question-answer/blob/master/extensions/topic_extension.rb#L16
, reply_to_post_number => undef
, replies => $q->{answer_ids}
, accepted_answer_id => $q->{accepted_answer_id}
, accepted_by => $author # assume accepted by OP
) if @{$q->{answer_ids}};

if ($q->{closed} == JSON::true) {
logmsg(INFO => " " x $INDENT, "close question #$q->{id}");
$INDENT++;
my $id = $q->{closed_by}->{id} // panic();
my $username = eval { username($id) };
if ($@) {
print STDERR $@; # already logged
logmsg(SOFTFAIL => "Couldn't create user #$id");
} else {
{
my $p = $DISCOURSE_PROFILES{ $ASKBOT_DUPLICATES{$id} // $id } // panic();
lock(%$p); # only need to lock that user's profile
if ((my $tl = $p->{trust_level}) < 4) {
logmsg(DEBUG => "Temporarily promoting $username to TL4");
discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level",
{ level => 4 });
$p->{trust_level} = 4;
$p->{real_trust_level} = $tl;
}
}
lock($CLOSE_lck);
my $r = discourse_req($username, PUT => "/t/$topic_id/status.json",
{ status=> "closed", enabled => "true" });
panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} eq "OK";
}

# database surgery: set closing date from AskBot
$sth_set_closed_at_by->bind_param(1, $q->{closed_at}, SQL_INTEGER);
$sth_set_closed_at_by->bind_param(2, $topic_id, SQL_INTEGER);
my $r = $sth_set_closed_at_by->execute();
panic($r) unless $r == 1;
$INDENT--;
}

if ((my $view_count = $q->{view_count}) > 0) {
# database surgery: set view count on the topic
$sth_set_view_count->bind_param(1, $view_count, SQL_INTEGER);
$sth_set_view_count->bind_param(2, $topic_id, SQL_INTEGER);
my $r = $sth_set_view_count->execute();
panic($r) unless $r == 1;
}

logmsg(WARN => "$topic_url (AskBot question #$q->{id}) has >=$num_errors missing replies!")
if $num_errors > 0;
$elapsed += Time::HiRes::gettimeofday() - $t0;
}

logmsg(INFO => sprintf("Worker stats: total active time %.2fs", $elapsed));
logmsg(INFO => sprintf("AskBot stats: %d requests, %.2fs total", $ASKBOT_COUNT, $ASKBOT_TIME));
logmsg(INFO => sprintf("Discourse stats: %d requests, %.2fs total", $DISCOURSE_COUNT, $DISCOURSE_TIME));
}

logmsg(NOTICE => "Spawning $NUM_WORKERS workers");
$QUEUE->limit = 4 * $NUM_WORKERS; # no need to prefetch pages if the workers can't keep up
threads->create(\&import_questions, $_) for 1 .. $NUM_WORKERS;


#######################################################################
# Queue up questions
# we process them ordered by ID so lower question IDs get lower topic IDs

#{
# my %question_ids = (
# 161330 => "en" # @-addressing with old username
# , 238673 => "en" # closed
# , 302629 => "en" # upvote, accepted answer
# , 303050 => "en" # screenshots, attachments (both with C:\fakepath\ and without)
# , 285734 => "ja" # URL to profile, unicode attachements
# , 161228 => "en" # question linked in the above, OneBox
# , 304617 => "en" # closed by someone with TL < 4
# , 161334 => "en" # Hello1 -> Hello2
# # duplicate accounts
# , 18673 => "en", 1505 => "en"
# , 25329 => "pt-br", 128011 => "pt-br"
# );
#
# foreach my $id (sort {$a <=> $b} keys %question_ids) {
# my $lang = $question_ids{$id};
# my $url = "/$lang/api/v1/questions/$id/";
# my $q = askbot_get($url);
# $q->{language_code} = $lang;
# $QUEUE->enqueue($q);
# }
#}

my $PAGE_NUM = 0; # number of most recent pages to process (0 means ALL)
my %PAGES;
foreach my $lang (keys %CATEGORY_IDs) {
my $url = "/$lang/api/v1/questions/";

my $offset = 1; # index of the first page to consider
unless ($PAGE_NUM == 0) {
my $h = askbot_get($url);
$offset = $h->{pages} - $PAGE_NUM + 1 if $PAGE_NUM < $h->{pages};
}

my $h = askbot_get("$url?sort=age-asc&page=$offset");
my @questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
logmsg(NOTICE => "Loaded /$lang/questions page $offset/$h->{pages}");

# buffer the page
$PAGES{$lang} = {
current_page => $offset
, pages => $h->{pages}
, questions => \@questions
, url => $url
} if @questions;
}

#my $COUNT = 1000;
MAINLOOP: while (%PAGES) {
# find language code with the lowest question ID
my ($alang, $blang) = sort { $PAGES{$a}->{questions}->[0]->{id} <=> $PAGES{$b}->{questions}->[0]->{id} }
keys %PAGES;
my $p = $PAGES{$alang};

# process questions from $alang that have a lower ID than any other language
# (ie, until we reach the first question from $blang)
my $questions = $p->{questions};
my $bound = defined $blang ? $PAGES{$blang}->{questions}->[0]->{id} : -1;
while ($bound < 0 or $questions->[0]->{id} < $bound) {
my $q = shift @$questions;
$q->{language_code} = $alang;
$QUEUE->enqueue($q);
#last MAINLOOP unless --$COUNT > 0;
next if @$questions; # $alang has some questions buffered still

unless ($p->{current_page} < $p->{pages}) {
logmsg(NOTICE => "Done queuing /$alang/questions");
delete $PAGES{$alang};
last;
}

# $alang's highest buffered question ID was lower that $blang's lowest, need to load another page
my $offset = ++$p->{current_page};
my $h = askbot_get($p->{url} . "?sort=age-asc&page=$offset");
logmsg(NOTICE => "Loaded /$alang/questions page $offset/$h->{pages}");
$p->{pages} = $h->{pages}; # update page count, there might be new posts

@$questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
}
}


#######################################################################
# Done!

$QUEUE->end();
$_->join() foreach threads->list();
logmsg(NOTICE => "Done importing");

logmsg(NOTICE => "Updating categories");
foreach my $c (values %CATEGORY_IDs) {
my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", {
num_featured_topics => 3 # restore default
, email_in => "question+$c->{slug}\@ask.libreoffice.org"
, required_tag_group_name => $TAG_GROUP_LIBO_COMPONENT
, min_tags_from_required_group => 1
, allow_global_tags => "true"
});
panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
}

{
# generate an AskBot question/answer/comment ID -> Discourse path for nginx rewriting
my $pathmap = dirname($SCRIPT_FILENAME) . "/nginx.map";
if (open my $fh, ">", $pathmap) {
printf $fh "%-6d %s;\n", $_, $URLMAP{$_} foreach sort {$a <=> $b} keys %URLMAP;
} else {
logmsg(WARN => "Couldn't open $pathmap for writing");
}
}


#######################################################################
# Post processing

sub post_process($) {
$WORKER = shift;
while (defined (my $x = $QUEUE->dequeue())) {
my $action = $x->{action} // panic();
if ($action eq "mangle_post") {
# try again to mangle internal links now that we have the full URL map
my $id = $x->{id} // panic();
logmsg(INFO => "Post-mangling post #$id");
my $p = discourse_req("system", GET => "/posts/$id.json");

my $modified = mangle_post_internal_links($id => \$p->{raw}, 1);
unless ($modified) {
logmsg(INFO => "Not updating unchanged post #$id");
next;
}
discourse_req("system", PUT => "/posts/$id.json", {
raw => $p->{raw}
, edit_reason => "Fix internal links (post AskBot migration)."
# XXX the below requires patching app/controllers/posts_controller.rb
, skip_validations => JSON::true
, no_bump => JSON::true
});
}
elsif ($action eq "trust_level") {
# reset TL back to its original value (before we raised it to close the issue)
my $id = $x->{id} // panic();
my $p = $DISCOURSE_PROFILES{$id} // panic($id); # never a duplicate
my $username = $ASKBOT_PROFILES{$id}->{discourse_username} // panic($id);
my $level = $p->{real_trust_level} // panic();
logmsg(INFO => "Demoting $username back to TL$level");
discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level", { level => $level });
}
else {
panic($action);
}
}
}

my @FIX_TRUST_LEVELS = grep { defined $DISCOURSE_PROFILES{$_}->{real_trust_level} }
keys %DISCOURSE_PROFILES;
if (@NEEDS_POST_MANGLING or @FIX_TRUST_LEVELS) {
logmsg(NOTICE => "Starting post-processing");
$QUEUE = Thread::Queue::->new();
threads->create(\&post_process, $_) for 1 .. $NUM_WORKERS;

$QUEUE->enqueue({ id => $_, action => "mangle_post" }) foreach @NEEDS_POST_MANGLING;
$QUEUE->enqueue({ id => $_, action => "trust_level" }) foreach @FIX_TRUST_LEVELS;
$QUEUE->end();
$_->join() foreach threads->list();
}


#######################################################################
logmsg(NOTICE => "Finished!");
(4-4/6)