|
#!/usr/bin/perl -T
|
|
|
|
use v5.10;
|
|
use strict;
|
|
use warnings;
|
|
|
|
use threads;
|
|
use threads::shared;
|
|
use Thread::Queue ();
|
|
|
|
BEGIN {
|
|
# JSON::XS is not thread-safe...
|
|
$ENV{PERL_JSON_BACKEND} = "Cpanel::JSON::XS";
|
|
}
|
|
|
|
use Config::Tiny ();
|
|
use Crypt::URandom qw/urandom/;
|
|
use DBI qw/:sql_types/;
|
|
use Encode ();
|
|
use File::Basename qw/basename dirname/;
|
|
use JSON ();
|
|
use LWP::MediaTypes qw/guess_media_type/;
|
|
use LWP::UserAgent ();
|
|
use POSIX ();
|
|
use URI::Escape qw/uri_escape_utf8/;
|
|
use Time::HiRes qw/usleep/;
|
|
|
|
my $SCRIPT_FILENAME = $0 =~ /\A(\p{Graph}+)\z/ ? $1 : die;
|
|
my $CONF = Config::Tiny::->read("$SCRIPT_FILENAME.conf") or die Config::Tiny::->errstr()."\n";
|
|
my ($DBH, $ASKBOT, $DISCOURSE, $WORKER, $INDENT);
|
|
my ($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT);
|
|
my $NUM_WORKERS = 8;
|
|
|
|
#######################################################################
|
|
|
|
# Logging
|
|
open my $LOG, ">", "$SCRIPT_FILENAME.log" or die "open($SCRIPT_FILENAME.log): $!";
|
|
sub logmsg($@) {
|
|
my $level = shift;
|
|
my @msg = @_;
|
|
$_ = Encode::encode("UTF-8", $_) foreach @msg;
|
|
my $level2 = $level eq "SOFTFAIL" ? "ERROR" : $level;
|
|
unshift @msg, $level2, " ";
|
|
unshift @msg, sprintf("[%0*d] ", length($NUM_WORKERS), $WORKER) if defined $WORKER;
|
|
push @msg, "\n";
|
|
$LOG->printflush( POSIX::strftime("%b %d %H:%M:%S ", localtime), @msg );
|
|
if ($level eq "WARN" or $level eq "NOTICE" or $level eq "SOFTFAIL") {
|
|
print STDERR @msg;
|
|
} elsif ($level eq "PANIC" or $level eq "ERROR") {
|
|
die @msg;
|
|
}
|
|
}
|
|
sub fail(@) {
|
|
return logmsg(ERROR => @_);
|
|
}
|
|
sub panic(@) {
|
|
my @msg = @_;
|
|
my @loc = caller;
|
|
push @msg, " " if @msg;
|
|
push @msg, "at line $loc[2] in $loc[1]";
|
|
return logmsg(PANIC => @msg);
|
|
}
|
|
|
|
# useragent_init()
|
|
# (Re-)init user agent, incl. connection cache.
|
|
sub useragent_init() {
|
|
$ASKBOT = LWP::UserAgent::->new(
|
|
keep_alive => 1,
|
|
ssl_opts => { verify_hostname => 1, SSL_hostname => $CONF->{askbot}->{host} },
|
|
default_headers => HTTP::Headers::->new()
|
|
);
|
|
$DISCOURSE = LWP::UserAgent::->new(
|
|
keep_alive => 1,
|
|
ssl_opts => { verify_hostname => 1 },
|
|
default_headers => HTTP::Headers::->new(
|
|
"Accept" => "application/json",
|
|
"Api-Key" => $CONF->{discourse}->{apikey}
|
|
)
|
|
);
|
|
($DISCOURSE_TIME, $DISCOURSE_COUNT, $ASKBOT_TIME, $ASKBOT_COUNT) = (0,0,0,0);
|
|
}
|
|
|
|
# decode_json_response($AGENT => $RESPONSE)
|
|
# Decode an application/json HTTP response. If a 5XY HTTP response
|
|
# was received, try again to send the request with a 1min timeout.
|
|
sub decode_json_response($$;$);
|
|
sub decode_json_response($$;$) {
|
|
my ($ua, $resp, $timeout) = @_;
|
|
|
|
if ($resp->is_success) {
|
|
my $data = $resp->decoded_content();
|
|
return JSON::->new->utf8->decode($data);
|
|
}
|
|
|
|
my $code = $resp->code();
|
|
my $req = $resp->request();
|
|
|
|
# random delay between 0.75 and 1.25 seconds to help solving race conditions
|
|
# between threads
|
|
$timeout = defined $timeout ? ($timeout * 2) : (int(rand(500_000)) + 750_000);
|
|
if ($code >= 500 and $code <= 599 and $timeout <= 320_000_000) {
|
|
# upon 5XY response codes, retry up to 5 minutes
|
|
logmsg(INFO => "Received $code from <", $req->uri(), ">; ",
|
|
"will retry in ", int($timeout/1000), "ms");
|
|
usleep($timeout);
|
|
return decode_json_response($ua => $ua->request($req), $timeout);
|
|
}
|
|
|
|
my @error = ($resp->status_line(), " ", $req->method(), " <", $req->uri(), ">");
|
|
my $content_type = $resp->content_type();
|
|
if ($content_type eq "application/json") {
|
|
my $content = $resp->decoded_content();
|
|
chomp $content;
|
|
push @error, " ", $content;
|
|
}
|
|
fail(@error);
|
|
}
|
|
|
|
# askbot_get($URL)
|
|
# Send a request to the AskBot API and decode the JSON response.
|
|
sub askbot_get($) {
|
|
my $url = $CONF->{askbot}->{apiurl} . shift;
|
|
my $t0 = Time::HiRes::gettimeofday();
|
|
my $resp = $ASKBOT->get($url, Host => $CONF->{askbot}->{host}, Accept => "application/json");
|
|
$ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
|
|
$ASKBOT_COUNT++;
|
|
return decode_json_response($ASKBOT => $resp);
|
|
}
|
|
|
|
# discourse_req($USERNAME, $METHOD, $PATH, [$BODY])
|
|
# Send a request to the Discourse API and decode the JSON response.
|
|
sub discourse_req($$$;$) {
|
|
my ($username, $method, $path, $body) = @_;
|
|
my $req = HTTP::Request::->new( $method => $CONF->{discourse}->{apiurl} . $path );
|
|
$req->header( Host => $CONF->{discourse}->{host} );
|
|
# the API doesn't mention case for usernames, but seems we need it
|
|
# for some Unicode usernames...
|
|
$req->header( "Api-Username" => Encode::encode("UTF-8", lc($username)) );
|
|
if (defined $body) {
|
|
$req->content_type("application/json");
|
|
$req->content(JSON::->new->utf8->encode($body));
|
|
}
|
|
my $t0 = Time::HiRes::gettimeofday();
|
|
my $resp = $DISCOURSE->request($req);
|
|
$DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t0;
|
|
$DISCOURSE_COUNT++;
|
|
return decode_json_response($DISCOURSE => $resp);
|
|
}
|
|
|
|
# upload($USERNAME, $PATH, type => $TYPE, ...)
|
|
# Download $PATH from AskBot and re-upload to Discourse; returns the
|
|
# upload response from the Discourse API, cf.
|
|
# https://docs.discourse.org/#tag/Uploads/paths/~1uploads.json/post
|
|
#
|
|
# If the download yields 404 then returns undef instead.
|
|
sub upload($$%) {
|
|
my $username = shift;
|
|
my $path = shift;
|
|
my %attrs = @_;
|
|
|
|
my $filename = delete $attrs{filename} // basename($path);
|
|
my $content_type = guess_media_type($filename);
|
|
|
|
logmsg(INFO => " " x $INDENT, "upload $path ($filename)");
|
|
panic($path) unless $path =~ /\A\//;
|
|
|
|
my $uri = $CONF->{askbot}->{apiurl} . $path;
|
|
my $t0 = Time::HiRes::gettimeofday();
|
|
my $resp = $ASKBOT->get($uri, Host => $CONF->{askbot}->{host}) or panic();
|
|
$ASKBOT_TIME += Time::HiRes::gettimeofday() - $t0;
|
|
$ASKBOT_COUNT++;
|
|
unless ($resp->is_success) {
|
|
my @error = ($resp->status_line(), " <", $uri, ">");
|
|
if ($resp->code() == 404) {
|
|
logmsg(WARN => @error);
|
|
return;
|
|
} else {
|
|
panic(@error);
|
|
}
|
|
}
|
|
|
|
# RFC 1867, using HTTP::Request::Common's form_ref
|
|
my @form_data = ( undef, Encode::encode("UTF-8", $filename) );
|
|
push @form_data, "Content-type" => $content_type
|
|
, Content => $resp->decoded_content();
|
|
|
|
my $t1 = Time::HiRes::gettimeofday();
|
|
$resp = $DISCOURSE->post( $CONF->{discourse}->{apiurl} . "/uploads.json",
|
|
"Api-Username" => Encode::encode("UTF-8", lc($username)),
|
|
Host => $CONF->{discourse}->{host},
|
|
Content_Type => "form-data",
|
|
Content => [ %attrs, synchronous => "true", "files[]" => \@form_data ]
|
|
);
|
|
$DISCOURSE_TIME += Time::HiRes::gettimeofday() - $t1;
|
|
$DISCOURSE_COUNT++;
|
|
|
|
return decode_json_response($DISCOURSE => $resp);
|
|
}
|
|
|
|
# username($ID)
|
|
# Returns the Discourse username associated with the AskBot user ID.
|
|
# The Discourse user is created if it doesn't already exist.
|
|
my %ASKBOT_PROFILES; # pre-existing AskBot profiles, incl. SSO linking information
|
|
my %ASKBOT_DUPLICATES;
|
|
my %DISCOURSE_PROFILES; # map AskBot user ID -> Discourse profile
|
|
sub username($);
|
|
sub username($) {
|
|
my $orig_id = shift;
|
|
my $id = $ASKBOT_DUPLICATES{$orig_id} // $orig_id;
|
|
|
|
state $sth_user_lookup = $DBH->prepare(q{
|
|
SELECT username FROM users WHERE id = ?
|
|
});
|
|
|
|
my $profile = $ASKBOT_PROFILES{$id} // fail("Can't find user #$id in users.json");
|
|
my $username = $profile->{discourse_username} // panic($id);
|
|
|
|
my ($u, $user_id);
|
|
{
|
|
my $p = $DISCOURSE_PROFILES{$id} // panic($id);
|
|
lock(%$p); # only need to lock that user's profile
|
|
return $username if exists $p->{user_id}; # already created
|
|
|
|
# user API is not localized, let's stick to english
|
|
$u = askbot_get("/en/api/v1/users/$id/");
|
|
|
|
my $msg = $profile->{username};
|
|
$msg .= " -> $username" if $username ne $profile->{username};
|
|
$msg .= ", duplicate #$orig_id" if $id != $orig_id;
|
|
logmsg(INFO => " " x $INDENT, "user #$id ($msg)");
|
|
|
|
my $email = $profile->{email} // panic();
|
|
if ($email =~ /\@(?:mailinator|trashmail)\.[a-z]+$/i
|
|
or $email !~ /\@[0-9a-z\-]+(?:\.[0-9a-z\-]+)+$/i) {
|
|
logmsg(WARN => "user #$id ($username) has invalid email address <$email>");
|
|
# create the account for content attribution but lock them out
|
|
$email = "noreply+$username\@documentfoundation.org";
|
|
}
|
|
my $r = discourse_req("system", POST => "/users.json", {
|
|
username => $username
|
|
, password => unpack("H*", urandom(32)) # XXX required even with SSO
|
|
, name => $profile->{name} // ""
|
|
, email => $email
|
|
, active => JSON::true
|
|
, approved => JSON::true
|
|
});
|
|
panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} == JSON::true;
|
|
$user_id = $r->{user_id} // panic($id);
|
|
|
|
$sth_user_lookup->bind_param(1, $user_id, SQL_INTEGER);
|
|
$sth_user_lookup->execute();
|
|
my ($username2) = $sth_user_lookup->fetchrow();
|
|
$sth_user_lookup->finish();
|
|
unless (defined $username2 and $username2 eq $username) {
|
|
# there is a race condition causing PostgreSQL to sometimes abort the transaction:
|
|
# Discourse reports the user as created but any attempt to use it as Api-Username fails
|
|
logmsg(SOFTFAIL => "Couldn't create user #$orig_id ($profile->{username}), retrying...");
|
|
return username($orig_id);
|
|
}
|
|
|
|
# assign a Trust Level to the newly created account, depending on the users's activity on
|
|
# AskBot, in order to bootstrap the site
|
|
my $tl = ($u->{reputation} >= 1000 and time - $u->{last_seen_at} <= 31557600) ? 4
|
|
: ($u->{reputation} >= 250 and $u->{answers} >= 100 and time - $u->{last_seen_at} <= 31557600) ? 3
|
|
: ($u->{answers} + $u->{comments} >= 100) ? 2
|
|
: ($u->{questions} + $u->{answers} + $u->{comments} >= 5) ? 1
|
|
: 0;
|
|
# leave non-returning (less than 10 between creation and last seen date) users at TL0
|
|
$tl = 0 if $u->{last_seen_at} - $u->{joined_at} < 864000;
|
|
discourse_req("system", PUT => "/admin/users/$user_id/trust_level", { level => $tl })
|
|
unless $tl == 0;
|
|
|
|
$p->{user_id} = $user_id;
|
|
$p->{trust_level} = $tl;
|
|
} # account was created and TL assigned, we can release the lock on $DISCOURSE_PROFILES{$id}
|
|
$INDENT++;
|
|
|
|
# preserve avatar
|
|
my %avatar_upload;
|
|
my $username_escaped = uri_escape_utf8(lc($username));
|
|
if ($u->{avatar} =~ m#^(?:(?:https?:)?//www\.gravatar\.com/avatar)?/#) {
|
|
my $path = "/user_avatar/$username_escaped/refresh_gravatar.json";
|
|
my $r = discourse_req($username, POST => $path, { username => $username });
|
|
$avatar_upload{upload_id} = $r->{gravatar_upload_id};
|
|
$avatar_upload{type} = "gravatar";
|
|
} elsif ($u->{avatar} =~ s,^(?:(?:https?:)?//ask\.libreoffice\.org)?(/[^/?#][^?#]*).*,$1,i) {
|
|
unless ($u->{avatar} eq "/m/default/media/images/nophoto.png") {
|
|
$u->{avatar} =~ s#^(/upfiles/avatars/[^/]+)/resized/[0-9]+/([^/]+)$#$1/$2#;
|
|
my $r = eval { upload($username, $u->{avatar}, type => "avatar", user_id => $user_id) };
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
} else {
|
|
$avatar_upload{upload_id} = $r->{id};
|
|
$avatar_upload{type} = "uploaded";
|
|
}
|
|
}
|
|
} else {
|
|
logmsg(WARN => "$username: unknown avatar URL $u->{avatar}");
|
|
}
|
|
|
|
if (%avatar_upload) {
|
|
my $errstr = "$username: Couldn't set avatar ($avatar_upload{type})";
|
|
if (defined $avatar_upload{upload_id}) {
|
|
my $path = "/u/$username_escaped/preferences/avatar/pick.json";
|
|
my $r = eval { discourse_req($username, PUT => $path, \%avatar_upload) };
|
|
unless (!$@ and defined $r->{success} and $r->{success} eq "OK") {
|
|
print STDERR $@ if $@; # already logged
|
|
$errstr .= ": " . $r->{message} if defined $r and defined $r->{message};
|
|
logmsg(SOFTFAIL => $errstr);
|
|
}
|
|
} else {
|
|
logmsg(WARN => $errstr);
|
|
}
|
|
}
|
|
|
|
# database surgery: set created and last seen dates from AskBot
|
|
state $sth_user_last = $DBH->prepare(q{
|
|
UPDATE users SET created_at = to_timestamp(?), last_seen_at = to_timestamp(?)
|
|
WHERE id = ?
|
|
});
|
|
$sth_user_last->bind_param(1, $u->{joined_at}, SQL_INTEGER);
|
|
$sth_user_last->bind_param(2, $u->{last_seen_at}, SQL_INTEGER);
|
|
$sth_user_last->bind_param(3, $user_id, SQL_INTEGER);
|
|
my $r = $sth_user_last->execute();
|
|
panic($r) unless $r == 1;
|
|
|
|
$INDENT--;
|
|
return $username;
|
|
}
|
|
|
|
# mangle_post_upload($USERNAME, $TEXT => $PATH, $INLINE)
|
|
# Return a Markdown string with given $TEXT and AskBot source link
|
|
# (attachment or screeshot). The source is downloaded and re-uploaded
|
|
# to Discourse.
|
|
sub mangle_post_upload($$$$) {
|
|
my ($username, $text, $path, $inline) = @_;
|
|
my $filename;
|
|
unless ($inline) {
|
|
# fix ugly C:\fakepath\something cf. https://www.mediawiki.org/wiki/Special:Code/MediaWiki/83225
|
|
$text =~ s/\AC:\\fakepath\\([^\\]+)\z/$1/;
|
|
my $ext = $path =~ /[^.\/\\]\.([a-zA-Z](?:-?[a-zA-Z0-9])*)\z/ ? $1 : undef;
|
|
$filename = !(defined $ext and $text =~ /[^.\/\\]\.\Q$ext\E\z/i) ? basename($path)
|
|
: $text =~ /\A[A-Z]:\\.*\\(.+)\z/ ? basename($1)
|
|
: basename($text);
|
|
$filename =~ tr[\t\n\f\r ][ ]s;
|
|
}
|
|
my $u = eval { upload($username, $path, type => "composer", filename => $filename) };
|
|
unless (!$@ and defined $u) {
|
|
# 404 or fatal error, can't do better
|
|
print STDERR $@ if $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't re-upload $path");
|
|
return;
|
|
}
|
|
return $inline ? "" : "[$text|attachment]($u->{short_url})";
|
|
}
|
|
|
|
# mangle_post_internal_link($ID, $LINK, $FINAL, $REF)
|
|
# Return appropriate replacement for AskBot internal $LINK in post $ID.
|
|
# The 3rd argument is a boolean indicating whether the import is
|
|
# ongoing or if we're at the final phase.
|
|
# The 4rd argument is a reference to a variable that is is set to 1 if
|
|
# $FINAL is false and the mangling couldn't proceeed because the
|
|
# AskBot question hasn't been processed yet, or if $FINAL is true and
|
|
# a suitable substitute was found.
|
|
my %URLMAP :shared; # map AskBot post ID -> Discourse URL
|
|
my %CATEGORY_IDs; # map language code -> Discourse category
|
|
sub mangle_post_internal_link($$$$) {
|
|
my ($post_id, $link, $final, $ref_rv) = @_;
|
|
|
|
my $uri = $link;
|
|
my $hash = $uri =~ s/\#(.*)// ? $1 : "";
|
|
my $query = $uri =~ s/\?(.*)// ? $1 : "";
|
|
my $lang = $uri =~ m,^ /(?:question|user)s? (?:/|$),x ? "en"
|
|
: $uri =~ s,^/([\p{PosixAlnum}\-_]+)(/(?:question|user)s? (?:/|$)),$2,x ? $1
|
|
: $uri =~ s,^/([\p{PosixAlnum}\-_]+) /?$,/questions/,x ? $1
|
|
: undef;
|
|
|
|
my $newlink;
|
|
unless (defined $lang) {
|
|
return unless $uri eq "" or $uri eq "/"; # can't do better
|
|
return if $CONF->{discourse}->{baseurl} eq "https://ask.libreoffice.org" and $query eq "" and $hash eq "";
|
|
$newlink = "/"; # site root (ignore query and hash)
|
|
} elsif ($uri =~ s#^/users?/([0-9]+)(?:/|$)##) {
|
|
# don't create accounts during the final phase
|
|
return if $final;
|
|
my $username = eval { username($1) };
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't create user #$1");
|
|
return;
|
|
}
|
|
$newlink = "/u/" . uri_escape_utf8(lc($username)); # profile URL
|
|
} elsif ($uri =~ s#^/questions?/([0-9]+)(?:/|$)##) {
|
|
my $id = $1;
|
|
my $old_post_id =
|
|
$hash =~ /^post-id-([0-9]+)$/ ? $1
|
|
: $query =~ /(?:^|&)(?:answer|comment)=([0-9]+)(?:&|$)/ ? $1
|
|
: $id;
|
|
unless (defined ($newlink = $URLMAP{$old_post_id})) {
|
|
if (!$final) {
|
|
$$ref_rv = 1; # needs post mangling
|
|
} else {
|
|
logmsg(WARN => "#$post_id: Can't mangle <$link> (missing #$old_post_id)");
|
|
}
|
|
return;
|
|
}
|
|
} elsif ($uri =~ m#^/questions?/?$#) {
|
|
my $c = $CATEGORY_IDs{$lang} // return;
|
|
$newlink = "/c/$c->{slug}/$c->{id}";
|
|
} elsif ($uri =~ m#^/questions?/ask/?$#) {
|
|
my $c = $CATEGORY_IDs{$lang} // return;
|
|
$newlink = "/new-topic?category=$c->{slug}";
|
|
} elsif ($uri =~ m#^/users?/?$#) {
|
|
$newlink = "/u";
|
|
} else {
|
|
return;
|
|
}
|
|
$$ref_rv = 1 if $final; # modified
|
|
logmsg(DEBUG => "#$post_id: Mangling <$link> -> <$newlink>");
|
|
return $CONF->{discourse}->{baseurl} . $newlink;
|
|
}
|
|
|
|
# mangle_post_internal_links($ID, $REF_TEXT, [$FINAL])
|
|
# Mangle all internal question/answer/comment/user links. The 3rd
|
|
# argument is a boolean indicating whether the import is ongoing or if
|
|
# we're at the final phase.
|
|
# Return a boolean, which when $FINAL is false indicates whether the
|
|
# post needs post-mangling during the final phase, and when $FINAL is
|
|
# true indicates whether the input text was mangled.
|
|
sub mangle_post_internal_links($$;$) {
|
|
my ($id, $text, $final) = @_;
|
|
my $rv = 0;
|
|
|
|
$$text =~ s#\b (?i:(?:https?:)//ask\.libreoffice\.org)
|
|
( / [^\P{Graph}!"'\(\)<>\[\]\`\{\}]+? )
|
|
(?! [^\P{Graph}!"'\(\),\.<>\?\[\]\`\{\}] | [!',\.\?]\p{Graph} )
|
|
# mangle_post_internal_link($id, $1, $final => \$rv) // $& #egx;
|
|
|
|
if ($final or !$rv) {
|
|
# note: the regexp is intentionally loose
|
|
logmsg(WARN => "Post #$id contains dangling internal link <$_>") foreach
|
|
$$text =~ m#\b(?i:(?:https?:)//ask\.libreoffice\.org)/(?![ctu]/)\S+#g;
|
|
}
|
|
|
|
return $rv;
|
|
}
|
|
|
|
# mangle_post($AUTHOR, $POST)
|
|
# Mangle $POST->{text} in place:
|
|
# * fix attachments and screenshots;
|
|
# * fix link to other profiles, "normal" links, bare URLs, and @-addressing
|
|
# (the target profile is created if it doesn't exist); and
|
|
# * fix links to questions, answers and comments.
|
|
# Return a boolean indicating whether the post needs further mangling
|
|
# during the final phase.
|
|
sub mangle_post($$) {
|
|
my ($author, $p) = @_;
|
|
|
|
# fix attachments and screenshots (an empty link text is allowed for screenshots, and we also
|
|
# allow it for atttachment since we append "|attachment")
|
|
$p->{text} =~ s#!\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/m/[\p{PosixAlnum}\-_]+/media/images/[^\[\]\(\)]+) \)#
|
|
mangle_post_upload("system", $1 => $2, 1) // $& #egx; # NOTE: discourse does deduplication
|
|
$p->{text} =~ s#(!)?\[ ([^\[\]\(\)]*) \]\( (?i:(?:https?:)?//ask\.libreoffice\.org)? (/upfiles/[^\[\]\(\)]+) \)#
|
|
mangle_post_upload($author, $2 => $3, $1) // $& #egx;
|
|
# NOTE: mangle_post_upload() can fail if it tries to upload the same document twice at the
|
|
# same time (race condition on uniqueness constraint) -- we could retry but we don't care so
|
|
# much since sidekiq will automatically fetch https://ask.libreoffice.org/upfiles/12345.png
|
|
# locally and update the post accordingly
|
|
# NOTE: the handful of bare links https://ask.libreoffice.org/upfiles/foobar.xyz are
|
|
# untouched hence will 404
|
|
|
|
# fix @-addressing; we can't match $p->{text} against /\@\w+/ here because AskBot
|
|
# usernames may contain spaces... so we check for <a href="..."/> in the HTML instead
|
|
my %user_ids;
|
|
$user_ids{$_} = 1 foreach $p->{summary} =~ m#<a\s(?:[^<>]*\s)*?href=['"]
|
|
(?i:(?:https?:)?//ask\.libreoffice\.org)? (?:/[\p{PosixAlnum}\-_]+)?
|
|
/users?/([0-9]+)[/'"]#gsx;
|
|
foreach my $id (keys %user_ids) {
|
|
# we don't want duplicate resolution here
|
|
my $profile = $ASKBOT_PROFILES{$id};
|
|
unless (defined $profile) {
|
|
logmsg(WARN => "#$id is not a known AskBot user, can't mangle its \@-addresses");
|
|
next;
|
|
}
|
|
my $old_username = $profile->{username} // panic($id);
|
|
my $new_username = eval { username($id) }; # might need to be created
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't create user #$id");
|
|
next;
|
|
}
|
|
$p->{text} =~ s/\@\Q$old_username\E (?! [\.\-]?[\p{Mark}_\p{Alnum}] )
|
|
/\@$new_username/gx
|
|
unless $old_username eq $new_username;
|
|
}
|
|
|
|
# explicitely prefix AskBot base URL so we don't have to do the work twice
|
|
$p->{text} =~ s#\[ ([^\[\]\(\)]+) \]\( (/[^\[\]\(\)]*) \)#[$1](https://ask.libreoffice.org$2)#gx;
|
|
|
|
# replace [URL](URL) links with a bare URL when possible so Discourse can use
|
|
# its own text for internal post links
|
|
$p->{text} =~ s#\[ ((?i:https?:)?//[^\[\]\(\)]+) \]\( \1 \)
|
|
(?! [!',\.\?]* [^\P{Graph}!"'\(\),\.<>\?\[\]\{\}] )#$1#gx;
|
|
|
|
return mangle_post_internal_links($p->{id} => \$p->{text});
|
|
}
|
|
|
|
# ts2iso8601($TIMESTAMP)
|
|
# Convert a timestamp to ISO 8601.
|
|
sub ts2iso8601($) {
|
|
my $ts = 0 + shift;
|
|
return POSIX::strftime("%Y-%m-%dT%H:%M:%SZ", gmtime($ts));
|
|
}
|
|
|
|
# replies(language_code => $LANG, topic_id => $ID, replies => [..], ..)
|
|
# Recursively process the given replies under some topic ID. The
|
|
# parent post is indicated by the optional reply_to_post_number
|
|
# attribute.
|
|
# Return the number of replies that couldn't be posted, if any
|
|
my @NEEDS_POST_MANGLING :shared;
|
|
sub replies(%);
|
|
sub replies(%) {
|
|
my %attrs = @_;
|
|
my $post_type = defined $attrs{reply_to_post_number} ? "comment" : "answer";
|
|
my $askbot_url = "/$attrs{language_code}/api/v1/${post_type}s/";
|
|
|
|
state $sth_set_vote_count = $DBH->prepare(q{
|
|
INSERT INTO post_custom_fields
|
|
(post_id, name, value, created_at, updated_at)
|
|
VALUES (?, 'vote_count', ?, NOW(), NOW())
|
|
});
|
|
|
|
my $num_errors = 0;
|
|
my $indent_base = $INDENT;
|
|
foreach my $id (sort {$a <=> $b} @{$attrs{replies}}) { # lowest ID first
|
|
logmsg(INFO => " " x $indent_base, "$post_type #$id");
|
|
$INDENT = $indent_base + 1;
|
|
|
|
my ($r, $t);
|
|
eval {
|
|
$r = askbot_get("$askbot_url$id/");
|
|
my $author = username($r->{author}->{id});
|
|
my $needs_post_mangling = mangle_post($author, $r);
|
|
|
|
$t = discourse_req($author, POST => "/posts.json", {
|
|
topic_id => $attrs{topic_id}
|
|
, reply_to_post_number => $attrs{reply_to_post_number}
|
|
, raw => $r->{text}
|
|
, created_at => ts2iso8601($r->{added_at})
|
|
, skip_validations => JSON::true
|
|
});
|
|
|
|
push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
|
|
};
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't post $post_type #$id");
|
|
$num_errors++;
|
|
next;
|
|
}
|
|
|
|
# no need to lock here, other threads will never store that ID
|
|
$URLMAP{$r->{id}} = "/t/$t->{topic_slug}/$t->{topic_id}/$t->{post_number}";
|
|
|
|
if ($post_type eq "answer") {
|
|
# database surgery: set vote count on answers (skip
|
|
# questions and comments as Discourse doesn't show these)
|
|
if ((my $score = $r->{score}) > 0) {
|
|
$sth_set_vote_count->bind_param(1, $t->{id}, SQL_INTEGER);
|
|
$sth_set_vote_count->bind_param(2, $score, SQL_INTEGER);
|
|
my $r = $sth_set_vote_count->execute();
|
|
panic($r) unless $r == 1;
|
|
}
|
|
if (defined $attrs{accepted_answer_id} and $attrs{accepted_answer_id} == $id) {
|
|
# XXX rate-limiting is not configurable, need to comment out
|
|
# "limit_accepts" in plugins/discourse-solved/plugin.rb:accept
|
|
logmsg(INFO => " " x $INDENT, "accept answer #$id");
|
|
$INDENT++;
|
|
my $r = eval { discourse_req($attrs{accepted_by}, POST => "/solution/accept",
|
|
{ id => $t->{id} }) };
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't accept answer #$id");
|
|
} elsif (!defined $r->{success} or $r->{success} ne "OK") {
|
|
panic(JSON::->new->utf8->encode($r));
|
|
}
|
|
$INDENT--;
|
|
}
|
|
}
|
|
|
|
# nested comments
|
|
$num_errors += replies(
|
|
language_code => $attrs{language_code}
|
|
, topic_id => $t->{topic_id}
|
|
, reply_to_post_number => $t->{post_number}
|
|
, replies => $r->{comment_ids}
|
|
) if @{$r->{comment_ids}};
|
|
}
|
|
return $num_errors;
|
|
}
|
|
|
|
|
|
#######################################################################
|
|
# Initialisation
|
|
|
|
logmsg(NOTICE => "Starting $SCRIPT_FILENAME");
|
|
my $TAG_GROUP_LIBO_COMPONENT = "LibreOffice component";
|
|
|
|
# Categories
|
|
useragent_init();
|
|
{
|
|
my $path = dirname($SCRIPT_FILENAME) . "/categories.json";
|
|
open my $fh, "<", $path or panic("open($path): $!");
|
|
my $categories = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
|
|
close $fh or panic("close: $!");
|
|
|
|
my %slugs;
|
|
my $r = discourse_req("system", GET => "/categories.json");
|
|
$slugs{$_->{slug}} = $_->{id} foreach @{$r->{category_list}->{categories}};
|
|
|
|
foreach my $c (@$categories) {
|
|
my $l = $c->{language_code} // panic();
|
|
$CATEGORY_IDs{$l} = { slug => $c->{slug} };
|
|
if (defined (my $id = $slugs{$c->{slug}})) {
|
|
# the category already exists
|
|
$CATEGORY_IDs{$l}->{id} = $id;
|
|
} else {
|
|
logmsg(NOTICE => "Creating category $c->{slug}");
|
|
my $r = discourse_req("system", POST => "/categories.json", {
|
|
name => $c->{name},
|
|
slug => $c->{slug},
|
|
custom_fields => { qa_enabled => JSON::true }
|
|
});
|
|
$CATEGORY_IDs{$l}->{id} = $r->{category}->{id} // panic();
|
|
}
|
|
}
|
|
|
|
$r = discourse_req("system", POST => "/tag_groups.json", {
|
|
tag_group => {
|
|
name => $TAG_GROUP_LIBO_COMPONENT,
|
|
tag_names => [ qw/common writer calc impress base draw math meta/ ],
|
|
permissions => { everyone => 1 },
|
|
one_per_topic => JSON::false
|
|
}
|
|
});
|
|
panic("Couldn't create group tag") unless defined $r->{tag_group};
|
|
|
|
# reorder (lexicographically) by slugs
|
|
logmsg(NOTICE => "Updating and ordering categories");
|
|
my (%categories_sorted, %positions);
|
|
my @categories_sorted = sort { $a->{slug} cmp $b->{slug} } @$categories;
|
|
foreach (my $i = 0; $i <= $#$categories; $i++) {
|
|
$categories_sorted{$categories_sorted[$i]->{slug}} = $i;
|
|
}
|
|
my $offset = 0;
|
|
$r = discourse_req("system", GET => "/categories.json");
|
|
foreach my $c (@{$r->{category_list}->{categories}}) {
|
|
my $s = $c->{slug} // panic();
|
|
my %h;
|
|
if ($s eq "uncategorized") {
|
|
$h{color} = "3AB54A";
|
|
$h{text_color} = "FFFFFF";
|
|
$positions{$c->{id}} = 0;
|
|
} elsif ($s eq "site-feedback") {
|
|
$positions{$c->{id}} = scalar(@$categories) + 1;
|
|
} elsif ($s eq "staff") {
|
|
$positions{$c->{id}} = scalar(@$categories) + 2;
|
|
} elsif ($s eq "lounge") {
|
|
$positions{$c->{id}} = scalar(@$categories) + 3;
|
|
} elsif (defined (my $i = $categories_sorted{$s})) {
|
|
$positions{$c->{id}} = $i + 1;
|
|
$h{num_featured_topics} = 1;
|
|
} else {
|
|
$positions{$c->{id}} = scalar(@$categories) + 4 + $offset;
|
|
$offset++;
|
|
}
|
|
next unless %h;
|
|
my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", \%h);
|
|
panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
|
|
}
|
|
$r = discourse_req("system", POST => "/categories/reorder",
|
|
{ mapping => JSON::->new->encode(\%positions) });
|
|
panic("Couldn't reorder categories") unless defined $r->{success} and $r->{success} eq "OK";
|
|
}
|
|
|
|
# AskBot (and SSO linking information) user profiles
|
|
{
|
|
my $path = dirname($SCRIPT_FILENAME) . "/users.json";
|
|
logmsg(NOTICE => "Loading $path");
|
|
open my $fh, "<", $path or panic("open($path): $!");
|
|
my $profiles = JSON::->new->utf8->decode( do { local $/ = undef; <$fh> } );
|
|
close $fh or panic("close: $!");
|
|
|
|
# merge accounts with the same email address (into the account with lowest ID)
|
|
my (%emails, %duplicates);
|
|
foreach my $id (keys %$profiles) {
|
|
my $p = $profiles->{$id} // panic();
|
|
# `lc` is a bit YOLO but Discourse matches email addresses case-insensitively...
|
|
my %e = ( lc($p->{email}) => 1 );
|
|
$e{lc($_)} = 1 foreach @{$p->{sso_emails} // []};
|
|
foreach (keys %e) {
|
|
$emails{$_} //= [];
|
|
push @{$emails{$_}}, $id;
|
|
}
|
|
}
|
|
foreach my $e (keys %emails) {
|
|
my $ids = $emails{$e} // panic();
|
|
next unless $#$ids > 0;
|
|
my ($id, @duplicates) = sort {$a <=> $b} @$ids;
|
|
$duplicates{$id} = \@duplicates;
|
|
}
|
|
foreach my $id (sort {$a <=> $b} keys %duplicates) {
|
|
my $dups = $duplicates{$id};
|
|
my $username = $profiles->{$id}->{username} // panic();
|
|
logmsg(INFO => "Merging user(s) ", join (", ", map {"#$_"} @$dups),
|
|
" with #$id ($username)");
|
|
$ASKBOT_DUPLICATES{$_} = $id + 0 foreach @$dups;
|
|
}
|
|
|
|
my @reserved_usernames = # 'reserved_usernames' setting
|
|
qw/admin moderator administrator mod sys system community info
|
|
you name username user nickname discourse discourseorg
|
|
discourseforum support/;
|
|
|
|
my %usernames;
|
|
my @profile_ids = sort {$a <=> $b} keys %$profiles;
|
|
|
|
# given users #1 "FooBar" and #2 "whatever" with SSO username "foobar", #1 and #2
|
|
# are respectively renamed to "foobar1" and "foobar"; we assume users not in SSO
|
|
# aren't that active in the project and allow hijacking usernames for SSO users
|
|
$usernames{lc($profiles->{$_}->{sso_username})} //= $ASKBOT_DUPLICATES{$_} // $_
|
|
# no need to add the version without `lc` here, SSO usernames are case insensitive
|
|
foreach grep { defined $profiles->{$_}->{sso_username} } @profile_ids;
|
|
|
|
# AskBot doesn't have a username requirement but Discourse does (and usernames are matched
|
|
# case-insensitively); we try to preserve users on a first come first served basis: given users
|
|
# #1 "Foo", #2 "foo", #3 "bar", #4 "bar" then "foo" and "bar" are respectively attributed to #2
|
|
# (because the case was correct) and #3 while #1 and #4 are renamed "Foo1" and "bar1"
|
|
$usernames{$profiles->{$_}->{username}} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
|
|
$usernames{lc($profiles->{$_}->{username})} //= $ASKBOT_DUPLICATES{$_} // $_ foreach @profile_ids;
|
|
|
|
# populate %ASKBOT_PROFILES
|
|
foreach my $id (@profile_ids) {
|
|
my $p = $profiles->{$id} // panic();
|
|
my %p = ( username => $p->{username} );
|
|
$id += 0; # convert to integer
|
|
|
|
if (exists $ASKBOT_DUPLICATES{$id}) {
|
|
# keep AskBot username for @-addressing mangling but skip other attributes
|
|
$ASKBOT_PROFILES{$id} = \%p;
|
|
next;
|
|
}
|
|
|
|
# mangle AskBot/SSO username to comply with Discourse's stricter policy
|
|
# https://github.com/discourse/discourse/blob/master/app/models/username_validator.rb
|
|
my $username = $p->{sso_username} // $p->{username} // panic();
|
|
$username =~ s/^\@+//; # @foo -> foo
|
|
$username =~ s/\@.*//; # foo@example.net -> foo
|
|
$username =~ s/[^\p{Alnum}\p{Mark}._\-]//g; # UNICODE_INVALID_CHAR_PATTERN
|
|
$username =~ s/^[^\p{Alnum}\p{Mark}_]+//; # INVALID_LEADING_CHAR_PATTERN
|
|
$username =~ s/[^\p{Alnum}\p{Mark}]+$//; # INVALID_TRAILING_CHAR_PATTERN
|
|
$username =~ s/([\-_.])+/$1/g; # REPEATED_SPECIAL_CHAR_PATTERN
|
|
$username =~ s/\.(js|json|css|html?|xml|jpe?g|png|gif|bmp|ico|tiff?|woff2?)$/-$1/i; # CONFUSING_EXTENSIONS
|
|
$username = "unknown" if $username eq "";
|
|
|
|
# comparison is case insensitive
|
|
$username = "not-$username" if grep { lc($username) eq $_ } @reserved_usernames;
|
|
|
|
while (($usernames{lc($username)} // $id) != $id) {
|
|
# username is already taken, increment or append a number
|
|
$username .= "1" unless $username =~ s/([0-9]+)$/sprintf("%0*d", length($1), $1 + 1)/e;
|
|
}
|
|
|
|
$usernames{lc($username)} = $id;
|
|
$p{discourse_username} = $username;
|
|
$p{name} = $p->{name} if defined $p->{name} and $p->{name} ne "";
|
|
$p{email} = @{$p->{sso_emails} // []} ? $p->{sso_emails}->[0] : $p->{email};
|
|
$ASKBOT_PROFILES{$id} = \%p;
|
|
$DISCOURSE_PROFILES{$id} = shared_clone({});
|
|
}
|
|
|
|
# pre-existing users
|
|
$DISCOURSE_PROFILES{32931}->{user_id} = 1;
|
|
$DISCOURSE_PROFILES{32931}->{trust_level} = 1;
|
|
}
|
|
|
|
|
|
#######################################################################
|
|
# Import workers, each fetching work from a queue of questions
|
|
|
|
my $QUEUE = Thread::Queue::->new();
|
|
my ($TOPIC_lck, $CLOSE_lck) :shared;
|
|
sub import_questions($) {
|
|
$WORKER = shift;
|
|
useragent_init(); # re-init
|
|
|
|
my $conf_db = $CONF->{database} // panic();
|
|
my $dsn = "DBI:Pg:dbname=$conf_db->{dbname}; host=$conf_db->{host}";
|
|
$DBH = DBI::->connect( $dsn, $conf_db->{user} // "", $conf_db->{password} // "", {RaiseError => 1} );
|
|
|
|
my $sth_set_view_count = $DBH->prepare(q{
|
|
UPDATE topics SET views = ? WHERE id = ?
|
|
});
|
|
my $sth_set_closed_at_by = $DBH->prepare(q{
|
|
UPDATE posts SET created_at = to_timestamp(?)
|
|
WHERE topic_id = ? AND post_type != 1 AND action_code = 'closed.enabled'
|
|
});
|
|
|
|
my $elapsed = 0;
|
|
while (defined (my $q = $QUEUE->dequeue())) {
|
|
logmsg(INFO => "question #$q->{id}");
|
|
$INDENT = 1;
|
|
|
|
my @tags = grep # skip reserved names, cf. app/models/tag.rb
|
|
{ lc($_) ne "none" and lc($_) ne "constructor" }
|
|
# NOTE: discourse mangles tag names, in particular
|
|
# turns 'libreoffice-1.2.3' to 'libreoffice-123', cf.
|
|
# lib/discourse_tagging.rb:clean_tag()
|
|
@{$q->{tags}};
|
|
|
|
my $lang = $q->{language_code} // panic();
|
|
my $category = $CATEGORY_IDs{$lang} // panic($lang);
|
|
my $t0 = Time::HiRes::gettimeofday();
|
|
|
|
my ($author, $t);
|
|
eval {
|
|
$author = username($q->{author}->{id});
|
|
my $needs_post_mangling = mangle_post($author, $q);
|
|
|
|
# post topic
|
|
lock($TOPIC_lck);
|
|
$t = discourse_req($author, POST => "/posts.json", {
|
|
title => $q->{title}
|
|
, raw => $q->{text}
|
|
, category => $category->{id}
|
|
, tags => \@tags
|
|
, created_at => ts2iso8601($q->{added_at})
|
|
, skip_validations => JSON::true
|
|
});
|
|
push @NEEDS_POST_MANGLING, $t->{id} if $needs_post_mangling;
|
|
};
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't post question #$q->{id}");
|
|
next;
|
|
}
|
|
|
|
my $topic_id = $t->{topic_id} // panic();
|
|
my $topic_url = "/t/$t->{topic_slug}/$topic_id";
|
|
my $num_errors = 0;
|
|
|
|
# no need to lock here, other threads will never store that ID
|
|
$URLMAP{$q->{id}} = $topic_url;
|
|
|
|
# (recursively) process replies
|
|
$num_errors += replies(
|
|
language_code => $lang
|
|
, topic_id => $topic_id
|
|
, reply_to_post_number => $t->{post_number}
|
|
, replies => $q->{comment_ids}
|
|
) if @{$q->{comment_ids}};
|
|
|
|
$num_errors += replies(
|
|
language_code => $lang
|
|
, topic_id => $topic_id
|
|
# answers have null has reply_to_post_number attribute, see
|
|
# https://github.com/paviliondev/discourse-question-answer/blob/master/extensions/topic_extension.rb#L16
|
|
, reply_to_post_number => undef
|
|
, replies => $q->{answer_ids}
|
|
, accepted_answer_id => $q->{accepted_answer_id}
|
|
, accepted_by => $author # assume accepted by OP
|
|
) if @{$q->{answer_ids}};
|
|
|
|
if ($q->{closed} == JSON::true) {
|
|
logmsg(INFO => " " x $INDENT, "close question #$q->{id}");
|
|
$INDENT++;
|
|
my $id = $q->{closed_by}->{id} // panic();
|
|
my $username = eval { username($id) };
|
|
if ($@) {
|
|
print STDERR $@; # already logged
|
|
logmsg(SOFTFAIL => "Couldn't create user #$id");
|
|
} else {
|
|
{
|
|
my $p = $DISCOURSE_PROFILES{ $ASKBOT_DUPLICATES{$id} // $id } // panic();
|
|
lock(%$p); # only need to lock that user's profile
|
|
if ((my $tl = $p->{trust_level}) < 4) {
|
|
logmsg(DEBUG => "Temporarily promoting $username to TL4");
|
|
discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level",
|
|
{ level => 4 });
|
|
$p->{trust_level} = 4;
|
|
$p->{real_trust_level} = $tl;
|
|
}
|
|
}
|
|
lock($CLOSE_lck);
|
|
my $r = discourse_req($username, PUT => "/t/$topic_id/status.json",
|
|
{ status=> "closed", enabled => "true" });
|
|
panic(JSON::->new->utf8->encode($r)) unless defined $r->{success} and $r->{success} eq "OK";
|
|
}
|
|
|
|
# database surgery: set closing date from AskBot
|
|
$sth_set_closed_at_by->bind_param(1, $q->{closed_at}, SQL_INTEGER);
|
|
$sth_set_closed_at_by->bind_param(2, $topic_id, SQL_INTEGER);
|
|
my $r = $sth_set_closed_at_by->execute();
|
|
panic($r) unless $r == 1;
|
|
$INDENT--;
|
|
}
|
|
|
|
if ((my $view_count = $q->{view_count}) > 0) {
|
|
# database surgery: set view count on the topic
|
|
$sth_set_view_count->bind_param(1, $view_count, SQL_INTEGER);
|
|
$sth_set_view_count->bind_param(2, $topic_id, SQL_INTEGER);
|
|
my $r = $sth_set_view_count->execute();
|
|
panic($r) unless $r == 1;
|
|
}
|
|
|
|
logmsg(WARN => "$topic_url (AskBot question #$q->{id}) has >=$num_errors missing replies!")
|
|
if $num_errors > 0;
|
|
$elapsed += Time::HiRes::gettimeofday() - $t0;
|
|
}
|
|
|
|
logmsg(INFO => sprintf("Worker stats: total active time %.2fs", $elapsed));
|
|
logmsg(INFO => sprintf("AskBot stats: %d requests, %.2fs total", $ASKBOT_COUNT, $ASKBOT_TIME));
|
|
logmsg(INFO => sprintf("Discourse stats: %d requests, %.2fs total", $DISCOURSE_COUNT, $DISCOURSE_TIME));
|
|
}
|
|
|
|
logmsg(NOTICE => "Spawning $NUM_WORKERS workers");
|
|
$QUEUE->limit = 4 * $NUM_WORKERS; # no need to prefetch pages if the workers can't keep up
|
|
threads->create(\&import_questions, $_) for 1 .. $NUM_WORKERS;
|
|
|
|
|
|
#######################################################################
|
|
# Queue up questions
|
|
# we process them ordered by ID so lower question IDs get lower topic IDs
|
|
|
|
#{
|
|
# my %question_ids = (
|
|
# 161330 => "en" # @-addressing with old username
|
|
# , 238673 => "en" # closed
|
|
# , 302629 => "en" # upvote, accepted answer
|
|
# , 303050 => "en" # screenshots, attachments (both with C:\fakepath\ and without)
|
|
# , 285734 => "ja" # URL to profile, unicode attachements
|
|
# , 161228 => "en" # question linked in the above, OneBox
|
|
# , 304617 => "en" # closed by someone with TL < 4
|
|
# , 161334 => "en" # Hello1 -> Hello2
|
|
# # duplicate accounts
|
|
# , 18673 => "en", 1505 => "en"
|
|
# , 25329 => "pt-br", 128011 => "pt-br"
|
|
# );
|
|
#
|
|
# foreach my $id (sort {$a <=> $b} keys %question_ids) {
|
|
# my $lang = $question_ids{$id};
|
|
# my $url = "/$lang/api/v1/questions/$id/";
|
|
# my $q = askbot_get($url);
|
|
# $q->{language_code} = $lang;
|
|
# $QUEUE->enqueue($q);
|
|
# }
|
|
#}
|
|
|
|
my $PAGE_NUM = 0; # number of most recent pages to process (0 means ALL)
|
|
my %PAGES;
|
|
foreach my $lang (keys %CATEGORY_IDs) {
|
|
my $url = "/$lang/api/v1/questions/";
|
|
|
|
my $offset = 1; # index of the first page to consider
|
|
unless ($PAGE_NUM == 0) {
|
|
my $h = askbot_get($url);
|
|
$offset = $h->{pages} - $PAGE_NUM + 1 if $PAGE_NUM < $h->{pages};
|
|
}
|
|
|
|
my $h = askbot_get("$url?sort=age-asc&page=$offset");
|
|
my @questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
|
|
logmsg(NOTICE => "Loaded /$lang/questions page $offset/$h->{pages}");
|
|
|
|
# buffer the page
|
|
$PAGES{$lang} = {
|
|
current_page => $offset
|
|
, pages => $h->{pages}
|
|
, questions => \@questions
|
|
, url => $url
|
|
} if @questions;
|
|
}
|
|
|
|
#my $COUNT = 1000;
|
|
MAINLOOP: while (%PAGES) {
|
|
# find language code with the lowest question ID
|
|
my ($alang, $blang) = sort { $PAGES{$a}->{questions}->[0]->{id} <=> $PAGES{$b}->{questions}->[0]->{id} }
|
|
keys %PAGES;
|
|
my $p = $PAGES{$alang};
|
|
|
|
# process questions from $alang that have a lower ID than any other language
|
|
# (ie, until we reach the first question from $blang)
|
|
my $questions = $p->{questions};
|
|
my $bound = defined $blang ? $PAGES{$blang}->{questions}->[0]->{id} : -1;
|
|
while ($bound < 0 or $questions->[0]->{id} < $bound) {
|
|
my $q = shift @$questions;
|
|
$q->{language_code} = $alang;
|
|
$QUEUE->enqueue($q);
|
|
#last MAINLOOP unless --$COUNT > 0;
|
|
next if @$questions; # $alang has some questions buffered still
|
|
|
|
unless ($p->{current_page} < $p->{pages}) {
|
|
logmsg(NOTICE => "Done queuing /$alang/questions");
|
|
delete $PAGES{$alang};
|
|
last;
|
|
}
|
|
|
|
# $alang's highest buffered question ID was lower that $blang's lowest, need to load another page
|
|
my $offset = ++$p->{current_page};
|
|
my $h = askbot_get($p->{url} . "?sort=age-asc&page=$offset");
|
|
logmsg(NOTICE => "Loaded /$alang/questions page $offset/$h->{pages}");
|
|
$p->{pages} = $h->{pages}; # update page count, there might be new posts
|
|
|
|
@$questions = sort { $a->{id} <=> $b->{id} } @{$h->{questions}}; # should be sorted already
|
|
}
|
|
}
|
|
|
|
|
|
#######################################################################
|
|
# Done!
|
|
|
|
$QUEUE->end();
|
|
$_->join() foreach threads->list();
|
|
logmsg(NOTICE => "Done importing");
|
|
|
|
logmsg(NOTICE => "Updating categories");
|
|
foreach my $c (values %CATEGORY_IDs) {
|
|
my $r = discourse_req("system", PUT => "/categories/$c->{id}.json", {
|
|
num_featured_topics => 3 # restore default
|
|
, email_in => "question+$c->{slug}\@ask.libreoffice.org"
|
|
, required_tag_group_name => $TAG_GROUP_LIBO_COMPONENT
|
|
, min_tags_from_required_group => 1
|
|
, allow_global_tags => "true"
|
|
});
|
|
panic("Couldn't update category") unless defined $r->{success} and $r->{success} eq "OK";
|
|
}
|
|
|
|
{
|
|
# generate an AskBot question/answer/comment ID -> Discourse path for nginx rewriting
|
|
my $pathmap = dirname($SCRIPT_FILENAME) . "/nginx.map";
|
|
if (open my $fh, ">", $pathmap) {
|
|
printf $fh "%-6d %s;\n", $_, $URLMAP{$_} foreach sort {$a <=> $b} keys %URLMAP;
|
|
} else {
|
|
logmsg(WARN => "Couldn't open $pathmap for writing");
|
|
}
|
|
}
|
|
|
|
|
|
#######################################################################
|
|
# Post processing
|
|
|
|
sub post_process($) {
|
|
$WORKER = shift;
|
|
while (defined (my $x = $QUEUE->dequeue())) {
|
|
my $action = $x->{action} // panic();
|
|
if ($action eq "mangle_post") {
|
|
# try again to mangle internal links now that we have the full URL map
|
|
my $id = $x->{id} // panic();
|
|
logmsg(INFO => "Post-mangling post #$id");
|
|
my $p = discourse_req("system", GET => "/posts/$id.json");
|
|
|
|
my $modified = mangle_post_internal_links($id => \$p->{raw}, 1);
|
|
unless ($modified) {
|
|
logmsg(INFO => "Not updating unchanged post #$id");
|
|
next;
|
|
}
|
|
discourse_req("system", PUT => "/posts/$id.json", {
|
|
raw => $p->{raw}
|
|
, edit_reason => "Fix internal links (post AskBot migration)."
|
|
# XXX the below requires patching app/controllers/posts_controller.rb
|
|
, skip_validations => JSON::true
|
|
, no_bump => JSON::true
|
|
});
|
|
}
|
|
elsif ($action eq "trust_level") {
|
|
# reset TL back to its original value (before we raised it to close the issue)
|
|
my $id = $x->{id} // panic();
|
|
my $p = $DISCOURSE_PROFILES{$id} // panic($id); # never a duplicate
|
|
my $username = $ASKBOT_PROFILES{$id}->{discourse_username} // panic($id);
|
|
my $level = $p->{real_trust_level} // panic();
|
|
logmsg(INFO => "Demoting $username back to TL$level");
|
|
discourse_req("system", PUT => "/admin/users/$p->{user_id}/trust_level", { level => $level });
|
|
}
|
|
else {
|
|
panic($action);
|
|
}
|
|
}
|
|
}
|
|
|
|
my @FIX_TRUST_LEVELS = grep { defined $DISCOURSE_PROFILES{$_}->{real_trust_level} }
|
|
keys %DISCOURSE_PROFILES;
|
|
if (@NEEDS_POST_MANGLING or @FIX_TRUST_LEVELS) {
|
|
logmsg(NOTICE => "Starting post-processing");
|
|
$QUEUE = Thread::Queue::->new();
|
|
threads->create(\&post_process, $_) for 1 .. $NUM_WORKERS;
|
|
|
|
$QUEUE->enqueue({ id => $_, action => "mangle_post" }) foreach @NEEDS_POST_MANGLING;
|
|
$QUEUE->enqueue({ id => $_, action => "trust_level" }) foreach @FIX_TRUST_LEVELS;
|
|
$QUEUE->end();
|
|
$_->join() foreach threads->list();
|
|
}
|
|
|
|
|
|
#######################################################################
|
|
logmsg(NOTICE => "Finished!");
|