#!/usr/bin/env perl
use HTML::SimpleLinkExtor;
use File::Find qw(find);
use URI::Escape;
use warnings;

#$Id: linkextract.pl,v 1.4 2013/10/21 10:58:00 libox Exp $

my $dir     = shift; # where static html
my $rsynclist = shift; # to create for rsync
my $medium = shift; # wich medium to be extract
my $source_dir = '/var/www/sites/libreofficebox.org/';
my $file;
my $version;

die "no dir given" unless ($dir);
die "no filename for rsync given" unless ($rsynclist);
die "no medium given" unless ($medium);

$file = "dvd" if $medium eq "dvd.de";
$file = "dvd" if $medium eq "cd_windows.de";
$file = "dvd" if $medium eq "dvd36.de";
$file = "dvd-betrieb" if $medium eq "dvd-betrieb.de";
$file = "dvd-betrieb" if $medium eq "cd_windows-betrieb.de";

open FH, ("/home/libox/cms/assets/LO_installer_unpacked/" . $file);
$version = <FH>;
close FH;

# not covered by link targets - separate by "\n"
my $more_items;
$more_items = "assets/bin\nassets/LO_installer_unpacked/" . $version . "/windows\nassets/LO_installer_unpacked/" . $version . "/linux_deb/64/\nassets/LO_installer_unpacked/" . $version . "/linux_deb/32/";
$more_items = "assets/bin\nassets/LO_installer_unpacked/" . $version . "/windows" if $medium eq "cd_windows.de";
$more_items = "assets/bin\nassets/LO_installer_unpacked/" . $version . "/windows" if $medium eq "cd_windows-betrieb.de";



# dir is the html-directory as provided in the tarball from
# StaticExporter with basedir relative
# rsynclist is file that will contain all the used files
my $pattern = 'html$';

my %assets;
my %themes;

sub collect_links {
	return unless /$pattern/;
	my $extor = HTML::SimpleLinkExtor->new();
	$extor->parse_file($File::Find::name);
	print "$File::Find::name\n";
	#print "$_\n";
	foreach my $link ($extor->links) {
		print "link: $link\n";
		$link =~ s/^\/?(..\/)*//;
		
		$assets{uri_unescape($link)}=1 if ($link =~ /^assets/);
		$themes{$link}=1 if ($link =~ s/^themes\/([^\/]+)\/.*/$1/);
	}
	#$extor->clear_links; # reset the link list
}

find ({wanted=>\&collect_links,no_chdir=>1}, $dir);
print "Number of referenced files from assets: ".scalar(keys %assets)."\n";
open INCLUDE, ">$rsynclist";
foreach $key (keys %assets) {
	#print "\t $key\n";
	print INCLUDE "$key\n";
}
print "Used theme(s):\n";
foreach $key (keys %themes) {
	#print "\t $key\n";
	print INCLUDE "themes/$key/css\n";
	print INCLUDE "themes/$key/images\n";
}

print INCLUDE $more_items;

close INCLUDE;

print STDERR "WARNING - MORE THAN ONE THEME!!\n" if ((keys %themes) >1);

print "now run the following command:\n";
print 'rsync -arv --dry-run --files-from='.$rsynclist." $source_dir ".$dir."\n";
print "If everything looks OK, remove the --dry-run and run again to actually copy the files\n";
print "Extrahiert von: ".$medium."\n";

