#!/usr/bin/perl -w

=head1 NAME

   convert.pl - Perl script to convert Word documents to Mediawiki

=head1 SYNOPSIS

   convert.pl WORDFILE

=head1 DESCRIPTION

    Covert Word documents to Mediawiki. 

    Depends on:
     A) OpenOffice with the "Sun Wiki Publisher" plugin
     B) OpenOffice must be running "headless":
        soffice -accept="socket,port=8100;urp;" -norestore -headless
     C) Python OpenDocument Converter (PyODConverter)
        http://www.artofsolving.com/opensource/pyodconverter
     D) pywikipediabot - to automatically log in and upload files
        http://pywikipediabot.sourceforge.net/

    Workflow: 
     1. Convert .doc to .wiki (the Mediawiki-page)
     2. Convert .doc to .xml (to get pictures)
     3. Decode and save base64 encoded pictures found in the .xml-file
     4. Polish the .wiki page (include images)
     5. Log in and upload the files to the mediawiki

=head1 AUTHOR

    Written by: Lars Strand <lars@redpill-linpro.no>

=cut

use strict;
use MIME::Base64; 
use File::Basename;

# Change these to your site
my $BASEDIR="..";                                 # Working directory
my $DocumentConverter = "./DocumentConverter.py"; # Where can we find the PyODConverter?
my $category = "[[Category:word2mediawiki]]";         # Default category to be appended to each page

my $pywikibot="./pywikipediabot";                 # Path to the pywikipediabot



# Expect WORD-file as input
die "Usage: $0 WORDFILE\n" if $#ARGV < 0;
my $inputfile = $ARGV[0];

die "ERROR: Unable to read file: $DocumentConverter\n" if ! -f $DocumentConverter;
die "ERROR: Unable to read file: $inputfile\n" if ! -f $inputfile;

# Start OpenOffice headless. OOo is smart enought to figure out
# if it's already running, so we can execute without checking
# if its running or not (in case this script is called several times).
system 'soffice -accept="socket,port=8100;urp;" -norestore -headless';

# Create data directory to store temp files
my $datadir="$BASEDIR/tmp";
stat($datadir);
if ( ! -d $datadir) {
    print "$datadir does not exists, creating ..\n";
    mkdir($datadir, 0700) or die "$0: couldn't create dir; $!\n";
}

# Create output directory to store wiki and images
my $resultdir="$BASEDIR/converted";
stat ($resultdir);
if ( ! -d $resultdir) {
    print "$resultdir does not exists, creating ..\n";
    mkdir($resultdir, 0700) or die "$0: couldn't create dir; $!\n";
}

# Convert all spaces in filename to '_' and strip .doc ending
my ($outputname, $path, $suffix) = fileparse($inputfile, qr/\.[^.]*/);
$outputname =~ s/\ /_/g;

# Do the actual conversion
print "#############################\n";
print "## Converting $inputfile to .wiki and .xml using soffice..\n";
print "#############################\n";
# OOo is noisy - redirect errors to /dev/null
system "python $DocumentConverter \"$inputfile\" $resultdir/$outputname.wiki 2> /dev/null";
system "python $DocumentConverter \"$inputfile\" $datadir/$outputname.xml 2> /dev/null";

# Convert the base64 encoded images in the xml-file to actual image files
# Based on a hack from jonas at redpill-linpro.no 
open(XMLFILE, '<', "$datadir/$outputname.xml")
    or die "ERROR: Unable to open file $datadir/$outputname.xml: $!\n";

# The XML file content is one long line
my @content;
while ( <XMLFILE> ) {
    push(@content, split(/\"/, $_));
}
close XMLFILE;

# Match for base64, decode and save image
my $counter = 1;
my @imglist;
foreach (@content) {
    if ( $_ =~ m/data:image\/\*\;base64/ ) {
        # Match base64 image, decode and save to tmp
        $_ =~ m/data:image\/\*\;base64\,([0-9a-zA-Z\+\/=]+)/;
        my $picturedata = $1;
        open(IMGFILE, '>', "$datadir/${outputname}_${counter}.tmp");
        print IMGFILE decode_base64($picturedata);
        close IMGFILE;

        # Determine what kind of image file and rename
        my $filtype = `file $datadir/${outputname}_${counter}.tmp`;
        if ( $filtype =~ /JPEG/ ) {
            rename ("$datadir/${outputname}_${counter}.tmp", "$resultdir/${outputname}_${counter}.jpg");
            print "Converting image: $resultdir/${outputname}_${counter}.jpg\n";
            $imglist[$counter] = "${outputname}_${counter}.jpg";
        }
        elsif ( $filtype =~ /PNG/ ) {
            rename ("$datadir/${outputname}_${counter}.tmp", "$resultdir/${outputname}_${counter}.png");
            print "Converting image: $resultdir/${outputname}_${counter}.png\n";
            $imglist[$counter] = "${outputname}_${counter}.png";
        }
        $counter++;
   }
}

# Include the image filename in the .wiki file
open(WIKIPAGE, '<', "$resultdir/$outputname.wiki")
    or die "ERROR: Unable to open file $resultdir/$outputname.wiki: $!\n";
my @wikipage = <WIKIPAGE>;
close WIKIPAGE;

# We must insert a start and end "tag" that is used by the wiki-bot for import
# We also want the title of the wiki page to be the same as the filename
unshift(@wikipage, "XZXZ42\n''' $outputname '''\n"); 

# Default category, include link to original file and add end-tag
my $basefilename = fileparse($inputfile); # to strip any directories
push(@wikipage, "$category\n\nConverted from: [[File:$basefilename]]\n\nYZYZ42\n");

# Modify and write the wiki page
open(WIKIPAGE, '>', "$resultdir/$outputname.wiki")
    or die "ERROR: Unable to open file $resultdir/$outputname.wiki: $!\n";

$counter = 1;
foreach my $line (@wikipage) {
  
    while ($line =~ /\[\[Image\:\]\]/) {
        my $image = $imglist[$counter];
        $line =~ s/\[\[Image\:\]\]/\[\[Image\:$image\]\]/;
        print "Rewrote wiki page with new Image tag: $line\n";
        $counter++;
    }

    print WIKIPAGE $line;
}

close WIKIPAGE;

print "#############################\n";
print "## Conversion complete: $resultdir/$outputname.wiki\n";
print "#############################\n";

# Upload files to mediawiki using "pywikipediabot"
print "###########################";
print "## Log in and upload\n";
print "###########################";

# Log in. Save browser cookie
print "## Logging in\n";
system "python $pywikibot/login.py -force -all";

# Upload the original Word document
print "## Uploading: $inputfile\n";
print "## Exec: python $pywikibot/upload.py -noverify -keep \"$inputfile\" \"File uploaded by word2mediawiki (BOT)\" \n";
system "python $pywikibot/upload.py -noverify -keep \"$inputfile\" \"File uploaded by word2mediawiki (BOT)\" ";


# Upload pictures
shift(@imglist); # Remove imglist[0], since its empty (we started the imglist at 1)

foreach my $img (@imglist) {
    print "## Uploading: $img\n";
    print "## Exec: python $pywikibot/upload.py -noverify -keep $resultdir/$img \"Image uploaded by word2mediawiki (BOT)\" \n";
    system "python $pywikibot/upload.py -noverify -keep $resultdir/$img \"Image uploaded by word2mediawiki (BOT)\" ";
}

# Upload the wiki page
print "## Uploading the wiki page\n";
print "## Exec: python $pywikibot/pagefromfile.py -start:XZXZ42 -end:YZYZ42 -safe -file:$resultdir/$outputname.wiki\n";
system "python $pywikibot/pagefromfile.py -start:XZXZ42 -end:YZYZ42 -safe -file:$resultdir/$outputname.wiki";

# done and done
print "## Conversion and upload complete\n";
