Mediawiki/OAI mirror/pragmatic synchronisation/wikisync perl code

From The Science Media Network

Jump to: navigation, search

WARNING! This is extremely crude 'in-production' code, provide for illustration. DO NOT USE!

This is terribly written, proof of concept code.

The point is merely to show that you can do pragmatic mediawiki synchronisation in relatively few lines, and that not much more is needed to do this properly. See Mediawiki/OAI mirror/pragmatic synchronisation for more discussion.

You'll need to also use the Mediawiki/OAI mirror/pragmatic synchronisation/checkpoint code, and amend the relevant line in the script below to point to it. I.e. in the script below you need to set these variables:

$getcheckpoint = ".... point to checkpoint script .... see above ... ";
$oaiextension = "......";
$pathtophp  = "......";

for the script to work.

#!/usr/bin/perl
 
# Not all of these are necessary:
use XML::RSS;
use Dumpvalue;
use XML::Simple;
use Data::Dumper;
use XML::LibXML;
use XML::LibXML::Iterator;
# use Tie::Hash;
use XML::LibXML::XPathContext;
# http://search.cpan.org/dist/XML-LibXML-XPathContext/XPathContext.pm
 
use MediaWiki::API;
 
# You need to set these three variables:
$getcheckpoint = ".... point to checkpoint script .... see above ... ";
$oaiextension = "......";
$pathtophp  = "......";
 
 
($checkpoint = `sh $getcheckpoint`) =~ s/\n//;
$checkpoint =~ s/.*\: //;
$checkpoint =~ s/ /T/;
$checkpoint .= "Z";
print $checkpoint."\n";
 
# The client only ever reports changes since the checkpoint. We should
# probably record the revision id on client and server as well (TODO). The
# source always reports the last changed page, so we have a duplicate (TODO).
 
# TODO: Do something about files as well.
 
print "\n************* SOURCE *****************\n";
%pages = &get_updates("http://localhost:8888/wikimirror/source/index.php",$checkpoint);
%pages1 = %pages;
foreach (keys %pages) {
#    print "======== $_ ($pages{$_}{id}, $pages{$_}{timestamp}) ==========\n$pages{$_}{text}\n";
print "$_ ($pages{$_}{id}, $pages{$_}{timestamp})\n";
    if ($pages{$_}{timestamp} eq $checkpoint) {
	# The source always returns the page at the checkpoint as well. record this, so that it can be removed later.
	$checkpointmatch++;
	$checkpointpage = $_;
    };
};
if ($checkpointmatch == 1) {
    print "REMOVING $checkpointpage\n";
    delete $pages{$checkpointpage};
} elsif ($checkpointmatch == 0) { 
} else {
    print "There are no or several checpoint pages. $checkpointmatch\n";
    die("This isn't implemented yet.\n");
};
@array1 = keys %pages;
 
print "\n************* CLIENT *****************\n";
%pages = &get_updates("http://localhost:8888/wikimirror/mirror/index.php");
foreach (keys %pages) {
print "$_ ($pages{$_}{id}, $pages{$_}{timestamp})\n";
#    print "======== $_ ($pages{$_}{id}, $pages{$_}{timestamp}) ==========\n$pages{$_}{text}\n";
};
@array2 = keys %pages;
 
print "\n************* COMPARISON *****************\n";
&get_intersection(\@array1,\@array2);
@possible_conflicts = @intersection;
&get_intersection(\@possible_conflicts,\@array2);
@good_to_go = @difference;
 
print "Source = ",join ", ",@array1;
print "\n";
print "Mirror = ",join ", ",@array2;
print "\n";
print "Resolve= ",join ", ",@possible_conflicts;
print "\n";
print "Good   = ",join ", ",@good_to_go;
print "\n";
print "\n";
 
if (@array2 == ()) {
    print "No local updates - proceed with mirror.\n";
} else {
    &login;
    print "THERE ARE LOCAL UPDATES. I'll push these first.\n";
    if (@intersection == ()) {
	print "\tThere are no concurrent edits and thus no conflicts.\n";
    } else {
	print "\tThere are concurrent edits. Try to resolve automatically, or flag conflict.\n";
    };
    foreach (@good_to_go) {
	# Get the local page, and send it!
	print "\tNon_Conflict\tSEND $_ ($pages{$_}{id}, $pages{$_}{timestamp})\n"; # $pages{$_}{text}
	# This is not correct: We should pass the timestamp from the source page. However, this might work, because if the ts doesn't have to match exactly.
	$summary = "ts=$pages{$_}{timestamp}, id=$pages{$_}{id}, conflict=none";
	&send_page($_, $pages{$_}{timestamp}, $pages{$_}{text}, $summary);
	# record the local id and the local timestamp in the comment field (appended to the actual comment)
	# Then make sure that it was received by the source.
    };
    foreach (@possible_conflicts) {
	# Get the local page, and send it!
	print "\tConflict\tEXAMINE $_ ($pages{$_}{id}, $pages{$_}{timestamp})\n"; # $pages{$_}{text}
	# record the local id and the local timestamp in the comment field (appended to the actual comment)
	# Then make sure that it was received by the source.
	open F,">temp_mirror/$_";
	print F $pages{$_}{text};
	close F;
	open F,">temp_remote/$_";
	print F $pages1{$_}{text};
	close F;
#TO DO:
	# Now retrieve the latest version before the checkpoint (if there is one - if there isn't one, then the page as been co-created.)
#http://localhost:8888/wikimirror/source/api.php?action=query&pageids=1&prop=revisions&rvprop=timestamp|content&rvstart=2009-06-27T16:06:21Z&rvlimit=1
	my $titles = $mw->api( {
	    action => 'query',
	    titles => "$_",
	    prop => 'revisions',
	    rvprop => 'timestamp|content',
	    rvstart => $checkpoint, 
	    rvlimit => 1 } )
	    || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
	#print	Dumper($titles);
# print Dumper($titles->{'query'}->{pages}->{1}->{revisions}[0]->{'*'});
	open F,">temp_base/$_";
	print F $titles->{'query'}->{pages}->{1}->{revisions}[0]->{'*'};
	close F;
	# We go for diff3 -E. However, you could do diff3 -e as well,
	# and note issues elsewhere. That would mean you loose some
	# edits if you take no action, but pages are always kept in a
	# good state. For instance, we could upload a merged page
	# (-E), followed by a resolved page (-e), with a note that you
	# should resolve and roll back.
 
	$result = system "diff3 -m -E 'temp_mirror/$_' 'temp_base/$_' 'temp_remote/$_' > 'temp_out/$_'";
	$result1 = system "diff3 -m -e 'temp_mirror/$_' 'temp_base/$_' 'temp_remote/$_' > 'temp_out_res/$_'";
	if ($result != 0) {
	    $conflicts = 1;
	};
	print "RESOLUT: $result, $result1\n";
	$text = `cat "temp_out/$_"`;
	$text1 = `cat "temp_out_res/$_"`;
#	$text .= "\n[[Category:Conflicts_$result]][[Category:Conflicts]]\n" if $result != 0 && $text !~ m/Category\:Conflicts_$result/s;
#	print "\nRESOLUT: $result\n";
	# This time we have the timestamp, and it's pages1{$_}{timestamp}	
	if ($result != 0) {
	    $purp = ", purpose=use_this_revision_to_resolve_manually";
	} else {
	    $purp = ", purpose=no_conflicting_edits";
	};
	$summary = "ts=$pages{$_}{timestamp}, id=$pages{$_}{id}, conflict=E/$result$purp";
	&send_page($_, $pages1{$_}{timestamp}, $text, $summary);
#       Let's not send the regular page - better if this comes through the synchronisatin. Need to change OAI to include edit summary. TODO
#	&send_page_m($_, $pages{$_}{timestamp}, $text, $summary);
# TODO: We now need to notify the user who made the changes, e.g. via their talk page.
#	&append_page("User_talk:$page{$_}{user}", $pages1{$_}{timestamp},"= Please resolve conflict =\n[[$_]] needs to be resolved manually, otherwise you will loose the edits made to the local wiki.Please just visit the relevant revision, and resolve the edits.", "automated edit from wiki synchroniser");
	if ($result != 0) {
	    $summary = "ts=$pages{$_}{timestamp}, id=$pages{$_}{id}, conflict=e/$result1, purpose=auto_removed_conflicts_USE_PREV_REV_TO_RESTORE";
	    &send_page($_, $pages1{$_}{timestamp}, $text1, $summary);
	    &send_page_m($_, $pages1{$_}{timestamp}, $text1, $summary);
	};
    };
};
print "Now do update from repo...\n";
system "cd $oaiextension; $pathtophp oaiUpdate.php";
 
if ($conflicts == 1) {
    print "
IMPORTANT: There were conflicts.
Please visit [[Category:Conflicts]] on either wiki and resolve them.
";
};
 
exit;
 
sub login {
    $mw = MediaWiki::API->new();
    $mw->{config}->{api_url} = 'http://localhost:8888/wikimirror/source/api.php';
    $mw->login( { lgname => 'WikiSysop', lgpassword => 'password' } )
	|| die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
 
    $mwm = MediaWiki::API->new();
    $mwm->{config}->{api_url} = 'http://localhost:8888/wikimirror/mirror/api.php';
    $mwm->login( { lgname => 'WikiSysop', lgpassword => 'password' } )
	|| die $mwm->{error}->{code} . ': ' . $mwm->{error}->{details};
 
};
 
sub send_page_m {
    my $pagename, $text, $timestamp, $summary;
    ($pagename, $timestamp, $text, $summary) = @_;
    print "SUMM SUMM: $summary\n";
#    my $ref = $mw->get_page( { title => $pagename } );
    $mwm->edit( {
        action => 'edit',
        title => $pagename,
        basetimestamp => $timestamp, # to avoid edit conflicts 
	summary => $summary,
	text => $text } )
        || die $mwm->{error}->{code} . ': ' . $mwm->{error}->{details};
};
sub send_page {
    my $pagename, $text, $timestamp, $summary;
    ($pagename, $timestamp, $text, $summary) = @_;
    print "SUMM SUMM: $summary\n";
#    my $ref = $mw->get_page( { title => $pagename } );
    $mw->edit( {
        action => 'edit',
        title => $pagename,
        basetimestamp => $timestamp, # to avoid edit conflicts 
	summary => $summary,
	text => $text } )
        || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
};
 
 
 
 
sub get_intersection {
# http://perl.active-venture.com/pod/perlfaq4-dataarrays.html
    my @array1, @array2;
    @array1 = @{$_[0]};
    @array2 = @{$_[1]};
    @union = @intersection = @difference = ();
    my %count = ();
#    print "IN: @array1\nIN: @array2\n";
    foreach $element (@array1, @array2) { $count{$element}++ }
    foreach $element (keys %count) {
	push @union, $element;
#	print "::: $count{$element} ::: $element\n";
	push @{ $count{$element} > 1 ? \@intersection : \@difference }, $element;
#	print ">>>::: @intersection ::: @difference\n";
    }  
};
 
sub get_updates {
    ($wiki,$ts) = @_;
    $from = "from=$ts" if $ts ne "";
    $repo = "$wiki/Special:OAIRepository?verb=ListRecords&metadataPrefix=mediawiki&$from";
    print $repo,"\n";
    $xml = `wget -O - -q '$repo'`;
#    print $xml;
    my $parser = XML::LibXML->new;
    my $docu    = $parser->parse_string($xml);
    my $in     = XML::LibXML::XPathContext->new($docu);
    $in->registerNs('m', 'http://www.mediawiki.org/xml/export-0.3/');
    $in->registerNs('o', 'http://www.openarchives.org/OAI/2.0/');
    my %page;
    my %file;
    foreach $item ($in->findnodes("/o:OAI-PMH/o:ListRecords/o:record/o:metadata/m:mediawiki/m:page")) {
#    print "Processing item here: " . $item->nodePath() . " - ". $item->nodeName ." \n";
	$i++;
	# Page properties:     # Need revision / ts to update local register
	$title = $in->findvalue("m:title",$item);
	$text = $in->findvalue("m:revision/m:text",$item);
	$id = $in->findvalue("m:revision/m:id",$item);
	$timestamp = $in->findvalue("m:revision/m:timestamp",$item);
	# ISSUE: there is no summary returned in the OAI.... TODO
	$summary = $in->findvalue("m:revision/m:summary",$item);
	$page{$title}{text} = $text; 
	$page{$title}{id} = $id; 
	$page{$title}{timestamp} = $timestamp;
	$page{$title}{summary} = $summary;
	# File properties:
	$upload_src = $in->findvalue("m:upload/m:src",$item);
	$upload_ts = $in->findvalue("m:upload/m:timestamp",$item);
	$upload_filename = $in->findvalue("m:upload/m:filename",$item);
	$upload_size = $in->findvalue("m:upload/m:size",$item);
	$file{$title}{upload_src} = $upload_src ;
	$file{$title}{upload_ts} = $upload_ts ;
	$file{$title}{upload_filename} = $upload_filename ;
	$file{$title}{upload_size} = $upload_size ;
	#note: $item->findvalue doesn't work.
#	print "======== $title ($timestamp, $id, $summary) ======\n";
#	print $text;
#	print "\nFILE: $upload_src" if $src ne "";
#	print "\n";	
    };
    return %page;
};
 
#<upload> 
#<timestamp>2009-06-20T12:25:18Z</timestamp> 
#<contributor><username>WikiSysop</username><id>1</id></contributor> 
#<comment/> 
#<filename>New_image.png</filename> 
#<src>http://localhost:8888/wikimirror/source/images/a/a6/New_image.png</src> 
#<size>7690</size> 
#</upload>
Create a book