Planning:KR Import Script - clc-wiki

#! /usr/bin/perl
 
# DESCRIPTION
# This script imports the KR2 Exercise solutions on Richard Heathfield's old
# unmaintained site into a mediawiki.
 
# WHAT IT DOES
# The main structural things it does are:
# * wikify the html tags and replace symbols like &amp
# * remove the colour highlighting from the source code since the wiki has 
#   its own colouriser
# * convert most of the "me", "myself", etc into Richard Heathfield and 
#   generally try to objectify the writing
# * add links to solution category numbers description page whereever they're 
#   mentioned
# It imports all content with a few exceptions: the Maintenance History, 
# Contributors, Bugs and Fixes, Copyright Issues, and Unsolved exercises pages 
# are ommitted; and the Naming Conventions page is rewritten.  One paragraph 
# is known to be skipped in an exercise solution - it was a reminder to add a 
# solution.
#
# The main non-structural functionality is a comparison between the source code
# within files in the zip archive and the source code within the html pages to
# identify any missing/different code in either.
 
# QUICKSTART
# Set the variables below.  The essential ones are $wikiuser and $wikipw; also
# make sure $browser exists or is set to an empty variable, $diffagent exists 
# or is a program that always returns true; and that the temporary directories
# and files are appropriate.
# Make sure that Perl is installed as well as the libwww-perl package on 
# which this script depends; you must be connected to the internet when 
# running the script as it downloads from RJH's original site.
 
# VERSION: 0.1.1 (the version of the script actually run against the live wiki)
# DATE  : 2 January 2006
# AUTHOR: netocrat@dodo.com.au
# HISTORY 
# *  2 Jan 2006: changed naming convention for all generated wiki pages to
#                prefix KR2_ and then ran the script against the wiki.
# * 28 Dec 2005: reworded a few comments prior to posting to the wiki;
#                the code itself is identical to the original version.
# * 16 Oct 2005: first release; emailed to RJH, MG and GP.
 
# CAVEATS
# It's very ad-hoc, certified organic and not designed for robust extension.
# Line width is occasionally much longer than 80 characters; style is very much
# "one big script" rather than well-structured into small functions.
# It's also inefficient - using multiple regexp parses of the same strings but
# ease of scripting and expression is favoured over the probably unnoticable 
# speed loss.
# Also there's little parsing of returned HTML pages for error messages - 
# that's left up to the user (set $browser below) - error handling's generally
# incomplete.
# Finally there's some lack of elegance - it's been a while since I've scripted
# Perl and I'm a bit out of touch with the subtleties esp. regexps.
 
use Cwd 'abs_path';
 
## BEGIN CONFIGURABLE VARIABLES
my $baseurl = "http://clc.flash-gordon.me.uk/mediawiki/index.php";
my $wikiuser = "TestUser";
my $wikipw = "test";
# No sanity check that $diffagent returns proper result codes is performed
# so make sure that it works!  The script will break if not - evidenced by a 
# slew of messages saying "unamtched code block".
my $diffagent = "diff -Bb";	# Path to diff command (including arguments) 
				# that compares ignoring whitespace and empty 
				# lines, returning 0 on difference and non-zero
				# on identical files.
				# If you don't care to use this functionality;
				# simply use a command that always returns true.
my $unzipagent = "unzip";	# Path to unzip command
my $httpdownloadagent = "wget";
my $dosubmit = 1;	# If true, submit pages to the wiki;
			# else do not submit pages i.e. a dry-run or for 
			# diff'ing
my $submitpermholder = 1; # If true, a "waiting permission from..." page will
			# be submitted to the wiki for pages containing 
			# content from authors whose permission has not yet
			# been obtained.  The page will not be recorded in the 
			# config file as having been submitted to the wiki.
my $verbose = 0;	# If true, show extra information (implies 
			# $summaryverbose and $showskippedsubmits)
my $summaryverbose = 0;	# If true, show all summary information (incl. missing
			# author permissions for a page, matching and unmatching 
			# blocks/source files for each exercise)
my $showskippedsubmits = 1; # If true and $dosubmit is also true, print a msg
			# when skipping a submit due to missing permissions
			# or a previous submission stored in config file 
my $askshowunmatchedblocks = 0;	# If set to true, will prompt whether to show 
			# unmatched blocks of code from webpages and whether
			# or not to ignore it (treat it as non-solution code)
			# Otherwise will only indicate when an unmatching block
			# is found
my $askshowfiles = 0;	# If true, will prompt whether to show unmatched
			# source files
			# Otherwise will only show which files are unmatched
my $forceconsent = 1;	# If true, the $autoconsent variable will be applied to 
			# _all_ authors, including those already in the hash 
			# due to the config file 
my $autoconsent = 1; 	# What to set the consent key ('perm') to for an author
			# NOT YET IN THE HASH - won't affect authors already
			# in the hash with 'perm' of zero unless $forceconsent 
			# is true
			# If zero, set consent to unknown
			# If negative, consent is assumed negative.
			# If positive, consent is assumed positive.
my $askunknownconsent = 1; # If true, prompt for consent on authors whose 
			# consent is as-yet unknown i.e. 0
my $askemailupdate = 0; # If true, as an author not in the hash
			# is encountered, or when querying for new permission 
			# for an author, the script will prompt for an
			# optional update of their email address
my $pauseperpage = 0;	# If true, will pause after each page sent to $browser
my $pauseonerr = 2;	# If true, will pause on problems/errors/
			# unexpected situations
			# If > 1, will only pause on more unexpected errors
my $pauseonsummary = 0;	# If true, pause on summaries
my $showframes = 0;	# If set, show a frame contrasting the submitted
			# wiki page with the original page on RH's site
			# with the page returned by the wiki submission.
			# Otherwise the page shown will be the wiki's
			# version as retrieved directly from the server
			# after edit (so proper stylesheeting will be 
			# shown)
my $browser = "firefox";#""; 	# Path to a browser
				# If set, pages submitted to the wiki and other
				# relevant pages will be opened in this browser
				# Should be a browser that opens in a separate 
				# window; preferably where separate calls don't
				# invoke a new one (with Firefox this is true
				# but pages are sometimes out-of-order or 
				# skipped without a pause as below)
my $browserpause = 7;	# Number of seconds to pause to give the browser
			# time to catch up without skipping pages
my $tmpdir = abs_path("/tmp/kr_wiki_import");
my $zipdir = "$tmpdir";
my $unzipdir = "$tmpdir/src";	# this should be an initially 
				# non-existent or empty directory to 
				# avoid globbing mix-ups
my $htmldir = "$tmpdir/html";
my $cookiefile = "$tmpdir/lwpcookies.txt";
my $tmpfile = "$tmpdir/tmp.html";
my $skipdownload = 0;	# If true, will not download the zipped srcfile 
			# (assumes it already exists locally)
			# The first time through, this and all "skip" 
			# variables should be zero - I may forget to change 
			# it when distributing the script
my $skipunzip = 0;	# If true, will not uzip the srcfiles 
			# (assumes this has already been done)
my $krbase = "http://users.powernet.co.uk/eton/kandr2";
my $zippedsource = "allc.zip";
## END CONFIGURABLE VARIABLES
 
# Read nested hashes from $configfile
# This file gets overwritten each time the script runs, so don't modify it if 
# you wish to keep the changes.
# For 'perm' key:
#  1 is "permission granted"
# -1 is "permission not granted"
#  0 is "permission unknown"
# Note that email is the updated email address if known.
my $configfile = abs_path("./import_kr.config");	# this file overwritten at script end
%authors;	# must be global to be available to $configfile
%ignblocks;	# must be global to be available to $configfile
%submittedpages;# must be global to be available to $configfile
if (-e $configfile) {
	require $configfile;
}
 
# Ask whether to save configfile on interrupt
$SIG{INT} = sub {
	print "Interrupt signal received - save configfile? ([y]/n) ";
	my $tmp = <STDIN>;
	$tmp =~ s/[\r\n]*//gsi;
	if ($tmp ne "n") {
		print "Saving config file\n";
		&writeconfigfile;
	} else { 
		print "NOT saving config file\n";
	}
	print "Exiting on interrupt\n";
	exit 1;
};
 
# Pages skipped due to missing permissions - to print out at end
my @skippedpages = ( );
 
use HTTP::Request::Common qw(POST GET);
use LWP::UserAgent;
use HTTP::Cookies;
use File::Copy;
 
# Init
print STDERR "Logging to stderr\n";
print STDERR "Asterisked items match a page opened in \$browser (if set)\n";
mkdir $tmpdir;
 
$zipdir = abs_path($zipdir);
# Download the zipped source from Richard Heathfield's original site
if (!$skipdownload) {
	mkdir $zipdir;
	chdir $zipdir || print STDERR "Failed to cd to $zipdir\n";
	system("$httpdownloadagent $krbase/$zippedsource");
}
 
$unzipdir = abs_path($unzipdir);
# Unzip the source code
if (!$skipunzip) {
	mkdir $unzipdir;
	chdir $unzipdir || print STDERR "Failed to cd to $unzipdir\n";
	system("$unzipagent $zipdir/$zippedsource");
}
 
$htmldir = abs_path($htmldir);
$cookiefile = abs_path($cookiefile);
$tmpfile = abs_path($tmpfile);
 
# List of files that get removed from the list as they are found to compare 
# equal to code in the html pages
(@srcfiles) = glob("$unzipdir/*");
 
# Create the HTTP user agent with cookie support
my $ua = LWP::UserAgent->new;
$ua->agent("MyApp/0.1 ");
$ua->cookie_jar(HTTP::Cookies->new(file => $cookiefile, autosave => 1));
mkdir $htmldir;
 
if ($dosubmit) {
	# Login to wiki
	my $req = POST "$baseurl?title=Special:Userlogin&action=submitlogin&returnto=Main_Page",
		[ wpName => $wikiuser, wpPassword => $wikipw, 
	  	wpLoginattempt => 'Log in' ];
	my $rc = &send($req);
	move($tmpfile, "$htmldir/loggedin.html");
	if (length($browser) > 0) {
		system("$browser $htmldir/loggedin.html &");
	}
	if (!$rc) {
		print STDERR $res->status_line, "\n";
		print STDERR "*Unable to login to wiki.\n";
		print STDERR "Exiting.\n";
		exit 1;
	}
	print STDERR "*Logged in to wiki\n";
	if ($pauseperpage) {
		&dopause;
	} elsif (length($browser) > 0) {
		sleep($browserpause);
	}
}
 
## Some statically generated pages
 
# A single category used for all pages
# Not needed to be explicitly included in solution pages since it is included by
# the template header
&submitwikipage("Category:KR2_Solutions", "'''The C Programming Language, 2nd Edition, by Kernighan and Ritchie'''<br>
'''Solutions collated and maintained by the [news://comp.lang.c comp.lang.c] community'''<br>
'''All contributions welcome'''<br>", "");
 
# Template header for exercise solution pages.  The template takes 4 parameters allowing easy formatting changes to be made to all solution page headings:
# {{{1}}} exercise number
# {{{2}}} page number
# {{{3}}} exercise description (wording from K&R)
# {{{4}}} contributor information
&submitwikipage("Template:KR2_Header", "'''The C Programming Language, 2nd Edition, by Kernighan and Ritchie'''<br>
'''Exercise {{{1}}} on page {{{2}}}'''<br><br>
''{{{3}}}''<!--Exercise description--><br><br>
''{{{4}}}''<!--Original contributing author information--><br><br>
[[Category:KR2_Solutions]]\n", "");
 
# Chapter index and general intro
&submitwikipage("KR2_Chapter_Index", "'''The C Programming Language, 2nd Edition, by Kernighan and Ritchie'''<br>
<br>
The content of this part of the wiki was seeded from [http://users.powernet.co.uk/eton/kandr2/index.html Richard Heathfield's solutions site].  That site was inaugurated on 1 January 2000 as a repository for definitive answers to the exercises in \"The C Programming Language\" by Kernighan and Ritchie (2nd edition) - ISBN 0-13-110362-8.   That site is no longer maintained.  It is intended that this part of the wiki inherit from that site the role of providing high quality solutions to K&R's exercises.<br>
<br>
'''Chapter Index For Exercise Solutions'''<br><br>
[[KR2_Chapter_1|Chapter 1 - A Tutorial Introduction]]<br>
[[KR2_Chapter_2|Chapter 2 - Types, Operators and Expressions]]<br>
[[KR2_Chapter_3|Chapter 3 - Control Flow]]<br>
[[KR2_Chapter_4|Chapter 4 - Functions and Program Structure]]<br>
[[KR2_Chapter_5|Chapter 5 - Pointers and Arrays]]<br>
[[KR2_Chapter_6|Chapter 6 - Structures]]<br>
[[KR2_Chapter_7|Chapter 7 - Input and Output]]<br>
[[KR2_Chapter_8|Chapter 8 - The UNIX System Interface]]<br>
<br>
[[KR2_Solution_Category_Numbers|Solution Category Types (Category 0, 1, 2 and 3)]]
[[Category:KR2_Solutions]]\n", "http://users.powernet.co.uk/eton/kandr2/index.html");
 
# Explanation of Category Numbers
&submitwikipage("KR2_Solution_Category_Numbers", "'''The C Programming Language, 2nd Edition, by Kernighan and Ritchie'''<br><br>
'''The Meanings of the Solution Category Numbers For Exercise Solutions As Used In This Wiki:'''<br>
<table>
<tr><td>0</td><td>ANSI/ISO C89 compliant. The example only uses the subset of C already covered at the point in the book at which the exercise appears.</td></tr>
<tr><td>1</td><td>ANSI/ISO C89 compliant. The example uses aspects of C which may not have been covered at the point in the book at which the exercise appears.</td></tr>
<tr><td>2</td><td>ANSI/ISO C99 compliant. The example only uses the subset of C already covered at the point in the book at which the exercise appears. It is compliant with the C99 standard (e.g. doesn't use implicit int ).</td></tr>
<tr><td>3</td><td>ANSI/ISO C99 compliant. The example uses some aspects of C which exist only in the C99 release of the language.</td></tr>
</table>
[[Category:KR2_Solutions]]", "http://users.powernet.co.uk/eton/kandr2/name.html");
 
# Undefined Behaviour page
&submitwikipage("Undefined_Behaviour", "Undefined behaviour is defined by the ISO/ANSI C Standard as:<br><br>
''behavior, upon use of a nonportable or erroneous program construct, of erroneous data, or of indeterminately valued objects, for which this International Standard imposes no requirements''<br>
<br>
''NOTE Possible undefined behavior ranges from ignoring the situation completely with unpredictable results, to behaving during translation or program execution in a documented manner characteristic of the environment (with or without the issuance of a diagnostic message), to terminating a translation or execution (with the issuance of a diagnostic message).''", "http://users.powernet.co.uk/eton/kandr2/undefined.html");
 
# Loop over RH website
# $ch => Chapter number
# $ex => Excercise number
for ($ch = 1; ; $ch++) {
	print STDERR "Processing chapter $ch\n";
	# Check whether chapter index page exists
	my $kr_url = "$krbase/krx$ch.html";
	my $req = GET $kr_url;
	my @exnums; # declare here so it's available to exercise loop
	if (!&send($req)) {
		print STDERR $res->status_line, "\n";
		print STDERR "Could not retrieve: $kr_url\n";
		print STDERR "Assuming end of chapters.\n";
		last;
	} else {
		## Non-rigorous wikifying of chapter index page using regexps
		# Remove all html tags prior to Chapter heading and subsequent
		# to (and including) the hit counter and preceding breaks
		(my $pg) = $res->content =~ /(<h2>\s*Chapter.*?)((<br>(\s*))*)<a href=index.html>/si;
		# General wikification
		$pg = &htmltowiki($pg);
		# Special case of ch 4 - referring to copyright - get wording to
		# come out as "when he was typing ... the problem occurred to 
		# Richard Heathfield ... fortunately for Richard, ..."
		if ($ch == 4) {
			$pg =~ s/ I / he /;
			$pg =~ s/(Heathfield.*?) Heathfield/$1/
		}
		$pg = &striphtmlcodeformatting($pg);
		$pg = "$pg\n[[Category:KR2_Solutions]]";
		# Get list of exercise numbers as linked to by the chapter index page
		(@exnums) = $pg =~ /KR2_Exercise_\d-(\d{2})/gsi;
		if ($verbose) {
			print STDERR "Exercise numbers: ", join(' ', @exnums), "\n";
		}
		# Special case of KR Polish Calculator source
		$pg =~ s/KR2_Exercise_4-00/KR2 Polish Calculator/gsi;
		# Check for any remaining links (likely internal)
		$links = gethrefs($pg);
		if (length($links) > 0) {
			print STDERR "Unconverted links remain in chapter $ch index wiki page:\n$links";
			if ($pauseonerr) { &dopause; }
		}
		# Submit chapter index page to wiki
		move($tmpfile, "$htmldir/krx$ch.html");
		&submitwikipage("KR2_Chapter_$ch", $pg, "$htmldir/krx$ch.html"); #ignore errors
	}
 
	# Loop over exercises in chapter
	foreach my $ex (@exnums) {
		print STDERR "Processing $ch.$ex\n";
		my $id = ((length($ex) == 1)?"0$ex":"$ex");
		$id = "$ch$id";
		my $k = 0;
		my $l = 0;
		# Get org page
		my $kr_url = "$krbase/krx$id.html";
		my $req = GET $kr_url;
		if (!&send($req)) {
			print STDERR $res->status_line, "\n";
			print STDERR "Could not retrieve: $kr_url\n";
			if ($pauseonerr) { &dopause; }
		}
		move($tmpfile, "$htmldir/krx$id.html");
 
		if ($ch == 4 && $ex == 0) {
			## Special case of KR2 Polish Calculator
			$res->content =~ /(.*?)<center>(.*?)<h3>(\s*)(.*?)(\s*)<\/h3>(.*?)<h4>(\s*)(.*?)(\s*)<\/h4>(.*?)<\/center>(((\s*)<br>(\s*))*)(.*?)((<br>(\s*))*)<a href=index.html>/si;
			my $heading = $4;
			my $subhead = $18;
			my $pg = $15;
			$heading = &convertsymbols($heading); # convert &amp; &lt; etc
			$subhead = &convertsymbols($subhead);
			$subhead = &striphtmlcodeformatting($subhead);
			my $pg = &convertsymbols($pg);
			$pg = &docodeconv($ch, $ex, $pg);
			$pg = "'''The C Programming Language, 2nd Edition, by Kernighan and Ritchie'''<br>\n'''$heading'''<br>\n''$subhead''<br><br>$pg\n";
			$pg = &htmltowiki($pg);
			$pg = "$pg\n[[Category:KR2_Solutions]]";
			my $wikititle = "KR2_Polish_Calculator";
			if (!&submitwikipage($wikititle, $pg,
					"$htmldir/krx$id.html")) {
				print STDERR "Skipping to next exercise\n";
			} 
			next;
		}
		## Non-rigorous wikifying of solution page using regexps
		$tmp = $res->content;
		# ch 2 ex 4 has nested italics tags in the question text
		$tmp =~ s/<i> string <\/i>/string/gsi;
		# ... as does ch 4 ex 1
		$tmp =~ s/<i> rightmost <\/i>/rightmost/gsi;
		# ... and ch 1 ex 22
		$tmp =~ s/<i> n <\/i> /''n''/gsi;
		# ... and ch 1 ex 20, but this one ends then restarts the 
		# italics as the above was probably also intended to do
		$tmp =~ s/<\/i> n <i>/''n''/gsi;
		# Strip unnecessary leading and trailing html and mark content
		$tmp =~ /(.*?)<center>(.*?)<h3>(\s*)Answer to Exercise(\s*)(\d*)-(\d*)(.*?)page (\d*)(.*?)<\/h3>(.*?)<h4>(\s*)(.*?)(\s*)<\/h4>(.*?)<\/center>(.*?)<i>(\s*)(.*?)(\s*)<\/i>(((\s*)<br>(\s*))*)(.*?)((<br>(\s*))*)<a href=index.html>/si;
		my $chpnum = $5;
		my $exnum = $6;
		my $pgnum = $8;
		my $question = $17;
		my $contributors = $12;
		my $pg = $23;
		# Special cases where heading has extra text in a different style
		if (($ch == 3 && $ex == 4) || ($ch == 1 && $ex == 20)
					 || ($ch == 6 && $ex == 3)) {
			$pg = "$15\n$pg";
		} elsif (!($ch == 8 && $ex == 1)) {
			$question = "$15 $question";
		}
		$question = &convertsymbols($question);
		# Remove &quot and end-of-line markers for author names
		$contributors =~ s/[\r\n]/ /gsi;
		$contributors =~ s/&quot;//gsi;
		$contributors = &convertsymbols($contributors);
		# Remove initial <br>s for contributors
		$contributors =~ s/^((\s*)(<br>*)(\s*))*//gsi;
		$pg = &convertsymbols($pg);
		# Strip code formatting from question text (for 5.13)
		$question = striphtmlcodeformatting($question);
		# Escape sequences of "=" and "|" with <nowiki> since they have
		# special meaning in parameters to templates
		$question =~ s/(=+)/<nowiki>$1<\/nowiki>/gsi;
		$question =~ s/(\|+)/<nowiki>$1<\/nowiki>/gsi;
		# Convert any <code><pre> tags to <C>
		$question =~ s/<code>(\s*)<pre>/<C>/gsi;
		$question =~ s/<\/pre>(\s*)<\/code>/<\/C>/gsi;
		# Remove end-of-line markers for questions but not within
		# the <C></C> enclosure
		# There's probably a way to do this in one single regexp 
		# but finding out how takes longer than implementing this
		my $end = 0;
		while ((my $start = index($question, "<C>", $end)) >= 0) {
			substr($question, $end, $start - $end) =~ s/[\r\n]/ /gsi;
			if (($end = index($question, "</C>", $start)) < 0) {
				$end = $start + length("<C>");
				last;
			}
			$end += length("</C>");
		}
		my $start = length($question);
		substr($question, $end, $start - $end) =~ s/[\r\n]/ /gsi;
 
		# Remove initial <br>s for question
		$question =~ s/^((\s*)(<br>*)(\s*))*//gsi;
		# Remove smiley which occurs inconsistently in one name (Flippant Squirrel)
		$contributors =~ s/ :-\)//gsi;
		$fmtnames = $contributors;
		$fmtnames =~ s/(.*?)<a(\s*)href=mailto:(.*?)>(\s*)(.*?)(\s*)<\/a>(.*?)/'$3' "$5" /gsi;
		(@names) = $fmtnames =~ /"(.*?)"/gsi;
		(@emails) = $fmtnames =~ /'(.*?)'/gsi;
		if ($#names != $#emails) {
			print STDERR "Number of contributor names does not match email addresses:\n";
			print STDERR "Names: @names\n";
			print STDERR "Emails: @emails\n";
			if ($pauseonerr) { &dopause; }
		}
 
		# Check / query author consent and email addresses
		my @nopermauthors = ( );
		for (my $i = 0; $i <= $#names; $i++) {
			my $aeu = 0;
			if (!exists($authors{$names[$i]}{'email'})) {
				$authors{$names[$i]}{'email'} = $emails[$i];
				$aeu = 1;
			}
			if (!exists($authors{$names[$i]}{'perm'}) || 
								$forceconsent) {
				$authors{$names[$i]}{'perm'} = $autoconsent;
			}
			if ($authors{$names[$i]}{'perm'} < 0) {
				$nopermauthors[$#nopermauthors+1] = $names[$i];
			} elsif ($authors{$names[$i]}{'perm'} == 0) {
				if (!$askunknownconsent) {
					$nopermauthors[$#nopermauthors+1] = $names[$i];
				} elsif (!$authors{$names[$i]}{'askedperm'}) {
					$aeu = 1;
					print "Permission unknown for author: ", $names[$i], " <", $authors{$names[$i]}{'email'}, ">\n";
					my $def = "u";
					print "Permission granted? (", ($def eq "y")?"[y]":"y", "/", ($def eq "n")?"[n]":"n", "/", ($def eq "u")?"[u]":"u", "/", ($def eq "q")?"[q]":"q", ") ";
					my $perm = <STDIN>;
					$perm =~ s/[\r\n]//gsi;
					if ($perm ne "y" && $perm ne "n" && $perm ne "u" && $perm ne "q") {
						$perm = $def;
					}
					if ($perm eq "q") {
						&writeconfigfile;
						exit 0;
					} elsif ($perm eq "n") {
						$authors{$names[$i]}{'perm'} = -1;
						$nopermauthors[$#nopermauthors+1] = $names[$i];
					} elsif ($perm eq "y") {
						$authors{$names[$i]}{'perm'} = 1;
					} else {
						$authors{$names[$i]}{'perm'} = 0;
						$nopermauthors[$#nopermauthors+1] = $names[$i];
					}
					$authors{$names[$i]}{'askedperm'} = 1;
				} else {
					$nopermauthors[$#nopermauthors+1] = $names[$i];
				}
			}
			if ($askemailupdate && $aeu) {
				print "Update email address for ", $names[$i], " [", $authors{$names[$i]}{'email'}, "]: ";
				my $newmail = <STDIN>;
				$newmail =~ s/[\r\n]//gsi;
				if ($newmail eq "q") {
					&writeconfigfile;
					exit 0;
				} 
				if (length($newmail) > 0) {
					$authors{$names[$i]}{'email'} = $newmail;
				}
			}
		}
 
		# Substitute updated email addresses
		for (my $i=0; $i <= $#names; $i++) {
			$contributors =~ s/$emails[$i]/$authors{$names[$i]}{'email'}/;
		}
 
		# Verify displayed chapter and exercise numbers (not verifying 
		# that page number is as displayed on chapter index page - that
		# would be a little too anal ;-)
		if ($chpnum != $ch) {
			print STDERR "Chapter number on page ($chpnum) not expected (expected $ch)\n";
			# don't pause for ch 4, ex 11 when $pauseonerr > 1
			if ($pauseonerr >= 1 && !($pauseonerr > 1 && $ch == 4 && $ex == 11)) { &dopause; }
		}
		if ($exnum != $ex) {
			print STDERR "Exercise number on page ($exnum) not expected (expected $ex)\n";
			# don't pause for ch 4, ex 11 when $pauseonerr > 1
			if ($pauseonerr >= 1 && !($pauseonerr > 1 && $ch == 4 && $ex == 11)) { &dopause; }
		}
		# Add template header to top of page
		if (length($contributors) < 1) { $contributors = " "; }
		$pg = "{{KR2_Header|$ch.$ex|$pgnum|$question|$contributors}}\n$pg";
 
		# Remove font colourisation from source code and compare with
		# unzipped src files
		$pg = &docodeconv($ch, $ex, $pg);
		# Wikify remaining html
		$pg = &htmltowiki($pg);
		# Unconvert the case where "me" doesn't refer to 
		# Richard Heathfield (quote of Chris Sidi)
		if ($ch == 1 && $ex == 18) {
			$pg =~ s/Richard Heathfield/me/;
		}
		# Fix up the case where "myself" and "my" should be converted better
		if ($ch == 1 && $ex == 10) {
			$pg =~ s/I wrote Richard Heathfield/Richard Heathfield wrote/;
			$pg =~ s/my solution/Richard Heathfield's solution/;
		}
 
		# Check for any remaining links (likely internal)
		$links = gethrefs($pg);
		if (length($links) > 0) {
			print STDERR "Unconverted links remain in $ch.$ex wiki page:\n$links";
			if ($pauseonerr) { &dopause; }
		}
 
		my $wikititle = "KR2_Exercise\_$ch\-$ex";
		# Handle cases where consent has not been obtained yet
		my $norecord = 0;
		if ($#nopermauthors >= 0) {
			push(@skippedpages, $wikititle);
			if ($verbose || $summaryverbose) {
				print STDERR "Permission to use the following author(s)'s material is unknown or has not been granted: ", join('; ', @nopermauthors), "\n";
			}
			if ($submitpermholder) {
				$pg = "This page not yet transferred from Richard Heathfield's original solutions site as permission to use the contributions of the following author(s) has not yet been obtained or has not been granted:<br><br>\n";
				foreach $auth (@nopermauthors) {
					$pg = "$pg\n$auth";
					if ($authors{$auth}{'perm'} == -1) {
						$pg = "$pg : Permission not granted<br>\n";
					} else {
						$pg = "$pg : Permission unknown; last known email address: $authors{$auth}{'email'}<br>\n";
					}
				}
				$norecord = 1;
			} elsif ($dosubmit) {
				if ($showskippedsubmits || $verbose || $summaryverbose) {
					print STDERR "NOT submitting $wikititle - missing contributor consent\n";
				}
				next; # Skip submit
			}
		}
 
		# Submit edit request
		if (!&submitwikipage($wikititle, 
				$pg, #"test bot submission $i $j $k $l",
				"$htmldir/krx$id.html", $norecord)) {
			print STDERR "Skipping to next exercise\n";
			next;
		} 
	}
}
 
# Print final list of unmatched srcfiles
if ($#srcfiles >= 0) {
	my $filelist = join("\t", @srcfiles);
	$filelist =~ s/$unzipdir\///gs;
	print STDERR "All unmatched unzipped source files:\n";
	print STDERR "$filelist\n";
}
 
# Print final list of pages not submitted due to lack of author consent
if ($#skippedpages >= 0) {
	print STDERR "$#skippedpages pages not submitted due to permission not granted/unknown:\n";
	print STDERR join("\t", @skippedpages), "\n";
}
 
# Write config file and exit
&writeconfigfile;
exit 0;
 
# Overwrite config file
# No warning before overwrite - make copy of config file if desired
sub writeconfigfile {
	foreach my $a (keys %authors) {
		delete($authors{$a}{'askedperm'});
	}
	open($F, ">$configfile"); 
	printndepthhash(\%ignblocks, "ignblocks", 3, 3, "", $F);
	printndepthhash(\%authors, "authors", 2, 2, "", $F);
	printndepthhash(\%submittedpages, "submittedpages", 1, 1, "", $F);
	print $F "1 # must return a true value for 'require'\n";
	close($F);
}
 
# Strip html colourised formatting from code blocks
sub striphtmlcodeformatting {
	$code = @_[0];
	# strip <font> tags
	$code =~ s/<(\/{0,1})font(.*?)>//igs;
	# strip bold tags
	$code =~ s/<(\/{0,1})b>//igs;
	return $code;
}
 
# Convert blocks of code in html to remove colourised formatting and
# compare to unzipped source files.  Wikify by placing inside <C></C>
# or just plain <pre></pre> for non-C blocks
#@_[0] => Chapter number
#@_[1] => Exercise number
#@_[2] => Content (may be partially wikified so long as html-formatted code 
#	  blocks aren't)
sub docodeconv {
	my $ch = shift; #@_[0];
	my $ex = shift; #@_[1];
	my $block = 0;
	my $pg = shift; #@_[2];
	my $begin = "<code>\n<pre>\n";
	my $bl = length($begin);
	my $end = "</pre>\n</code>\n";
	my $el = length($end);
	my $idx = 0;
	my $id = (length($ex) == 1)?"0$ex":$ex;
	my $nonmatchblocks = 0;
	my $igncount = 0;
	$id = "$ch$id";
	my (@files) = glob("$unzipdir/[kK][rR][xX]$id*.[cC]");
	my $fc = $#files + 1;
 
	if ($verbose) {
		print STDERR "Comparing webpage C code against ", $#files+1, " unzipped source files\n";
	}
 
	my $pause = 0;
	while (($idx = index($pg, $begin, $idx)) >= 0) {
		my $edx = index($pg, $end, $idx + $bl);
		if ($edx > 0) {
			my $code = substr($pg, $idx + $bl, $edx - $idx - $bl);
			$code = striphtmlcodeformatting($code);
 
			if ($ignblocks{$ch}{$ex}{$block+1} == 1) {
				$igncount++;
				if ($verbose) {
					print STDERR "Ignoring code block #", $block+1, "\n";
				}
			} else {
				# Compare to unzipped source files
				my $srcfilefromweb = "$tmpdir/$ch.$ex.$block.c";
				open(SRCFILE, ">$srcfilefromweb");
				print SRCFILE $code;
				close(SRCFILE);
				my $match = 0;
				for (my $ai = 0; $ai <= $#files; $ai++) {
					if (system("$diffagent $srcfilefromweb $files[$ai] >/dev/null") == 0) {
						if ($verbose) {
							print STDERR "Code block #", $block+1, " from webpage ",
								"compared equal to ", 
								substr($files[$ai], length($unzipdir)), "\n";
						}
						$match = 1;
						# Remove from global then local unzipped src file list
						# (in that order)
						my $ai2;
						for ($ai2 = 0; $ai2 <= $#srcfiles; $ai2++) {
							if ("$files[$ai]" eq "$srcfiles[$ai2]") {
								splice(@srcfiles, $ai2, 1);
								last;
							}
						}
						splice(@files, $ai, 1);
						last;
					}
				}
				if (!$match) {
					print STDERR "Code block #", $block+1, " did not match any unzipped source files (", $#files + 1, " files remaining)\n";
					if ($askshowunmatchedblocks) {
						my $def = "d";
						my $act = "";
						while ($act ne "c") {
							print "(D)isplay block, (i)gnore block for comparison purposes and continue or (c)ontinue without marking block as ignored? (", ($def eq "d")?"[d]":"d", ($def eq "i")?"[i]":"i", ($def eq "c")?"[c]":"c", ($def eq "q")?"[q]":"q", ") ";
							$act = <STDIN>;
							$act =~ s/[\r\n]//gsi;
							if ($act eq "q") {
								&writeconfigfile;
								exit 0;
							} elsif ($act ne "d" && $act ne "i" && $act ne "c") {
								$act = $def;
							}
							if ($act eq "d") {
								print "$code\n";
								$def = "c";
							}
							elsif ($act eq "i") {
								$ignblocks{$ch}{$ex}{$block+1} = 1;
								$igncount++;
								$nonmatchblocks--;
								print STDERR "OK, Ignoring code block #", $block+1, "\n";
								last; # redundant
							}
						}
					} elsif ($pauseonerr == 1 && !$pauseonsummary) {
						$pause = 1;
					}
					$nonmatchblocks++;
				}
			}
			if ($ignblocks{$ch}{$ex}{$block+1} == 1) {
				$code = "<pre>\n$code</pre>\n";
			} else {
				$code = "<C>\n$code</C>\n";
			}
			my $orglen = $el + $edx - $idx;
			substr($pg, $idx, $orglen) = $code;
			$edx = $idx + length($code);
			$block++;
		} else {
			print STDERR "Possibly unterminated html-formatted code block ($ch.$ex, block ", $blocks+1, "\n";
			if ($pauseonerr) { &dopause; }
		}
		$idx = $edx + 1;
	}
	if ($block <= 0) {
		print STDERR "No code blocks found on webpage\n";
		# Ch 4 ex 11 is missing
		if ($pauseonerr >= 1 && !($pauseonerr > 1 && $ch == 4 && $ex == 11)) { &dopause; }
	} else {
		if ($nonmatchblocks > 0) {
			print STDERR "Summary: $nonmatchblocks (of $block) UNMATCHED code block(s) on $ch.$ex webpage\n";
			if ($pauseonerr == 1) { $pause = 1; }
		} elsif ($verbose || $summaryverbose) {
			print STDERR "Summary: $block total code block(s) ", ($igncount > 0)?"($igncount IGNORED) ":"", "on $ch.$ex webpage\n";
			if ($pauseonsummary) {
				$pause = 1;
			}
		}
	}
	if ($#files >= 0) {
		my $filelist = join("\n", @files);
		$filelist =~ s/$unzipdir\///gs;
		print STDERR "Summary: ", $#files + 1, " (of $fc) source file(s) UNMATCHED against $ch.$ex webpage:\n";
		print STDERR "$filelist\n";
		if ($askshowfiles) {
			for ($i = 0; $i <= $#files; $i++) {
				my $def = "y";
				my $show = "";
				print "Display unmatched file ", $files[$i], "? (", ($def eq "y")?"[y]":"y", "/", ($def eq "n")?"[n]":"n", ($def eq "q")?"[q]":"q", ") ";
				$show = <STDIN>;
				$show =~ s/[\r\n]//gsi;
				if ($show eq "q") {
					&writeconfigfile;
					exit 0;
				} 
				if ($show eq "y" || ($show ne "n" && $def eq "y")) {
					open(F, $files[$i]);
					while (<F>) {
						print;
					}
					if (!$pauseonsummary) {
						$pause = 0;
						&dopause;
					}
				}
			}
		} elsif ($pauseonsummary || $pauseonerr == 1) {
			$pause = 1;
		}
	} elsif ($verbose || $summaryverbose) {
		print STDERR "Summary: $fc total file(s) matched against $ch.$ex webpage\n";
		if ($pauseonsummary) {
			$pause = 1;
		}
	}
	if ($pause) {
		&dopause;
	}
	return $pg;
}
 
# Uses filescope $dosubmit
# @_[0] => Title of wiki page
# @_[1] => Content of wiki page
# @_[2] => Name of html file containing original content located in $htmldir
#	   If not supplied, a frame will not be created and only the wiki pg
#	   will be sent to $browser
# @_[3] => Whether to skip recording of the page in the submittedpages hash
#	   (optional - default is record)
sub submitwikipage
{
	if (!$dosubmit) {
		return 1;
	}
 
	my $wikititle = @_[0];
	my $content = @_[1];
	my $norecord = @_[3];
	if ($submittedpages{$wikititle} == 1) {
		if ($verbose || $showskippedsubmits) {
			print STDERR "NOT submitting $wikititle - previously submitted\n";
		}
		return 1;
	}
	print STDERR "Start wiki edit submission on: $wikititle\n";
 
	# Get token for edit
	my $edttok_url = "$baseurl?title=$wikititle\&action=edit";
	my $req = GET $edttok_url;
	if (!&send($req)) {
		print STDERR $res->status_line, "\n";
		print STDERR "Could not get edit token for wiki: $edttok_url\n";
		return 0;
	}
	(local $edttok) = $res->content =~ /value="([0-9a-zA-Z]+)" name="wpEditToken"/;
	(local $edttime) = $res->content =~ /value="([0-9a-zA-Z]+)" name="wpEdittime"/;
#	print STDERR "edit token: $edttok\nedit time: $edttime\n";
	if (length($edttok) <= 0) {
		move($tmpfile, "$htmldir/tok.$id.html");
		if (length($browser) > 0) {
			system("$browser $htmldir/tok.$id.html");
		}
		print STDERR "*Regular expression did not extract an edit token from edit page\n";
		if ($pauseonerr) {
			&dopause;
		} elsif (length($browser) > 0) {
			sleep($browserpause);
		}
		return 0;
	}
 
	# Submit edit
	local $edt_url = "$baseurl?title=$wikititle\&action=submit";
	$req = POST $edt_url,
		[ wpSummary => "", wpSection => "", wpEdittime => $edttime,
		wpEditToken => $edttok, wpTextbox1 => $content,
		wpSave => "Save page" ];
	if (!&send($req)) {
		print STDERR $res->status_line, "\n";
		print STDERR "HTTP ERROR on submitting $title edit to wiki\n";
		return 0;
	}
	print STDERR "*Wiki edit ($wikititle) submitted\n";
	# Display in browser
	if (length($browser) > 0) {
		my $wikifile = "$htmldir/wiki.$wikititle.html";
		my $krfile = @_[2];
		my $frfile = "$htmldir/fr.$wikititle.html";
		move($tmpfile, $wikifile);
		if ($showframes) {
			if (length($krfile) > 0) {
				&makeframe($krfile, $wikifile, $frfile);
			} else {
				$frfile = $wikifile;
			}
		} else {
			$frfile = "$baseurl/$wikititle";
		}
		system("$browser $frfile &");
	}
	if ($norecord != 1) {
		$submittedpages{$wikititle} = 1;
	}
	if ($pauseperpage) {
		&dopause;
	} elsif (length($browser) > 0) {
		sleep($browserpause);
	}
	return 1;
}
 
# @_[0] => url of orig page
# @_[1] => url of wiki page
# @_[2] => url of frame to create
sub makeframe
{
	open(FRFILE, ">@_[2]");
	print FRFILE "<html>\n";
	print FRFILE "<head><title>fr</title></head>\n";
	print FRFILE "<frameset rows=\"50%,*\">\n";
	print FRFILE "<frame src=\"@_[1]\">\n";
	print FRFILE "<frame src=\"@_[0]\">\n";
	print FRFILE "</frameset>\n";
	print FRFILE "</html>\n";
	close(FRFILE);
}
 
# Send HTTP request to server and get response back; save content in $tmpfile
# @_[0] => $req
# Uses filescope $ua, $tmpfile; sets filescope $res - probably bad practice but I'm 
# not going to look up how to do pass by reference in Perl - this is easiest 
# to implement
sub send {
	# Pass request to the user agent and get a response back
	my $req = @_[0];
	my $i = 0;
	$res = $ua->request($req);
	if ($res->is_success) {
		open(FILE, ">$tmpfile");
		print FILE $res->content;
		close(FILE); 
		return 1;
	} else {
		# This needs to be revisited since afaict it's mostly
		# redundant, although not broken
		for ($i = 0; $i < 100; $i++ ) {
			if ($res->status_line == "302 Found") {
				# Temporary redirection - try again
				$req->uri($res->headers->header("Location"));
				$res = $ua->request($req);
				if ($res->is_success) {
					open(FILE, ">$tmpfile");
					print FILE $res->content;
					close(FILE); 
					return 1;
				} 
			} else {
				last;
			}
		}
	}
	if ($i > 1) {
		print STDERR $res->status_line, "\n";
		print STDERR "Too many redirects ($i)\n";
		print STDERR "Returned HTTP headers:\n";
		print STDERR $res->headers->as_string, "\n";
	}
	return 0;
}
 
# Uses global $pauseperpage and $pauseonerr
sub dopause {
	print "Paused; press enter (or 'q' then enter to save config file and quit)";
	my $tmp = <STDIN>;
	$tmp =~ s/[\r\n]//gsi;
	if ($tmp eq "q") {
		&writeconfigfile;
		print STDERR "'q' pressed - config file saved; exiting.\n";
		exit 0;
	}
}
 
sub convertsymbols {
	my $str = @_[0];
	# naively convert &lt; &gt; &quot; and &amp;
	$str =~ s/&lt;/</igs;
	$str =~ s/&gt;/>/igs;
	$str =~ s/&quot;/"/igs;
	$str =~ s/&amp;/&/igs;
	return $str;
}
 
# @_[0] => the content to modify as a string; modified content is returned
# Mostly minimal conversion so that cases needing special treatment can be 
# easily identified as they arise
sub htmltowiki {
	my $str = @_[0];
	# Replace all exercise references with internal wiki links
	# maintaining original link wording
	$str =~ s/<a(\s+)href(\s*)=(\s*)krx(\d)(\d{2}).html>(.*?)<\/a>/\[\[KR2_Exercise_$4-$5 \|$6\]\]/gsi;
	# Wikify external http links
	$str =~ s/<a(\s+)href(\s*)=(\s*)((http:\/\/).*?)(\s*)>(.*?)<\/a>/[\4 \6]/gsi;
	# bold
	$str =~ s/<b>(.*?)<\/b>/'''$1'''/gsi;
	# italics
	$str =~ s/<i>(.*?)<\/i>/''$1''/gsi;
	# remove indentation from headings since 
	# wiki interprets indentation as <pre></pre> and this conflicts
	foreach $tag ("h1", "h2", "h3", "h4") {
		my $start = 0;
		while (($start = index($str, "<$tag>", $start)) >= 0) {
			my $end = index($str, "</$tag>", $start);
			substr($str, $start, $end - $start) =~ s/\r{0,1}\n([ ]+)/ /gsi;
			$start = $start + length("<$tag>");
		}
	}
	# wikify heading markup
	$str =~ s/<h1>(\s*)(.*?)(\s*)<\/h1>/==$2==/gsi;
	$str =~ s/<h2>(\s*)(.*?)(\s*)<\/h2>/===$2===/gsi;
	$str =~ s/<h3>(\s*)(.*?)(\s*)<\/h3>/====$2====/gsi;
	$str =~ s/<h4>(\s*)(.*?)(\s*)<\/h4>/=====$2=====/gsi;
 
	# naively convert &lt; &gt; &quot; and &amp;
	$str = convertsymbols($str);
 
	$str =~ s/<a(\s*)href=mailto:(.*?)>(\s*)(.*?)(\s*)<\/a>/[mailto:$2 $4]/gsi;
 
	## Miscellaneous
 
	# all occurences of "me", "Me", "my good self" and "myself" to "Richard Heathfield"
	$str =~ s/\bme\b/Richard Heathfield/gsi;
	$str =~ s/\bmy good self\b/Richard Heathfield/gsi;
	$str =~ s/\bmyself\b/Richard Heathfield/gsi;
	$str =~ s/\bmine\b/Richard Heathfield's/gsi;
 
	# add wiki links to a description whereever a solution's category is 
	# mentioned
	$str =~ s/\b(cat((egory){0,1}) \d)\b/[[KR2_Solution_Category_Numbers|$1]]/gsi;
 
	# wrongly formed comp.lang.c link
	$str =~ s/<a(\s*)href=news:comp.lang.c>(.*?)<\/a>/[news:\/\/comp.lang.c comp.lang.c],/gsi;
 
	# internal undefined behaviour links
	$str =~ s/<a(\s*)href=undefined.html>(.*?)<\/a>/[[Undefined_Behaviour|$2]]/gsi;
 
	# 1.23 critique headings
	$str =~ s/<p>(\s*)(uncomment(.*?)(\s*)\((.*?)\))(\s*)<\/p>(\s*)<p>(((\s*)=)+)(\s*)<\/p>/=== $2 ===/gsi;
 
	return $str;
}
 
sub gethrefs {
	my $links = @_[0];
	# Remove everything except href links separated by newlines
	$links =~ s/(.*?)(((<a(\s+)href(\s*)=(\s*)(.*?)(\s*)>(.*?)<\/a>))*)(.*?)/$2/gsi;
	$links =~ s/<\/a>/<\/a>\n/gsi;
	return $links;
}
 
# Recursive function to print an n-depth hash so that it can be recreated by 
# running the output through a perl interpreter
# Probably could be made more sophisticated by actually checking whether a value
# is a hash reference or not
# Top-level call should specify @_[3] == @_[4] and @_[5] == ""
# @_[0] reference to the hash
# @_[2] name of the hash as a string
# @_[3] current depth of hash (if 0, print value not key)
# @_[4] total depth of hash
# @_[5] current prefix
# @_[6] filehandle to output to (can be STDOUT)
sub printndepthhash {
	my $href = shift;
	my $hname = shift;
	my $depth = shift;
	my $orgdepth = shift;
	my $prefix = shift;
	my $F = shift;
 
	if ($depth == $orgdepth) {
		$prefix = "$prefix\$$hname";
	}
 
	foreach $key (keys %$href) {
		my $p = "$prefix\{'$key'\}";
		if ($depth == 1) {
			print $F "$p = \'$$href{$key}\';\n";
		} else {
			printndepthhash($$href{$key}, $hname, $depth - 1, $orgdepth, $p, $F);
		}
	}
}
From clc-wiki

Views

Personal tools

Site_logo

Search

Content_hierarchy

Tools