dougalcampbell
diff --git a/wxrsplit.pl b/wxrsplit.pl
 #!/usr/bin/perl -w
 #
 # wxrsplit - Split a WordPress WXR file into multiple output files, each 
 #   with a maximum filesize.
 # 
 # NOTE: Because this tool attempts to keep items intact within each output
 #   file, it is possible to exceed the specified max filesize. Comments are
 #   contained within a post item, so a post with many comments could
 #   conceivably generate a very large item size. There probably is not a
 #   practical way around this.
 #
 # @author: Dougal Campbell <[email protected]>
 # @license: MIT / GPL2

 use strict;
 use Getopt::Long;

 ## Defaults
 my $filename = 'output.wxr';
 # Can use 'K' for kilobytes or 'M' for megabytes (not case-sensitive)
 my $size = '2M';

 my $help = 0;

 # If we get this many bytes into the file without finding proof that it's
 # a WXR, abort. Since the entire WXR export of a fresh WP 2.5 install,
 # including the 'Hello, World' post and comment, is only about 5K, a 
 # value of 8K should be more than safe:
 my $hdrsz = '8192';

 ## Parse options
 my $result = GetOptions('f=s' => \$filename, 's=s' => \$size, 'help' => \$help);

 if ($help) {
 	die usage();
 }

 ## Options valid?
 checkfile($filename) or warn "File '$filename' not found.\n" and die usage();
 my $filesize = parsesize($size) or warn "Filesize '$size' not recognized.\n" and die usage();

 open WXR, "< $filename" or die "Could not open '$filename' for reading.\n";

 my $header = getheader() or die "Could not parse header. Is this a WXR file?\n";

 my $headersize = length($header);

 # input record separator:
 $/ = '<item>';

 ## Find the first item
 seek WXR, $headersize, 0;

 my $chunk = '';
 my $chunksize = 0;
 my $i = 1;
 my $file = '';

 while (<WXR>) {
 	chomp;

 	## first chunk is probably just whitespace between the channel info
 	## and the start of the first item. Skip it, if so:
 	next unless m|</item>|s;

 	my $item = "<item>\n" . $_;
 	my $itemsize = length($item);
 	
 	($file = $filename) =~ s/\.wxr/-$i.wxr/;

 	if (($headersize + $chunksize + $itemsize) >= $filesize) {
 			
 		writechunk($header . $chunk, $file);
 		
 		$chunk = $item;
 		++$i;
 	}
 	
 	$chunk .= $item;
 	$chunksize = length($chunk);
 	
 }

 ## Write final chunk.
 writechunk($header . $chunk, $file) if $chunk;

 print "Done.\n";

 ######
 sub usage {
 	warn <<USAGE;

 Usage: wxrsplit [opts] 
  Options:
    -f filename (defaults to 'output.wxr')
    -s SIZE (defaults to 2M)

 Split a WXR file into multiple pieces, keeping each piece below a given
 size. 
 USAGE
 return "\n"; # suppress line number reporting from die()
 }

 sub checkfile {
 	my $file = shift;

 	if (-f $file) {
 		return $file;
 	}

 	return undef;
 }

 sub parsesize {
 	my $size = shift;
 	my $kilo = 1024;
 	my $meg = $kilo * $kilo;
 	my $mult = 1;

 	$size =~ m/^(\d+)(.?)$/;

 	my ($num, $unit) = ($1, $2);
 	
 	$num -= 0 or return undef;

 	$unit = lc($unit);

 	if ($unit eq 'm') {
 		$mult = $meg;
 	} elsif ($unit eq 'k') {
 		$mult = $kilo;
 	} elsif ($unit) {
 		return undef;
 	}

 	my $filesize = $num * $mult;

 	return $filesize;
 }

 sub getheader {
 	my $bytes = 0;
 	my $header = '';

 	$bytes = read(WXR, $header, $hdrsz);

 	## Is this really a WXR file?
 	my ($iswxr) = $header =~ m|xmlns:wp="http://wordpress[.]org/export/\d+[.]\d+/"|s;
 	
 	return undef unless $iswxr;

 	## Non-greedy match of everything up to the first <item> element:
 	$header =~ m|^(.*?)<item>|s;

 	## rewind file
 	seek WXR, 0, 0;
 	
 	return $1 ? $1 : undef;
 }

 sub writechunk {
 	my ($text, $outfile) = @_;
 	
 	open OUT, "> $outfile";
 	print OUT $text;
 	close OUT;
 	
 }