A perl script to split WordPress WXR export files into multiple, smaller files
#!/usr/bin/perl -w | |
# | |
# wxrsplit - Split a WordPress WXR file into multiple output files, each | |
# with a maximum filesize. | |
# | |
# NOTE: Because this tool attempts to keep items intact within each output | |
# file, it is possible to exceed the specified max filesize. Comments are | |
# contained within a post item, so a post with many comments could | |
# conceivably generate a very large item size. There probably is not a | |
# practical way around this. | |
# | |
# @author: Dougal Campbell <[email protected]> | |
# @license: MIT / GPL2 | |
use strict; | |
use Getopt::Long; | |
## Defaults | |
my $filename = 'output.wxr'; | |
# Can use 'K' for kilobytes or 'M' for megabytes (not case-sensitive) | |
my $size = '2M'; | |
my $help = 0; | |
# If we get this many bytes into the file without finding proof that it's | |
# a WXR, abort. Since the entire WXR export of a fresh WP 2.5 install, | |
# including the 'Hello, World' post and comment, is only about 5K, a | |
# value of 8K should be more than safe: | |
my $hdrsz = '8192'; | |
## Parse options | |
my $result = GetOptions('f=s' => \$filename, 's=s' => \$size, 'help' => \$help); | |
if ($help) { | |
die usage(); | |
} | |
## Options valid? | |
checkfile($filename) or warn "File '$filename' not found.\n" and die usage(); | |
my $filesize = parsesize($size) or warn "Filesize '$size' not recognized.\n" and die usage(); | |
open WXR, "< $filename" or die "Could not open '$filename' for reading.\n"; | |
my $header = getheader() or die "Could not parse header. Is this a WXR file?\n"; | |
my $headersize = length($header); | |
# input record separator: | |
$/ = '<item>'; | |
## Find the first item | |
seek WXR, $headersize, 0; | |
my $chunk = ''; | |
my $chunksize = 0; | |
my $i = 1; | |
my $file = ''; | |
while (<WXR>) { | |
chomp; | |
## first chunk is probably just whitespace between the channel info | |
## and the start of the first item. Skip it, if so: | |
next unless m|</item>|s; | |
my $item = "<item>\n" . $_; | |
my $itemsize = length($item); | |
($file = $filename) =~ s/\.wxr/-$i.wxr/; | |
if (($headersize + $chunksize + $itemsize) >= $filesize) { | |
writechunk($header . $chunk, $file); | |
$chunk = $item; | |
++$i; | |
} | |
$chunk .= $item; | |
$chunksize = length($chunk); | |
} | |
## Write final chunk. | |
writechunk($header . $chunk, $file) if $chunk; | |
print "Done.\n"; | |
###### | |
sub usage { | |
warn <<USAGE; | |
Usage: wxrsplit [opts] | |
Options: | |
-f filename (defaults to 'output.wxr') | |
-s SIZE (defaults to 2M) | |
Split a WXR file into multiple pieces, keeping each piece below a given | |
size. | |
USAGE | |
return "\n"; # suppress line number reporting from die() | |
} | |
sub checkfile { | |
my $file = shift; | |
if (-f $file) { | |
return $file; | |
} | |
return undef; | |
} | |
sub parsesize { | |
my $size = shift; | |
my $kilo = 1024; | |
my $meg = $kilo * $kilo; | |
my $mult = 1; | |
$size =~ m/^(\d+)(.?)$/; | |
my ($num, $unit) = ($1, $2); | |
$num -= 0 or return undef; | |
$unit = lc($unit); | |
if ($unit eq 'm') { | |
$mult = $meg; | |
} elsif ($unit eq 'k') { | |
$mult = $kilo; | |
} elsif ($unit) { | |
return undef; | |
} | |
my $filesize = $num * $mult; | |
return $filesize; | |
} | |
sub getheader { | |
my $bytes = 0; | |
my $header = ''; | |
$bytes = read(WXR, $header, $hdrsz); | |
## Is this really a WXR file? | |
my ($iswxr) = $header =~ m|xmlns:wp="http://wordpress[.]org/export/\d+[.]\d+/"|s; | |
return undef unless $iswxr; | |
## Non-greedy match of everything up to the first <item> element: | |
$header =~ m|^(.*?)<item>|s; | |
## rewind file | |
seek WXR, 0, 0; | |
return $1 ? $1 : undef; | |
} | |
sub writechunk { | |
my ($text, $outfile) = @_; | |
open OUT, "> $outfile"; | |
print OUT $text; | |
close OUT; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment