User:Hippietrail/wiktgrep.pl
Jump to navigation
Jump to search
#!/usr/bin/perl # wiktgrep2 string # # scans a wiktionary xml dump file # looks for articles containing the string arg # dumps the <title> field, ignores other fields up to the <text> field # then dumps all lines of the text field. scan resumes # all lines are dumped preceded by their line number # and that line's offset in the dump file # # uses two pre-built index files to speed up the # process: a list of all article names in order, and a list of offsets # to the start of each article in the dumpfile use strict; use Path::Class; # not standard! use IO::Uncompress::Bunzip2 qw(bunzip2 $Bunzip2Error); use HTML::Entities; my $config = getconfig(); our($NFH) = $config->{'date'} . '-all.txt'; our($IFH) = $config->{'date'} . '-off.raw';; my $df = Path::Class::file($config->{'dumppath'}, ('enwiktionary-' . $config->{'date'} . '-pages-articles.xml' . $config->{'ext'})); my $regexp = shift; print "using dump file $df\n--------\n"; my $dumph; # file handle or bzip2 object my $mode; # 0 for text, 1 for bzip2 open NFH or die "no name file"; open IFH or die "no index file"; open(DFH, $df) or die "no dump file"; binmode(STDOUT, ":utf8"); # TODO experimental for windows console binmode(NFH, ":utf8"); # TODO experimental for windows console binmode(DFH, ":utf8"); # TODO experimental for windows console binmode(IFH); binmode(STDOUT, 'utf8'); # make optional if (rindex($df, ".bz2") != -1) { print STDERR "** bzip2 compressed dump **\n"; $mode = 1; $dumph = new IO::Uncompress::Bunzip2(\*DFH) or die "IO::Uncompress::Bunzip2 failed: $Bunzip2Error\n";; } else { print STDERR "** uncompressed dump **\n"; $mode = 0; $dumph = \*DFH; } my $lineno; my $raw; my $offset; while (<NFH>) { if (/$regexp/) { $lineno = $. - 1; print $_, "--------\n"; seek(IFH, $lineno * 4, 0) == 0 && die "can't seek index file"; read(IFH, $raw, 4) || die "can't read index $lineno"; $offset = unpack('I', $raw); if ($mode == 0) { seek(DFH, $offset, 0) == 0 && die "can't seek dump file"; } else { $dumph->seek($offset, 0) == 0 && die "can't seek compressed dump file"; } my $t = <$dumph>; $t = decode_entities($t); $t = substr($t,11,length($t)-20); while (<$dumph>) { last if (/<text /); } my $l = $_; my $islast = 0; while (1) { if (index($l, ' <text') == 0) { $l = substr($l, 33); } if (rindex($l, '</text>') != -1) { $l = substr($l, 0, -8); $islast = 1; } $l = decode_entities($l); print $l; last if $islast; $l = <$dumph>; } print "\n--------\n"; } } exit; ########################################## sub getconfig { require Cwd; require File::HomeDir; # not standard! require FindBin; my $configname = '.mwconfig'; my $configpath = undef; my $dumppath = undef; my $date = undef; my $ext = undef; for ( Cwd::getcwd(), File::HomeDir->my_home, $FindBin::Bin ) { my $d = Path::Class::file($_, $configname); if (-d $d) { $configpath = $d; last; } } unless ($configpath) { die "no config file in current dir, my home dir, or the script's dir"; } else { print "using config file $configpath\n"; my $configdumppath = Path::Class::file($configpath, 'dumppath'); if (-e $configdumppath) { #print "config contains dump path\n"; unless (open DP, "<$configdumppath") { print "can't open configdumppath\n"; } else { #print "opened configdumppath\n"; $dumppath = <DP>; chomp $dumppath; #print "dump files are stored in $dumppath\n"; unless (-d $dumppath) { print "specified dump file directory $dumppath doesn't exist\n"; } else { my $origdir = Cwd::getcwd(); chdir $dumppath; # find newest .xml and .xml.bz2 dump files my @files = glob('enwiktionary-????????-pages-articles.xml'); push @files, glob('enwiktionary-????????-pages-articles.xml.bz2'); if (@files) { my @sorted = sort { $b->[0] <=> $a->[0] || $a->[1] cmp $b->[1] } map { /enwiktionary-(\d\d\d\d\d\d\d\d)-pages-articles.xml(\.bz2)?/; [ $1, $2 ] } @files; $date = $sorted[0][0]; $ext = $sorted[0][1]; } chdir $origdir; } } } else { print "config doesn't contain dump path\n"; } } return { 'dumppath' => $dumppath, 'date' => $date, 'ext' => $ext }; }