#!/usr/bin/perl -CIO
use strict;
use warnings;

# Minimum number of occurrence of a word.
my $MIN_FREQ = 10;

# Regex for letter (can be overridden to restrict to appropriate alphabet).
my $LETTER_RE = '\w';

if (@ARGV && $ARGV[0] !~ /^-/) {
    if (@ARGV) {
	$MIN_FREQ = shift @ARGV;
    }

    if (@ARGV) {
	$LETTER_RE = shift @ARGV;
	if ($LETTER_RE eq 'ascii') {
	    $LETTER_RE = '[a-z]';
	} elsif ($LETTER_RE eq 'latin1') {
	    $LETTER_RE = '[a-z\x{c0}-\x{ff}]';
	} elsif ($LETTER_RE =~ /^[A-Za-z]+$/) {
	    # Perl handles script names case-insensitively.
	    $LETTER_RE = "\\p{Script: $LETTER_RE}";
	} else {
	    $LETTER_RE = "(?:$LETTER_RE)";
	}
    }
}

if (@ARGV) {
    print <<"__END__";
Usage: $0 [MIN_FREQ [SCRIPT]]

MIN_FREQ is the minimum number of occurrences needed for a word to be
included.  The default is 10.

SCRIPT can be:

* A Unicode script name (the first letter gets upper-cased and the Perl
  regex used is: \\p{Script: ...} which generally seems suitable.  This
  has been tested for at least: "armenian", "devanagari", "greek", "latin",
  and "tamil".
* "ascii" to match only ASCII letters.
* "latin1" to match only Latin1 letters.
* A regular expression to match a letter in the script.

If not specified, the default regex used is: \\w

The script only considers words which match the regex \\b(?:\$SCRIPT)+\b and
are not changed by the Perl "lc" function.

For example:

WikiExtractor.py XXwiki-latest-pages-articles.xml.bz2
cat text/*/wiki_*|$0 42 ascii > voc.txt

Extract a list of the most common words from a wikipedia dump.  In this
example, XX is the ISO 639-1 language code.
__END__
    exit ($ARGV[0] eq '--help' ? 0 : 1);
}

my %f;
while (<>) {
    s!<[^>]*>! !g;
    s!&[^; ]*;! !g;
    while (/\b$LETTER_RE+\b/go) {
	# Perl lc is Unicode-aware for strings with the UTF8 flag set (which it
	# will be here).
	++$f{$&} if lc($&) eq $&;
    }
}
for (sort keys %f) {
    print "$_\n" if $f{$_} >= $MIN_FREQ;
    #print "$f{$_}\t$_\n" if $f{$_} >= $MIN_FREQ;
}
