#!/usr/bin/env perl
#
# $Id$ 
#
# generates in the current directory:
#  - unicode-blocks.h
#  - unicode-names.h
#  - unicode-nameslist.h
#  - unicode-unihan.h
#  - unicode-categories.h
#  - unicode-scripts.h
#  - unicode-versions.h
#
# usage: ./gen-guch-unicode-tables.pl UNICODE-VERSION UNICODE-DIRECTORY OUTPUT-DIRECTORY
# where DIRECTORY contains UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt
#
# NOTE! Some code copied from glib/glib/gen-unicode-tables.pl; keep in sync!

use strict;
use warnings;

use Env qw($PROG_UNZIP);
$PROG_UNZIP = "unzip" unless (defined $PROG_UNZIP);

$| = 1;  # flush stdout buffer

if (@ARGV != 3 && @ARGV != 4)
{
    $0 =~ s@.*/@@;
    die <<EOF

Usage: $0 UNICODE-VERSION UNICODE-DIRECTORY OUTPUT-DIRECTORY [--i18n]

DIRECTORY should contain the following Unicode data files:
UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt

which can be found at https://www.unicode.org/Public/UNIDATA/

EOF
}

my ($unicodedata_txt, $unihan_zip, $nameslist_txt, $blocks_txt, $scripts_txt, $versions_txt);

my $v = $ARGV[0];
my $d = $ARGV[1];
my $outdir = $ARGV[2];

my $gen_translatable_strings = 0;
if (@ARGV == 4)
{
    $gen_translatable_strings = 1 if ($ARGV[3] eq "--i18n") or die "Unknown option \"$ARGV[3]\"\n";
}

opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";
for my $f (readdir ($dir))
{
    $unicodedata_txt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/);
    $unihan_zip = "$d/$f" if ($f =~ /Unihan.*\.zip/);
    $nameslist_txt = "$d/$f" if ($f =~ /NamesList.*\.txt/);
    $blocks_txt = "$d/$f" if ($f =~ /Blocks.*\.txt/);
    $scripts_txt = "$d/$f" if ($f =~ /Scripts.*\.txt/);
    $versions_txt = "$d/$f" if ($f =~ /DerivedAge.*\.txt/);
}

defined $unicodedata_txt or die "Did not find $d/UnicodeData.txt";
defined $unihan_zip or die "Did not find $d/Unihan.zip";
defined $nameslist_txt or die "Did not find $d/NamesList.txt";
defined $blocks_txt or die "Did not find $d/Blocks.txt";
defined $scripts_txt or die "Did not find $d/Scripts.txt";
defined $versions_txt or die "Did not find $d/DerivedAge.txt";

if ($gen_translatable_strings)
{
    process_translatable_strings ($blocks_txt, $scripts_txt);
}
else
{
    process_unicode_data_txt ($unicodedata_txt);
    process_nameslist_txt ($nameslist_txt);
    process_blocks_txt ($blocks_txt);
    process_scripts_txt ($scripts_txt);
    process_versions_txt ($versions_txt);
    process_unihan_zip ($unihan_zip);
}

exit;


#------------------------#

sub process_unicode_data_txt
{
    my ($unicodedata_txt) = @_;

    # part 1: names

    open (my $unicodedata, $unicodedata_txt) or die;
    open (my $out, "> $outdir/unicode-names.h") or die;

    print "processing $unicodedata_txt...";

    print $out "/* unicode-names.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_NAMES_H\n";
    print $out "#define UNICODE_NAMES_H\n\n";

    print $out "#include <glib.h>\n\n";

    my @unicode_pairs;
    my %names;

    while (my $line = <$unicodedata>)
    {
        chomp $line;
        $line =~ /^([^;]+);([^;]+)/ or die;

        my $hex = $1;
        my $name = $2;

        # Skip items where we can easily reconstruct the name programmatically
        next if ($name =~ /^CJK UNIFIED IDEOGRAPH-[0-9A-F]{4,6}$/);
        next if ($name =~ /^CJK COMPATIBILITY IDEOGRAPH-[0-9A-F]{4,6}$/);
        next if ($name =~ /^TANGUT IDEOGRAPH-[0-9A-F]{4,6}$/);
        next if ($name =~ /^TANGUT COMPONENT-[0-9]+$/);
        next if ($name =~ /^KHITAN SMALL SCRIPT CHARACTER-[0-9A-F]+$/);

        # Skip unwanted items
        next if ($name =~ /^<.+, (First|Last)>$/);

        $names{$name} = 1;
        push @unicode_pairs, [$hex, $name];
    }

    print $out "static const char unicode_names_strings[] = \\\n";

    my $offset = 0;

    foreach my $name (sort keys %names) {
	print $out "  \"$name\\0\"\n";
	$names{$name} = $offset;
	$offset += length($name) + 1;
    }

    undef $offset;

    print $out ";\n";

    print $out "typedef struct _UnicodeName UnicodeName;\n\n";

    print $out "static const struct _UnicodeName\n";
    print $out "{\n";
    print $out "  gunichar index;\n";
    print $out "  guint32 name_offset;\n";
    print $out "} \n";
    print $out "unicode_names[] =\n";
    print $out "{\n";

    my $first_line = 1;

    foreach my $pair (@unicode_pairs) {
	if (!$first_line) {
	    print $out ",\n";
	} else {
	    $first_line = 0;
	}

	my ($hex, $name) = @{$pair};
	my $offset = $names{$name};
	print $out "  {0x$hex, $offset}";
    }

    print $out "\n};\n\n";

    print $out <<EOT;
static inline const char * unicode_name_get_name(const UnicodeName *entry)
{
  guint32 offset = entry->name_offset;
  return unicode_names_strings + offset;
}

EOT

    print $out "#endif  /* #ifndef UNICODE_NAMES_H */\n";

    undef %names;
    undef @unicode_pairs;

    close ($unicodedata);
    close ($out);

    # part 2: categories

    open ($unicodedata, $unicodedata_txt) or die;
    open ($out, "> $outdir/unicode-categories.h") or die;

    # Map general category code onto symbolic name.
    my %mappings =
    (
        # Normative.
        'Lu' => "G_UNICODE_UPPERCASE_LETTER",
        'Ll' => "G_UNICODE_LOWERCASE_LETTER",
        'Lt' => "G_UNICODE_TITLECASE_LETTER",
        'Mn' => "G_UNICODE_NON_SPACING_MARK",
        'Mc' => "G_UNICODE_COMBINING_MARK",
        'Me' => "G_UNICODE_ENCLOSING_MARK",
        'Nd' => "G_UNICODE_DECIMAL_NUMBER",
        'Nl' => "G_UNICODE_LETTER_NUMBER",
        'No' => "G_UNICODE_OTHER_NUMBER",
        'Zs' => "G_UNICODE_SPACE_SEPARATOR",
        'Zl' => "G_UNICODE_LINE_SEPARATOR",
        'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
        'Cc' => "G_UNICODE_CONTROL",
        'Cf' => "G_UNICODE_FORMAT",
        'Cs' => "G_UNICODE_SURROGATE",
        'Co' => "G_UNICODE_PRIVATE_USE",
        'Cn' => "G_UNICODE_UNASSIGNED",

        # Informative.
        'Lm' => "G_UNICODE_MODIFIER_LETTER",
        'Lo' => "G_UNICODE_OTHER_LETTER",
        'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
        'Pd' => "G_UNICODE_DASH_PUNCTUATION",
        'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
        'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
        'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
        'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
        'Po' => "G_UNICODE_OTHER_PUNCTUATION",
        'Sm' => "G_UNICODE_MATH_SYMBOL",
        'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
        'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
        'So' => "G_UNICODE_OTHER_SYMBOL"
    );

    # these shouldn't be -1
    my ($codepoint, $last_codepoint, $start_codepoint) = (-999, -999, -999);

    my ($category, $last_category) = ("G_FAKE1", "G_FAKE2");
    my ($started_range, $finished_range) = (undef, undef);

    print $out "/* unicode-categories.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_CATEGORIES_H\n";
    print $out "#define UNICODE_CATEGORIES_H\n\n";

    print $out "#include <glib.h>\n\n";

    print $out "typedef struct _UnicodeCategory UnicodeCategory;\n\n";

    print $out "static const struct _UnicodeCategory\n";
    print $out "{\n";
    print $out "  gunichar start;\n";
    print $out "  gunichar end;\n";
    print $out "  GUnicodeType category;\n";
    print $out "}\n";
    print $out "unicode_categories[] =\n";
    print $out "{\n";

    while (my $line = <$unicodedata>)
    {
        $line =~ /^([0-9A-F]*);([^;]*);([^;]*);/ or die;
        my $codepoint = hex ($1);
        my $name = $2;
        my $category = $mappings{$3};

        if ($finished_range 
            or ($category ne $last_category) 
            or (not $started_range and $codepoint != $last_codepoint + 1))
        {
            if ($last_codepoint >= 0) {
                printf $out ("  { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
            } 

            $start_codepoint = $codepoint;
        }

        if ($name =~ /^<.*First>$/) {
            $started_range = 1;
            $finished_range = undef;
        }
        elsif ($name =~ /^<.*Last>$/) {
            $started_range = undef;
            $finished_range = 1;
        }
        elsif ($finished_range) {
            $finished_range = undef;
        }

        $last_codepoint = $codepoint;
        $last_category = $category;
    }
    printf $out ("  { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);

    print $out "};\n\n";

    print $out "#endif  /* #ifndef UNICODE_CATEGORIES_H */\n";

    close ($out);
    print " done.\n";
}

#------------------------#

# XXX should do kFrequency too
sub process_unihan_zip
{
    my ($unihan_zip) = @_;

    print "processing $unihan_zip.";

    open (my $unihan, "$PROG_UNZIP -c '$unihan_zip' |") or die;
    open (my $out, "> $outdir/unicode-unihan.h") or die;

    print $out "/* unicode-unihan.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_UNIHAN_H\n";
    print $out "#define UNICODE_UNIHAN_H\n\n";

    print $out "#include <glib.h>\n\n";

    print $out "typedef struct _Unihan Unihan;\n\n";

    print $out "static const struct _Unihan\n";
    print $out "{\n";
    print $out "  gunichar index;\n";
    print $out "  gint32 kDefinition;\n";
    print $out "  gint32 kCantonese;\n";
    print $out "  gint32 kMandarin;\n";
    print $out "  gint32 kTang;\n";
    print $out "  gint32 kKorean;\n";
    print $out "  gint32 kJapaneseKun;\n";
    print $out "  gint32 kJapaneseOn;\n";
    print $out "  gint32 kHangul;\n";
    print $out "  gint32 kVietnamese;\n";
    print $out "} \n";
    print $out "unihan[] =\n";
    print $out "{\n";

    my @strings;
    my $offset = 0;

    my $wc = 0;
    my ($kDefinition, $kCantonese, $kMandarin, $kTang, $kKorean, $kJapaneseKun, $kJapaneseOn, $kHangul, $kVietnamese);

    my $i = 0;
    while (my $line = <$unihan>)
    {
        chomp $line;
        $line =~ /^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$/ or next;

        my $new_wc = hex ($1);
        my $field = $2;

        my $value = $3;
        $value =~ s/\\/\\\\/g;
        $value =~ s/\"/\\"/g;

        if ($new_wc != $wc)
        {
            if (defined $kDefinition or defined $kCantonese or defined $kMandarin 
                or defined $kTang or defined $kKorean or defined $kJapaneseKun
                or defined $kJapaneseOn or defined $kHangul or defined $kVietnamese)
            {
                printf $out ("  { 0x%04X, \%d, \%d, \%d, \%d, \%d, \%d, \%d, \%d, \%d },\n",
                             $wc,
                             (defined($kDefinition) ? $kDefinition : -1),
                             (defined($kCantonese) ? $kCantonese: -1),
                             (defined($kMandarin) ? $kMandarin : -1),
                             (defined($kTang) ? $kTang : -1),
                             (defined($kKorean) ? $kKorean : -1),
                             (defined($kJapaneseKun) ? $kJapaneseKun : -1),
                             (defined($kJapaneseOn) ? $kJapaneseOn : -1),
                             (defined($kHangul) ? $kHangul : -1),
                             (defined($kVietnamese) ? $kVietnamese : -1));
            }

            $wc = $new_wc;

            undef $kDefinition;
            undef $kCantonese;
            undef $kMandarin;
            undef $kTang;
            undef $kKorean;
            undef $kJapaneseKun;
            undef $kJapaneseOn;
            undef $kHangul;
            undef $kVietnamese;
        }

        for my $f (qw(kDefinition kCantonese kMandarin
                     kTang kKorean kJapaneseKun kJapaneseOn kHangul kVietnamese)) {

            if ($field eq $f) {
	        push @strings, $value;
		my $last_offset = $offset;
		$offset += length($value) + 1;
		$value = $last_offset;
		last;
	    }
	}

        if ($field eq "kDefinition") {
            $kDefinition = $value;
        }
        elsif ($field eq "kCantonese") {
            $kCantonese = $value;
        }
        elsif ($field eq "kMandarin") {
            $kMandarin = $value;
        }
        elsif ($field eq "kTang") {
            $kTang = $value;
        }
        elsif ($field eq "kKorean") {
            $kKorean = $value;
        }
        elsif ($field eq "kJapaneseKun") {
            $kJapaneseKun = $value;
        }
        elsif ($field eq "kJapaneseOn") {
            $kJapaneseOn = $value;
        }
        elsif ($field eq "kHangul") {
            $kHangul = $value;
        }
        elsif ($field eq "kVietnamese") {
            $kVietnamese = $value;
        }

        if ($i++ % 32768 == 0) {
            print ".";
        }
    }

    print $out "};\n\n";

    print $out "static const char unihan_strings[] = \\\n";

    for my $s (@strings) {
	print $out "  \"$s\\0\"\n";
    }
    print $out ";\n\n";

    print $out "static const Unihan *_get_unihan (gunichar uc)\n;";

    for my $name (qw(kDefinition kCantonese kMandarin
		    kTang kKorean kJapaneseKun kJapaneseOn kHangul kVietnamese)) {
    print $out <<EOT;

static inline const char * unihan_get_$name (const Unihan *uh)
{
    gint32 offset = uh->$name;
    if (offset == -1)
      return NULL;
    return unihan_strings + offset;
}

const gchar * 
gucharmap_get_unicode_$name (gunichar uc)
{
  const Unihan *uh = _get_unihan (uc);
  if (uh == NULL)
    return NULL;
  else
    return unihan_get_$name (uh);
}

EOT
    }

    print $out "#endif  /* #ifndef UNICODE_UNIHAN_H */\n";

    close ($unihan);
    close ($out);

    print " done.\n";
}

#------------------------#

# $nameslist_hash = 
# {
#     0x0027 => { '=' => { 
#                          'index'  => 30, 
#                          'values' => [ 'APOSTROPHE-QUOTE', 'APL quote' ]
#                        }
#                 '*' => {
#                          'index'  => 50,
#                          'values' => [ 'neutral (vertical) glyph with mixed usage',
#                                        '2019 is preferred for apostrophe',
#                                        'preferred characters in English for paired quotation marks are 2018 & 2019'
#                                      ]
#                         }
#                  # etc
#                }
#     # etc 
# };
# 

sub print_names_list
{
    my ($out, $nameslist_hash, $token, $variable_name) = @_;

    print $out "static const char ", $variable_name, "_strings[] = \n";

    my @names_pairs;
    my %names_offsets;
    my $offset = 0;

    for my $wc (sort {$a <=> $b} keys %{$nameslist_hash})
    {
        next if not exists $nameslist_hash->{$wc}->{$token};
        for my $value (@{$nameslist_hash->{$wc}->{$token}->{'values'}}) {
            push @names_pairs, [$wc, $value];
            next if exists $names_offsets{$value};

            $names_offsets{$value} = $offset;
            $offset += length($value) + 1;

            my $printvalue = $value;
            $printvalue =~ s/\\/\\\\/g;
            $printvalue =~ s/\"/\\"/g;

            printf $out (qq/  "\%s\\0"\n/, $printvalue);
        }
    }

    print $out "  ;\n\n";

    print $out "static const UnicharStringIndex ", $variable_name, "[] = \n";
    print $out "{\n";
    foreach my $pair (@names_pairs) {
	my ($wc, $value) = @{$pair};
        printf $out (qq/  { 0x%04X, %d },\n/, $wc, $names_offsets{$value});
    }
    print $out "  { (gunichar)(-1), 0 } /* end marker */ \n";
    print $out "};\n\n";
}

sub process_nameslist_txt
{
    my ($nameslist_txt) = @_;

    open (my $nameslist, $nameslist_txt) or die;

    print "processing $nameslist_txt...";

    my ($equal_i, $ex_i, $star_i, $pound_i, $colon_i) = (0, 0, 0, 0, 0);
    my $wc = 0;

    my $nameslist_hash;
    my $in_multiline_comment = 0;
    my $seen_v = 0;

    while (my $line = <$nameslist>)
    {
        if ($in_multiline_comment && $line =~ /^\t/)
        {
            next;
        }

        chomp ($line);

        $in_multiline_comment = 0;

        if ($line =~ /^@\+/)
        {
            $in_multiline_comment = 1;
            next;
        }
        elsif ($line =~ /^@@@\tThe Unicode Standard ([0-9]+\.[0-9]+\.[0-9]+)$/)
        {
            die "$d contains unicode data for version $1 but version $v is required" unless $1 eq $v;
            $seen_v = 1;
        }
        elsif ($line =~ /^@/)
        {
            next;
        }
        elsif ($line =~ /^([0-9A-F]+)/)
        {
            $wc = hex ($1);
        }
        elsif ($line =~ /^\s+=\s+(.+)$/)
        {
            my $value = $1;

            if (not defined $nameslist_hash->{$wc}->{'='}->{'index'}) {
                $nameslist_hash->{$wc}->{'='}->{'index'} = $equal_i;
            }
            push (@{$nameslist_hash->{$wc}->{'='}->{'values'}}, $value);

            $equal_i++;
        }
        elsif ($line =~ /^\s+\*\s+(.+)$/)
        {
            my $value = $1;

            if (not defined $nameslist_hash->{$wc}->{'*'}->{'index'}) {
                $nameslist_hash->{$wc}->{'*'}->{'index'} = $star_i;
            }
            push (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}, $value);

            $star_i++;
        }
        elsif ($line =~ /^\s+#\s+(.+)$/)
        {
            my $value = $1;

            if (not defined $nameslist_hash->{$wc}->{'#'}->{'index'}) {
                $nameslist_hash->{$wc}->{'#'}->{'index'} = $pound_i;
            }
            push (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}, $value);

            $pound_i++;
        }
        elsif ($line =~ /^\s+:\s+(.+)$/)
        {
            my $value = $1;

            if (not defined $nameslist_hash->{$wc}->{':'}->{'index'}) {
                $nameslist_hash->{$wc}->{':'}->{'index'} = $colon_i;
            }
            push (@{$nameslist_hash->{$wc}->{':'}->{'values'}}, $value);

            $colon_i++;
        }
        elsif ($line =~ /^\s+x\s+.*?([0-9A-F]{4,6})\)$/)  # this one is different
        {
            my $value = hex ($1);

            if (not defined $nameslist_hash->{$wc}->{'x'}->{'index'}) {
                $nameslist_hash->{$wc}->{'x'}->{'index'} = $ex_i;
            }
            push (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}, $value);

            $ex_i++;
        }
    }

    close ($nameslist);

    die "Unicode version marker not found in $nameslist_txt" unless $seen_v;

    open (my $out, "> $outdir/unicode-nameslist.h") or die;

    print $out "/* unicode-nameslist.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_NAMESLIST_H\n";
    print $out "#define UNICODE_NAMESLIST_H\n\n";

    print $out "#include <glib.h>\n\n";

    print $out "typedef struct _UnicharStringIndex UnicharStringIndex;\n";
    print $out "typedef struct _UnicharUnichar UnicharUnichar;\n";
    print $out "typedef struct _NamesList NamesList;\n\n";

    print $out "struct _UnicharStringIndex\n";
    print $out "{\n";
    print $out "  gunichar index;\n";
    print $out "  guint32 string_index;\n";
    print $out "}; \n\n";

    print $out "struct _UnicharUnichar\n";
    print $out "{\n";
    print $out "  gunichar index;\n";
    print $out "  gunichar value;\n";
    print $out "}; \n\n";

    print $out "struct _NamesList\n";
    print $out "{\n";
    print $out "  gunichar index;\n";
    print $out "  gint16 equals_index;  /* -1 means */\n";
    print $out "  gint16 stars_index;   /* this character */\n";
    print $out "  gint16 exes_index;    /* doesn't */\n";
    print $out "  gint16 pounds_index;  /* have any */\n";
    print $out "  gint16 colons_index;\n";
    print $out "};\n\n";

    print_names_list($out, $nameslist_hash, '=', "names_list_equals");
    print_names_list($out, $nameslist_hash, '*', "names_list_stars");
    print_names_list($out, $nameslist_hash, '#', "names_list_pounds");
    print_names_list($out, $nameslist_hash, ':', "names_list_colons");

    print $out "static const UnicharUnichar names_list_exes[] = \n";
    print $out "{\n";
    for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
    {
        next if not exists $nameslist_hash->{$wc}->{'x'};
        for my $value (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}) {
            printf $out (qq/  { 0x%04X, 0x%04X },\n/, $wc, $value);
        }
    }
    print $out "  { (gunichar)(-1), 0 }\n";
    print $out "};\n\n";

    print $out "static const NamesList names_list[] =\n";
    print $out "{\n";
    for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
    {
        my $eq    = exists $nameslist_hash->{$wc}->{'='}->{'index'} ? $nameslist_hash->{$wc}->{'='}->{'index'} : -1;
        my $star  = exists $nameslist_hash->{$wc}->{'*'}->{'index'} ? $nameslist_hash->{$wc}->{'*'}->{'index'} : -1;
        my $ex    = exists $nameslist_hash->{$wc}->{'x'}->{'index'} ? $nameslist_hash->{$wc}->{'x'}->{'index'} : -1;
        my $pound = exists $nameslist_hash->{$wc}->{'#'}->{'index'} ? $nameslist_hash->{$wc}->{'#'}->{'index'} : -1;
        my $colon = exists $nameslist_hash->{$wc}->{':'}->{'index'} ? $nameslist_hash->{$wc}->{':'}->{'index'} : -1;

        printf $out ("  { 0x%04X, \%d, \%d, \%d, \%d, \%d },\n", $wc, $eq, $star, $ex, $pound, $colon);
    }
    print $out "};\n\n";

    print $out "#endif  /* #ifndef UNICODE_NAMESLIST_H */\n";

    close ($out);

    print " done.\n";
}

#------------------------#

sub read_blocks_txt
{
    my ($blocks_txt, $blocks) = @_;

    # Override script names
    my %block_overrides =
    (
      "NKo" => "N\'Ko"
    );

    open (my $blocks_file, $blocks_txt) or die;

    my $offset = 0;

    while (my $line = <$blocks_file>)
    {
        $line =~ /^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$/ or next;

        my ($start,$end,$block) = ($1, $2, $3);

        if (exists $block_overrides{$block}) {
                $block = $block_overrides{$block};
        }

        push @$blocks, [$start, $end, $block, $offset];
        $offset += length($block) + 1;
    }

    close ($blocks_file);
}

sub process_blocks_txt
{
    my ($blocks_txt) = @_;

    print "processing $blocks_txt...";

    open (my $out, "> $outdir/unicode-blocks.h") or die;

    print $out "/* unicode-blocks.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_BLOCKS_H\n";
    print $out "#define UNICODE_BLOCKS_H\n\n";

    print $out "#include <glib.h>\n";

    my @blocks;
    read_blocks_txt ($blocks_txt, \@blocks);

    print $out "static const char unicode_blocks_strings[] =\n";
    foreach my $block (@blocks)
    {
        my ($start, $end, $name, $offset) = @{$block};
        print $out qq/  "$name\\0"\n/;
    }
    print $out "  ;\n\n";

    print $out "typedef struct _UnicodeBlock UnicodeBlock;\n";
    print $out "\n";
    print $out "static const struct _UnicodeBlock\n";
    print $out "{\n";
    print $out "  gunichar start;\n";
    print $out "  gunichar end;\n";
    print $out "  guint16 block_name_index;\n";
    print $out "}\n";
    print $out "unicode_blocks[] =\n";
    print $out "{\n";
    foreach my $block (@blocks)
    {
        my ($start, $end, $name, $offset) = @{$block};
        print $out qq/  { 0x$start, 0x$end, $offset },\n/;
    }
    print $out "};\n\n";

    print $out "#endif  /* #ifndef UNICODE_BLOCKS_H */\n";

    close ($out);

    print " done.\n";
}

#------------------------#

sub read_scripts_txt
{
    my ($scripts_txt, $script_hash, $scripts) = @_;

    # Override script names
    my %script_overrides =
    (
      "Nko" => "N\'Ko"
    );

    open (my $scripts_file, $scripts_txt) or die;

    while (my $line = <$scripts_file>)
    {
        my ($start, $end, $raw_script);

        if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/)
        {
            $start = hex ($1);
            $end = hex ($2);
            $raw_script = $3;
        }
        elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/)
        {
            $start = hex ($1);
            $end = $start;
            $raw_script = $2;
        }
        else 
        {
            next;
        }

        my $script = $raw_script;
        $script =~ tr/_/ /;
        $script =~ s/(\w+)/\u\L$1/g;

        if (exists $script_overrides{$script}) {
                $script = $script_overrides{$script};
        }

        $script_hash->{$start} = { 'end' => $end, 'script' => $script };
        $scripts->{$script} = 1;
    }

    close ($scripts_file);

    # Adds Common to make sure works with UCD <= 4.0.0
    $scripts->{"Common"} = 1; 

    # Add Unknown (for code points not explicitly listed as script)
    $scripts->{"Unknown"} = 1;
}

sub process_scripts_txt
{
    my ($scripts_txt) = @_;

    print "processing $scripts_txt...";

    my %script_hash;
    my %scripts;

    read_scripts_txt ($scripts_txt, \%script_hash, \%scripts);

    open (my $out, "> $outdir/unicode-scripts.h") or die;

    print $out "/* unicode-scripts.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_SCRIPTS_H\n";
    print $out "#define UNICODE_SCRIPTS_H\n\n";

    print $out "#include <glib.h>\n";

    print $out "typedef struct _UnicodeScript UnicodeScript;\n\n";

    print $out "static const gchar unicode_script_list_strings[] =\n";
    my $offset = 0;
    my $i = 0;
    my %script_offsets;
    for my $script (sort keys %scripts)
    {
        printf $out (qq/  "\%s\\0"\n/, $script);
        $scripts{$script} = $i;
        $i++;
	$script_offsets{$script} = $offset;
	$offset += length($script) + 1;
    }
    print $out "  ;\n\n";
    undef $offset;

    print $out "static const guint16 unicode_script_list_offsets[] =\n";
    print $out "{\n";
    for my $script (sort keys %scripts)
    {
        printf $out (qq/  \%d,\n/, $script_offsets{$script});
    }
    print $out "};\n\n";

    print $out "static const struct _UnicodeScript\n";
    print $out "{\n";
    print $out "  gunichar start;\n";
    print $out "  gunichar end;\n";
    print $out "  guint8 script_index;   /* index into unicode_script_list_offsets */\n";
    print $out "}\n";
    print $out "unicode_scripts[] =\n";
    print $out "{\n";
    for my $start (sort { $a <=> $b } keys %script_hash) 
    {
        printf $out (qq/  { 0x%04X, 0x%04X, \%2d },\n/, 
                     $start, $script_hash{$start}->{'end'}, $scripts{$script_hash{$start}->{'script'}});
    }
    print $out "};\n\n";

    print $out "#endif  /* #ifndef UNICODE_SCRIPTS_H */\n";

    close ($out);
    print " done.\n";
}

#------------------------#

sub process_translatable_strings
{
    my ($blocks_txt, $scripts_txt) = @_;

    print "processing $blocks_txt and $scripts_txt...";

    my @blocks;
    read_blocks_txt ($blocks_txt, \@blocks);

    my %script_hash;
    my %scripts;

    read_scripts_txt ($scripts_txt, \%script_hash, \%scripts);

    open (my $out, "> $outdir/unicode-i18n.h") or die;

    print $out "unicode-i18n.h for extraction by gettext\n";
    print $out "THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN.\n";
    print $out "Generated by $0\n";
    print $out "Generated from UCD version $v\n\n";

    foreach my $block (@blocks)
    {
        my ($start, $end, $name, $offset) = @{$block};
        print $out qq/N_("$name")\n/;
    }

    print $out "\n";

    my $i = 0;
    for my $script (sort keys %scripts)
    {
        print $out qq/N_("$script")\n/;
    }

    close ($out);
    print " done.\n";
}

#------------------------#

sub process_versions_txt
{
    my ($versions_txt) = @_;

    my %version_hash;
    my %versions;

    open (my $versions, $versions_txt) or die;
    open (my $out, "> $outdir/unicode-versions.h") or die;

    print "processing $versions_txt...";

    while (my $line = <$versions>)
    {
        my ($start, $end, $raw_version);

        if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/)
        {
            $start = hex ($1);
            $end = hex ($2);
            $raw_version = $3;
        }
        elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/)
        {
            $start = hex ($1);
            $end = $start;
            $raw_version = $2;
        }
        else 
        {
            next;
        }

        my $version = $raw_version;
        $version =~ tr/_/ /;
        $version =~ s/(\w+)/\u\L$1/g;

        $versions{$version} = 1;

        $version =~ s/\./_/g;
        $version_hash{$start} = { 'end' => $end, 'version' => $version };
    }

    close ($versions);

    print $out "/* unicode-versions.h */\n";
    print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
    print $out "/* Generated by $0 */\n";
    print $out "/* Generated from UCD version $v */\n\n";

    print $out "#ifndef UNICODE_VERSIONS_H\n";
    print $out "#define UNICODE_VERSIONS_H\n\n";

    print $out "#include <glib.h>\n";

    print $out "typedef struct {\n";
    print $out "  gunichar start;\n";
    print $out "  gunichar end;\n";
    print $out "  GucharmapUnicodeVersion version;\n";
    print $out "} UnicodeVersion;\n\n";

    print $out "static const UnicodeVersion unicode_versions[] =\n";
    print $out "{\n";
    for my $start (sort { $a <=> $b } keys %version_hash)
    {
        printf $out (qq/  { 0x%04X, 0x%04X, GUCHARMAP_UNICODE_VERSION_\%s },\n/,
                     $start, $version_hash{$start}->{'end'}, $version_hash{$start}->{'version'});
    }
    print $out "};\n\n";

    print $out "static const gchar unicode_version_strings[] =\n";
    my $offset = 0;
    my %version_offsets;
    for my $version (sort { $a <=> $b } keys %versions)
    {
        printf $out (qq/  "\%s\\0"\n/, $version);
	$version_offsets{$version} = $offset;
	$offset += length($version) + 1;
    }
    print $out "  ;\n\n";
    undef $offset;

    print $out "static const guint16 unicode_version_string_offsets[] =\n";
    print $out "{\n";
    for my $version (sort { $a <=> $b } keys %versions)
    {
        printf $out (qq/  \%d,\n/, $version_offsets{$version});
    }
    print $out "};\n\n";

    print $out "#endif  /* #ifndef UNICODE_VERSIONS_H */\n";

    close ($out);
    print " done.\n";
}
