#!/usr/bin/perl -w

#########################################################################
#
#  Generate utf-8 conversion table for EPrints 2.3.3
#
#  Copyright (c) 2004 Laszlo Csirmaz
#
#########################################################################

=pod

=head1 NAME

B<generate_utf8> - utf-8 conversion table for EPrints 2.3.3

=head1 SYNOPSYS

B<generate_utf8>

=head1 DESCRIPTION

Dump a conversion table for EPrints 2.3.3 to the standard output. 
The table consists of three columns: the first one is the UTF-8
characters which are considered to be letters, latin digits, and
hyphens. The second column is the lower-case equivalent, and the 
third column is the plain latin equivalent. The third column might 
have more than one character.

The table is used in the full-text indexing process. Sequences
consisting of "letters" are considered only. All consituent is
replaced by its lower-case equivalent. Furthermore each word is
accompanied by its latin equivalent. When searching for an accented
word the result will be exact, however searching without accents
the result includes all accented versions.

The table is generated form UnicodeData.txt, supplied with the perl
distribution. Entries with the "L" flag set are considered letters.
The character descption is consulted for being a LATIN letter, and
deciding which are the corresponding unaccented version. 

=head1 AUTHOR

Laszlo Csirmaz, csirmaz@renyi.hu

=head1 VERSION

Version 1.0

=cut

use Unicode::String qw( utf8 uchr utf16 );

my $dir="";
my $file="unicore/UnicodeData.txt";

foreach ( @INC ){
    $dir=$_;
    last if( -f "$dir/$file" );
}

open (UCD, "$dir/$file" ) || die "Cannot open UnicodeData.txt file\n";

print "sub UnicodeConvSub { my %hash = (\n";

## digits
foreach ( qw( 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 ) ){
    print_line( $_, $_, chr(hex("0x".$_)), 'DIGIT' );
}

while(<UCD>){
    chomp;
    my @a=split( /;/,$_.";;x" );
## test for hyphens
    if( $a[2] eq "Pd" ){
	print_line($a[0],'002D','-',$a[1]);
    }

## skip if it is not a letter
    next if( $a[4] ne "L" );
## skip if in the description there is no LATIN
    next if( $a[1] !~ /LATIN/ );
## we try to figure out what type of letter is it.
## after the LETTER keyword, skip over the following:
##     SHARP DOTLESS LONG OPEN AFRICAN TURNED REVERSED SCRIPT CAPITAL
##     BARRED SQUAT STRECHED SIDEWAYS
## if the next sequence is one or two letters long, it is included.
## these chars are not included: THORN, ETH, KRA, ENG, TONE, SCHWA 
##     ESH, EZH, TWO, INVERTED, WYNN, DENTAL, LATERAL
## after the LIGATURE keyword, skip over LONG; keep FFI,FFL and two
##      char sequences
     $ltr="";
     if( $a[1] =~ /LIGATURE ([A-Z]+)\s*(.*)$/ ){
           $ltr=$1; $rest=$2;
	   if( $ltr =~ /LONG/ ){ $ltr=$rest; $ltr =~ s/[^A-Z].*$//; }
	   if( $rest =~ /^([A-Z])\s+([A-Z])$/ ){
	       $ltr = $1.$2;
	   }
     }
     if( $a[1] =~ /LETTER ([A-Z]+)\s*(.*)$/ ){ 
	($ltr,$rest) = skip_magic_words($1,$2);
	if( $rest =~ /LETTER ([A-Z]+)\s*(.*)$/ ){
	    ($ltr2,$rest)=skip_magic_words($1,$2);
	    $ltr .= $ltr2;
	}
     }
     next if( $ltr !~ /^(FFI|FFL|..?)$/ );
     print_line( $a[0], $a[13] eq "" ? $a[0]:$a[13], $ltr, $a[1] );
## get the lower case equivalent
}

  print "); return \\%hash; }\n";
  close(UCD);

  exit 0;

########################################################################
#
#  sub skip_magic_words( $first, $rest)
#
#  skip magic words in $first. Return the remaining ($first,$rest) pair
#
########################################################################

sub skip_magic_words
{
    my($first,$rest) = @_;
    while( $first =~
     /SHARP|DOTLESS|LONG|OPEN|AFRICAN|TURNED|REVERSED|SCRIPT|CAPITAL|BARRED|SQUAT|STRECHED|SIDEWAYS|CLOSED|SMALL/ )
    {
	if( $rest =~ /([A-Z]+)\s*(.*)$/ )
	{
	    $first = $1; $rest = $2;
	}
	else
	{
	    $first = $rest = "";
	}
    }
    return ($first,$rest);
}

########################################################################
#
#  sub print_lint( $hex_char, $hex_lower_case, $latin , $comment )
#
# print out one line for the conversion table.
#
########################################################################

sub print_line {

    my ( $chr, $lwc,$latin,$comment ) = @_;

    print "'",uchr(hex("0x".$chr)), "' => [ '",
              uchr(hex("0x".$lwc)), "', '",lc($latin), "'], #",
              $chr, " ", $comment, "\n";
}