#!/usr/bin/perl -w ######################################################################### # # Generate utf-8 conversion table for EPrints 2.3.3 # # Copyright (c) 2004 Laszlo Csirmaz # ######################################################################### =pod =head1 NAME B - utf-8 conversion table for EPrints 2.3.3 =head1 SYNOPSYS B =head1 DESCRIPTION Dump a conversion table for EPrints 2.3.3 to the standard output. The table consists of three columns: the first one is the UTF-8 characters which are considered to be letters, latin digits, and hyphens. The second column is the lower-case equivalent, and the third column is the plain latin equivalent. The third column might have more than one character. The table is used in the full-text indexing process. Sequences consisting of "letters" are considered only. All consituent is replaced by its lower-case equivalent. Furthermore each word is accompanied by its latin equivalent. When searching for an accented word the result will be exact, however searching without accents the result includes all accented versions. The table is generated form UnicodeData.txt, supplied with the perl distribution. Entries with the "L" flag set are considered letters. The character descption is consulted for being a LATIN letter, and deciding which are the corresponding unaccented version. =head1 AUTHOR Laszlo Csirmaz, csirmaz@renyi.hu =head1 VERSION Version 1.0 =cut use Unicode::String qw( utf8 uchr utf16 ); my $dir=""; my $file="unicore/UnicodeData.txt"; foreach ( @INC ){ $dir=$_; last if( -f "$dir/$file" ); } open (UCD, "$dir/$file" ) || die "Cannot open UnicodeData.txt file\n"; print "sub UnicodeConvSub { my %hash = (\n"; ## digits foreach ( qw( 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 ) ){ print_line( $_, $_, chr(hex("0x".$_)), 'DIGIT' ); } while(){ chomp; my @a=split( /;/,$_.";;x" ); ## test for hyphens if( $a[2] eq "Pd" ){ print_line($a[0],'002D','-',$a[1]); } ## skip if it is not a letter next if( $a[4] ne "L" ); ## skip if in the description there is no LATIN next if( $a[1] !~ /LATIN/ ); ## we try to figure out what type of letter is it. ## after the LETTER keyword, skip over the following: ## SHARP DOTLESS LONG OPEN AFRICAN TURNED REVERSED SCRIPT CAPITAL ## BARRED SQUAT STRECHED SIDEWAYS ## if the next sequence is one or two letters long, it is included. ## these chars are not included: THORN, ETH, KRA, ENG, TONE, SCHWA ## ESH, EZH, TWO, INVERTED, WYNN, DENTAL, LATERAL ## after the LIGATURE keyword, skip over LONG; keep FFI,FFL and two ## char sequences $ltr=""; if( $a[1] =~ /LIGATURE ([A-Z]+)\s*(.*)$/ ){ $ltr=$1; $rest=$2; if( $ltr =~ /LONG/ ){ $ltr=$rest; $ltr =~ s/[^A-Z].*$//; } if( $rest =~ /^([A-Z])\s+([A-Z])$/ ){ $ltr = $1.$2; } } if( $a[1] =~ /LETTER ([A-Z]+)\s*(.*)$/ ){ ($ltr,$rest) = skip_magic_words($1,$2); if( $rest =~ /LETTER ([A-Z]+)\s*(.*)$/ ){ ($ltr2,$rest)=skip_magic_words($1,$2); $ltr .= $ltr2; } } next if( $ltr !~ /^(FFI|FFL|..?)$/ ); print_line( $a[0], $a[13] eq "" ? $a[0]:$a[13], $ltr, $a[1] ); ## get the lower case equivalent } print "); return \\%hash; }\n"; close(UCD); exit 0; ######################################################################## # # sub skip_magic_words( $first, $rest) # # skip magic words in $first. Return the remaining ($first,$rest) pair # ######################################################################## sub skip_magic_words { my($first,$rest) = @_; while( $first =~ /SHARP|DOTLESS|LONG|OPEN|AFRICAN|TURNED|REVERSED|SCRIPT|CAPITAL|BARRED|SQUAT|STRECHED|SIDEWAYS|CLOSED|SMALL/ ) { if( $rest =~ /([A-Z]+)\s*(.*)$/ ) { $first = $1; $rest = $2; } else { $first = $rest = ""; } } return ($first,$rest); } ######################################################################## # # sub print_lint( $hex_char, $hex_lower_case, $latin , $comment ) # # print out one line for the conversion table. # ######################################################################## sub print_line { my ( $chr, $lwc,$latin,$comment ) = @_; print "'",uchr(hex("0x".$chr)), "' => [ '", uchr(hex("0x".$lwc)), "', '",lc($latin), "'], #", $chr, " ", $comment, "\n"; }