2000-12-13 17:47:13 -05:00
|
|
|
#! /usr/bin/perl
|
|
|
|
|
2001-12-28 23:31:10 -05:00
|
|
|
# Copyright (C) 2000, 2001 Free Software Foundation
|
2000-12-13 17:47:13 -05:00
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 2, or (at your option)
|
|
|
|
# any later version.
|
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
|
|
|
# 02111-1307, USA.
|
|
|
|
|
|
|
|
# gen-table.pl - Generate tables for gcj from Unicode data.
|
|
|
|
# Usage: perl gen-table.pl DATA-FILE
|
2001-12-28 17:27:29 -05:00
|
|
|
#
|
2001-12-28 23:31:10 -05:00
|
|
|
# You can find the Unicode data file here:
|
|
|
|
# ftp://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.txt
|
|
|
|
# Please update this URL when this program is used with a more
|
|
|
|
# recent version of the table. Note that this table cannot be
|
|
|
|
# distributed with gcc.
|
|
|
|
# This program should not be re-run indiscriminately. Care must be
|
|
|
|
# taken that what it generates is in sync with the Java specification.
|
2000-12-13 17:47:13 -05:00
|
|
|
|
|
|
|
# Names of fields in Unicode data table.
|
|
|
|
$CODE = 0;
|
|
|
|
$NAME = 1;
|
|
|
|
$CATEGORY = 2;
|
|
|
|
$COMBINING_CLASSES = 3;
|
|
|
|
$BIDI_CATEGORY = 4;
|
|
|
|
$DECOMPOSITION = 5;
|
|
|
|
$DECIMAL_VALUE = 6;
|
|
|
|
$DIGIT_VALUE = 7;
|
|
|
|
$NUMERIC_VALUE = 8;
|
|
|
|
$MIRRORED = 9;
|
|
|
|
$OLD_NAME = 10;
|
|
|
|
$COMMENT = 11;
|
|
|
|
$UPPER = 12;
|
|
|
|
$LOWER = 13;
|
|
|
|
$TITLE = 14;
|
|
|
|
|
|
|
|
# Start of special-cased gaps in Unicode data table.
|
|
|
|
%gaps = (
|
|
|
|
0x4e00 => "CJK",
|
|
|
|
0xac00 => "Hangul",
|
|
|
|
0xd800 => "Unassigned High Surrogate",
|
|
|
|
0xdb80 => "Private Use High Surrogate",
|
|
|
|
0xdc00 => "Low Surrogate",
|
|
|
|
0xe000 => "Private Use"
|
|
|
|
);
|
|
|
|
|
|
|
|
# This lists control characters which are also considered whitespace.
|
|
|
|
# This is a somewhat odd list, taken from the JCL definition of
|
|
|
|
# Character.isIdentifierIgnorable.
|
|
|
|
%whitespace_controls =
|
|
|
|
(
|
|
|
|
0x0009 => 1,
|
|
|
|
0x000a => 1,
|
|
|
|
0x000b => 1,
|
|
|
|
0x000c => 1,
|
|
|
|
0x000d => 1,
|
|
|
|
0x001c => 1,
|
|
|
|
0x001d => 1,
|
|
|
|
0x001e => 1,
|
|
|
|
0x001f => 1
|
|
|
|
);
|
|
|
|
|
|
|
|
open (INPUT, "< $ARGV[0]") || exit 1;
|
|
|
|
|
|
|
|
$last_code = -1;
|
|
|
|
while (<INPUT>)
|
|
|
|
{
|
|
|
|
chop;
|
|
|
|
@fields = split (';', $_, 30);
|
|
|
|
if ($#fields != 14)
|
|
|
|
{
|
|
|
|
print STDERR "Entry for $fields[$CODE] has wrong number of fields\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
$code = hex ($fields[$CODE]);
|
2001-12-28 23:31:10 -05:00
|
|
|
last if $code > 0xffff;
|
2000-12-13 17:47:13 -05:00
|
|
|
if ($code > $last_code + 1)
|
|
|
|
{
|
|
|
|
# Found a gap.
|
|
|
|
if (defined $gaps{$code})
|
|
|
|
{
|
|
|
|
# Fill the gap with the last character read.
|
|
|
|
@gfields = @fields;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
# The gap represents undefined characters. Only the type
|
|
|
|
# matters.
|
|
|
|
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
|
|
|
'', '', '', '');
|
|
|
|
}
|
|
|
|
for (++$last_code; $last_code < $code; ++$last_code)
|
|
|
|
{
|
|
|
|
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
|
|
|
&process_one ($last_code, @gfields);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
&process_one ($code, @fields);
|
|
|
|
$last_code = $code;
|
|
|
|
}
|
|
|
|
|
|
|
|
close (INPUT);
|
|
|
|
|
|
|
|
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
|
|
|
'', '', '', '');
|
|
|
|
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
|
|
|
{
|
|
|
|
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
|
|
|
&process_one ($last_code, @gfields);
|
|
|
|
}
|
|
|
|
--$last_code; # Want last to be 0xFFFF.
|
|
|
|
|
|
|
|
&print_tables ($last_code);
|
|
|
|
|
|
|
|
exit 0;
|
|
|
|
|
|
|
|
# Process a single character.
|
|
|
|
sub process_one
|
|
|
|
{
|
|
|
|
my ($code, @fields) = @_;
|
|
|
|
|
|
|
|
my $value = '';
|
|
|
|
my $type = $fields[$CATEGORY];
|
|
|
|
|
|
|
|
# See if the character is a valid identifier start.
|
|
|
|
if ($type =~ /L./ # Letter
|
|
|
|
|| $type eq 'Pc' # Connecting punctuation
|
|
|
|
|| $type eq 'Sc') # Currency symbol
|
|
|
|
{
|
|
|
|
$value = 'LETTER_START';
|
|
|
|
}
|
|
|
|
|
|
|
|
# See if the character is a valid identifier member.
|
|
|
|
if ($type =~ /L./ # Letter
|
|
|
|
|| $type eq 'Pc' # Connecting punctuation
|
|
|
|
|| $type eq 'Sc' # Currency symbol
|
|
|
|
|| $type =~ /N[dl]/ # Number: decimal or letter
|
|
|
|
|| $type =~ /M[nc]/ # Mark: non-spacing or combining
|
|
|
|
|| ($type eq 'Cc' # Certain controls
|
|
|
|
&& ! defined $whitespace_controls{$code})
|
|
|
|
|| ($code >= 0x200c # Join controls
|
|
|
|
&& $code <= 0x200f)
|
|
|
|
|| ($code >= 0x202a # Bidi controls -- note that there
|
|
|
|
# is a typo in the JCL where these are
|
|
|
|
# concerned.
|
|
|
|
&& $code <= 0x202e)
|
|
|
|
|| ($code >= 0x206a # Format controls
|
|
|
|
&& $code <= 0x206f)
|
|
|
|
|| $code == 0xfeff) # ZWNBSP
|
|
|
|
{
|
|
|
|
if ($value eq '')
|
|
|
|
{
|
|
|
|
$value = 'LETTER_PART';
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
$value = 'LETTER_PART | ' . $value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($value eq '')
|
|
|
|
{
|
|
|
|
$value = '0';
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
$value = '(' . $value . ')';
|
|
|
|
}
|
|
|
|
|
|
|
|
$map[$code] = $value;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub print_tables
|
|
|
|
{
|
|
|
|
my ($last) = @_;
|
|
|
|
|
|
|
|
local ($bytes_out) = 0;
|
|
|
|
|
|
|
|
open (OUT, "> chartables.h");
|
|
|
|
|
|
|
|
print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
|
|
|
|
print OUT " Instead, edit gen-table.pl and re-run. */\n\n";
|
|
|
|
|
2001-05-25 21:31:47 -04:00
|
|
|
print OUT "#ifndef GCC_CHARTABLES_H\n";
|
|
|
|
print OUT "#define GCC_CHARTABLES_H\n\n";
|
2000-12-13 17:47:13 -05:00
|
|
|
|
|
|
|
print OUT "#define LETTER_START 1\n";
|
|
|
|
print OUT "#define LETTER_PART 2\n\n";
|
|
|
|
|
|
|
|
for ($count = 0; $count <= $last; $count += 256)
|
|
|
|
{
|
2001-12-28 17:27:29 -05:00
|
|
|
$row[$count / 256] = &print_row ($count, '(char *) ', 'const char', 1,
|
2000-12-13 17:47:13 -05:00
|
|
|
'page');
|
|
|
|
}
|
|
|
|
|
2001-12-28 17:27:29 -05:00
|
|
|
print OUT "static const char *const type_table[256] = {\n";
|
2000-12-13 17:47:13 -05:00
|
|
|
for ($count = 0; $count <= $last; $count += 256)
|
|
|
|
{
|
|
|
|
print OUT ",\n" if $count > 0;
|
|
|
|
print OUT " ", $row[$count / 256];
|
|
|
|
$bytes_out += 4;
|
|
|
|
}
|
|
|
|
print OUT "\n};\n\n";
|
|
|
|
|
2001-05-25 21:31:47 -04:00
|
|
|
print OUT "#endif /* ! GCC_CHARTABLES_H */\n";
|
2000-12-13 17:47:13 -05:00
|
|
|
|
|
|
|
close (OUT);
|
|
|
|
|
|
|
|
printf "Generated %d bytes\n", $bytes_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Print a single "row" of a two-level table.
|
|
|
|
sub print_row
|
|
|
|
{
|
|
|
|
my ($start, $def_pfx, $typname, $typsize, $name) = @_;
|
|
|
|
|
|
|
|
my ($i);
|
|
|
|
my (@values);
|
|
|
|
my ($flag) = 1;
|
|
|
|
my ($off);
|
|
|
|
for ($off = 0; $off < 256; ++$off)
|
|
|
|
{
|
|
|
|
$values[$off] = $map[$off + $start];
|
|
|
|
if ($values[$off] ne $values[0])
|
|
|
|
{
|
|
|
|
$flag = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ($flag)
|
|
|
|
{
|
|
|
|
return $def_pfx . $values[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
printf OUT "static %s %s%d[256] = {\n ", $typname, $name, $start / 256;
|
|
|
|
my ($column) = 2;
|
|
|
|
for ($i = $start; $i < $start + 256; ++$i)
|
|
|
|
{
|
|
|
|
print OUT ", "
|
|
|
|
if $i > $start;
|
|
|
|
my ($text) = $values[$i - $start];
|
|
|
|
if (length ($text) + $column + 2 > 78)
|
|
|
|
{
|
|
|
|
print OUT "\n ";
|
|
|
|
$column = 2;
|
|
|
|
}
|
|
|
|
print OUT $text;
|
|
|
|
$column += length ($text) + 2;
|
|
|
|
}
|
|
|
|
print OUT "\n};\n\n";
|
|
|
|
|
|
|
|
$bytes_out += 256 * $typsize;
|
|
|
|
|
|
|
|
return sprintf "%s%d", $name, $start / 256;
|
|
|
|
}
|