8sa1-gcc/libjava/gnu/gcj/convert/Output_UTF8.java

122 lines
3.2 KiB
Java
Raw Normal View History

/* Copyright (C) 1999, 2000 Free Software Foundation
1999-04-07 10:42:40 -04:00
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
/**
* Convert Unicode to UTF8.
* @author Per Bothner <bothner@cygnus.com>
* @date Match 1999.
*/
1999-04-07 10:42:40 -04:00
public class Output_UTF8 extends UnicodeToBytes
{
public String getName() { return "UTF8"; }
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
* Otherwise, a surrogate pair is treated as two separate characters.
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
public boolean standardUTF8 = true;
1999-04-07 10:42:40 -04:00
// Saves the previous char if it was a high-surrogate.
char hi_part;
// Value of incomplete character.
1999-04-07 10:42:40 -04:00
int value;
// Number of continuation bytes still to emit.
int bytes_todo;
public int write (char[] inbuffer, int inpos, int inlength)
{
int start_pos = inpos;
int avail = buf.length - count;
for (;;)
{
if (avail == 0 || (inlength == 0 && bytes_todo == 0))
1999-04-07 10:42:40 -04:00
break;
// The algorithm is made more complicated because we want to write
1999-04-07 10:42:40 -04:00
// at least one byte in the output buffer, if there is room for
// that byte, and at least one input character is available.
// This makes the code more robust, since client code will
// always "make progress", even in the complicated cases,
// where the output buffer only has room for only *part* of a
// multi-byte sequence, or the input char buffer only has half
// of a surrogate pair (when standardUTF8 is set), or both.
// Handle continuation characters we did not have room for before.
if (bytes_todo > 0)
{
do
{
bytes_todo--;
buf[count++] = (byte)
(((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
1999-04-07 10:42:40 -04:00
avail--;
}
while (bytes_todo > 0 && avail > 0);
continue;
}
1999-04-07 10:42:40 -04:00
char ch = inbuffer[inpos++];
inlength--;
if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
|| (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
{
// If the previous character was a high surrogate, and we
// don't now have a low surrogate, we print the high
// surrogate as an isolated character. If this character
// is a low surrogate and we didn't previously see a high
// surrogate, we do the same thing.
--inpos;
++inlength;
buf[count++] = (byte) (0xE0 | (hi_part >> 12));
value = hi_part;
hi_part = 0;
avail--;
bytes_todo = 2;
}
else if (ch < 128 && (ch != 0 || standardUTF8))
1999-04-07 10:42:40 -04:00
{
avail--;
buf[count++] = (byte) ch;
}
else if (ch <= 0x07FF)
{
buf[count++] = (byte) (0xC0 | (ch >> 6));
avail--;
value = ch;
bytes_todo = 1;
1999-04-07 10:42:40 -04:00
}
else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
{
if (ch <= 0xDBFF) // High surrogates
{
// Just save the high surrogate until the next
// character comes along.
1999-04-07 10:42:40 -04:00
hi_part = ch;
}
else // Low surrogates
{
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
buf[count++] = (byte) (0xF0 | (value >> 18));
1999-04-07 10:42:40 -04:00
bytes_todo = 3;
hi_part = 0;
1999-04-07 10:42:40 -04:00
}
}
else
{
buf[count++] = (byte) (0xE0 | (ch >> 12));
value = ch;
avail--;
bytes_todo = 2;
}
}
return inpos - start_pos;
}
}