diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b2867b12c6d..b5f465adfb9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,39 @@ +Mon Jul 20 16:16:38 1998 Dave Brolley + + * configure.in (enable_c_mbchar): New configure option. + (extra_cpp_objs): Always available now. + + * cexp.y (mbchar.h): #include it. + (yylex): Handle Multibyte characters in character literals. + + * cccp.c (mbchar.h): #include it. + (main): Set character set based on LANG environment variable. + (rescan): Handle multibyte characters in comments. + (skip_if_group): See above. + (validate_else): See above. + (skip_to_end_of_comment): See above. + (macarg1): See above. + (discard_comments): See above. + (rescan): Handle multibyte characters in string and character literals. + (collect_expansion): See above. + (skip_quoted_string): See above. + (macroexpand): See above. + (macarg1): See above. + (discard_comments): See above. + (change_newlines): See above. + + * c-lex.c (mbchar.h): #include it. + (GET_ENVIRONMENT): New macro. + (init_lex): Set character set based on LANG environment variable. + (yylex): Handle multibyte characters in character literals. + (yylex): Handle multibyte characters in string literals. + + * Makefile.in (mbchar.o): New target. + (cccp$(exeext)): @extra_cpp_objs@ is always available. + (cppmain$(exeext)): @extra_cpp_objs@ is always available. + + * mbchar.[ch]: New files for multibyte character handling. + Mon Jul 20 01:11:11 1998 David S. Miller * jump.c (jump_optimize): When simplifying noop moves and diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 39784e17b17..ea0a0b52e9a 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -641,7 +641,8 @@ OBJS = toplev.o version.o tree.o print-tree.o stor-layout.o fold-const.o \ regclass.o local-alloc.o global.o reload.o reload1.o caller-save.o gcse.o \ insn-peep.o reorg.o $(SCHED_PREFIX)sched.o final.o recog.o reg-stack.o \ insn-opinit.o insn-recog.o insn-extract.o insn-output.o insn-emit.o \ - profile.o insn-attrtab.o $(out_object_file) getpwd.o $(EXTRA_OBJS) convert.o + profile.o insn-attrtab.o $(out_object_file) getpwd.o $(EXTRA_OBJS) convert.o \ + mbchar.o # GEN files are listed separately, so they can be built before doing parallel # makes for cc1 or cc1plus. Otherwise sequent parallel make attempts to load @@ -1275,13 +1276,14 @@ c-lang.o : c-lang.c $(CONFIG_H) system.h $(TREE_H) c-tree.h c-lex.h toplev.h \ output.h c-lex.o : c-lex.c $(CONFIG_H) system.h $(TREE_H) $(RTL_H) c-lex.h c-tree.h \ $(srcdir)/c-parse.h input.h flags.h $(srcdir)/c-gperf.h c-pragma.h \ - toplev.h output.h + toplev.h output.h mbchar.h c-aux-info.o : c-aux-info.c $(CONFIG_H) system.h $(TREE_H) c-tree.h flags.h c-convert.o : c-convert.c $(CONFIG_H) system.h $(TREE_H) flags.h toplev.h c-pragma.o: c-pragma.c $(CONFIG_H) system.h $(RTL_H) $(TREE_H) except.h \ function.h defaults.h c-pragma.h toplev.h c-iterate.o: c-iterate.c $(CONFIG_H) system.h $(TREE_H) $(RTL_H) c-tree.h \ flags.h toplev.h $(EXPR_H) +mbchar.o: $(CONFIG_H) system.h gansidecl.h mbchar.h collect2$(exeext): collect2.o tlink.o hash.o cplus-dem.o underscore.o \ version.o choose-temp.o mkstemp.o $(LIBDEPS) @@ -1816,15 +1818,16 @@ $(HOST_PREFIX_1): cpp$(exeext): $(CCCP)$(exeext) -rm -f cpp$(exeext) $(LN) $(CCCP)$(exeext) cpp$(exeext) -cccp$(exeext): cccp.o cexp.o version.o prefix.o $(LIBDEPS) - $(CC) $(ALL_CFLAGS) $(LDFLAGS) -o $@ cccp.o cexp.o prefix.o \ - version.o $(LIBS) +cccp$(exeext): cccp.o cexp.o version.o prefix.o mbchar.o @extra_cpp_objs@ $(LIBDEPS) + $(CC) $(ALL_CFLAGS) $(LDFLAGS) -o $@ cccp.o cexp.o prefix.o mbchar.o \ + version.o @extra_cpp_objs@ $(LIBS) cexp.o: $(srcdir)/cexp.c $(CONFIG_H) system.h gansidecl.h $(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) -c $(srcdir)/cexp.c $(srcdir)/cexp.c: $(srcdir)/cexp.y cd $(srcdir); $(BISON) -o cexp.c cexp.y -cccp.o: cccp.c $(CONFIG_H) pcp.h version.c config.status system.h gansidecl.h +cccp.o: cccp.c $(CONFIG_H) pcp.h version.c config.status system.h gansidecl.h \ + mbchar.h $(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ -DGCC_INCLUDE_DIR=\"$(libsubdir)/include\" \ -DGPLUSPLUS_INCLUDE_DIR=\"$(gxx_include_dir)\" \ @@ -1835,8 +1838,9 @@ cccp.o: cccp.c $(CONFIG_H) pcp.h version.c config.status system.h gansidecl.h -c `echo $(srcdir)/cccp.c | sed 's,^\./,,'` cppmain$(exeext): cppmain.o cpplib.o cpphash.o cppalloc.o cpperror.o cppexp.o \ - prefix.o version.o $(LIBDEPS) + prefix.o version.o mbchar.o @extra_cpp_objs@ $(LIBDEPS) $(CC) $(ALL_CFLAGS) $(LDFLAGS) -o $@ cppmain.o cpplib.o cpphash.o \ + mbchar.o @extra_cpp_objs@ \ cppalloc.o cpperror.o cppexp.o prefix.o version.o $(LIBS) cppmain.o: cppmain.c $(CONFIG_H) cpplib.h system.h gansidecl.h diff --git a/gcc/c-lex.c b/gcc/c-lex.c index f82ad761d31..f4f4a12d1e8 100644 --- a/gcc/c-lex.c +++ b/gcc/c-lex.c @@ -33,16 +33,14 @@ Boston, MA 02111-1307, USA. */ #include "c-pragma.h" #include "toplev.h" -/* MULTIBYTE_CHARS support only works for native compilers. - ??? Ideally what we want is to model widechar support after - the current floating point support. */ -#ifdef CROSS_COMPILE -#undef MULTIBYTE_CHARS -#endif - #ifdef MULTIBYTE_CHARS +#include "mbchar.h" #include + +#ifndef GET_ENVIRONMENT +#define GET_ENVIRONMENT(ENV_VALUE,ENV_NAME) ((ENV_VALUE) = getenv (ENV_NAME)) #endif +#endif /* MULTIBYTE_CHARS */ #if USE_CPPLIB #include "cpplib.h" @@ -232,6 +230,7 @@ init_lex () #ifdef MULTIBYTE_CHARS /* Change to the native locale for multibyte conversions. */ setlocale (LC_CTYPE, ""); + GET_ENVIRONMENT (literal_codeset, "LANG"); #endif maxtoken = 40; @@ -1795,30 +1794,27 @@ yylex () { register int result = 0; register int num_chars = 0; + int chars_seen = 0; unsigned width = TYPE_PRECISION (char_type_node); int max_chars; - - if (wide_flag) - { - width = WCHAR_TYPE_SIZE; #ifdef MULTIBYTE_CHARS - max_chars = MB_CUR_MAX; -#else - max_chars = 1; + int longest_char = local_mb_cur_max (); + (void) local_mbtowc (NULL_PTR, NULL_PTR, 0); #endif - } - else - max_chars = TYPE_PRECISION (integer_type_node) / width; + + max_chars = TYPE_PRECISION (integer_type_node) / width; + if (wide_flag) + width = WCHAR_TYPE_SIZE; while (1) { tryagain: - c = GETC(); if (c == '\'' || c == EOF) break; + ++chars_seen; if (c == '\\') { int ignore = 0; @@ -1839,18 +1835,76 @@ yylex () pedwarn ("ANSI C forbids newline in character constant"); lineno++; } -#ifdef MAP_CHARACTER else - c = MAP_CHARACTER (c); + { +#ifdef MULTIBYTE_CHARS + wchar_t wc; + int i; + int char_len = -1; + for (i = 1; i <= longest_char; ++i) + { + if (i > maxtoken - 4) + extend_token_buffer (token_buffer); + + token_buffer[i] = c; + char_len = local_mbtowc (& wc, + token_buffer + 1, + i); + if (char_len != -1) + break; + c = GETC (); + } + if (char_len > 1) + { + /* mbtowc sometimes needs an extra char before accepting */ + if (char_len < i) + UNGETC (c); + if (! wide_flag) + { + /* Merge character into result; ignore excess chars. */ + for (i = 1; i <= char_len; ++i) + { + if (i > max_chars) + break; + if (width < HOST_BITS_PER_INT) + result = (result << width) + | (token_buffer[i] + & ((1 << width) - 1)); + else + result = token_buffer[i]; + } + num_chars += char_len; + goto tryagain; + } + c = wc; + } + else + { + if (char_len == -1) + warning ("Ignoring invalid multibyte character"); + if (wide_flag) + c = wc; +#ifdef MAP_CHARACTER + else + c = MAP_CHARACTER (c); #endif + } +#else /* ! MULTIBYTE_CHARS */ +#ifdef MAP_CHARACTER + c = MAP_CHARACTER (c); +#endif +#endif /* ! MULTIBYTE_CHARS */ + } - num_chars++; - if (num_chars > maxtoken - 4) - extend_token_buffer (token_buffer); - - token_buffer[num_chars] = c; + if (wide_flag) + { + if (chars_seen == 1) /* only keep the first one */ + result = c; + goto tryagain; + } /* Merge character into result; ignore excess chars. */ + num_chars += (width / TYPE_PRECISION (char_type_node)); if (num_chars < max_chars + 1) { if (width < HOST_BITS_PER_INT) @@ -1860,19 +1914,16 @@ yylex () } } - token_buffer[num_chars + 1] = '\''; - token_buffer[num_chars + 2] = 0; - if (c != '\'') error ("malformatted character constant"); - else if (num_chars == 0) + else if (chars_seen == 0) error ("empty character constant"); else if (num_chars > max_chars) { num_chars = max_chars; error ("character constant too long"); } - else if (num_chars != 1 && ! flag_traditional && warn_multichar) + else if (chars_seen != 1 && ! flag_traditional && warn_multichar) warning ("multi-character character constant"); /* If char type is signed, sign-extend the constant. */ @@ -1897,22 +1948,6 @@ yylex () } else { -#ifdef MULTIBYTE_CHARS - /* Set the initial shift state and convert the next sequence. */ - result = 0; - /* In all locales L'\0' is zero and mbtowc will return zero, - so don't use it. */ - if (num_chars > 1 - || (num_chars == 1 && token_buffer[1] != '\0')) - { - wchar_t wc; - (void) mbtowc (NULL_PTR, NULL_PTR, 0); - if (mbtowc (& wc, token_buffer + 1, num_chars) == num_chars) - result = wc; - else - warning ("Ignoring invalid multibyte character"); - } -#endif yylval.ttype = build_int_2 (result, 0); TREE_TYPE (yylval.ttype) = wchar_type_node; } @@ -1924,7 +1959,13 @@ yylex () case '"': string_constant: { - c = GETC(); + unsigned width = wide_flag ? WCHAR_TYPE_SIZE + : TYPE_PRECISION (char_type_node); +#ifdef MULTIBYTE_CHARS + int longest_char = local_mb_cur_max (); + (void) local_mbtowc (NULL_PTR, NULL_PTR, 0); +#endif + c = GETC (); p = token_buffer + 1; while (c != '"' && c >= 0) @@ -1935,9 +1976,8 @@ yylex () c = readescape (&ignore); if (ignore) goto skipnewline; - if (!wide_flag - && TYPE_PRECISION (char_type_node) < HOST_BITS_PER_INT - && c >= (1 << TYPE_PRECISION (char_type_node))) + if (width < HOST_BITS_PER_INT + && (unsigned) c >= (1 << width)) pedwarn ("escape sequence out of range for character"); } else if (c == '\n') @@ -1946,15 +1986,94 @@ yylex () pedwarn ("ANSI C forbids newline in string constant"); lineno++; } + else + { +#ifdef MULTIBYTE_CHARS + wchar_t wc; + int i; + int char_len = -1; + for (i = 0; i < longest_char; ++i) + { + if (p + i == token_buffer + maxtoken) + p = extend_token_buffer (p); + p[i] = c; - if (p == token_buffer + maxtoken) - p = extend_token_buffer (p); - *p++ = c; + char_len = local_mbtowc (& wc, p, i + 1); + if (char_len != -1) + break; + c = GETC (); + } + if (char_len == -1) + warning ("Ignoring invalid multibyte character"); + else + { + /* mbtowc sometimes needs an extra char before accepting */ + if (char_len <= i) + UNGETC (c); + if (wide_flag) + { + *(wchar_t *)p = wc; + p += sizeof (wc); + } + else + p += (i + 1); + c = GETC (); + continue; + } +#endif /* MULTIBYTE_CHARS */ + } + + /* Add this single character into the buffer either as a wchar_t + or as a single byte. */ + if (wide_flag) + { + unsigned width = TYPE_PRECISION (char_type_node); + unsigned bytemask = (1 << width) - 1; + int byte; + + if (p + WCHAR_BYTES >= token_buffer + maxtoken) + p = extend_token_buffer (p); + + for (byte = 0; byte < WCHAR_BYTES; ++byte) + { + int value; + if (byte >= sizeof (c)) + value = 0; + else + value = (c >> (byte * width)) & bytemask; + if (BYTES_BIG_ENDIAN) + p[WCHAR_BYTES - byte - 1] = value; + else + p[byte] = value; + } + p += WCHAR_BYTES; + } + else + { + if (p == token_buffer + maxtoken) + p = extend_token_buffer (p); + *p++ = c; + } skipnewline: - c = GETC(); + c = GETC (); + } + + /* Terminate the string value, either with a single byte zero + or with a wide zero. */ + if (wide_flag) + { + if (p + WCHAR_BYTES >= token_buffer + maxtoken) + p = extend_token_buffer (p); + bzero (p, WCHAR_BYTES); + p += WCHAR_BYTES; + } + else + { + if (p == token_buffer + maxtoken) + p = extend_token_buffer (p); + *p++ = 0; } - *p = 0; if (c < 0) error ("Unterminated string constant"); @@ -1964,52 +2083,27 @@ yylex () if (wide_flag) { - /* If this is a L"..." wide-string, convert the multibyte string - to a wide character string. */ - char *widep = (char *) alloca ((p - token_buffer) * WCHAR_BYTES); - int len; - -#ifdef MULTIBYTE_CHARS - len = mbstowcs ((wchar_t *) widep, token_buffer + 1, p - token_buffer); - if (len < 0 || len >= (p - token_buffer)) - { - warning ("Ignoring invalid multibyte string"); - len = 0; - } - bzero (widep + (len * WCHAR_BYTES), WCHAR_BYTES); -#else - { - char *wp, *cp; - - wp = widep + (BYTES_BIG_ENDIAN ? WCHAR_BYTES - 1 : 0); - bzero (widep, (p - token_buffer) * WCHAR_BYTES); - for (cp = token_buffer + 1; cp < p; cp++) - *wp = *cp, wp += WCHAR_BYTES; - len = p - token_buffer - 1; - } -#endif - yylval.ttype = build_string ((len + 1) * WCHAR_BYTES, widep); + yylval.ttype = build_string (p - (token_buffer + 1), + token_buffer + 1); TREE_TYPE (yylval.ttype) = wchar_array_type_node; value = STRING; } else if (objc_flag) { /* Return an Objective-C @"..." constant string object. */ - yylval.ttype = build_objc_string (p - token_buffer, + yylval.ttype = build_objc_string (p - (token_buffer + 1), token_buffer + 1); TREE_TYPE (yylval.ttype) = char_array_type_node; value = OBJC_STRING; } else { - yylval.ttype = build_string (p - token_buffer, token_buffer + 1); + yylval.ttype = build_string (p - (token_buffer + 1), + token_buffer + 1); TREE_TYPE (yylval.ttype) = char_array_type_node; value = STRING; } - *p++ = '"'; - *p = 0; - break; } diff --git a/gcc/cccp.c b/gcc/cccp.c index 1bd7649ea7c..55b6e68b7ab 100644 --- a/gcc/cccp.c +++ b/gcc/cccp.c @@ -45,6 +45,11 @@ typedef unsigned char U_CHAR; #include "gansidecl.h" #include "pcp.h" +#ifdef MULTIBYTE_CHARS +#include "mbchar.h" +#include +#endif /* MULTIBYTE_CHARS */ + #ifndef GET_ENVIRONMENT #define GET_ENVIRONMENT(ENV_VALUE,ENV_NAME) ENV_VALUE = getenv (ENV_NAME) #endif @@ -1308,6 +1313,12 @@ main (argc, argv) bzero ((char *) pend_assertions, argc * sizeof (char *)); bzero ((char *) pend_includes, argc * sizeof (char *)); +#ifdef MULTIBYTE_CHARS + /* Change to the native locale for multibyte conversions. */ + setlocale (LC_CTYPE, ""); + GET_ENVIRONMENT (literal_codeset, "LANG"); +#endif + /* Process switches and find input file name. */ for (i = 1; i < argc; i++) { @@ -2774,9 +2785,27 @@ do { ip = &instack[indepth]; \ bp += 2; else if (*bp == '/' && bp[1] == '*') { bp += 2; - while (!(*bp == '*' && bp[1] == '/')) - bp++; - bp += 2; + while (1) + { + if (*bp == '*') + { + if (bp[1] == '/') + { + bp += 2; + break; + } + } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, limit - bp); + if (length > 1) + bp += (length - 1); +#endif + } + bp++; + } } /* There is no point in trying to deal with C++ // comments here, because if there is one, then this # must be part of the @@ -2937,6 +2966,24 @@ do { ip = &instack[indepth]; \ if (ibp[-1] == c) goto while2end; break; +#ifdef MULTIBYTE_CHARS + default: + { + int length; + --ibp; + length = local_mblen (ibp, limit - ibp); + if (length > 0) + { + --obp; + bcopy (ibp, obp, length); + obp += length; + ibp += length; + } + else + ++ibp; + } + break; +#endif } } while2end: @@ -2983,6 +3030,15 @@ do { ip = &instack[indepth]; \ *obp++ = '\n'; ++op->lineno; } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (ibp, limit - ibp); + if (length > 1) + ibp += (length - 1); +#endif + } } break; } @@ -3071,6 +3127,16 @@ do { ip = &instack[indepth]; \ goto limit_reached; } break; +#ifdef MULTIBYTE_CHARS + default: + { + int length; + length = local_mblen (ibp, limit - ibp); + if (length > 1) + ibp += (length - 1); + } + break; +#endif } } comment_end: @@ -3433,11 +3499,27 @@ randomchar: break; } } - if (*ibp == '\n') { + else if (*ibp == '\n') { /* Newline in a file. Count it. */ ++ip->lineno; ++op->lineno; } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (ibp, limit - ibp); + if (length > 1) + { + if (put_out_comments) + { + bcopy (ibp, obp, length - 1); + obp += length - 1; + } + ibp += (length - 1); + } +#endif + } if (put_out_comments) *obp++ = *ibp; } @@ -3448,9 +3530,32 @@ randomchar: } else if (! traditional) { *obp++ = ' '; } - for (ibp += 2; *ibp != '\n' || ibp[-1] == '\\'; ibp++) - if (put_out_comments) - *obp++ = *ibp; + for (ibp += 2; ; ibp++) + { + if (*ibp == '\n') + { + if (ibp[-1] != '\\') + break; + } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (ibp, limit - ibp); + if (length > 1) + { + if (put_out_comments) + { + bcopy (ibp, obp, length - 1); + obp += length - 1; + } + ibp += (length - 1); + } +#endif + } + if (put_out_comments) + *obp++ = *ibp; + } } else break; } @@ -6186,6 +6291,25 @@ collect_expansion (buf, end, nargs, arglist) } } +#ifdef MULTIBYTE_CHARS + /* Handle multibyte characters inside string and character literals. */ + if (expected_delimiter != '\0') + { + int length; + --p; + length = local_mblen (p, limit - p); + if (length > 1) + { + --exp_p; + bcopy (p, exp_p, length); + p += length; + exp_p += length; + continue; + } + ++p; + } +#endif + /* Handle the start of a symbol. */ if (is_idchar[c] && nargs > 0) { U_CHAR *id_beg = p - 1; @@ -7412,9 +7536,27 @@ skip_if_group (ip, any, op) bp += 2; else if (*bp == '/' && bp[1] == '*') { bp += 2; - while (!(*bp == '*' && bp[1] == '/')) - bp++; - bp += 2; + while (1) + { + if (*bp == '*') + { + if (bp[1] == '/') + { + bp += 2; + break; + } + } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, endb - bp); + if (length > 1) + bp += (length - 1); +#endif + } + bp++; + } } /* There is no point in trying to deal with C++ // comments here, because if there is one, then this # must be part of the @@ -7458,6 +7600,15 @@ skip_if_group (ip, any, op) if (bp[1] == '/') break; } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, endb - bp); + if (length > 1) + bp += (length - 1); +#endif + } } bp += 2; } else if (bp[1] == '/' && cplusplus_comments) { @@ -7469,6 +7620,15 @@ skip_if_group (ip, any, op) warning ("multiline `//' comment"); ip->lineno++; } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, endb - bp); + if (length > 1) + bp += (length - 1); +#endif + } } } else break; @@ -7764,6 +7924,15 @@ validate_else (p, limit) break; } } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (p, limit - p); + if (length > 1) + p += (length - 1); +#endif + } } } else if (cplusplus_comments && p[1] == '/') @@ -7817,6 +7986,22 @@ skip_to_end_of_comment (ip, line_counter, nowarn) if (op) ++op->lineno; } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, limit - bp); + if (length > 1) + { + if (op) + { + bcopy (bp, op->bufp, length - 1); + op->bufp += (length - 1); + } + bp += (length - 1); + } +#endif + } if (op) *op->bufp++ = *bp; } @@ -7854,6 +8039,23 @@ skip_to_end_of_comment (ip, line_counter, nowarn) return bp; } break; +#ifdef MULTIBYTE_CHARS + default: + { + int length; + bp--; + length = local_mblen (bp, limit - bp); + if (length <= 0) + length = 1; + if (op) + { + op->bufp--; + bcopy (bp, op->bufp, length); + op->bufp += length; + } + bp += length; + } +#endif } } @@ -7944,6 +8146,16 @@ skip_quoted_string (bp, limit, start_line, count_newlines, backslash_newlines_p, } } else if (c == match) break; +#ifdef MULTIBYTE_CHARS + { + int length; + --bp; + length = local_mblen (bp, limit - bp); + if (length <= 0) + length = 1; + bp += length; + } +#endif } return bp; } @@ -8381,9 +8593,23 @@ macroexpand (hp, op) else { if (c == '\\') escaped = 1; - if (in_string) { + else if (in_string) { if (c == in_string) in_string = 0; + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (arg->raw + i, arglen - i); + if (length > 1) + { + bcopy (arg->raw + i, xbuf + totlen, length); + i += length - 1; + totlen += length; + continue; + } +#endif + } } else if (c == '\"' || c == '\'') in_string = c; } @@ -8717,6 +8943,15 @@ macarg1 (start, limit, macro, depthptr, newlines, comments, rest_args) break; } } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, limit - bp); + if (length > 1) + bp += (length - 1); +#endif + } } } else if (bp[1] == '/' && cplusplus_comments) { *comments = 1; @@ -8728,6 +8963,15 @@ macarg1 (start, limit, macro, depthptr, newlines, comments, rest_args) if (warn_comments) warning ("multiline `//' comment"); } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, limit - bp); + if (length > 1) + bp += (length - 1); +#endif + } } } break; @@ -8751,6 +8995,15 @@ macarg1 (start, limit, macro, depthptr, newlines, comments, rest_args) if (quotec == '\'') break; } + else + { +#ifdef MULTIBYTE_CHARS + int length; + length = local_mblen (bp, limit - bp); + if (length > 1) + bp += (length - 1); +#endif + } } } break; @@ -8828,8 +9081,23 @@ discard_comments (start, length, newlines) /* Comments are equivalent to spaces. */ obp[-1] = ' '; ibp++; - while (ibp < limit && (*ibp != '\n' || ibp[-1] == '\\')) - ibp++; + while (ibp < limit) + { + if (*ibp == '\n') + { + if (ibp[-1] != '\\') + break; + } + else + { +#ifdef MULTIBYTE_CHARS + int length = local_mblen (ibp, limit - ibp); + if (length > 1) + ibp += (length - 1); +#endif + } + ibp++; + } break; } if (ibp[0] != '*' || ibp + 1 >= limit) @@ -8849,6 +9117,14 @@ discard_comments (start, length, newlines) break; } } + else + { +#ifdef MULTIBYTE_CHARS + int length = local_mblen (ibp, limit - ibp); + if (length > 1) + ibp += (length - 1); +#endif + } } break; @@ -8863,9 +9139,12 @@ discard_comments (start, length, newlines) *obp++ = c = *ibp++; if (c == quotec) break; - if (c == '\n' && quotec == '\'') - break; - if (c == '\\') { + if (c == '\n') + { + if (quotec == '\'') + break; + } + else if (c == '\\') { if (ibp < limit && *ibp == '\n') { ibp++; obp--; @@ -8876,6 +9155,23 @@ discard_comments (start, length, newlines) *obp++ = *ibp++; } } + else + { +#ifdef MULTIBYTE_CHARS + int length; + ibp--; + length = local_mblen (ibp, limit - ibp); + if (length > 1) + { + obp--; + bcopy (ibp, obp, length); + ibp += length; + obp += length; + } + else + ibp++; +#endif + } } } break; @@ -8925,10 +9221,33 @@ change_newlines (start, length) int quotec = c; while (ibp < limit) { *obp++ = c = *ibp++; - if (c == quotec && ibp[-2] != '\\') - break; - if (c == '\n' && quotec == '\'') - break; + if (c == quotec) + { + if (ibp[-2] != '\\') + break; + } + else if (c == '\n') + { + if (quotec == '\'') + break; + } + else + { +#ifdef MULTIBYTE_CHARS + int length; + ibp--; + length = local_mblen (ibp, limit - ibp); + if (length > 1) + { + obp--; + bcopy (ibp, obp, length); + ibp += length; + obp += length; + } + else + ibp++; +#endif + } } } break; diff --git a/gcc/cexp.y b/gcc/cexp.y index 6280aedeb80..5d36329e7e9 100644 --- a/gcc/cexp.y +++ b/gcc/cexp.y @@ -39,12 +39,12 @@ Boston, MA 02111-1307, USA. #include "system.h" #include /* #define YYDEBUG 1 */ +#include "gansidecl.h" #ifdef MULTIBYTE_CHARS +#include "mbchar.h" #include -#endif - -#include "gansidecl.h" +#endif /* MULTIBYTE_CHARS */ typedef unsigned char U_CHAR; @@ -641,23 +641,18 @@ yylex () { register HOST_WIDE_INT result = 0; register int num_chars = 0; + int chars_seen = 0; unsigned width = MAX_CHAR_TYPE_SIZE; int max_chars; - char *token_buffer; - - if (wide_flag) - { - width = MAX_WCHAR_TYPE_SIZE; #ifdef MULTIBYTE_CHARS - max_chars = MB_CUR_MAX; -#else - max_chars = 1; + int longest_char = local_mb_cur_max (); + char *token_buffer = (char *) alloca (longest_char); + (void) local_mbtowc (NULL_PTR, NULL_PTR, 0); #endif - } - else - max_chars = MAX_LONG_TYPE_SIZE / width; - token_buffer = (char *) alloca (max_chars + 1); + max_chars = MAX_LONG_TYPE_SIZE / width; + if (wide_flag) + width = MAX_WCHAR_TYPE_SIZE; while (1) { @@ -666,44 +661,96 @@ yylex () if (c == '\'' || c == EOF) break; + ++chars_seen; if (c == '\\') { c = parse_escape (&lexptr, mask); } + else + { +#ifdef MULTIBYTE_CHARS + wchar_t wc; + int i; + int char_len = -1; + for (i = 1; i <= longest_char; ++i) + { + token_buffer[i - 1] = c; + char_len = local_mbtowc (& wc, token_buffer, i); + if (char_len != -1) + break; + c = *lexptr++; + } + if (char_len > 1) + { + /* mbtowc sometimes needs an extra char before accepting */ + if (char_len < i) + lexptr--; + if (! wide_flag) + { + /* Merge character into result; ignore excess chars. */ + for (i = 1; i <= char_len; ++i) + { + if (i > max_chars) + break; + if (width < HOST_BITS_PER_INT) + result = (result << width) + | (token_buffer[i - 1] + & ((1 << width) - 1)); + else + result = token_buffer[i - 1]; + } + num_chars += char_len; + continue; + } + } + else + { + if (char_len == -1) + warning ("Ignoring invalid multibyte character"); + } + if (wide_flag) + c = wc; +#endif /* ! MULTIBYTE_CHARS */ + } - num_chars++; + if (wide_flag) + { + if (chars_seen == 1) /* only keep the first one */ + result = c; + continue; + } /* Merge character into result; ignore excess chars. */ + num_chars++; if (num_chars <= max_chars) { - if (width < HOST_BITS_PER_WIDE_INT) - result = (result << width) | c; + if (width < HOST_BITS_PER_INT) + result = (result << width) | (c & ((1 << width) - 1)); else result = c; - token_buffer[num_chars - 1] = c; } } - token_buffer[num_chars] = 0; - if (c != '\'') error ("malformatted character constant"); - else if (num_chars == 0) + else if (chars_seen == 0) error ("empty character constant"); else if (num_chars > max_chars) { num_chars = max_chars; error ("character constant too long"); } - else if (num_chars != 1 && ! traditional) + else if (chars_seen != 1 && ! traditional) warning ("multi-character character constant"); /* If char type is signed, sign-extend the constant. */ if (! wide_flag) { int num_bits = num_chars * width; - - if (lookup ((U_CHAR *) "__CHAR_UNSIGNED__", + if (num_bits == 0) + /* We already got an error; avoid invalid shift. */ + yylval.integer.value = 0; + else if (lookup ((U_CHAR *) "__CHAR_UNSIGNED__", sizeof ("__CHAR_UNSIGNED__") - 1, -1) || ((result >> (num_bits - 1)) & 1) == 0) yylval.integer.value @@ -716,22 +763,6 @@ yylex () } else { -#ifdef MULTIBYTE_CHARS - /* Set the initial shift state and convert the next sequence. */ - result = 0; - /* In all locales L'\0' is zero and mbtowc will return zero, - so don't use it. */ - if (num_chars > 1 - || (num_chars == 1 && token_buffer[0] != '\0')) - { - wchar_t wc; - (void) mbtowc (NULL_PTR, NULL_PTR, 0); - if (mbtowc (& wc, token_buffer, num_chars) == num_chars) - result = wc; - else - pedwarn ("Ignoring invalid multibyte character"); - } -#endif yylval.integer.value = result; } } diff --git a/gcc/configure.in b/gcc/configure.in index 6791547ec87..b4c1aca12d7 100644 --- a/gcc/configure.in +++ b/gcc/configure.in @@ -84,7 +84,7 @@ AC_DEFINE(ENABLE_CHECKING) # Enable use of cpplib for C. cpp_main=cccp AC_ARG_ENABLE(c-cpplib, -[ --enable-c-cpplib Use cpplib for C.], +[ --enable-c-cpplib Use cpplib for C and C++.], if [[[ x$enable_c_cpplib != xno ]]]; then extra_c_objs="${extra_c_objs} cpplib.o cppexp.o cpphash.o cpperror.o" extra_c_objs="${extra_c_objs} prefix.o" @@ -93,6 +93,13 @@ if [[[ x$enable_c_cpplib != xno ]]]; then cpp_main=cppmain fi) +# Enable Multibyte Characters for C/C++ +AC_ARG_ENABLE(c-mbchar, +[ --enable-c-mbchar Enable multibyte characters for C and C++.], +if [[[ x$enable_c_mbchar != xno ]]]; then + extra_c_flags=-DMULTIBYTE_CHARS=1 +fi) + # Enable Haifa scheduler. AC_ARG_ENABLE(haifa, [ --enable-haifa Use the experimental scheduler. @@ -193,6 +200,9 @@ AC_CHECK_FUNCS(strtoul bsearch strerror putenv popen bcopy bzero bcmp \ index rindex strchr strrchr kill getrlimit setrlimit atoll atoq \ sysconf isascii gettimeofday) +# Make sure wchar_t is available +#AC_CHECK_TYPE(wchar_t, unsigned int) + GCC_FUNC_VFPRINTF_DOPRNT GCC_FUNC_PRINTF_PTR @@ -3585,6 +3595,7 @@ AC_SUBST(extra_programs) AC_SUBST(extra_parts) AC_SUBST(extra_c_objs) AC_SUBST(extra_cxx_objs) +AC_SUBST(extra_cpp_objs) AC_SUBST(extra_c_flags) AC_SUBST(extra_objs) AC_SUBST(host_extra_gcc_objs) diff --git a/gcc/invoke.texi b/gcc/invoke.texi index 8056b8478bb..3b3ad4313db 100644 --- a/gcc/invoke.texi +++ b/gcc/invoke.texi @@ -5964,8 +5964,9 @@ the language standard. You should not need to use these options yourself. @cindex environment variables This section describes several environment variables that affect how GNU -CC operates. They work by specifying directories or prefixes to use -when searching for various kinds of files. +CC operates. Some of them work by specifying directories or prefixes to use +when searching for various kinds of files. Some are used to specify other +ascpects of the compilation environment. @ifclear INTERNALS Note that you can also specify places to search using options such as @@ -6065,6 +6066,28 @@ which case the Make rules are written to that file, guessing the target name from the source file name. Or the value can have the form @samp{@var{file} @var{target}}, in which case the rules are written to file @var{file} using @var{target} as the target name. + +@item LANG +@findex LANG +@cindex locale definition +This variable is used to pass locale information to the compiler. One way in +which this information is used is to determine the character set to be used +when character literals, string literals and comments are parsed in C and C++. +When the compiler is configured to allow multibyte characters, +the following values for @code{LANG} are recognized: + +@table @code +@item C-JIS +Recognize JIS characters. +@item C-SJIS +Recognize SJIS characters. +@item C-EUCJP +Recognize EUCJP characters. +@end table + +If @code{LANG} is not defined, or if it has some ther value, then the +compiler will use mblen and mbtowc as defined by the default locale to +recognize and translate multibyte characters. @end table @node Running Protoize diff --git a/gcc/mbchar.c b/gcc/mbchar.c new file mode 100644 index 00000000000..d54a49749ce --- /dev/null +++ b/gcc/mbchar.c @@ -0,0 +1,288 @@ +/* Multibyte Character Functions. + Copyright (C) 1998 Free Software Foundation, Inc. + +This file is part of GNU CC. + +GNU CC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU CC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU CC; see the file COPYING. If not, write to +the Free Software Foundation, 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/* These functions are used to manipulate multibyte characters. */ + +/* Note regarding cross compilation: + + In general translation of multibyte characters to wide characters can + only work in a native compiler since the translation function (mbtowc) + needs to know about both the source and target character encoding. However, + this particular implementation for JIS, SJIS and EUCJP source characters + will work for any compiler with a newlib target. Other targets may also + work provided that their wchar_t implementation is 2 bytes and the encoding + leaves the source character values unchanged (except for removing the + state shifting markers). */ + +#ifdef MULTIBYTE_CHARS +#include "config.h" +#include "system.h" +#include "gansidecl.h" +#include "mbchar.h" +#include + +typedef enum +{ + ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER, JIS_C_NUM +} JIS_CHAR_TYPE; + +typedef enum +{ + ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR, + J2_ESC, J2_ESC_BR, INV, JIS_S_NUM +} JIS_STATE; + +typedef enum +{ + COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP, EMPTY, ERROR +} JIS_ACTION; + +/***************************************************************************** + * state/action tables for processing JIS encoding + * Where possible, switches to JIS are grouped with proceding JIS characters + * and switches to ASCII are grouped with preceding JIS characters. + * Thus, maximum returned length is: + * 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. + *****************************************************************************/ +static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = { +/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER*/ +/*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, +/*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, +/*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII}, +/*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV }, +/*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV }, +/*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS }, +/*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV }, +/*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, +/*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV }, +/*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, +}; + +static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = { +/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */ +/*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA}, +/*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA}, +/*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA}, +/*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR }, +/*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR }, +/*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2}, +/*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, +/*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR }, +/*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, +/*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR }, +}; + + +char *literal_codeset = NULL; + +int +local_mbtowc (pwc, s, n) + wchar_t *pwc; + const char *s; + size_t n; +{ + static JIS_STATE save_state = ASCII; + JIS_STATE curr_state = save_state; + unsigned char *t = (unsigned char *)s; + + if (s != NULL && n == 0) + return -1; + + if (literal_codeset == NULL || strlen (literal_codeset) <= 1) + { + /* This must be the "C" locale or unknown locale -- fall thru */ + } + else if (! strcmp (literal_codeset, "C-SJIS")) + { + int char1; + if (s == NULL) + return 0; /* not state-dependent */ + char1 = *t; + if (ISSJIS1 (char1)) + { + int char2 = t[1]; + if (n <= 1) + return -1; + if (ISSJIS2 (char2)) + { + if (pwc != NULL) + *pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1)); + return 2; + } + return -1; + } + if (pwc != NULL) + *pwc = (wchar_t)*t; + if (*t == '\0') + return 0; + return 1; + } + else if (! strcmp (literal_codeset, "C-EUCJP")) + { + int char1; + if (s == NULL) + return 0; /* not state-dependent */ + char1 = *t; + if (ISEUCJP (char1)) + { + int char2 = t[1]; + if (n <= 1) + return -1; + if (ISEUCJP (char2)) + { + if (pwc != NULL) + *pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1)); + return 2; + } + return -1; + } + if (pwc != NULL) + *pwc = (wchar_t)*t; + if (*t == '\0') + return 0; + return 1; + } + else if (! strcmp (literal_codeset, "C-JIS")) + { + JIS_ACTION action; + JIS_CHAR_TYPE ch; + unsigned char *ptr; + int i, curr_ch; + + if (s == NULL) + { + save_state = ASCII; + return 1; /* state-dependent */ + } + + ptr = t; + + for (i = 0; i < n; ++i) + { + curr_ch = t[i]; + switch (curr_ch) + { + case JIS_ESC_CHAR: + ch = ESCAPE; + break; + case '$': + ch = DOLLAR; + break; + case '@': + ch = AT; + break; + case '(': + ch = BRACKET; + break; + case 'B': + ch = B; + break; + case 'J': + ch = J; + break; + case '\0': + ch = NUL; + break; + default: + if (ISJIS (curr_ch)) + ch = JIS_CHAR; + else + ch = OTHER; + } + + action = JIS_action_table[curr_state][ch]; + curr_state = JIS_state_table[curr_state][ch]; + + switch (action) + { + case NOOP: + break; + case EMPTY: + if (pwc != NULL) + *pwc = (wchar_t)0; + save_state = curr_state; + return i; + case COPYA: + if (pwc != NULL) + *pwc = (wchar_t)*ptr; + save_state = curr_state; + return (i + 1); + case COPYJ: + if (pwc != NULL) + *pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1)); + save_state = curr_state; + return (i + 1); + case COPYJ2: + if (pwc != NULL) + *pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1)); + save_state = curr_state; + return (ptr - t) + 2; + case MAKE_A: + case MAKE_J: + ptr = (char *)(t + i + 1); + break; + case ERROR: + default: + return -1; + } + } + + return -1; /* n < bytes needed */ + } + +#ifdef CROSS_COMPILE + if (s == NULL) + return 0; /* not state-dependent */ + if (pwc != NULL) + *pwc = *s; + return 1; +#else + /* This must be the "C" locale or unknown locale. */ + return mbtowc (pwc, s, n); +#endif +} + +int +local_mblen (s, n) + const char *s; + size_t n; +{ + return local_mbtowc (NULL, s, n); +} + +int +local_mb_cur_max () +{ + if (literal_codeset == NULL || strlen (literal_codeset) <= 1) + ; + else if (! strcmp (literal_codeset, "C-SJIS")) + return 2; + else if (! strcmp (literal_codeset, "C-EUCJP")) + return 2; + else if (! strcmp (literal_codeset, "C-JIS")) + return 8; /* 3 + 2 + 3 */ + +#ifdef CROSS_COMPILE + return 1; +#else + return MB_CUR_MAX; +#endif +} +#endif /* MULTIBYTE_CHARS */ diff --git a/gcc/mbchar.h b/gcc/mbchar.h new file mode 100644 index 00000000000..a4b82c0558d --- /dev/null +++ b/gcc/mbchar.h @@ -0,0 +1,25 @@ +/* mbchar.h - Various declarations for functions found in mbchar.c + Copyright (C) 1998 Free Software Foundation, Inc. + */ + +#ifndef __GCC_MBCHAR_H__ +#define __GCC_MBCHAR_H__ + +#ifdef MULTIBYTE_CHARS +/* escape character used for JIS encoding */ +#define JIS_ESC_CHAR 0x1b + +#define ISSJIS1(c) ((c) >= 0x81 && (c) <= 0x9f || (c) >= 0xe0 && (c) <= 0xef) +#define ISSJIS2(c) ((c) >= 0x40 && (c) <= 0x7e || (c) >= 0x80 && (c) <= 0xfc) +#define ISEUCJP(c) ((c) >= 0xa1 && (c) <= 0xfe) +#define ISJIS(c) ((c) >= 0x21 && (c) <= 0x7e) + +int local_mbtowc PROTO ((wchar_t *, const char *, size_t)); +int local_mblen PROTO ((const char *, size_t)); +int local_mb_cur_max PROTO ((void)); + +/* The locale being used for multibyte characters in string/char literals. */ +extern char *literal_codeset; +#endif /* MULTIBYTE_CHARS */ + +#endif /* __GCC_MBCHAR_H__ */