1 /* 2 * Copyright (C) 1999-2001 Free Software Foundation, Inc. 3 * This file is part of the GNU LIBICONV Library. 4 * 5 * The GNU LIBICONV Library is free software; you can redistribute it 6 * and/or modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either version 2 8 * of the License, or (at your option) any later version. 9 * 10 * The GNU LIBICONV Library is distributed in the hope that it will be 11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public 16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB. 17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street, 18 * Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 21 /* 22 * ISO-2022-JP-2 23 */ 24 25 /* Specification: RFC 1554 */ 26 /* ESC '(' 'I' for JISX0201 Katakana is an extension not found in RFC 1554 or 27 CJK.INF, but implemented in glibc-2.1 and qt-2.0. */ 28 29 #define ESC 0x1b 30 31 /* 32 * The state is composed of one of the following values 33 */ 34 #define STATE_ASCII 0 35 #define STATE_JISX0201ROMAN 1 36 #define STATE_JISX0201KATAKANA 2 37 #define STATE_JISX0208 3 38 #define STATE_JISX0212 4 39 #define STATE_GB2312 5 40 #define STATE_KSC5601 6 41 /* 42 * and one of the following values, << 8 43 */ 44 #define STATE_G2_NONE 0 45 #define STATE_G2_ISO8859_1 1 46 #define STATE_G2_ISO8859_7 2 47 48 #define SPLIT_STATE \ 49 unsigned int state1 = state & 0xff, state2 = state >> 8 50 #define COMBINE_STATE \ 51 state = (state2 << 8) | state1 52 53 static int 54 iso2022_jp2_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 55 { 56 state_t state = conv->istate; 57 SPLIT_STATE; 58 int count = 0; 59 unsigned char c; 60 for (;;) { 61 c = *s; 62 if (c == ESC) { 63 if (n < count+3) 64 goto none; 65 if (s[1] == '(') { 66 if (s[2] == 'B') { 67 state1 = STATE_ASCII; 68 s += 3; count += 3; 69 if (n < count+1) 70 goto none; 71 continue; 72 } 73 if (s[2] == 'J') { 74 state1 = STATE_JISX0201ROMAN; 75 s += 3; count += 3; 76 if (n < count+1) 77 goto none; 78 continue; 79 } 80 if (s[2] == 'I') { 81 state1 = STATE_JISX0201KATAKANA; 82 s += 3; count += 3; 83 if (n < count+1) 84 goto none; 85 continue; 86 } 87 return RET_ILSEQ; 88 } 89 if (s[1] == '$') { 90 if (s[2] == '@' || s[2] == 'B') { 91 /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */ 92 state1 = STATE_JISX0208; 93 s += 3; count += 3; 94 if (n < count+1) 95 goto none; 96 continue; 97 } 98 if (s[2] == 'A') { 99 state1 = STATE_GB2312; 100 s += 3; count += 3; 101 if (n < count+1) 102 goto none; 103 continue; 104 } 105 if (s[2] == '(') { 106 if (n < count+4) 107 goto none; 108 if (s[3] == 'D') { 109 state1 = STATE_JISX0212; 110 s += 4; count += 4; 111 if (n < count+1) 112 goto none; 113 continue; 114 } 115 if (s[3] == 'C') { 116 state1 = STATE_KSC5601; 117 s += 4; count += 4; 118 if (n < count+1) 119 goto none; 120 continue; 121 } 122 return RET_ILSEQ; 123 } 124 return RET_ILSEQ; 125 } 126 if (s[1] == '.') { 127 if (n < count+3) 128 goto none; 129 if (s[2] == 'A') { 130 state2 = STATE_G2_ISO8859_1; 131 s += 3; count += 3; 132 if (n < count+1) 133 goto none; 134 continue; 135 } 136 if (s[2] == 'F') { 137 state2 = STATE_G2_ISO8859_7; 138 s += 3; count += 3; 139 if (n < count+1) 140 goto none; 141 continue; 142 } 143 return RET_ILSEQ; 144 } 145 if (s[1] == 'N') { 146 switch (state2) { 147 case STATE_G2_NONE: 148 return RET_ILSEQ; 149 case STATE_G2_ISO8859_1: 150 if (s[2] < 0x80) { 151 unsigned char buf = s[2]+0x80; 152 int ret = iso8859_1_mbtowc(conv,pwc,&buf,1); 153 if (ret == RET_ILSEQ) 154 return RET_ILSEQ; 155 if (ret != 1) abort(); 156 COMBINE_STATE; 157 conv->istate = state; 158 return count+3; 159 } else 160 return RET_ILSEQ; 161 case STATE_G2_ISO8859_7: 162 if (s[2] < 0x80) { 163 unsigned char buf = s[2]+0x80; 164 int ret = iso8859_7_mbtowc(conv,pwc,&buf,1); 165 if (ret == RET_ILSEQ) 166 return RET_ILSEQ; 167 if (ret != 1) abort(); 168 COMBINE_STATE; 169 conv->istate = state; 170 return count+3; 171 } else 172 return RET_ILSEQ; 173 default: abort(); 174 } 175 } 176 return RET_ILSEQ; 177 } 178 break; 179 } 180 switch (state1) { 181 case STATE_ASCII: 182 if (c < 0x80) { 183 int ret = ascii_mbtowc(conv,pwc,s,1); 184 if (ret == RET_ILSEQ) 185 return RET_ILSEQ; 186 if (ret != 1) abort(); 187 if (*pwc == 0x000a || *pwc == 0x000d) 188 state2 = STATE_G2_NONE; 189 COMBINE_STATE; 190 conv->istate = state; 191 return count+1; 192 } else 193 return RET_ILSEQ; 194 case STATE_JISX0201ROMAN: 195 if (c < 0x80) { 196 int ret = jisx0201_mbtowc(conv,pwc,s,1); 197 if (ret == RET_ILSEQ) 198 return RET_ILSEQ; 199 if (ret != 1) abort(); 200 if (*pwc == 0x000a || *pwc == 0x000d) 201 state2 = STATE_G2_NONE; 202 COMBINE_STATE; 203 conv->istate = state; 204 return count+1; 205 } else 206 return RET_ILSEQ; 207 case STATE_JISX0201KATAKANA: 208 if (c < 0x80) { 209 unsigned char buf = c+0x80; 210 int ret = jisx0201_mbtowc(conv,pwc,&buf,1); 211 if (ret == RET_ILSEQ) 212 return RET_ILSEQ; 213 if (ret != 1) abort(); 214 COMBINE_STATE; 215 conv->istate = state; 216 return count+1; 217 } else 218 return RET_ILSEQ; 219 case STATE_JISX0208: 220 if (n < count+2) 221 goto none; 222 if (s[0] < 0x80 && s[1] < 0x80) { 223 int ret = jisx0208_mbtowc(conv,pwc,s,2); 224 if (ret == RET_ILSEQ) 225 return RET_ILSEQ; 226 if (ret != 2) abort(); 227 COMBINE_STATE; 228 conv->istate = state; 229 return count+2; 230 } else 231 return RET_ILSEQ; 232 case STATE_JISX0212: 233 if (n < count+2) 234 goto none; 235 if (s[0] < 0x80 && s[1] < 0x80) { 236 int ret = jisx0212_mbtowc(conv,pwc,s,2); 237 if (ret == RET_ILSEQ) 238 return RET_ILSEQ; 239 if (ret != 2) abort(); 240 COMBINE_STATE; 241 conv->istate = state; 242 return count+2; 243 } else 244 return RET_ILSEQ; 245 case STATE_GB2312: 246 if (n < count+2) 247 goto none; 248 if (s[0] < 0x80 && s[1] < 0x80) { 249 int ret = gb2312_mbtowc(conv,pwc,s,2); 250 if (ret == RET_ILSEQ) 251 return RET_ILSEQ; 252 if (ret != 2) abort(); 253 COMBINE_STATE; 254 conv->istate = state; 255 return count+2; 256 } else 257 return RET_ILSEQ; 258 case STATE_KSC5601: 259 if (n < count+2) 260 goto none; 261 if (s[0] < 0x80 && s[1] < 0x80) { 262 int ret = ksc5601_mbtowc(conv,pwc,s,2); 263 if (ret == RET_ILSEQ) 264 return RET_ILSEQ; 265 if (ret != 2) abort(); 266 COMBINE_STATE; 267 conv->istate = state; 268 return count+2; 269 } else 270 return RET_ILSEQ; 271 default: abort(); 272 } 273 274 none: 275 COMBINE_STATE; 276 conv->istate = state; 277 return RET_TOOFEW(count); 278 } 279 280 #undef COMBINE_STATE 281 #undef SPLIT_STATE 282 283 /* 284 * The state can also contain one of the following values, << 16. 285 * Values >= STATE_TAG_LANGUAGE are temporary tag parsing states. 286 */ 287 #define STATE_TAG_NONE 0 288 #define STATE_TAG_LANGUAGE 4 289 #define STATE_TAG_LANGUAGE_j 5 290 #define STATE_TAG_LANGUAGE_ja 1 291 #define STATE_TAG_LANGUAGE_k 6 292 #define STATE_TAG_LANGUAGE_ko 2 293 #define STATE_TAG_LANGUAGE_z 7 294 #define STATE_TAG_LANGUAGE_zh 3 295 296 #define SPLIT_STATE \ 297 unsigned int state1 = state & 0xff, state2 = (state >> 8) & 0xff, state3 = state >> 16 298 #define COMBINE_STATE \ 299 state = (state3 << 16) | (state2 << 8) | state1 300 301 static int 302 iso2022_jp2_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) 303 { 304 state_t state = conv->ostate; 305 SPLIT_STATE; 306 unsigned char buf[2]; 307 int ret; 308 /* This defines the conversion preferences depending on the current 309 langauge tag. */ 310 enum conversion { none = 0, european, japanese, chinese, korean, other }; 311 static const unsigned int conversion_lists[STATE_TAG_LANGUAGE] = { 312 /* STATE_TAG_NONE */ 313 japanese + (european << 3) + (chinese << 6) + (korean << 9) + (other << 12), 314 /* STATE_TAG_LANGUAGE_ja */ 315 japanese + (european << 3) + (chinese << 6) + (korean << 9) + (other << 12), 316 /* STATE_TAG_LANGUAGE_ko */ 317 korean + (european << 3) + (japanese << 6) + (chinese << 9) + (other << 12), 318 /* STATE_TAG_LANGUAGE_zh */ 319 chinese + (european << 3) + (japanese << 6) + (korean << 9) + (other << 12) 320 }; 321 unsigned int conversion_list; 322 323 /* Handle Unicode tag characters (range U+E0000..U+E007F). */ 324 if ((wc >> 7) == (0xe0000 >> 7)) { 325 char c = wc & 0x7f; 326 if (c >= 'A' && c <= 'Z') 327 c += 'a'-'A'; 328 switch (c) { 329 case 0x01: 330 state3 = STATE_TAG_LANGUAGE; 331 COMBINE_STATE; 332 conv->ostate = state; 333 return 0; 334 case 'j': 335 if (state3 == STATE_TAG_LANGUAGE) { 336 state3 = STATE_TAG_LANGUAGE_j; 337 COMBINE_STATE; 338 conv->ostate = state; 339 return 0; 340 } 341 break; 342 case 'a': 343 if (state3 == STATE_TAG_LANGUAGE_j) { 344 state3 = STATE_TAG_LANGUAGE_ja; 345 COMBINE_STATE; 346 conv->ostate = state; 347 return 0; 348 } 349 break; 350 case 'k': 351 if (state3 == STATE_TAG_LANGUAGE) { 352 state3 = STATE_TAG_LANGUAGE_k; 353 COMBINE_STATE; 354 conv->ostate = state; 355 return 0; 356 } 357 break; 358 case 'o': 359 if (state3 == STATE_TAG_LANGUAGE_k) { 360 state3 = STATE_TAG_LANGUAGE_ko; 361 COMBINE_STATE; 362 conv->ostate = state; 363 return 0; 364 } 365 break; 366 case 'z': 367 if (state3 == STATE_TAG_LANGUAGE) { 368 state3 = STATE_TAG_LANGUAGE_z; 369 COMBINE_STATE; 370 conv->ostate = state; 371 return 0; 372 } 373 break; 374 case 'h': 375 if (state3 == STATE_TAG_LANGUAGE_z) { 376 state3 = STATE_TAG_LANGUAGE_zh; 377 COMBINE_STATE; 378 conv->ostate = state; 379 return 0; 380 } 381 break; 382 case 0x7f: 383 state3 = STATE_TAG_NONE; 384 COMBINE_STATE; 385 conv->ostate = state; 386 return 0; 387 default: 388 break; 389 } 390 /* Other tag characters reset the tag parsing state or are ignored. */ 391 if (state3 >= STATE_TAG_LANGUAGE) 392 state3 = STATE_TAG_NONE; 393 COMBINE_STATE; 394 conv->ostate = state; 395 return 0; 396 } 397 if (state3 >= STATE_TAG_LANGUAGE) 398 state3 = STATE_TAG_NONE; 399 400 /* Try ASCII. */ 401 ret = ascii_wctomb(conv,buf,wc,1); 402 if (ret != RET_ILUNI) { 403 if (ret != 1) abort(); 404 if (buf[0] < 0x80) { 405 int count = (state1 == STATE_ASCII ? 1 : 4); 406 if (n < count) 407 return RET_TOOSMALL; 408 if (state1 != STATE_ASCII) { 409 r[0] = ESC; 410 r[1] = '('; 411 r[2] = 'B'; 412 r += 3; 413 state1 = STATE_ASCII; 414 } 415 r[0] = buf[0]; 416 if (wc == 0x000a || wc == 0x000d) 417 state2 = STATE_G2_NONE; 418 COMBINE_STATE; 419 conv->ostate = state; 420 return count; 421 } 422 } 423 424 conversion_list = conversion_lists[state3]; 425 426 do { 427 switch (conversion_list & ((1 << 3) - 1)) { 428 429 case european: 430 431 /* Try ISO-8859-1. */ 432 ret = iso8859_1_wctomb(conv,buf,wc,1); 433 if (ret != RET_ILUNI) { 434 if (ret != 1) abort(); 435 if (buf[0] >= 0x80) { 436 int count = (state2 == STATE_G2_ISO8859_1 ? 3 : 6); 437 if (n < count) 438 return RET_TOOSMALL; 439 if (state2 != STATE_G2_ISO8859_1) { 440 r[0] = ESC; 441 r[1] = '.'; 442 r[2] = 'A'; 443 r += 3; 444 state2 = STATE_G2_ISO8859_1; 445 } 446 r[0] = ESC; 447 r[1] = 'N'; 448 r[2] = buf[0]-0x80; 449 COMBINE_STATE; 450 conv->ostate = state; 451 return count; 452 } 453 } 454 455 /* Try ISO-8859-7. */ 456 ret = iso8859_7_wctomb(conv,buf,wc,1); 457 if (ret != RET_ILUNI) { 458 if (ret != 1) abort(); 459 if (buf[0] >= 0x80) { 460 int count = (state2 == STATE_G2_ISO8859_7 ? 3 : 6); 461 if (n < count) 462 return RET_TOOSMALL; 463 if (state2 != STATE_G2_ISO8859_7) { 464 r[0] = ESC; 465 r[1] = '.'; 466 r[2] = 'F'; 467 r += 3; 468 state2 = STATE_G2_ISO8859_7; 469 } 470 r[0] = ESC; 471 r[1] = 'N'; 472 r[2] = buf[0]-0x80; 473 COMBINE_STATE; 474 conv->ostate = state; 475 return count; 476 } 477 } 478 479 break; 480 481 case japanese: 482 483 /* Try JIS X 0201-1976 Roman. */ 484 ret = jisx0201_wctomb(conv,buf,wc,1); 485 if (ret != RET_ILUNI) { 486 if (ret != 1) abort(); 487 if (buf[0] < 0x80) { 488 int count = (state1 == STATE_JISX0201ROMAN ? 1 : 4); 489 if (n < count) 490 return RET_TOOSMALL; 491 if (state1 != STATE_JISX0201ROMAN) { 492 r[0] = ESC; 493 r[1] = '('; 494 r[2] = 'J'; 495 r += 3; 496 state1 = STATE_JISX0201ROMAN; 497 } 498 r[0] = buf[0]; 499 if (wc == 0x000a || wc == 0x000d) 500 state2 = STATE_G2_NONE; 501 COMBINE_STATE; 502 conv->ostate = state; 503 return count; 504 } 505 } 506 507 /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and 508 JIS X 0208-1983. */ 509 ret = jisx0208_wctomb(conv,buf,wc,2); 510 if (ret != RET_ILUNI) { 511 if (ret != 2) abort(); 512 if (buf[0] < 0x80 && buf[1] < 0x80) { 513 int count = (state1 == STATE_JISX0208 ? 2 : 5); 514 if (n < count) 515 return RET_TOOSMALL; 516 if (state1 != STATE_JISX0208) { 517 r[0] = ESC; 518 r[1] = '$'; 519 r[2] = 'B'; 520 r += 3; 521 state1 = STATE_JISX0208; 522 } 523 r[0] = buf[0]; 524 r[1] = buf[1]; 525 COMBINE_STATE; 526 conv->ostate = state; 527 return count; 528 } 529 } 530 531 /* Try JIS X 0212-1990. */ 532 ret = jisx0212_wctomb(conv,buf,wc,2); 533 if (ret != RET_ILUNI) { 534 if (ret != 2) abort(); 535 if (buf[0] < 0x80 && buf[1] < 0x80) { 536 int count = (state1 == STATE_JISX0212 ? 2 : 6); 537 if (n < count) 538 return RET_TOOSMALL; 539 if (state1 != STATE_JISX0212) { 540 r[0] = ESC; 541 r[1] = '$'; 542 r[2] = '('; 543 r[3] = 'D'; 544 r += 4; 545 state1 = STATE_JISX0212; 546 } 547 r[0] = buf[0]; 548 r[1] = buf[1]; 549 COMBINE_STATE; 550 conv->ostate = state; 551 return count; 552 } 553 } 554 555 break; 556 557 case chinese: 558 559 /* Try GB 2312-1980. */ 560 ret = gb2312_wctomb(conv,buf,wc,2); 561 if (ret != RET_ILUNI) { 562 if (ret != 2) abort(); 563 if (buf[0] < 0x80 && buf[1] < 0x80) { 564 int count = (state1 == STATE_GB2312 ? 2 : 5); 565 if (n < count) 566 return RET_TOOSMALL; 567 if (state1 != STATE_GB2312) { 568 r[0] = ESC; 569 r[1] = '$'; 570 r[2] = 'A'; 571 r += 3; 572 state1 = STATE_GB2312; 573 } 574 r[0] = buf[0]; 575 r[1] = buf[1]; 576 COMBINE_STATE; 577 conv->ostate = state; 578 return count; 579 } 580 } 581 582 break; 583 584 case korean: 585 586 /* Try KS C 5601-1992. */ 587 ret = ksc5601_wctomb(conv,buf,wc,2); 588 if (ret != RET_ILUNI) { 589 if (ret != 2) abort(); 590 if (buf[0] < 0x80 && buf[1] < 0x80) { 591 int count = (state1 == STATE_KSC5601 ? 2 : 6); 592 if (n < count) 593 return RET_TOOSMALL; 594 if (state1 != STATE_KSC5601) { 595 r[0] = ESC; 596 r[1] = '$'; 597 r[2] = '('; 598 r[3] = 'C'; 599 r += 4; 600 state1 = STATE_KSC5601; 601 } 602 r[0] = buf[0]; 603 r[1] = buf[1]; 604 COMBINE_STATE; 605 conv->ostate = state; 606 return count; 607 } 608 } 609 610 break; 611 612 case other: 613 614 /* Try JIS X 0201-1976 Kana. This is not officially part of 615 ISO-2022-JP-2, according to RFC 1554. Therefore we try this 616 only after all other attempts. */ 617 ret = jisx0201_wctomb(conv,buf,wc,1); 618 if (ret != RET_ILUNI) { 619 if (ret != 1) abort(); 620 if (buf[0] >= 0x80) { 621 int count = (state1 == STATE_JISX0201KATAKANA ? 1 : 4); 622 if (n < count) 623 return RET_TOOSMALL; 624 if (state1 != STATE_JISX0201KATAKANA) { 625 r[0] = ESC; 626 r[1] = '('; 627 r[2] = 'I'; 628 r += 3; 629 state1 = STATE_JISX0201KATAKANA; 630 } 631 r[0] = buf[0]-0x80; 632 COMBINE_STATE; 633 conv->ostate = state; 634 return count; 635 } 636 } 637 638 break; 639 640 default: 641 abort(); 642 } 643 644 conversion_list = conversion_list >> 3; 645 } while (conversion_list != 0); 646 647 return RET_ILUNI; 648 } 649 650 static int 651 iso2022_jp2_reset (conv_t conv, unsigned char *r, int n) 652 { 653 state_t state = conv->ostate; 654 SPLIT_STATE; 655 (void)state2; 656 (void)state3; 657 if (state1 != STATE_ASCII) { 658 if (n < 3) 659 return RET_TOOSMALL; 660 r[0] = ESC; 661 r[1] = '('; 662 r[2] = 'B'; 663 /* conv->ostate = 0; will be done by the caller */ 664 return 3; 665 } else 666 return 0; 667 } 668 669 #undef COMBINE_STATE 670 #undef SPLIT_STATE 671 #undef STATE_TAG_LANGUAGE_zh 672 #undef STATE_TAG_LANGUAGE_z 673 #undef STATE_TAG_LANGUAGE_ko 674 #undef STATE_TAG_LANGUAGE_k 675 #undef STATE_TAG_LANGUAGE_ja 676 #undef STATE_TAG_LANGUAGE_j 677 #undef STATE_TAG_LANGUAGE 678 #undef STATE_TAG_NONE 679 #undef STATE_G2_ISO8859_7 680 #undef STATE_G2_ISO8859_1 681 #undef STATE_G2_NONE 682 #undef STATE_KSC5601 683 #undef STATE_GB2312 684 #undef STATE_JISX0212 685 #undef STATE_JISX0208 686 #undef STATE_JISX0201KATAKANA 687 #undef STATE_JISX0201ROMAN 688 #undef STATE_ASCII 689