1 /* 2 * Copyright 2002-2009, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Michael Wilber 7 * Axel Dörfler, axeld@pinc-software.de 8 */ 9 10 11 #include "STXTTranslator.h" 12 #include "STXTView.h" 13 14 #include <Catalog.h> 15 #include <CharacterSet.h> 16 #include <CharacterSetRoster.h> 17 #include <MimeType.h> 18 #include <String.h> 19 #include <UTF8.h> 20 21 #include <algorithm> 22 #include <new> 23 #include <string.h> 24 #include <stdio.h> 25 #include <stdint.h> 26 27 28 using namespace BPrivate; 29 using namespace std; 30 31 #undef B_TRANSLATE_CONTEXT 32 #define B_TRANSLATE_CONTEXT "STXTTranslator" 33 34 #define READ_BUFFER_SIZE 32768 35 #define DATA_BUFFER_SIZE 256 36 37 // The input formats that this translator supports. 38 static const translation_format sInputFormats[] = { 39 { 40 B_TRANSLATOR_TEXT, 41 B_TRANSLATOR_TEXT, 42 TEXT_IN_QUALITY, 43 TEXT_IN_CAPABILITY, 44 "text/plain", 45 "Plain text file" 46 }, 47 { 48 B_STYLED_TEXT_FORMAT, 49 B_TRANSLATOR_TEXT, 50 STXT_IN_QUALITY, 51 STXT_IN_CAPABILITY, 52 "text/x-vnd.Be-stxt", 53 "Be styled text file" 54 } 55 }; 56 57 // The output formats that this translator supports. 58 static const translation_format sOutputFormats[] = { 59 { 60 B_TRANSLATOR_TEXT, 61 B_TRANSLATOR_TEXT, 62 TEXT_OUT_QUALITY, 63 TEXT_OUT_CAPABILITY, 64 "text/plain", 65 "Plain text file" 66 }, 67 { 68 B_STYLED_TEXT_FORMAT, 69 B_TRANSLATOR_TEXT, 70 STXT_OUT_QUALITY, 71 STXT_OUT_CAPABILITY, 72 "text/x-vnd.Be-stxt", 73 "Be styled text file" 74 } 75 }; 76 77 // Default settings for the Translator 78 static const TranSetting sDefaultSettings[] = { 79 {B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false}, 80 {B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false} 81 }; 82 83 const uint32 kNumInputFormats = sizeof(sInputFormats) / sizeof(translation_format); 84 const uint32 kNumOutputFormats = sizeof(sOutputFormats) / sizeof(translation_format); 85 const uint32 kNumDefaultSettings = sizeof(sDefaultSettings) / sizeof(TranSetting); 86 87 // --------------------------------------------------------------- 88 // make_nth_translator 89 // 90 // Creates a STXTTranslator object to be used by BTranslatorRoster 91 // 92 // Preconditions: 93 // 94 // Parameters: n, The translator to return. Since 95 // STXTTranslator only publishes one 96 // translator, it only returns a 97 // STXTTranslator if n == 0 98 // 99 // you, The image_id of the add-on that 100 // contains code (not used). 101 // 102 // flags, Has no meaning yet, should be 0. 103 // 104 // Postconditions: 105 // 106 // Returns: NULL if n is not zero, 107 // a new STXTTranslator if n is zero 108 // --------------------------------------------------------------- 109 BTranslator * 110 make_nth_translator(int32 n, image_id you, uint32 flags, ...) 111 { 112 if (!n) 113 return new (std::nothrow) STXTTranslator(); 114 115 return NULL; 116 } 117 118 119 // #pragma mark - ascmagic.c from the BSD file tool 120 /* 121 * The following code has been taken from version 4.17 of the BSD file tool, 122 * file ascmagic.c, modified for our purpose. 123 */ 124 125 /* 126 * Copyright (c) Ian F. Darwin 1986-1995. 127 * Software written by Ian F. Darwin and others; 128 * maintained 1995-present by Christos Zoulas and others. 129 * 130 * Redistribution and use in source and binary forms, with or without 131 * modification, are permitted provided that the following conditions 132 * are met: 133 * 1. Redistributions of source code must retain the above copyright 134 * notice immediately at the beginning of the file, without modification, 135 * this list of conditions, and the following disclaimer. 136 * 2. Redistributions in binary form must reproduce the above copyright 137 * notice, this list of conditions and the following disclaimer in the 138 * documentation and/or other materials provided with the distribution. 139 * 140 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 141 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 142 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 143 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 144 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 145 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 146 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 147 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 148 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 149 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 150 * SUCH DAMAGE. 151 */ 152 /* 153 * ASCII magic -- file types that we know based on keywords 154 * that can appear anywhere in the file. 155 * bool found = false; 156 if (subtypeMimeSpecific != NULL) { 157 mimeType->SetTo(subtypeMimeSpecific); 158 if (mimeType->IsInstalled()) 159 found = true; 160 } 161 if (!found && subtypeMimeGeneric != NULL) { 162 mimeType->SetTo(subtypeMimeGeneric); 163 if (mimeType->IsInstalled()) 164 found = true; 165 } 166 if (!found) 167 mimeType->SetTo("text/plain"); 168 169 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, 170 * to handle character codes other than ASCII on a unified basis. 171 * 172 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 173 * international characters, now subsumed into this file. 174 */ 175 176 #include <stdio.h> 177 #include <string.h> 178 #include <memory.h> 179 #include <ctype.h> 180 #include <stdlib.h> 181 #include <unistd.h> 182 #include "names.h" 183 184 typedef unsigned long my_unichar; 185 186 #define MAXLINELEN 300 /* longest sane line length */ 187 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ 188 || (x) == 0x85 || (x) == '\f') 189 190 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *); 191 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *); 192 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *); 193 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *); 194 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *); 195 static void from_ebcdic(const unsigned char *, size_t, unsigned char *); 196 static int ascmatch(const unsigned char *, const my_unichar *, size_t); 197 198 199 static int 200 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType, 201 const char*& encoding) 202 { 203 size_t i; 204 unsigned char *nbuf = NULL; 205 my_unichar *ubuf = NULL; 206 size_t ulen; 207 struct names *p; 208 int rv = -1; 209 210 const char *code = NULL; 211 encoding = NULL; 212 const char *type = NULL; 213 const char *subtype = NULL; 214 const char *subtypeMimeGeneric = NULL; 215 const char *subtypeMimeSpecific = NULL; 216 217 int has_escapes = 0; 218 int has_backspace = 0; 219 int seen_cr = 0; 220 221 int n_crlf = 0; 222 int n_lf = 0; 223 int n_cr = 0; 224 int n_nel = 0; 225 226 int last_line_end = -1; 227 int has_long_lines = 0; 228 229 if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL) 230 goto done; 231 if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL) 232 goto done; 233 234 /* 235 * Then try to determine whether it's any character code we can 236 * identify. Each of these tests, if it succeeds, will leave 237 * the text converted into one-my_unichar-per-character Unicode in 238 * ubuf, and the number of characters converted in ulen. 239 */ 240 if (nbytes == 0) { 241 code = "UTF-8 Unicode"; 242 encoding = NULL; // "UTF-8"; 243 type = "text"; 244 rv = 1; 245 } else if (looks_ascii(buf, nbytes, ubuf, &ulen)) { 246 code = "ASCII"; 247 encoding = NULL; //"us-ascii"; 248 type = "text"; 249 if (nbytes == 1) { 250 // no further tests 251 rv = 1; 252 } 253 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { 254 code = "UTF-8 Unicode"; 255 encoding = NULL; // "UTF-8"; 256 type = "text"; 257 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { 258 if (i == 1) { 259 code = "Little-endian UTF-16 Unicode"; 260 encoding = "UTF-16"; 261 } else { 262 code = "Big-endian UTF-16 Unicode"; 263 encoding = "UTF-16"; 264 } 265 266 type = "character data"; 267 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { 268 code = "ISO-8859"; 269 type = "text"; 270 encoding = "iso-8859-1"; 271 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { 272 code = "Non-ISO extended-ASCII"; 273 type = "text"; 274 encoding = "unknown"; 275 } else { 276 from_ebcdic(buf, nbytes, nbuf); 277 278 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { 279 code = "EBCDIC"; 280 type = "character data"; 281 encoding = "ebcdic"; 282 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { 283 code = "International EBCDIC"; 284 type = "character data"; 285 encoding = "ebcdic"; 286 } else { 287 rv = 0; 288 goto done; /* doesn't look like text at all */ 289 } 290 } 291 292 if (nbytes <= 1) { 293 if (rv == -1) 294 rv = 0; 295 goto done; 296 } 297 298 /* 299 * for troff, look for . + letter + letter or .\"; 300 * this must be done to disambiguate tar archives' ./file 301 * and other trash from real troff input. 302 * 303 * I believe Plan 9 troff allows non-ASCII characters in the names 304 * of macros, so this test might possibly fail on such a file. 305 */ 306 if (*ubuf == '.') { 307 my_unichar *tp = ubuf + 1; 308 309 while (ISSPC(*tp)) 310 ++tp; /* skip leading whitespace */ 311 if ((tp[0] == '\\' && tp[1] == '\"') || 312 (isascii((unsigned char)tp[0]) && 313 isalnum((unsigned char)tp[0]) && 314 isascii((unsigned char)tp[1]) && 315 isalnum((unsigned char)tp[1]) && 316 ISSPC(tp[2]))) { 317 subtypeMimeGeneric = "text/x-source-code"; 318 subtypeMimeSpecific = "text/troff"; 319 subtype = "troff or preprocessor input"; 320 goto subtype_identified; 321 } 322 } 323 324 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { 325 subtypeMimeGeneric = "text/x-source-code"; 326 subtypeMimeSpecific = "text/fortran"; 327 subtype = "fortran program"; 328 goto subtype_identified; 329 } 330 331 /* look for tokens from names.h - this is expensive! */ 332 333 i = 0; 334 while (i < ulen) { 335 size_t end; 336 337 /* 338 * skip past any leading space 339 */ 340 while (i < ulen && ISSPC(ubuf[i])) 341 i++; 342 if (i >= ulen) 343 break; 344 345 /* 346 * find the next whitespace 347 */ 348 for (end = i + 1; end < nbytes; end++) 349 if (ISSPC(ubuf[end])) 350 break; 351 352 /* 353 * compare the word thus isolated against the token list 354 */ 355 for (p = names; p < names + NNAMES; p++) { 356 if (ascmatch((const unsigned char *)p->name, ubuf + i, 357 end - i)) { 358 subtype = types[p->type].human; 359 subtypeMimeGeneric = types[p->type].generic_mime; 360 subtypeMimeSpecific = types[p->type].specific_mime; 361 goto subtype_identified; 362 } 363 } 364 365 i = end; 366 } 367 368 subtype_identified: 369 370 /* 371 * Now try to discover other details about the file. 372 */ 373 for (i = 0; i < ulen; i++) { 374 if (ubuf[i] == '\n') { 375 if (seen_cr) 376 n_crlf++; 377 else 378 n_lf++; 379 last_line_end = i; 380 } else if (seen_cr) 381 n_cr++; 382 383 seen_cr = (ubuf[i] == '\r'); 384 if (seen_cr) 385 last_line_end = i; 386 387 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ 388 n_nel++; 389 last_line_end = i; 390 } 391 392 /* If this line is _longer_ than MAXLINELEN, remember it. */ 393 if ((int)i > last_line_end + MAXLINELEN) 394 has_long_lines = 1; 395 396 if (ubuf[i] == '\033') 397 has_escapes = 1; 398 if (ubuf[i] == '\b') 399 has_backspace = 1; 400 } 401 402 rv = 1; 403 done: 404 if (nbuf) 405 free(nbuf); 406 if (ubuf) 407 free(ubuf); 408 409 if (rv) { 410 // If we have identified the subtype, return it, otherwise just 411 // text/plain. 412 413 bool found = false; 414 if (subtypeMimeSpecific != NULL) { 415 mimeType->SetTo(subtypeMimeSpecific); 416 if (mimeType->IsInstalled()) 417 found = true; 418 } 419 if (!found && subtypeMimeGeneric != NULL) { 420 mimeType->SetTo(subtypeMimeGeneric); 421 if (mimeType->IsInstalled()) 422 found = true; 423 } 424 if (!found) 425 mimeType->SetTo("text/plain"); 426 } 427 428 return rv; 429 } 430 431 static int 432 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen) 433 { 434 size_t i; 435 436 for (i = 0; i < ulen; i++) { 437 if (s[i] != us[i]) 438 return 0; 439 } 440 441 if (s[i]) 442 return 0; 443 else 444 return 1; 445 } 446 447 /* 448 * This table reflects a particular philosophy about what constitutes 449 * "text," and there is room for disagreement about it. 450 * 451 * Version 3.31 of the file command considered a file to be ASCII if 452 * each of its characters was approved by either the isascii() or 453 * isalpha() function. On most systems, this would mean that any 454 * file consisting only of characters in the range 0x00 ... 0x7F 455 * would be called ASCII text, but many systems might reasonably 456 * consider some characters outside this range to be alphabetic, 457 * so the file command would call such characters ASCII. It might 458 * have been more accurate to call this "considered textual on the 459 * local system" than "ASCII." 460 * 461 * It considered a file to be "International language text" if each 462 * of its characters was either an ASCII printing character (according 463 * to the real ASCII standard, not the above test), a character in 464 * the range 0x80 ... 0xFF, or one of the following control characters: 465 * backspace, tab, line feed, vertical tab, form feed, carriage return, 466 * escape. No attempt was made to determine the language in which files 467 * of this type were written. 468 * 469 * 470 * The table below considers a file to be ASCII if all of its characters 471 * are either ASCII printing characters (again, according to the X3.4 472 * standard, not isascii()) or any of the following controls: bell, 473 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 474 * 475 * I include bell because some programs (particularly shell scripts) 476 * use it literally, even though it is rare in normal text. I exclude 477 * vertical tab because it never seems to be used in real text. I also 478 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 479 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 480 * character to. It might be more appropriate to include it in the 8859 481 * set instead of the ASCII set, but it's got to be included in *something* 482 * we recognize or EBCDIC files aren't going to be considered textual. 483 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 484 * and Latin characters, so these should possibly be allowed. But they 485 * make a real mess on VT100-style displays if they're not paired properly, 486 * so we are probably better off not calling them text. 487 * 488 * A file is considered to be ISO-8859 text if its characters are all 489 * either ASCII, according to the above definition, or printing characters 490 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 491 * 492 * Finally, a file is considered to be international text from some other 493 * character code if its characters are all either ISO-8859 (according to 494 * the above definition) or characters in the range 0x80 ... 0x9F, which 495 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 496 * consider to be printing characters. 497 */ 498 499 #define F 0 /* character never appears in text */ 500 #define T 1 /* character appears in plain ASCII text */ 501 #define I 2 /* character appears in ISO-8859 text */ 502 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 503 504 static char text_chars[256] = { 505 /* BEL BS HT LF FF CR */ 506 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 507 /* ESC */ 508 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 509 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 510 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 511 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 512 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 513 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 514 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 515 /* NEL */ 516 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 517 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 518 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 519 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 520 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 521 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 522 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 523 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 524 }; 525 526 static int 527 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 528 size_t *ulen) 529 { 530 int i; 531 532 *ulen = 0; 533 534 for (i = 0; i < (int)nbytes; i++) { 535 int t = text_chars[buf[i]]; 536 537 if (t != T) 538 return 0; 539 540 ubuf[(*ulen)++] = buf[i]; 541 } 542 543 return 1; 544 } 545 546 static int 547 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen) 548 { 549 int i; 550 551 *ulen = 0; 552 553 for (i = 0; i < (int)nbytes; i++) { 554 int t = text_chars[buf[i]]; 555 556 if (t != T && t != I) 557 return 0; 558 559 ubuf[(*ulen)++] = buf[i]; 560 } 561 562 return 1; 563 } 564 565 static int 566 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 567 size_t *ulen) 568 { 569 int i; 570 571 *ulen = 0; 572 573 for (i = 0; i < (int)nbytes; i++) { 574 int t = text_chars[buf[i]]; 575 576 if (t != T && t != I && t != X) 577 return 0; 578 579 ubuf[(*ulen)++] = buf[i]; 580 } 581 582 return 1; 583 } 584 585 static int 586 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen) 587 { 588 int i, n; 589 my_unichar c; 590 int gotone = 0; 591 592 *ulen = 0; 593 594 for (i = 0; i < (int)nbytes; i++) { 595 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 596 /* 597 * Even if the whole file is valid UTF-8 sequences, 598 * still reject it if it uses weird control characters. 599 */ 600 601 if (text_chars[buf[i]] != T) 602 return 0; 603 604 ubuf[(*ulen)++] = buf[i]; 605 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 606 return 0; 607 } else { /* 11xxxxxx begins UTF-8 */ 608 int following; 609 610 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 611 c = buf[i] & 0x1f; 612 following = 1; 613 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 614 c = buf[i] & 0x0f; 615 following = 2; 616 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 617 c = buf[i] & 0x07; 618 following = 3; 619 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 620 c = buf[i] & 0x03; 621 following = 4; 622 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 623 c = buf[i] & 0x01; 624 following = 5; 625 } else 626 return 0; 627 628 for (n = 0; n < following; n++) { 629 i++; 630 if (i >= (int)nbytes) 631 goto done; 632 633 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 634 return 0; 635 636 c = (c << 6) + (buf[i] & 0x3f); 637 } 638 639 ubuf[(*ulen)++] = c; 640 gotone = 1; 641 } 642 } 643 done: 644 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 645 } 646 647 static int 648 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 649 size_t *ulen) 650 { 651 int bigend; 652 int i; 653 654 if (nbytes < 2) 655 return 0; 656 657 if (buf[0] == 0xff && buf[1] == 0xfe) 658 bigend = 0; 659 else if (buf[0] == 0xfe && buf[1] == 0xff) 660 bigend = 1; 661 else 662 return 0; 663 664 *ulen = 0; 665 666 for (i = 2; i + 1 < (int)nbytes; i += 2) { 667 /* XXX fix to properly handle chars > 65536 */ 668 669 if (bigend) 670 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 671 else 672 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 673 674 if (ubuf[*ulen - 1] == 0xfffe) 675 return 0; 676 if (ubuf[*ulen - 1] < 128 && 677 text_chars[(size_t)ubuf[*ulen - 1]] != T) 678 return 0; 679 } 680 681 return 1 + bigend; 682 } 683 684 #undef F 685 #undef T 686 #undef I 687 #undef X 688 689 /* 690 * This table maps each EBCDIC character to an (8-bit extended) ASCII 691 * character, as specified in the rationale for the dd(1) command in 692 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 693 * 694 * Unfortunately it does not seem to correspond exactly to any of the 695 * five variants of EBCDIC documented in IBM's _Enterprise Systems 696 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 697 * Edition, July, 1999, pp. I-1 - I-4. 698 * 699 * Fortunately, though, all versions of EBCDIC, including this one, agree 700 * on most of the printing characters that also appear in (7-bit) ASCII. 701 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 702 * 703 * Fortunately too, there is general agreement that codes 0x00 through 704 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 705 * remainder printing characters. 706 * 707 * This is sufficient to allow us to identify EBCDIC text and to distinguish 708 * between old-style and internationalized examples of text. 709 */ 710 711 static unsigned char ebcdic_to_ascii[] = { 712 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 713 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 714 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 715 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 716 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 717 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 718 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 719 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 720 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 721 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 722 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 723 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 724 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 725 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 726 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 727 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 728 }; 729 730 #ifdef notdef 731 /* 732 * The following EBCDIC-to-ASCII table may relate more closely to reality, 733 * or at least to modern reality. It comes from 734 * 735 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 736 * 737 * and maps the characters of EBCDIC code page 1047 (the code used for 738 * Unix-derived software on IBM's 390 systems) to the corresponding 739 * characters from ISO 8859-1. 740 * 741 * If this table is used instead of the above one, some of the special 742 * cases for the NEL character can be taken out of the code. 743 */ 744 745 static unsigned char ebcdic_1047_to_8859[] = { 746 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 747 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 748 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 749 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 750 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 751 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 752 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 753 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 754 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 755 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 756 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 757 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 758 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 759 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 760 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 761 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 762 }; 763 #endif 764 765 /* 766 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 767 */ 768 static void 769 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 770 { 771 int i; 772 773 for (i = 0; i < (int)nbytes; i++) { 774 out[i] = ebcdic_to_ascii[buf[i]]; 775 } 776 } 777 778 779 // #pragma mark - 780 781 782 /*! 783 Determines if the data in inSource is of the STXT format. 784 785 \param header the STXT stream header read in by Identify() or Translate() 786 \param inSource the stream with the STXT data 787 \param outInfo information about the type of data from inSource is stored here 788 \param outType the desired output type for the data in inSource 789 \param ptxtheader if this is not NULL, the TEXT header from 790 inSource is copied to it 791 */ 792 status_t 793 identify_stxt_header(const TranslatorStyledTextStreamHeader &header, 794 BPositionIO *inSource, translator_info *outInfo, uint32 outType, 795 TranslatorStyledTextTextHeader *ptxtheader = NULL) 796 { 797 const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader); 798 const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader); 799 800 uint8 buffer[max(ktxtsize, kstylsize)]; 801 802 // Check the TEXT header 803 TranslatorStyledTextTextHeader txtheader; 804 if (inSource->Read(buffer, ktxtsize) != ktxtsize) 805 return B_NO_TRANSLATOR; 806 807 memcpy(&txtheader, buffer, ktxtsize); 808 if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize, 809 B_SWAP_BENDIAN_TO_HOST) != B_OK) 810 return B_ERROR; 811 812 if (txtheader.header.magic != 'TEXT' 813 || txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader) 814 || txtheader.charset != B_UNICODE_UTF8) 815 return B_NO_TRANSLATOR; 816 817 // skip the text data 818 off_t seekresult, pos; 819 pos = header.header.header_size + txtheader.header.header_size 820 + txtheader.header.data_size; 821 seekresult = inSource->Seek(txtheader.header.data_size, 822 SEEK_CUR); 823 if (seekresult < pos) 824 return B_NO_TRANSLATOR; 825 if (seekresult > pos) 826 return B_ERROR; 827 828 // check the STYL header (not all STXT files have this) 829 ssize_t read = 0; 830 TranslatorStyledTextStyleHeader stylheader; 831 read = inSource->Read(buffer, kstylsize); 832 if (read < 0) 833 return read; 834 if (read != kstylsize && read != 0) 835 return B_NO_TRANSLATOR; 836 837 // If there is a STYL header 838 if (read == kstylsize) { 839 memcpy(&stylheader, buffer, kstylsize); 840 if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize, 841 B_SWAP_BENDIAN_TO_HOST) != B_OK) 842 return B_ERROR; 843 844 if (stylheader.header.magic != 'STYL' 845 || stylheader.header.header_size != 846 sizeof(TranslatorStyledTextStyleHeader)) 847 return B_NO_TRANSLATOR; 848 } 849 850 // if output TEXT header is supplied, fill it with data 851 if (ptxtheader) { 852 ptxtheader->header.magic = txtheader.header.magic; 853 ptxtheader->header.header_size = txtheader.header.header_size; 854 ptxtheader->header.data_size = txtheader.header.data_size; 855 ptxtheader->charset = txtheader.charset; 856 } 857 858 // return information about the data in the stream 859 outInfo->type = B_STYLED_TEXT_FORMAT; 860 outInfo->group = B_TRANSLATOR_TEXT; 861 outInfo->quality = STXT_IN_QUALITY; 862 outInfo->capability = STXT_IN_CAPABILITY; 863 strcpy(outInfo->name, B_TRANSLATE("Be styled text file")); 864 strcpy(outInfo->MIME, "text/x-vnd.Be-stxt"); 865 866 return B_OK; 867 } 868 869 870 /*! 871 Determines if the data in \a inSource is of the UTF8 plain 872 873 \param data buffer containing data already read (must be at 874 least DATA_BUFFER_SIZE bytes large) 875 \param nread number of bytes that have already been read from the stream 876 \param header the STXT stream header read in by Identify() or Translate() 877 \param inSource the stream with the STXT data 878 \param outInfo information about the type of data from inSource is stored here 879 \param outType the desired output type for the data in inSource 880 */ 881 status_t 882 identify_text(uint8* data, int32 bytesRead, BPositionIO* source, 883 translator_info* outInfo, uint32 outType, const char*& encoding) 884 { 885 ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead); 886 if (readLater < B_OK) 887 return B_NO_TRANSLATOR; 888 889 bytesRead += readLater; 890 891 // TODO: identify encoding as possible! 892 BMimeType type; 893 if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding)) 894 return B_NO_TRANSLATOR; 895 896 float capability = TEXT_IN_CAPABILITY; 897 if (bytesRead < 20) 898 capability = .1f; 899 900 // return information about the data in the stream 901 outInfo->type = B_TRANSLATOR_TEXT; 902 outInfo->group = B_TRANSLATOR_TEXT; 903 outInfo->quality = TEXT_IN_QUALITY; 904 outInfo->capability = capability; 905 906 char description[B_MIME_TYPE_LENGTH]; 907 if (type.GetLongDescription(description) == B_OK) 908 strlcpy(outInfo->name, description, sizeof(outInfo->name)); 909 else 910 strlcpy(outInfo->name, B_TRANSLATE("Plain text file"), 911 sizeof(outInfo->name)); 912 913 //strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME)); 914 strcpy(outInfo->MIME, "text/plain"); 915 return B_OK; 916 } 917 918 919 // --------------------------------------------------------------- 920 // translate_from_stxt 921 // 922 // Translates the data in inSource to the type outType and stores 923 // the translated data in outDestination. 924 // 925 // Preconditions: 926 // 927 // Parameters: inSource, the data to be translated 928 // 929 // outDestination, where the translated data is 930 // put 931 // 932 // outType, the type to convert inSource to 933 // 934 // txtheader, the TEXT header from inSource 935 // 936 // 937 // Postconditions: 938 // 939 // Returns: B_BAD_VALUE, if outType is invalid 940 // 941 // B_NO_TRANSLATOR, if this translator doesn't understand the data 942 // 943 // B_ERROR, if there was an error allocating memory or converting 944 // data 945 // 946 // B_OK, if all went well 947 // --------------------------------------------------------------- 948 status_t 949 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination, 950 uint32 outType, const TranslatorStyledTextTextHeader &txtheader) 951 { 952 if (inSource->Seek(0, SEEK_SET) != 0) 953 return B_ERROR; 954 955 const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader); 956 const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader); 957 958 bool btoplain; 959 if (outType == B_TRANSLATOR_TEXT) 960 btoplain = true; 961 else if (outType == B_STYLED_TEXT_FORMAT) 962 btoplain = false; 963 else 964 return B_BAD_VALUE; 965 966 uint8 buffer[READ_BUFFER_SIZE]; 967 ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0; 968 969 // skip to the actual text data when outputting a 970 // plain text file 971 if (btoplain) { 972 if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) != 973 kstxtsize + ktxtsize) 974 return B_ERROR; 975 } 976 977 // Read data from inSource 978 // When outputing B_TRANSLATOR_TEXT, the loop stops when all of 979 // the text data has been read and written. 980 // When outputting B_STYLED_TEXT_FORMAT, the loop stops when all 981 // of the data from inSource has been read and written. 982 if (btoplain) 983 nreed = min((size_t)READ_BUFFER_SIZE, 984 txtheader.header.data_size - ntotalread); 985 else 986 nreed = READ_BUFFER_SIZE; 987 nread = inSource->Read(buffer, nreed); 988 while (nread > 0) { 989 nwritten = outDestination->Write(buffer, nread); 990 if (nwritten != nread) 991 return B_ERROR; 992 993 if (btoplain) { 994 ntotalread += nread; 995 nreed = min((size_t)READ_BUFFER_SIZE, 996 txtheader.header.data_size - ntotalread); 997 } else 998 nreed = READ_BUFFER_SIZE; 999 nread = inSource->Read(buffer, nreed); 1000 } 1001 1002 if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) != 1003 ntotalread) 1004 // If not all of the text data was able to be read... 1005 return B_NO_TRANSLATOR; 1006 else 1007 return B_OK; 1008 } 1009 1010 // --------------------------------------------------------------- 1011 // output_headers 1012 // 1013 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT 1014 // to outDestination, setting the data_size member of the text header 1015 // to text_data_size 1016 // 1017 // Preconditions: 1018 // 1019 // Parameters: outDestination, where the translated data is 1020 // put 1021 // 1022 // text_data_size, number of bytes in data section 1023 // of the TEXT header 1024 // 1025 // 1026 // Postconditions: 1027 // 1028 // Returns: 1029 // 1030 // B_ERROR, if there was an error writing to outDestination or 1031 // an error with converting the byte order 1032 // 1033 // B_OK, if all went well 1034 // --------------------------------------------------------------- 1035 status_t 1036 output_headers(BPositionIO *outDestination, uint32 text_data_size) 1037 { 1038 const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) + 1039 sizeof(TranslatorStyledTextTextHeader); 1040 status_t result; 1041 TranslatorStyledTextStreamHeader stxtheader; 1042 TranslatorStyledTextTextHeader txtheader; 1043 1044 uint8 buffer[kHeadersSize]; 1045 1046 stxtheader.header.magic = 'STXT'; 1047 stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader); 1048 stxtheader.header.data_size = 0; 1049 stxtheader.version = 100; 1050 memcpy(buffer, &stxtheader, stxtheader.header.header_size); 1051 1052 txtheader.header.magic = 'TEXT'; 1053 txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader); 1054 txtheader.header.data_size = text_data_size; 1055 txtheader.charset = B_UNICODE_UTF8; 1056 memcpy(buffer + stxtheader.header.header_size, &txtheader, 1057 txtheader.header.header_size); 1058 1059 // write out headers in Big Endian byte order 1060 result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize, 1061 B_SWAP_HOST_TO_BENDIAN); 1062 if (result == B_OK) { 1063 ssize_t nwritten = 0; 1064 nwritten = outDestination->Write(buffer, kHeadersSize); 1065 if (nwritten != kHeadersSize) 1066 return B_ERROR; 1067 else 1068 return B_OK; 1069 } 1070 1071 return result; 1072 } 1073 1074 // --------------------------------------------------------------- 1075 // output_styles 1076 // 1077 // Writes out the actual style information into outDestination 1078 // using the data from pflatRunArray 1079 // 1080 // Preconditions: 1081 // 1082 // Parameters: outDestination, where the translated data is 1083 // put 1084 // 1085 // text_size, size in bytes of the text in 1086 // outDestination 1087 // 1088 // data_size, size of pflatRunArray 1089 // 1090 // Postconditions: 1091 // 1092 // Returns: 1093 // 1094 // B_ERROR, if there was an error writing to outDestination or 1095 // an error with converting the byte order 1096 // 1097 // B_OK, if all went well 1098 // --------------------------------------------------------------- 1099 status_t 1100 output_styles(BPositionIO *outDestination, uint32 text_size, 1101 uint8 *pflatRunArray, ssize_t data_size) 1102 { 1103 const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader); 1104 1105 uint8 buffer[kstylsize]; 1106 1107 // output STYL header 1108 TranslatorStyledTextStyleHeader stylheader; 1109 stylheader.header.magic = 'STYL'; 1110 stylheader.header.header_size = 1111 sizeof(TranslatorStyledTextStyleHeader); 1112 stylheader.header.data_size = data_size; 1113 stylheader.apply_offset = 0; 1114 stylheader.apply_length = text_size; 1115 1116 memcpy(buffer, &stylheader, kstylsize); 1117 if (swap_data(B_UINT32_TYPE, buffer, kstylsize, 1118 B_SWAP_HOST_TO_BENDIAN) != B_OK) 1119 return B_ERROR; 1120 if (outDestination->Write(buffer, kstylsize) != kstylsize) 1121 return B_ERROR; 1122 1123 // output actual style information 1124 if (outDestination->Write(pflatRunArray, 1125 data_size) != data_size) 1126 return B_ERROR; 1127 1128 return B_OK; 1129 } 1130 1131 1132 /*! 1133 Convert the plain text (UTF8) from inSource to plain or 1134 styled text in outDestination 1135 */ 1136 status_t 1137 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding, 1138 BPositionIO* destination, uint32 outType) 1139 { 1140 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1141 return B_BAD_VALUE; 1142 1143 // find the length of the text 1144 off_t size = source->Seek(0, SEEK_END); 1145 if (size < 0) 1146 return (status_t)size; 1147 if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT) 1148 return B_NOT_SUPPORTED; 1149 1150 status_t status = source->Seek(0, SEEK_SET); 1151 if (status < B_OK) 1152 return status; 1153 1154 if (outType == B_STYLED_TEXT_FORMAT) { 1155 // output styled text headers 1156 status = output_headers(destination, (uint32)size); 1157 if (status != B_OK) 1158 return status; 1159 } 1160 1161 class MallocBuffer { 1162 public: 1163 MallocBuffer() : fBuffer(NULL), fSize(0) {} 1164 ~MallocBuffer() { free(fBuffer); } 1165 1166 void* Buffer() { return fBuffer; } 1167 size_t Size() const { return fSize; } 1168 1169 status_t 1170 Allocate(size_t size) 1171 { 1172 fBuffer = malloc(size); 1173 if (fBuffer != NULL) { 1174 fSize = size; 1175 return B_OK; 1176 } 1177 return B_NO_MEMORY; 1178 } 1179 1180 private: 1181 void* fBuffer; 1182 size_t fSize; 1183 } encodingBuffer; 1184 BMallocIO encodingIO; 1185 uint32 encodingID = 0; 1186 // defaults to UTF-8 or no encoding 1187 1188 BNode* node = dynamic_cast<BNode*>(source); 1189 if (node != NULL) { 1190 // determine encoding, if available 1191 const BCharacterSet* characterSet = NULL; 1192 bool hasAttribute = false; 1193 if (encoding != NULL && !forceEncoding) { 1194 BString name; 1195 if (node->ReadAttrString("be:encoding", &name) == B_OK) { 1196 encoding = name.String(); 1197 hasAttribute = true; 1198 } else { 1199 int32 value; 1200 ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0, 1201 &value, sizeof(value)); 1202 if (bytesRead == (ssize_t)sizeof(value)) { 1203 hasAttribute = true; 1204 if (value != 65535) 1205 characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value); 1206 } 1207 } 1208 } else { 1209 hasAttribute = true; 1210 // we don't write the encoding in this case 1211 } 1212 if (characterSet == NULL && encoding != NULL) 1213 characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding); 1214 1215 if (characterSet != NULL) { 1216 encodingID = characterSet->GetConversionID(); 1217 encodingBuffer.Allocate(READ_BUFFER_SIZE * 4); 1218 } 1219 1220 if (!hasAttribute && encoding != NULL) { 1221 // add encoding attribute, so that someone opening the file can 1222 // retrieve it for persistance 1223 node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding, 1224 strlen(encoding)); 1225 } 1226 } 1227 1228 off_t outputSize = 0; 1229 ssize_t bytesRead; 1230 int32 state = 0; 1231 1232 // output the actual text part of the data 1233 do { 1234 uint8 buffer[READ_BUFFER_SIZE]; 1235 bytesRead = source->Read(buffer, READ_BUFFER_SIZE); 1236 if (bytesRead < B_OK) 1237 return bytesRead; 1238 if (bytesRead == 0) 1239 break; 1240 1241 if (encodingBuffer.Size() == 0) { 1242 // default, no encoding 1243 ssize_t bytesWritten = destination->Write(buffer, bytesRead); 1244 if (bytesWritten != bytesRead) { 1245 if (bytesWritten < B_OK) 1246 return bytesWritten; 1247 1248 return B_ERROR; 1249 } 1250 1251 outputSize += bytesRead; 1252 } else { 1253 // decode text file to UTF-8 1254 char* pos = (char*)buffer; 1255 int32 encodingLength = encodingIO.BufferLength(); 1256 int32 bytesLeft = bytesRead; 1257 int32 bytes; 1258 do { 1259 encodingLength = READ_BUFFER_SIZE * 4; 1260 bytes = bytesLeft; 1261 1262 status = convert_to_utf8(encodingID, pos, &bytes, 1263 (char*)encodingBuffer.Buffer(), &encodingLength, &state); 1264 if (status < B_OK) 1265 return status; 1266 1267 ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(), 1268 encodingLength); 1269 if (bytesWritten < encodingLength) { 1270 if (bytesWritten < B_OK) 1271 return bytesWritten; 1272 1273 return B_ERROR; 1274 } 1275 1276 pos += bytes; 1277 bytesLeft -= bytes; 1278 outputSize += encodingLength; 1279 } while (encodingLength > 0 && bytesLeft > 0); 1280 } 1281 } while (bytesRead > 0); 1282 1283 if (outType != B_STYLED_TEXT_FORMAT) 1284 return B_OK; 1285 1286 if (encodingBuffer.Size() != 0 && size != outputSize) { 1287 if (outputSize > UINT32_MAX) 1288 return B_NOT_SUPPORTED; 1289 1290 // we need to update the header as the decoded text size has changed 1291 status = destination->Seek(0, SEEK_SET); 1292 if (status == B_OK) 1293 status = output_headers(destination, (uint32)outputSize); 1294 if (status == B_OK) 1295 status = destination->Seek(0, SEEK_END); 1296 1297 if (status < B_OK) 1298 return status; 1299 } 1300 1301 // Read file attributes if outputting styled data 1302 // and source is a BNode object 1303 1304 if (node == NULL) 1305 return B_OK; 1306 1307 // Try to read styles - we only propagate an error if the actual on-disk 1308 // data is likely to be okay 1309 1310 const char *kAttrName = "styles"; 1311 attr_info info; 1312 if (node->GetAttrInfo(kAttrName, &info) != B_OK) 1313 return B_OK; 1314 1315 if (info.type != B_RAW_TYPE || info.size < 160) { 1316 // styles seem to be broken, but since we got the text, 1317 // we don't propagate the error 1318 return B_OK; 1319 } 1320 1321 uint8* flatRunArray = new (std::nothrow) uint8[info.size]; 1322 if (flatRunArray == NULL) 1323 return B_NO_MEMORY; 1324 1325 bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size); 1326 if (bytesRead != info.size) 1327 return B_OK; 1328 1329 output_styles(destination, size, flatRunArray, info.size); 1330 1331 delete[] flatRunArray; 1332 return B_OK; 1333 } 1334 1335 1336 // #pragma mark - 1337 1338 1339 STXTTranslator::STXTTranslator() 1340 : BaseTranslator(B_TRANSLATE("StyledEdit files"), 1341 B_TRANSLATE("StyledEdit files translator"), 1342 STXT_TRANSLATOR_VERSION, 1343 sInputFormats, kNumInputFormats, 1344 sOutputFormats, kNumOutputFormats, 1345 "STXTTranslator_Settings", 1346 sDefaultSettings, kNumDefaultSettings, 1347 B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT) 1348 { 1349 } 1350 1351 1352 STXTTranslator::~STXTTranslator() 1353 { 1354 } 1355 1356 1357 status_t 1358 STXTTranslator::Identify(BPositionIO *inSource, 1359 const translation_format *inFormat, BMessage *ioExtension, 1360 translator_info *outInfo, uint32 outType) 1361 { 1362 if (!outType) 1363 outType = B_TRANSLATOR_TEXT; 1364 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1365 return B_NO_TRANSLATOR; 1366 1367 const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader); 1368 1369 uint8 buffer[DATA_BUFFER_SIZE]; 1370 status_t nread = 0; 1371 // Read in the header to determine 1372 // if the data is supported 1373 nread = inSource->Read(buffer, kstxtsize); 1374 if (nread < 0) 1375 return nread; 1376 1377 // read in enough data to fill the stream header 1378 if (nread == kstxtsize) { 1379 TranslatorStyledTextStreamHeader header; 1380 memcpy(&header, buffer, kstxtsize); 1381 if (swap_data(B_UINT32_TYPE, &header, kstxtsize, 1382 B_SWAP_BENDIAN_TO_HOST) != B_OK) 1383 return B_ERROR; 1384 1385 if (header.header.magic == B_STYLED_TEXT_FORMAT 1386 && header.header.header_size == (int32)kstxtsize 1387 && header.header.data_size == 0 1388 && header.version == 100) 1389 return identify_stxt_header(header, inSource, outInfo, outType); 1390 } 1391 1392 // if the data is not styled text, check if it is plain text 1393 const char* encoding; 1394 return identify_text(buffer, nread, inSource, outInfo, outType, encoding); 1395 } 1396 1397 1398 status_t 1399 STXTTranslator::Translate(BPositionIO* source, const translator_info* info, 1400 BMessage* ioExtension, uint32 outType, BPositionIO* outDestination) 1401 { 1402 if (!outType) 1403 outType = B_TRANSLATOR_TEXT; 1404 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1405 return B_NO_TRANSLATOR; 1406 1407 const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader); 1408 uint8 buffer[DATA_BUFFER_SIZE]; 1409 status_t result; 1410 translator_info outInfo; 1411 // Read in the header to determine 1412 // if the data is supported 1413 ssize_t bytesRead = source->Read(buffer, headerSize); 1414 if (bytesRead < 0) 1415 return bytesRead; 1416 1417 // read in enough data to fill the stream header 1418 if (bytesRead == headerSize) { 1419 TranslatorStyledTextStreamHeader header; 1420 memcpy(&header, buffer, headerSize); 1421 if (swap_data(B_UINT32_TYPE, &header, headerSize, 1422 B_SWAP_BENDIAN_TO_HOST) != B_OK) 1423 return B_ERROR; 1424 1425 if (header.header.magic == B_STYLED_TEXT_FORMAT 1426 && header.header.header_size == sizeof(TranslatorStyledTextStreamHeader) 1427 && header.header.data_size == 0 1428 && header.version == 100) { 1429 TranslatorStyledTextTextHeader textHeader; 1430 result = identify_stxt_header(header, source, &outInfo, outType, 1431 &textHeader); 1432 if (result != B_OK) 1433 return result; 1434 1435 return translate_from_stxt(source, outDestination, outType, textHeader); 1436 } 1437 } 1438 1439 // if the data is not styled text, check if it is ASCII text 1440 bool forceEncoding = false; 1441 const char* encoding = NULL; 1442 result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding); 1443 if (result != B_OK) 1444 return result; 1445 1446 if (ioExtension != NULL) { 1447 const char* value; 1448 if (ioExtension->FindString("be:encoding", &value) == B_OK 1449 && value[0]) { 1450 // override encoding 1451 encoding = value; 1452 forceEncoding = true; 1453 } 1454 } 1455 1456 return translate_from_text(source, encoding, forceEncoding, outDestination, outType); 1457 } 1458 1459 1460 BView * 1461 STXTTranslator::NewConfigView(TranslatorSettings *settings) 1462 { 1463 return new STXTView(BRect(0, 0, 225, 175), 1464 B_TRANSLATE("STXTTranslator Settings"), 1465 B_FOLLOW_ALL, B_WILL_DRAW, settings); 1466 } 1467 1468