1 /* 2 * Copyright 2002-2008, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Michael Wilber 7 * Axel Dörfler, axeld@pinc-software.de 8 */ 9 10 11 #include "STXTTranslator.h" 12 #include "STXTView.h" 13 14 #include <CharacterSet.h> 15 #include <CharacterSetRoster.h> 16 #include <MimeType.h> 17 #include <String.h> 18 #include <UTF8.h> 19 20 #include <algorithm> 21 #include <new> 22 #include <string.h> 23 #include <stdio.h> 24 #include <stdint.h> 25 26 27 using namespace BPrivate; 28 using namespace std; 29 30 31 #define READ_BUFFER_SIZE 32768 32 #define DATA_BUFFER_SIZE 256 33 34 // The input formats that this translator supports. 35 translation_format gInputFormats[] = { 36 { 37 B_TRANSLATOR_TEXT, 38 B_TRANSLATOR_TEXT, 39 TEXT_IN_QUALITY, 40 TEXT_IN_CAPABILITY, 41 "text/plain", 42 "Plain text file" 43 }, 44 { 45 B_STYLED_TEXT_FORMAT, 46 B_TRANSLATOR_TEXT, 47 STXT_IN_QUALITY, 48 STXT_IN_CAPABILITY, 49 "text/x-vnd.Be-stxt", 50 "Be styled text file" 51 } 52 }; 53 54 // The output formats that this translator supports. 55 translation_format gOutputFormats[] = { 56 { 57 B_TRANSLATOR_TEXT, 58 B_TRANSLATOR_TEXT, 59 TEXT_OUT_QUALITY, 60 TEXT_OUT_CAPABILITY, 61 "text/plain", 62 "Plain text file" 63 }, 64 { 65 B_STYLED_TEXT_FORMAT, 66 B_TRANSLATOR_TEXT, 67 STXT_OUT_QUALITY, 68 STXT_OUT_CAPABILITY, 69 "text/x-vnd.Be-stxt", 70 "Be styled text file" 71 } 72 }; 73 74 // Default settings for the Translator 75 TranSetting gDefaultSettings[] = { 76 {B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false}, 77 {B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false} 78 }; 79 80 // --------------------------------------------------------------- 81 // make_nth_translator 82 // 83 // Creates a STXTTranslator object to be used by BTranslatorRoster 84 // 85 // Preconditions: 86 // 87 // Parameters: n, The translator to return. Since 88 // STXTTranslator only publishes one 89 // translator, it only returns a 90 // STXTTranslator if n == 0 91 // 92 // you, The image_id of the add-on that 93 // contains code (not used). 94 // 95 // flags, Has no meaning yet, should be 0. 96 // 97 // Postconditions: 98 // 99 // Returns: NULL if n is not zero, 100 // a new STXTTranslator if n is zero 101 // --------------------------------------------------------------- 102 BTranslator * 103 make_nth_translator(int32 n, image_id you, uint32 flags, ...) 104 { 105 if (!n) 106 return new (std::nothrow) STXTTranslator(); 107 108 return NULL; 109 } 110 111 112 // #pragma mark - ascmagic.c from the BSD file tool 113 /* 114 * The following code has been taken from version 4.17 of the BSD file tool, 115 * file ascmagic.c, modified for our purpose. 116 */ 117 118 /* 119 * Copyright (c) Ian F. Darwin 1986-1995. 120 * Software written by Ian F. Darwin and others; 121 * maintained 1995-present by Christos Zoulas and others. 122 * 123 * Redistribution and use in source and binary forms, with or without 124 * modification, are permitted provided that the following conditions 125 * are met: 126 * 1. Redistributions of source code must retain the above copyright 127 * notice immediately at the beginning of the file, without modification, 128 * this list of conditions, and the following disclaimer. 129 * 2. Redistributions in binary form must reproduce the above copyright 130 * notice, this list of conditions and the following disclaimer in the 131 * documentation and/or other materials provided with the distribution. 132 * 133 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 134 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 135 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 136 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 137 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 138 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 139 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 140 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 141 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 142 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 143 * SUCH DAMAGE. 144 */ 145 /* 146 * ASCII magic -- file types that we know based on keywords 147 * that can appear anywhere in the file. 148 * bool found = false; 149 if (subtypeMimeSpecific != NULL) { 150 mimeType->SetTo(subtypeMimeSpecific); 151 if (mimeType->IsInstalled()) 152 found = true; 153 } 154 if (!found && subtypeMimeGeneric != NULL) { 155 mimeType->SetTo(subtypeMimeGeneric); 156 if (mimeType->IsInstalled()) 157 found = true; 158 } 159 if (!found) 160 mimeType->SetTo("text/plain"); 161 162 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, 163 * to handle character codes other than ASCII on a unified basis. 164 * 165 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 166 * international characters, now subsumed into this file. 167 */ 168 169 #include <stdio.h> 170 #include <string.h> 171 #include <memory.h> 172 #include <ctype.h> 173 #include <stdlib.h> 174 #include <unistd.h> 175 #include "names.h" 176 177 typedef unsigned long my_unichar; 178 179 #define MAXLINELEN 300 /* longest sane line length */ 180 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ 181 || (x) == 0x85 || (x) == '\f') 182 183 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *); 184 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *); 185 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *); 186 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *); 187 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *); 188 static void from_ebcdic(const unsigned char *, size_t, unsigned char *); 189 static int ascmatch(const unsigned char *, const my_unichar *, size_t); 190 191 192 static int 193 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType, 194 const char*& encoding) 195 { 196 size_t i; 197 unsigned char *nbuf = NULL; 198 my_unichar *ubuf = NULL; 199 size_t ulen; 200 struct names *p; 201 int rv = -1; 202 203 const char *code = NULL; 204 encoding = NULL; 205 const char *type = NULL; 206 const char *subtype = NULL; 207 const char *subtypeMimeGeneric = NULL; 208 const char *subtypeMimeSpecific = NULL; 209 210 int has_escapes = 0; 211 int has_backspace = 0; 212 int seen_cr = 0; 213 214 int n_crlf = 0; 215 int n_lf = 0; 216 int n_cr = 0; 217 int n_nel = 0; 218 219 int last_line_end = -1; 220 int has_long_lines = 0; 221 222 if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL) 223 goto done; 224 if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL) 225 goto done; 226 227 /* 228 * Then try to determine whether it's any character code we can 229 * identify. Each of these tests, if it succeeds, will leave 230 * the text converted into one-my_unichar-per-character Unicode in 231 * ubuf, and the number of characters converted in ulen. 232 */ 233 if (nbytes == 0) { 234 code = "UTF-8 Unicode"; 235 encoding = NULL; // "UTF-8"; 236 type = "text"; 237 rv = 1; 238 } else if (looks_ascii(buf, nbytes, ubuf, &ulen)) { 239 code = "ASCII"; 240 encoding = NULL; //"us-ascii"; 241 type = "text"; 242 if (nbytes == 1) { 243 // no further tests 244 rv = 1; 245 } 246 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { 247 code = "UTF-8 Unicode"; 248 encoding = NULL; // "UTF-8"; 249 type = "text"; 250 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { 251 if (i == 1) { 252 code = "Little-endian UTF-16 Unicode"; 253 encoding = "UTF-16"; 254 } else { 255 code = "Big-endian UTF-16 Unicode"; 256 encoding = "UTF-16"; 257 } 258 259 type = "character data"; 260 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { 261 code = "ISO-8859"; 262 type = "text"; 263 encoding = "iso-8859-1"; 264 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { 265 code = "Non-ISO extended-ASCII"; 266 type = "text"; 267 encoding = "unknown"; 268 } else { 269 from_ebcdic(buf, nbytes, nbuf); 270 271 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { 272 code = "EBCDIC"; 273 type = "character data"; 274 encoding = "ebcdic"; 275 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { 276 code = "International EBCDIC"; 277 type = "character data"; 278 encoding = "ebcdic"; 279 } else { 280 rv = 0; 281 goto done; /* doesn't look like text at all */ 282 } 283 } 284 285 if (nbytes <= 1) { 286 if (rv == -1) 287 rv = 0; 288 goto done; 289 } 290 291 /* 292 * for troff, look for . + letter + letter or .\"; 293 * this must be done to disambiguate tar archives' ./file 294 * and other trash from real troff input. 295 * 296 * I believe Plan 9 troff allows non-ASCII characters in the names 297 * of macros, so this test might possibly fail on such a file. 298 */ 299 if (*ubuf == '.') { 300 my_unichar *tp = ubuf + 1; 301 302 while (ISSPC(*tp)) 303 ++tp; /* skip leading whitespace */ 304 if ((tp[0] == '\\' && tp[1] == '\"') || 305 (isascii((unsigned char)tp[0]) && 306 isalnum((unsigned char)tp[0]) && 307 isascii((unsigned char)tp[1]) && 308 isalnum((unsigned char)tp[1]) && 309 ISSPC(tp[2]))) { 310 subtypeMimeGeneric = "text/x-source-code"; 311 subtypeMimeSpecific = "text/troff"; 312 subtype = "troff or preprocessor input"; 313 goto subtype_identified; 314 } 315 } 316 317 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { 318 subtypeMimeGeneric = "text/x-source-code"; 319 subtypeMimeSpecific = "text/fortran"; 320 subtype = "fortran program"; 321 goto subtype_identified; 322 } 323 324 /* look for tokens from names.h - this is expensive! */ 325 326 i = 0; 327 while (i < ulen) { 328 size_t end; 329 330 /* 331 * skip past any leading space 332 */ 333 while (i < ulen && ISSPC(ubuf[i])) 334 i++; 335 if (i >= ulen) 336 break; 337 338 /* 339 * find the next whitespace 340 */ 341 for (end = i + 1; end < nbytes; end++) 342 if (ISSPC(ubuf[end])) 343 break; 344 345 /* 346 * compare the word thus isolated against the token list 347 */ 348 for (p = names; p < names + NNAMES; p++) { 349 if (ascmatch((const unsigned char *)p->name, ubuf + i, 350 end - i)) { 351 subtype = types[p->type].human; 352 subtypeMimeGeneric = types[p->type].generic_mime; 353 subtypeMimeSpecific = types[p->type].specific_mime; 354 goto subtype_identified; 355 } 356 } 357 358 i = end; 359 } 360 361 subtype_identified: 362 363 /* 364 * Now try to discover other details about the file. 365 */ 366 for (i = 0; i < ulen; i++) { 367 if (ubuf[i] == '\n') { 368 if (seen_cr) 369 n_crlf++; 370 else 371 n_lf++; 372 last_line_end = i; 373 } else if (seen_cr) 374 n_cr++; 375 376 seen_cr = (ubuf[i] == '\r'); 377 if (seen_cr) 378 last_line_end = i; 379 380 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ 381 n_nel++; 382 last_line_end = i; 383 } 384 385 /* If this line is _longer_ than MAXLINELEN, remember it. */ 386 if ((int)i > last_line_end + MAXLINELEN) 387 has_long_lines = 1; 388 389 if (ubuf[i] == '\033') 390 has_escapes = 1; 391 if (ubuf[i] == '\b') 392 has_backspace = 1; 393 } 394 395 rv = 1; 396 done: 397 if (nbuf) 398 free(nbuf); 399 if (ubuf) 400 free(ubuf); 401 402 if (rv) { 403 // If we have identified the subtype, return it, otherwise just 404 // text/plain. 405 406 bool found = false; 407 if (subtypeMimeSpecific != NULL) { 408 mimeType->SetTo(subtypeMimeSpecific); 409 if (mimeType->IsInstalled()) 410 found = true; 411 } 412 if (!found && subtypeMimeGeneric != NULL) { 413 mimeType->SetTo(subtypeMimeGeneric); 414 if (mimeType->IsInstalled()) 415 found = true; 416 } 417 if (!found) 418 mimeType->SetTo("text/plain"); 419 } 420 421 return rv; 422 } 423 424 static int 425 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen) 426 { 427 size_t i; 428 429 for (i = 0; i < ulen; i++) { 430 if (s[i] != us[i]) 431 return 0; 432 } 433 434 if (s[i]) 435 return 0; 436 else 437 return 1; 438 } 439 440 /* 441 * This table reflects a particular philosophy about what constitutes 442 * "text," and there is room for disagreement about it. 443 * 444 * Version 3.31 of the file command considered a file to be ASCII if 445 * each of its characters was approved by either the isascii() or 446 * isalpha() function. On most systems, this would mean that any 447 * file consisting only of characters in the range 0x00 ... 0x7F 448 * would be called ASCII text, but many systems might reasonably 449 * consider some characters outside this range to be alphabetic, 450 * so the file command would call such characters ASCII. It might 451 * have been more accurate to call this "considered textual on the 452 * local system" than "ASCII." 453 * 454 * It considered a file to be "International language text" if each 455 * of its characters was either an ASCII printing character (according 456 * to the real ASCII standard, not the above test), a character in 457 * the range 0x80 ... 0xFF, or one of the following control characters: 458 * backspace, tab, line feed, vertical tab, form feed, carriage return, 459 * escape. No attempt was made to determine the language in which files 460 * of this type were written. 461 * 462 * 463 * The table below considers a file to be ASCII if all of its characters 464 * are either ASCII printing characters (again, according to the X3.4 465 * standard, not isascii()) or any of the following controls: bell, 466 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 467 * 468 * I include bell because some programs (particularly shell scripts) 469 * use it literally, even though it is rare in normal text. I exclude 470 * vertical tab because it never seems to be used in real text. I also 471 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 472 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 473 * character to. It might be more appropriate to include it in the 8859 474 * set instead of the ASCII set, but it's got to be included in *something* 475 * we recognize or EBCDIC files aren't going to be considered textual. 476 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 477 * and Latin characters, so these should possibly be allowed. But they 478 * make a real mess on VT100-style displays if they're not paired properly, 479 * so we are probably better off not calling them text. 480 * 481 * A file is considered to be ISO-8859 text if its characters are all 482 * either ASCII, according to the above definition, or printing characters 483 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 484 * 485 * Finally, a file is considered to be international text from some other 486 * character code if its characters are all either ISO-8859 (according to 487 * the above definition) or characters in the range 0x80 ... 0x9F, which 488 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 489 * consider to be printing characters. 490 */ 491 492 #define F 0 /* character never appears in text */ 493 #define T 1 /* character appears in plain ASCII text */ 494 #define I 2 /* character appears in ISO-8859 text */ 495 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 496 497 static char text_chars[256] = { 498 /* BEL BS HT LF FF CR */ 499 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 500 /* ESC */ 501 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 502 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 503 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 504 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 505 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 506 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 507 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 508 /* NEL */ 509 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 510 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 511 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 512 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 513 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 514 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 515 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 516 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 517 }; 518 519 static int 520 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 521 size_t *ulen) 522 { 523 int i; 524 525 *ulen = 0; 526 527 for (i = 0; i < (int)nbytes; i++) { 528 int t = text_chars[buf[i]]; 529 530 if (t != T) 531 return 0; 532 533 ubuf[(*ulen)++] = buf[i]; 534 } 535 536 return 1; 537 } 538 539 static int 540 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen) 541 { 542 int i; 543 544 *ulen = 0; 545 546 for (i = 0; i < (int)nbytes; i++) { 547 int t = text_chars[buf[i]]; 548 549 if (t != T && t != I) 550 return 0; 551 552 ubuf[(*ulen)++] = buf[i]; 553 } 554 555 return 1; 556 } 557 558 static int 559 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 560 size_t *ulen) 561 { 562 int i; 563 564 *ulen = 0; 565 566 for (i = 0; i < (int)nbytes; i++) { 567 int t = text_chars[buf[i]]; 568 569 if (t != T && t != I && t != X) 570 return 0; 571 572 ubuf[(*ulen)++] = buf[i]; 573 } 574 575 return 1; 576 } 577 578 static int 579 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen) 580 { 581 int i, n; 582 my_unichar c; 583 int gotone = 0; 584 585 *ulen = 0; 586 587 for (i = 0; i < (int)nbytes; i++) { 588 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 589 /* 590 * Even if the whole file is valid UTF-8 sequences, 591 * still reject it if it uses weird control characters. 592 */ 593 594 if (text_chars[buf[i]] != T) 595 return 0; 596 597 ubuf[(*ulen)++] = buf[i]; 598 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 599 return 0; 600 } else { /* 11xxxxxx begins UTF-8 */ 601 int following; 602 603 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 604 c = buf[i] & 0x1f; 605 following = 1; 606 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 607 c = buf[i] & 0x0f; 608 following = 2; 609 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 610 c = buf[i] & 0x07; 611 following = 3; 612 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 613 c = buf[i] & 0x03; 614 following = 4; 615 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 616 c = buf[i] & 0x01; 617 following = 5; 618 } else 619 return 0; 620 621 for (n = 0; n < following; n++) { 622 i++; 623 if (i >= (int)nbytes) 624 goto done; 625 626 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 627 return 0; 628 629 c = (c << 6) + (buf[i] & 0x3f); 630 } 631 632 ubuf[(*ulen)++] = c; 633 gotone = 1; 634 } 635 } 636 done: 637 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 638 } 639 640 static int 641 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 642 size_t *ulen) 643 { 644 int bigend; 645 int i; 646 647 if (nbytes < 2) 648 return 0; 649 650 if (buf[0] == 0xff && buf[1] == 0xfe) 651 bigend = 0; 652 else if (buf[0] == 0xfe && buf[1] == 0xff) 653 bigend = 1; 654 else 655 return 0; 656 657 *ulen = 0; 658 659 for (i = 2; i + 1 < (int)nbytes; i += 2) { 660 /* XXX fix to properly handle chars > 65536 */ 661 662 if (bigend) 663 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 664 else 665 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 666 667 if (ubuf[*ulen - 1] == 0xfffe) 668 return 0; 669 if (ubuf[*ulen - 1] < 128 && 670 text_chars[(size_t)ubuf[*ulen - 1]] != T) 671 return 0; 672 } 673 674 return 1 + bigend; 675 } 676 677 #undef F 678 #undef T 679 #undef I 680 #undef X 681 682 /* 683 * This table maps each EBCDIC character to an (8-bit extended) ASCII 684 * character, as specified in the rationale for the dd(1) command in 685 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 686 * 687 * Unfortunately it does not seem to correspond exactly to any of the 688 * five variants of EBCDIC documented in IBM's _Enterprise Systems 689 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 690 * Edition, July, 1999, pp. I-1 - I-4. 691 * 692 * Fortunately, though, all versions of EBCDIC, including this one, agree 693 * on most of the printing characters that also appear in (7-bit) ASCII. 694 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 695 * 696 * Fortunately too, there is general agreement that codes 0x00 through 697 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 698 * remainder printing characters. 699 * 700 * This is sufficient to allow us to identify EBCDIC text and to distinguish 701 * between old-style and internationalized examples of text. 702 */ 703 704 static unsigned char ebcdic_to_ascii[] = { 705 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 706 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 707 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 708 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 709 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 710 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 711 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 712 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 713 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 714 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 715 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 716 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 717 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 718 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 719 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 720 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 721 }; 722 723 #ifdef notdef 724 /* 725 * The following EBCDIC-to-ASCII table may relate more closely to reality, 726 * or at least to modern reality. It comes from 727 * 728 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 729 * 730 * and maps the characters of EBCDIC code page 1047 (the code used for 731 * Unix-derived software on IBM's 390 systems) to the corresponding 732 * characters from ISO 8859-1. 733 * 734 * If this table is used instead of the above one, some of the special 735 * cases for the NEL character can be taken out of the code. 736 */ 737 738 static unsigned char ebcdic_1047_to_8859[] = { 739 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 740 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 741 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 742 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 743 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 744 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 745 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 746 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 747 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 748 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 749 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 750 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 751 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 752 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 753 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 754 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 755 }; 756 #endif 757 758 /* 759 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 760 */ 761 static void 762 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 763 { 764 int i; 765 766 for (i = 0; i < (int)nbytes; i++) { 767 out[i] = ebcdic_to_ascii[buf[i]]; 768 } 769 } 770 771 772 // #pragma mark - 773 774 775 /*! 776 Determines if the data in inSource is of the STXT format. 777 778 \param header the STXT stream header read in by Identify() or Translate() 779 \param inSource the stream with the STXT data 780 \param outInfo information about the type of data from inSource is stored here 781 \param outType the desired output type for the data in inSource 782 \param ptxtheader if this is not NULL, the TEXT header from 783 inSource is copied to it 784 */ 785 status_t 786 identify_stxt_header(const TranslatorStyledTextStreamHeader &header, 787 BPositionIO *inSource, translator_info *outInfo, uint32 outType, 788 TranslatorStyledTextTextHeader *ptxtheader = NULL) 789 { 790 const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader); 791 const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader); 792 793 uint8 buffer[max(ktxtsize, kstylsize)]; 794 795 // Check the TEXT header 796 TranslatorStyledTextTextHeader txtheader; 797 if (inSource->Read(buffer, ktxtsize) != ktxtsize) 798 return B_NO_TRANSLATOR; 799 800 memcpy(&txtheader, buffer, ktxtsize); 801 if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize, 802 B_SWAP_BENDIAN_TO_HOST) != B_OK) 803 return B_ERROR; 804 805 if (txtheader.header.magic != 'TEXT' 806 || txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader) 807 || txtheader.charset != B_UNICODE_UTF8) 808 return B_NO_TRANSLATOR; 809 810 // skip the text data 811 off_t seekresult, pos; 812 pos = header.header.header_size + txtheader.header.header_size 813 + txtheader.header.data_size; 814 seekresult = inSource->Seek(txtheader.header.data_size, 815 SEEK_CUR); 816 if (seekresult < pos) 817 return B_NO_TRANSLATOR; 818 if (seekresult > pos) 819 return B_ERROR; 820 821 // check the STYL header (not all STXT files have this) 822 ssize_t read = 0; 823 TranslatorStyledTextStyleHeader stylheader; 824 read = inSource->Read(buffer, kstylsize); 825 if (read < 0) 826 return read; 827 if (read != kstylsize && read != 0) 828 return B_NO_TRANSLATOR; 829 830 // If there is a STYL header 831 if (read == kstylsize) { 832 memcpy(&stylheader, buffer, kstylsize); 833 if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize, 834 B_SWAP_BENDIAN_TO_HOST) != B_OK) 835 return B_ERROR; 836 837 if (stylheader.header.magic != 'STYL' 838 || stylheader.header.header_size != 839 sizeof(TranslatorStyledTextStyleHeader)) 840 return B_NO_TRANSLATOR; 841 } 842 843 // if output TEXT header is supplied, fill it with data 844 if (ptxtheader) { 845 ptxtheader->header.magic = txtheader.header.magic; 846 ptxtheader->header.header_size = txtheader.header.header_size; 847 ptxtheader->header.data_size = txtheader.header.data_size; 848 ptxtheader->charset = txtheader.charset; 849 } 850 851 // return information about the data in the stream 852 outInfo->type = B_STYLED_TEXT_FORMAT; 853 outInfo->group = B_TRANSLATOR_TEXT; 854 outInfo->quality = STXT_IN_QUALITY; 855 outInfo->capability = STXT_IN_CAPABILITY; 856 strcpy(outInfo->name, "Be styled text file"); 857 strcpy(outInfo->MIME, "text/x-vnd.Be-stxt"); 858 859 return B_OK; 860 } 861 862 863 /*! 864 Determines if the data in \a inSource is of the UTF8 plain 865 866 \param data buffer containing data already read (must be at 867 least DATA_BUFFER_SIZE bytes large) 868 \param nread number of bytes that have already been read from the stream 869 \param header the STXT stream header read in by Identify() or Translate() 870 \param inSource the stream with the STXT data 871 \param outInfo information about the type of data from inSource is stored here 872 \param outType the desired output type for the data in inSource 873 */ 874 status_t 875 identify_text(uint8* data, int32 bytesRead, BPositionIO* source, 876 translator_info* outInfo, uint32 outType, const char*& encoding) 877 { 878 ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead); 879 if (readLater < B_OK) 880 return B_NO_TRANSLATOR; 881 882 bytesRead += readLater; 883 884 // TODO: identify encoding as possible! 885 BMimeType type; 886 if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding)) 887 return B_NO_TRANSLATOR; 888 889 float capability = TEXT_IN_CAPABILITY; 890 if (bytesRead < 20) 891 capability = .1f; 892 893 // return information about the data in the stream 894 outInfo->type = B_TRANSLATOR_TEXT; 895 outInfo->group = B_TRANSLATOR_TEXT; 896 outInfo->quality = TEXT_IN_QUALITY; 897 outInfo->capability = capability; 898 899 char description[B_MIME_TYPE_LENGTH]; 900 if (type.GetLongDescription(description) == B_OK) 901 strlcpy(outInfo->name, description, sizeof(outInfo->name)); 902 else 903 strlcpy(outInfo->name, "Plain text file", sizeof(outInfo->name)); 904 905 //strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME)); 906 strcpy(outInfo->MIME, "text/plain"); 907 return B_OK; 908 } 909 910 911 // --------------------------------------------------------------- 912 // translate_from_stxt 913 // 914 // Translates the data in inSource to the type outType and stores 915 // the translated data in outDestination. 916 // 917 // Preconditions: 918 // 919 // Parameters: inSource, the data to be translated 920 // 921 // outDestination, where the translated data is 922 // put 923 // 924 // outType, the type to convert inSource to 925 // 926 // txtheader, the TEXT header from inSource 927 // 928 // 929 // Postconditions: 930 // 931 // Returns: B_BAD_VALUE, if outType is invalid 932 // 933 // B_NO_TRANSLATOR, if this translator doesn't understand the data 934 // 935 // B_ERROR, if there was an error allocating memory or converting 936 // data 937 // 938 // B_OK, if all went well 939 // --------------------------------------------------------------- 940 status_t 941 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination, 942 uint32 outType, const TranslatorStyledTextTextHeader &txtheader) 943 { 944 if (inSource->Seek(0, SEEK_SET) != 0) 945 return B_ERROR; 946 947 const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader); 948 const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader); 949 950 bool btoplain; 951 if (outType == B_TRANSLATOR_TEXT) 952 btoplain = true; 953 else if (outType == B_STYLED_TEXT_FORMAT) 954 btoplain = false; 955 else 956 return B_BAD_VALUE; 957 958 uint8 buffer[READ_BUFFER_SIZE]; 959 ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0; 960 961 // skip to the actual text data when outputting a 962 // plain text file 963 if (btoplain) { 964 if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) != 965 kstxtsize + ktxtsize) 966 return B_ERROR; 967 } 968 969 // Read data from inSource 970 // When outputing B_TRANSLATOR_TEXT, the loop stops when all of 971 // the text data has been read and written. 972 // When outputting B_STYLED_TEXT_FORMAT, the loop stops when all 973 // of the data from inSource has been read and written. 974 if (btoplain) 975 nreed = min((size_t)READ_BUFFER_SIZE, 976 txtheader.header.data_size - ntotalread); 977 else 978 nreed = READ_BUFFER_SIZE; 979 nread = inSource->Read(buffer, nreed); 980 while (nread > 0) { 981 nwritten = outDestination->Write(buffer, nread); 982 if (nwritten != nread) 983 return B_ERROR; 984 985 if (btoplain) { 986 ntotalread += nread; 987 nreed = min((size_t)READ_BUFFER_SIZE, 988 txtheader.header.data_size - ntotalread); 989 } else 990 nreed = READ_BUFFER_SIZE; 991 nread = inSource->Read(buffer, nreed); 992 } 993 994 if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) != 995 ntotalread) 996 // If not all of the text data was able to be read... 997 return B_NO_TRANSLATOR; 998 else 999 return B_OK; 1000 } 1001 1002 // --------------------------------------------------------------- 1003 // output_headers 1004 // 1005 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT 1006 // to outDestination, setting the data_size member of the text header 1007 // to text_data_size 1008 // 1009 // Preconditions: 1010 // 1011 // Parameters: outDestination, where the translated data is 1012 // put 1013 // 1014 // text_data_size, number of bytes in data section 1015 // of the TEXT header 1016 // 1017 // 1018 // Postconditions: 1019 // 1020 // Returns: 1021 // 1022 // B_ERROR, if there was an error writing to outDestination or 1023 // an error with converting the byte order 1024 // 1025 // B_OK, if all went well 1026 // --------------------------------------------------------------- 1027 status_t 1028 output_headers(BPositionIO *outDestination, uint32 text_data_size) 1029 { 1030 const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) + 1031 sizeof(TranslatorStyledTextTextHeader); 1032 status_t result; 1033 TranslatorStyledTextStreamHeader stxtheader; 1034 TranslatorStyledTextTextHeader txtheader; 1035 1036 uint8 buffer[kHeadersSize]; 1037 1038 stxtheader.header.magic = 'STXT'; 1039 stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader); 1040 stxtheader.header.data_size = 0; 1041 stxtheader.version = 100; 1042 memcpy(buffer, &stxtheader, stxtheader.header.header_size); 1043 1044 txtheader.header.magic = 'TEXT'; 1045 txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader); 1046 txtheader.header.data_size = text_data_size; 1047 txtheader.charset = B_UNICODE_UTF8; 1048 memcpy(buffer + stxtheader.header.header_size, &txtheader, 1049 txtheader.header.header_size); 1050 1051 // write out headers in Big Endian byte order 1052 result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize, 1053 B_SWAP_HOST_TO_BENDIAN); 1054 if (result == B_OK) { 1055 ssize_t nwritten = 0; 1056 nwritten = outDestination->Write(buffer, kHeadersSize); 1057 if (nwritten != kHeadersSize) 1058 return B_ERROR; 1059 else 1060 return B_OK; 1061 } 1062 1063 return result; 1064 } 1065 1066 // --------------------------------------------------------------- 1067 // output_styles 1068 // 1069 // Writes out the actual style information into outDestination 1070 // using the data from pflatRunArray 1071 // 1072 // Preconditions: 1073 // 1074 // Parameters: outDestination, where the translated data is 1075 // put 1076 // 1077 // text_size, size in bytes of the text in 1078 // outDestination 1079 // 1080 // data_size, size of pflatRunArray 1081 // 1082 // Postconditions: 1083 // 1084 // Returns: 1085 // 1086 // B_ERROR, if there was an error writing to outDestination or 1087 // an error with converting the byte order 1088 // 1089 // B_OK, if all went well 1090 // --------------------------------------------------------------- 1091 status_t 1092 output_styles(BPositionIO *outDestination, uint32 text_size, 1093 uint8 *pflatRunArray, ssize_t data_size) 1094 { 1095 const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader); 1096 1097 uint8 buffer[kstylsize]; 1098 1099 // output STYL header 1100 TranslatorStyledTextStyleHeader stylheader; 1101 stylheader.header.magic = 'STYL'; 1102 stylheader.header.header_size = 1103 sizeof(TranslatorStyledTextStyleHeader); 1104 stylheader.header.data_size = data_size; 1105 stylheader.apply_offset = 0; 1106 stylheader.apply_length = text_size; 1107 1108 memcpy(buffer, &stylheader, kstylsize); 1109 if (swap_data(B_UINT32_TYPE, buffer, kstylsize, 1110 B_SWAP_HOST_TO_BENDIAN) != B_OK) 1111 return B_ERROR; 1112 if (outDestination->Write(buffer, kstylsize) != kstylsize) 1113 return B_ERROR; 1114 1115 // output actual style information 1116 if (outDestination->Write(pflatRunArray, 1117 data_size) != data_size) 1118 return B_ERROR; 1119 1120 return B_OK; 1121 } 1122 1123 1124 /*! 1125 Convert the plain text (UTF8) from inSource to plain or 1126 styled text in outDestination 1127 */ 1128 status_t 1129 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding, 1130 BPositionIO* destination, uint32 outType) 1131 { 1132 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1133 return B_BAD_VALUE; 1134 1135 // find the length of the text 1136 off_t size = source->Seek(0, SEEK_END); 1137 if (size < 0) 1138 return (status_t)size; 1139 if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT) 1140 return B_NOT_SUPPORTED; 1141 1142 status_t status = source->Seek(0, SEEK_SET); 1143 if (status < B_OK) 1144 return status; 1145 1146 if (outType == B_STYLED_TEXT_FORMAT) { 1147 // output styled text headers 1148 status = output_headers(destination, (uint32)size); 1149 if (status != B_OK) 1150 return status; 1151 } 1152 1153 class MallocBuffer { 1154 public: 1155 MallocBuffer() : fBuffer(NULL), fSize(0) {} 1156 ~MallocBuffer() { free(fBuffer); } 1157 1158 void* Buffer() { return fBuffer; } 1159 size_t Size() const { return fSize; } 1160 1161 status_t 1162 Allocate(size_t size) 1163 { 1164 fBuffer = malloc(size); 1165 if (fBuffer != NULL) { 1166 fSize = size; 1167 return B_OK; 1168 } 1169 return B_NO_MEMORY; 1170 } 1171 1172 private: 1173 void* fBuffer; 1174 size_t fSize; 1175 } encodingBuffer; 1176 BMallocIO encodingIO; 1177 uint32 encodingID = 0; 1178 // defaults to UTF-8 or no encoding 1179 1180 BNode* node = dynamic_cast<BNode*>(source); 1181 if (node != NULL) { 1182 // determine encoding, if available 1183 const BCharacterSet* characterSet = NULL; 1184 bool hasAttribute = false; 1185 if (encoding != NULL && !forceEncoding) { 1186 BString name; 1187 if (node->ReadAttrString("be:encoding", &name) == B_OK) { 1188 encoding = name.String(); 1189 hasAttribute = true; 1190 } else { 1191 int32 value; 1192 ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0, 1193 &value, sizeof(value)); 1194 if (bytesRead == (ssize_t)sizeof(value)) { 1195 hasAttribute = true; 1196 if (value != 65535) 1197 characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value); 1198 } 1199 } 1200 } else { 1201 hasAttribute = true; 1202 // we don't write the encoding in this case 1203 } 1204 if (characterSet == NULL && encoding != NULL) 1205 characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding); 1206 1207 if (characterSet != NULL) { 1208 encodingID = characterSet->GetConversionID(); 1209 encodingBuffer.Allocate(READ_BUFFER_SIZE * 4); 1210 } 1211 1212 if (!hasAttribute && encoding != NULL) { 1213 // add encoding attribute, so that someone opening the file can 1214 // retrieve it for persistance 1215 node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding, 1216 strlen(encoding)); 1217 } 1218 } 1219 1220 off_t outputSize = 0; 1221 ssize_t bytesRead; 1222 int32 state = 0; 1223 1224 // output the actual text part of the data 1225 do { 1226 uint8 buffer[READ_BUFFER_SIZE]; 1227 bytesRead = source->Read(buffer, READ_BUFFER_SIZE); 1228 if (bytesRead < B_OK) 1229 return bytesRead; 1230 if (bytesRead == 0) 1231 break; 1232 1233 if (encodingBuffer.Size() == 0) { 1234 // default, no encoding 1235 ssize_t bytesWritten = destination->Write(buffer, bytesRead); 1236 if (bytesWritten != bytesRead) { 1237 if (bytesWritten < B_OK) 1238 return bytesWritten; 1239 1240 return B_ERROR; 1241 } 1242 1243 outputSize += bytesRead; 1244 } else { 1245 // decode text file to UTF-8 1246 char* pos = (char*)buffer; 1247 int32 encodingLength = encodingIO.BufferLength(); 1248 int32 bytesLeft = bytesRead; 1249 int32 bytes; 1250 do { 1251 encodingLength = READ_BUFFER_SIZE * 4; 1252 bytes = bytesLeft; 1253 1254 status = convert_to_utf8(encodingID, pos, &bytes, 1255 (char*)encodingBuffer.Buffer(), &encodingLength, &state); 1256 if (status < B_OK) 1257 return status; 1258 1259 ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(), 1260 encodingLength); 1261 if (bytesWritten < encodingLength) { 1262 if (bytesWritten < B_OK) 1263 return bytesWritten; 1264 1265 return B_ERROR; 1266 } 1267 1268 pos += bytes; 1269 bytesLeft -= bytes; 1270 outputSize += encodingLength; 1271 } while (encodingLength > 0 && bytesLeft > 0); 1272 } 1273 } while (bytesRead > 0); 1274 1275 if (outType != B_STYLED_TEXT_FORMAT) 1276 return B_OK; 1277 1278 if (encodingBuffer.Size() != 0 && size != outputSize) { 1279 if (outputSize > UINT32_MAX) 1280 return B_NOT_SUPPORTED; 1281 1282 // we need to update the header as the decoded text size has changed 1283 status = destination->Seek(0, SEEK_SET); 1284 if (status == B_OK) 1285 status = output_headers(destination, (uint32)outputSize); 1286 if (status == B_OK) 1287 status = destination->Seek(0, SEEK_END); 1288 1289 if (status < B_OK) 1290 return status; 1291 } 1292 1293 // Read file attributes if outputting styled data 1294 // and source is a BNode object 1295 1296 if (node == NULL) 1297 return B_OK; 1298 1299 // Try to read styles - we only propagate an error if the actual on-disk 1300 // data is likely to be okay 1301 1302 const char *kAttrName = "styles"; 1303 attr_info info; 1304 if (node->GetAttrInfo(kAttrName, &info) != B_OK) 1305 return B_OK; 1306 1307 if (info.type != B_RAW_TYPE || info.size < 160) { 1308 // styles seem to be broken, but since we got the text, 1309 // we don't propagate the error 1310 return B_OK; 1311 } 1312 1313 uint8* flatRunArray = new (std::nothrow) uint8[info.size]; 1314 if (flatRunArray == NULL) 1315 return B_NO_MEMORY; 1316 1317 bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size); 1318 if (bytesRead != info.size) 1319 return B_OK; 1320 1321 output_styles(destination, size, flatRunArray, info.size); 1322 1323 delete[] flatRunArray; 1324 return B_OK; 1325 } 1326 1327 1328 // #pragma mark - 1329 1330 1331 STXTTranslator::STXTTranslator() 1332 : BaseTranslator("StyledEdit Files", "StyledEdit files translator", 1333 STXT_TRANSLATOR_VERSION, 1334 gInputFormats, sizeof(gInputFormats) / sizeof(translation_format), 1335 gOutputFormats, sizeof(gOutputFormats) / sizeof(translation_format), 1336 "STXTTranslator_Settings", 1337 gDefaultSettings, sizeof(gDefaultSettings) / sizeof(TranSetting), 1338 B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT) 1339 { 1340 } 1341 1342 1343 STXTTranslator::~STXTTranslator() 1344 { 1345 } 1346 1347 1348 status_t 1349 STXTTranslator::Identify(BPositionIO *inSource, 1350 const translation_format *inFormat, BMessage *ioExtension, 1351 translator_info *outInfo, uint32 outType) 1352 { 1353 if (!outType) 1354 outType = B_TRANSLATOR_TEXT; 1355 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1356 return B_NO_TRANSLATOR; 1357 1358 const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader); 1359 1360 uint8 buffer[DATA_BUFFER_SIZE]; 1361 status_t nread = 0; 1362 // Read in the header to determine 1363 // if the data is supported 1364 nread = inSource->Read(buffer, kstxtsize); 1365 if (nread < 0) 1366 return nread; 1367 1368 // read in enough data to fill the stream header 1369 if (nread == kstxtsize) { 1370 TranslatorStyledTextStreamHeader header; 1371 memcpy(&header, buffer, kstxtsize); 1372 if (swap_data(B_UINT32_TYPE, &header, kstxtsize, 1373 B_SWAP_BENDIAN_TO_HOST) != B_OK) 1374 return B_ERROR; 1375 1376 if (header.header.magic == B_STYLED_TEXT_FORMAT 1377 && header.header.header_size == (int32)kstxtsize 1378 && header.header.data_size == 0 1379 && header.version == 100) 1380 return identify_stxt_header(header, inSource, outInfo, outType); 1381 } 1382 1383 // if the data is not styled text, check if it is plain text 1384 const char* encoding; 1385 return identify_text(buffer, nread, inSource, outInfo, outType, encoding); 1386 } 1387 1388 1389 status_t 1390 STXTTranslator::Translate(BPositionIO* source, const translator_info* info, 1391 BMessage* ioExtension, uint32 outType, BPositionIO* outDestination) 1392 { 1393 if (!outType) 1394 outType = B_TRANSLATOR_TEXT; 1395 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1396 return B_NO_TRANSLATOR; 1397 1398 const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader); 1399 uint8 buffer[DATA_BUFFER_SIZE]; 1400 status_t result; 1401 translator_info outInfo; 1402 // Read in the header to determine 1403 // if the data is supported 1404 ssize_t bytesRead = source->Read(buffer, headerSize); 1405 if (bytesRead < 0) 1406 return bytesRead; 1407 1408 // read in enough data to fill the stream header 1409 if (bytesRead == headerSize) { 1410 TranslatorStyledTextStreamHeader header; 1411 memcpy(&header, buffer, headerSize); 1412 if (swap_data(B_UINT32_TYPE, &header, headerSize, 1413 B_SWAP_BENDIAN_TO_HOST) != B_OK) 1414 return B_ERROR; 1415 1416 if (header.header.magic == B_STYLED_TEXT_FORMAT 1417 && header.header.header_size == sizeof(TranslatorStyledTextStreamHeader) 1418 && header.header.data_size == 0 1419 && header.version == 100) { 1420 TranslatorStyledTextTextHeader textHeader; 1421 result = identify_stxt_header(header, source, &outInfo, outType, 1422 &textHeader); 1423 if (result != B_OK) 1424 return result; 1425 1426 return translate_from_stxt(source, outDestination, outType, textHeader); 1427 } 1428 } 1429 1430 // if the data is not styled text, check if it is ASCII text 1431 bool forceEncoding = false; 1432 const char* encoding = NULL; 1433 result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding); 1434 if (result != B_OK) 1435 return result; 1436 1437 if (ioExtension != NULL) { 1438 const char* value; 1439 if (ioExtension->FindString("be:encoding", &value) == B_OK 1440 && value[0]) { 1441 // override encoding 1442 encoding = value; 1443 forceEncoding = true; 1444 } 1445 } 1446 1447 return translate_from_text(source, encoding, forceEncoding, outDestination, outType); 1448 } 1449 1450 1451 BView * 1452 STXTTranslator::NewConfigView(TranslatorSettings *settings) 1453 { 1454 return new STXTView(BRect(0, 0, 225, 175), "STXTTranslator Settings", 1455 B_FOLLOW_ALL, B_WILL_DRAW, settings); 1456 } 1457 1458