1 /* 2 * Copyright 2002-2007, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Michael Wilber 7 * Axel Dörfler, axeld@pinc-software.de 8 */ 9 10 11 #include "STXTTranslator.h" 12 #include "STXTView.h" 13 14 #include <CharacterSet.h> 15 #include <CharacterSetRoster.h> 16 #include <MimeType.h> 17 #include <String.h> 18 #include <UTF8.h> 19 20 #include <new> 21 #include <string.h> 22 #include <stdio.h> 23 #include <stdint.h> 24 25 26 using namespace BPrivate; 27 28 29 #define READ_BUFFER_SIZE 32768 30 #define DATA_BUFFER_SIZE 256 31 32 // The input formats that this translator supports. 33 translation_format gInputFormats[] = { 34 { 35 B_TRANSLATOR_TEXT, 36 B_TRANSLATOR_TEXT, 37 TEXT_IN_QUALITY, 38 TEXT_IN_CAPABILITY, 39 "text/plain", 40 "Plain text file" 41 }, 42 { 43 B_STYLED_TEXT_FORMAT, 44 B_TRANSLATOR_TEXT, 45 STXT_IN_QUALITY, 46 STXT_IN_CAPABILITY, 47 "text/x-vnd.Be-stxt", 48 "Be styled text file" 49 } 50 }; 51 52 // The output formats that this translator supports. 53 translation_format gOutputFormats[] = { 54 { 55 B_TRANSLATOR_TEXT, 56 B_TRANSLATOR_TEXT, 57 TEXT_OUT_QUALITY, 58 TEXT_OUT_CAPABILITY, 59 "text/plain", 60 "Plain text file" 61 }, 62 { 63 B_STYLED_TEXT_FORMAT, 64 B_TRANSLATOR_TEXT, 65 STXT_OUT_QUALITY, 66 STXT_OUT_CAPABILITY, 67 "text/x-vnd.Be-stxt", 68 "Be styled text file" 69 } 70 }; 71 72 // Default settings for the Translator 73 TranSetting gDefaultSettings[] = { 74 {B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false}, 75 {B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false} 76 }; 77 78 // --------------------------------------------------------------- 79 // make_nth_translator 80 // 81 // Creates a STXTTranslator object to be used by BTranslatorRoster 82 // 83 // Preconditions: 84 // 85 // Parameters: n, The translator to return. Since 86 // STXTTranslator only publishes one 87 // translator, it only returns a 88 // STXTTranslator if n == 0 89 // 90 // you, The image_id of the add-on that 91 // contains code (not used). 92 // 93 // flags, Has no meaning yet, should be 0. 94 // 95 // Postconditions: 96 // 97 // Returns: NULL if n is not zero, 98 // a new STXTTranslator if n is zero 99 // --------------------------------------------------------------- 100 BTranslator * 101 make_nth_translator(int32 n, image_id you, uint32 flags, ...) 102 { 103 if (!n) 104 return new (std::nothrow) STXTTranslator(); 105 106 return NULL; 107 } 108 109 110 // #pragma mark - ascmagic.c from the BSD file tool 111 /* 112 * The following code has been taken from version 4.17 of the BSD file tool, 113 * file ascmagic.c, modified for our purpose. 114 */ 115 116 /* 117 * Copyright (c) Ian F. Darwin 1986-1995. 118 * Software written by Ian F. Darwin and others; 119 * maintained 1995-present by Christos Zoulas and others. 120 * 121 * Redistribution and use in source and binary forms, with or without 122 * modification, are permitted provided that the following conditions 123 * are met: 124 * 1. Redistributions of source code must retain the above copyright 125 * notice immediately at the beginning of the file, without modification, 126 * this list of conditions, and the following disclaimer. 127 * 2. Redistributions in binary form must reproduce the above copyright 128 * notice, this list of conditions and the following disclaimer in the 129 * documentation and/or other materials provided with the distribution. 130 * 131 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 132 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 133 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 134 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 135 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 136 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 137 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 138 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 139 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 140 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 141 * SUCH DAMAGE. 142 */ 143 /* 144 * ASCII magic -- file types that we know based on keywords 145 * that can appear anywhere in the file. 146 * 147 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, 148 * to handle character codes other than ASCII on a unified basis. 149 * 150 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 151 * international characters, now subsumed into this file. 152 */ 153 154 #include <stdio.h> 155 #include <string.h> 156 #include <memory.h> 157 #include <ctype.h> 158 #include <stdlib.h> 159 #include <unistd.h> 160 #include "names.h" 161 162 typedef unsigned long my_unichar; 163 164 #define MAXLINELEN 300 /* longest sane line length */ 165 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ 166 || (x) == 0x85 || (x) == '\f') 167 168 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *); 169 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *); 170 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *); 171 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *); 172 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *); 173 static void from_ebcdic(const unsigned char *, size_t, unsigned char *); 174 static int ascmatch(const unsigned char *, const my_unichar *, size_t); 175 176 177 static int 178 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType, 179 const char*& encoding) 180 { 181 size_t i; 182 unsigned char *nbuf = NULL; 183 my_unichar *ubuf = NULL; 184 size_t ulen; 185 struct names *p; 186 int rv = -1; 187 188 const char *code = NULL; 189 encoding = NULL; 190 const char *type = NULL; 191 const char *subtype = NULL; 192 const char *subtype_mime = NULL; 193 194 int has_escapes = 0; 195 int has_backspace = 0; 196 int seen_cr = 0; 197 198 int n_crlf = 0; 199 int n_lf = 0; 200 int n_cr = 0; 201 int n_nel = 0; 202 203 int last_line_end = -1; 204 int has_long_lines = 0; 205 206 if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL) 207 goto done; 208 if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL) 209 goto done; 210 211 /* 212 * Then try to determine whether it's any character code we can 213 * identify. Each of these tests, if it succeeds, will leave 214 * the text converted into one-my_unichar-per-character Unicode in 215 * ubuf, and the number of characters converted in ulen. 216 */ 217 if (looks_ascii(buf, nbytes, ubuf, &ulen)) { 218 code = "ASCII"; 219 encoding = NULL; //"us-ascii"; 220 type = "text"; 221 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { 222 code = "UTF-8 Unicode"; 223 encoding = NULL; // "UTF-8"; 224 type = "text"; 225 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { 226 if (i == 1) { 227 code = "Little-endian UTF-16 Unicode"; 228 encoding = "UTF-16"; 229 } else { 230 code = "Big-endian UTF-16 Unicode"; 231 encoding = "UTF-16"; 232 } 233 234 type = "character data"; 235 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { 236 code = "ISO-8859"; 237 type = "text"; 238 encoding = "iso-8859-1"; 239 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { 240 code = "Non-ISO extended-ASCII"; 241 type = "text"; 242 encoding = "unknown"; 243 } else { 244 from_ebcdic(buf, nbytes, nbuf); 245 246 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { 247 code = "EBCDIC"; 248 type = "character data"; 249 encoding = "ebcdic"; 250 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { 251 code = "International EBCDIC"; 252 type = "character data"; 253 encoding = "ebcdic"; 254 } else { 255 rv = 0; 256 goto done; /* doesn't look like text at all */ 257 } 258 } 259 260 if (nbytes <= 1) { 261 rv = 0; 262 goto done; 263 } 264 265 /* 266 * for troff, look for . + letter + letter or .\"; 267 * this must be done to disambiguate tar archives' ./file 268 * and other trash from real troff input. 269 * 270 * I believe Plan 9 troff allows non-ASCII characters in the names 271 * of macros, so this test might possibly fail on such a file. 272 */ 273 if (*ubuf == '.') { 274 my_unichar *tp = ubuf + 1; 275 276 while (ISSPC(*tp)) 277 ++tp; /* skip leading whitespace */ 278 if ((tp[0] == '\\' && tp[1] == '\"') || 279 (isascii((unsigned char)tp[0]) && 280 isalnum((unsigned char)tp[0]) && 281 isascii((unsigned char)tp[1]) && 282 isalnum((unsigned char)tp[1]) && 283 ISSPC(tp[2]))) { 284 subtype_mime = "text/troff"; 285 subtype = "troff or preprocessor input"; 286 goto subtype_identified; 287 } 288 } 289 290 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { 291 subtype_mime = "text/fortran"; 292 subtype = "fortran program"; 293 goto subtype_identified; 294 } 295 296 /* look for tokens from names.h - this is expensive! */ 297 298 i = 0; 299 while (i < ulen) { 300 size_t end; 301 302 /* 303 * skip past any leading space 304 */ 305 while (i < ulen && ISSPC(ubuf[i])) 306 i++; 307 if (i >= ulen) 308 break; 309 310 /* 311 * find the next whitespace 312 */ 313 for (end = i + 1; end < nbytes; end++) 314 if (ISSPC(ubuf[end])) 315 break; 316 317 /* 318 * compare the word thus isolated against the token list 319 */ 320 for (p = names; p < names + NNAMES; p++) { 321 if (ascmatch((const unsigned char *)p->name, ubuf + i, 322 end - i)) { 323 subtype = types[p->type].human; 324 subtype_mime = types[p->type].mime; 325 goto subtype_identified; 326 } 327 } 328 329 i = end; 330 } 331 332 subtype_identified: 333 334 /* 335 * Now try to discover other details about the file. 336 */ 337 for (i = 0; i < ulen; i++) { 338 if (ubuf[i] == '\n') { 339 if (seen_cr) 340 n_crlf++; 341 else 342 n_lf++; 343 last_line_end = i; 344 } else if (seen_cr) 345 n_cr++; 346 347 seen_cr = (ubuf[i] == '\r'); 348 if (seen_cr) 349 last_line_end = i; 350 351 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ 352 n_nel++; 353 last_line_end = i; 354 } 355 356 /* If this line is _longer_ than MAXLINELEN, remember it. */ 357 if ((int)i > last_line_end + MAXLINELEN) 358 has_long_lines = 1; 359 360 if (ubuf[i] == '\033') 361 has_escapes = 1; 362 if (ubuf[i] == '\b') 363 has_backspace = 1; 364 } 365 366 rv = 1; 367 done: 368 if (nbuf) 369 free(nbuf); 370 if (ubuf) 371 free(ubuf); 372 373 if (rv) { 374 // If we have identified the subtype, return it, otherwise just 375 // text/plain. 376 if (subtype_mime) 377 mimeType->SetTo(subtype_mime); 378 else 379 mimeType->SetTo("text/plain"); 380 } 381 382 return rv; 383 } 384 385 static int 386 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen) 387 { 388 size_t i; 389 390 for (i = 0; i < ulen; i++) { 391 if (s[i] != us[i]) 392 return 0; 393 } 394 395 if (s[i]) 396 return 0; 397 else 398 return 1; 399 } 400 401 /* 402 * This table reflects a particular philosophy about what constitutes 403 * "text," and there is room for disagreement about it. 404 * 405 * Version 3.31 of the file command considered a file to be ASCII if 406 * each of its characters was approved by either the isascii() or 407 * isalpha() function. On most systems, this would mean that any 408 * file consisting only of characters in the range 0x00 ... 0x7F 409 * would be called ASCII text, but many systems might reasonably 410 * consider some characters outside this range to be alphabetic, 411 * so the file command would call such characters ASCII. It might 412 * have been more accurate to call this "considered textual on the 413 * local system" than "ASCII." 414 * 415 * It considered a file to be "International language text" if each 416 * of its characters was either an ASCII printing character (according 417 * to the real ASCII standard, not the above test), a character in 418 * the range 0x80 ... 0xFF, or one of the following control characters: 419 * backspace, tab, line feed, vertical tab, form feed, carriage return, 420 * escape. No attempt was made to determine the language in which files 421 * of this type were written. 422 * 423 * 424 * The table below considers a file to be ASCII if all of its characters 425 * are either ASCII printing characters (again, according to the X3.4 426 * standard, not isascii()) or any of the following controls: bell, 427 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 428 * 429 * I include bell because some programs (particularly shell scripts) 430 * use it literally, even though it is rare in normal text. I exclude 431 * vertical tab because it never seems to be used in real text. I also 432 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 433 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 434 * character to. It might be more appropriate to include it in the 8859 435 * set instead of the ASCII set, but it's got to be included in *something* 436 * we recognize or EBCDIC files aren't going to be considered textual. 437 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 438 * and Latin characters, so these should possibly be allowed. But they 439 * make a real mess on VT100-style displays if they're not paired properly, 440 * so we are probably better off not calling them text. 441 * 442 * A file is considered to be ISO-8859 text if its characters are all 443 * either ASCII, according to the above definition, or printing characters 444 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 445 * 446 * Finally, a file is considered to be international text from some other 447 * character code if its characters are all either ISO-8859 (according to 448 * the above definition) or characters in the range 0x80 ... 0x9F, which 449 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 450 * consider to be printing characters. 451 */ 452 453 #define F 0 /* character never appears in text */ 454 #define T 1 /* character appears in plain ASCII text */ 455 #define I 2 /* character appears in ISO-8859 text */ 456 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 457 458 static char text_chars[256] = { 459 /* BEL BS HT LF FF CR */ 460 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 461 /* ESC */ 462 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 463 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 464 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 465 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 466 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 467 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 468 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 469 /* NEL */ 470 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 471 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 472 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 473 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 474 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 475 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 476 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 477 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 478 }; 479 480 static int 481 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 482 size_t *ulen) 483 { 484 int i; 485 486 *ulen = 0; 487 488 for (i = 0; i < (int)nbytes; i++) { 489 int t = text_chars[buf[i]]; 490 491 if (t != T) 492 return 0; 493 494 ubuf[(*ulen)++] = buf[i]; 495 } 496 497 return 1; 498 } 499 500 static int 501 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen) 502 { 503 int i; 504 505 *ulen = 0; 506 507 for (i = 0; i < (int)nbytes; i++) { 508 int t = text_chars[buf[i]]; 509 510 if (t != T && t != I) 511 return 0; 512 513 ubuf[(*ulen)++] = buf[i]; 514 } 515 516 return 1; 517 } 518 519 static int 520 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 521 size_t *ulen) 522 { 523 int i; 524 525 *ulen = 0; 526 527 for (i = 0; i < (int)nbytes; i++) { 528 int t = text_chars[buf[i]]; 529 530 if (t != T && t != I && t != X) 531 return 0; 532 533 ubuf[(*ulen)++] = buf[i]; 534 } 535 536 return 1; 537 } 538 539 static int 540 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen) 541 { 542 int i, n; 543 my_unichar c; 544 int gotone = 0; 545 546 *ulen = 0; 547 548 for (i = 0; i < (int)nbytes; i++) { 549 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 550 /* 551 * Even if the whole file is valid UTF-8 sequences, 552 * still reject it if it uses weird control characters. 553 */ 554 555 if (text_chars[buf[i]] != T) 556 return 0; 557 558 ubuf[(*ulen)++] = buf[i]; 559 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 560 return 0; 561 } else { /* 11xxxxxx begins UTF-8 */ 562 int following; 563 564 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 565 c = buf[i] & 0x1f; 566 following = 1; 567 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 568 c = buf[i] & 0x0f; 569 following = 2; 570 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 571 c = buf[i] & 0x07; 572 following = 3; 573 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 574 c = buf[i] & 0x03; 575 following = 4; 576 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 577 c = buf[i] & 0x01; 578 following = 5; 579 } else 580 return 0; 581 582 for (n = 0; n < following; n++) { 583 i++; 584 if (i >= (int)nbytes) 585 goto done; 586 587 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 588 return 0; 589 590 c = (c << 6) + (buf[i] & 0x3f); 591 } 592 593 ubuf[(*ulen)++] = c; 594 gotone = 1; 595 } 596 } 597 done: 598 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 599 } 600 601 static int 602 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, 603 size_t *ulen) 604 { 605 int bigend; 606 int i; 607 608 if (nbytes < 2) 609 return 0; 610 611 if (buf[0] == 0xff && buf[1] == 0xfe) 612 bigend = 0; 613 else if (buf[0] == 0xfe && buf[1] == 0xff) 614 bigend = 1; 615 else 616 return 0; 617 618 *ulen = 0; 619 620 for (i = 2; i + 1 < (int)nbytes; i += 2) { 621 /* XXX fix to properly handle chars > 65536 */ 622 623 if (bigend) 624 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 625 else 626 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 627 628 if (ubuf[*ulen - 1] == 0xfffe) 629 return 0; 630 if (ubuf[*ulen - 1] < 128 && 631 text_chars[(size_t)ubuf[*ulen - 1]] != T) 632 return 0; 633 } 634 635 return 1 + bigend; 636 } 637 638 #undef F 639 #undef T 640 #undef I 641 #undef X 642 643 /* 644 * This table maps each EBCDIC character to an (8-bit extended) ASCII 645 * character, as specified in the rationale for the dd(1) command in 646 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 647 * 648 * Unfortunately it does not seem to correspond exactly to any of the 649 * five variants of EBCDIC documented in IBM's _Enterprise Systems 650 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 651 * Edition, July, 1999, pp. I-1 - I-4. 652 * 653 * Fortunately, though, all versions of EBCDIC, including this one, agree 654 * on most of the printing characters that also appear in (7-bit) ASCII. 655 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 656 * 657 * Fortunately too, there is general agreement that codes 0x00 through 658 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 659 * remainder printing characters. 660 * 661 * This is sufficient to allow us to identify EBCDIC text and to distinguish 662 * between old-style and internationalized examples of text. 663 */ 664 665 static unsigned char ebcdic_to_ascii[] = { 666 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 667 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 668 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 669 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 670 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 671 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 672 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 673 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 674 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 675 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 676 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 677 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 678 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 679 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 680 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 681 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 682 }; 683 684 #ifdef notdef 685 /* 686 * The following EBCDIC-to-ASCII table may relate more closely to reality, 687 * or at least to modern reality. It comes from 688 * 689 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 690 * 691 * and maps the characters of EBCDIC code page 1047 (the code used for 692 * Unix-derived software on IBM's 390 systems) to the corresponding 693 * characters from ISO 8859-1. 694 * 695 * If this table is used instead of the above one, some of the special 696 * cases for the NEL character can be taken out of the code. 697 */ 698 699 static unsigned char ebcdic_1047_to_8859[] = { 700 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 701 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 702 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 703 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 704 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 705 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 706 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 707 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 708 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 709 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 710 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 711 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 712 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 713 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 714 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 715 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 716 }; 717 #endif 718 719 /* 720 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 721 */ 722 static void 723 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 724 { 725 int i; 726 727 for (i = 0; i < (int)nbytes; i++) { 728 out[i] = ebcdic_to_ascii[buf[i]]; 729 } 730 } 731 732 733 // #pragma mark - 734 735 736 /*! 737 Determines if the data in inSource is of the STXT format. 738 739 \param header the STXT stream header read in by Identify() or Translate() 740 \param inSource the stream with the STXT data 741 \param outInfo information about the type of data from inSource is stored here 742 \param outType the desired output type for the data in inSource 743 \param ptxtheader if this is not NULL, the TEXT header from 744 inSource is copied to it 745 */ 746 status_t 747 identify_stxt_header(const TranslatorStyledTextStreamHeader &header, 748 BPositionIO *inSource, translator_info *outInfo, uint32 outType, 749 TranslatorStyledTextTextHeader *ptxtheader = NULL) 750 { 751 const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader); 752 const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader); 753 754 uint8 buffer[max(ktxtsize, kstylsize)]; 755 756 // Check the TEXT header 757 TranslatorStyledTextTextHeader txtheader; 758 if (inSource->Read(buffer, ktxtsize) != ktxtsize) 759 return B_NO_TRANSLATOR; 760 761 memcpy(&txtheader, buffer, ktxtsize); 762 if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize, 763 B_SWAP_BENDIAN_TO_HOST) != B_OK) 764 return B_ERROR; 765 766 if (txtheader.header.magic != 'TEXT' 767 || txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader) 768 || txtheader.charset != B_UNICODE_UTF8) 769 return B_NO_TRANSLATOR; 770 771 // skip the text data 772 off_t seekresult, pos; 773 pos = header.header.header_size + txtheader.header.header_size 774 + txtheader.header.data_size; 775 seekresult = inSource->Seek(txtheader.header.data_size, 776 SEEK_CUR); 777 if (seekresult < pos) 778 return B_NO_TRANSLATOR; 779 if (seekresult > pos) 780 return B_ERROR; 781 782 // check the STYL header (not all STXT files have this) 783 ssize_t read = 0; 784 TranslatorStyledTextStyleHeader stylheader; 785 read = inSource->Read(buffer, kstylsize); 786 if (read < 0) 787 return read; 788 if (read != kstylsize && read != 0) 789 return B_NO_TRANSLATOR; 790 791 // If there is a STYL header 792 if (read == kstylsize) { 793 memcpy(&stylheader, buffer, kstylsize); 794 if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize, 795 B_SWAP_BENDIAN_TO_HOST) != B_OK) 796 return B_ERROR; 797 798 if (stylheader.header.magic != 'STYL' 799 || stylheader.header.header_size != 800 sizeof(TranslatorStyledTextStyleHeader)) 801 return B_NO_TRANSLATOR; 802 } 803 804 // if output TEXT header is supplied, fill it with data 805 if (ptxtheader) { 806 ptxtheader->header.magic = txtheader.header.magic; 807 ptxtheader->header.header_size = txtheader.header.header_size; 808 ptxtheader->header.data_size = txtheader.header.data_size; 809 ptxtheader->charset = txtheader.charset; 810 } 811 812 // return information about the data in the stream 813 outInfo->type = B_STYLED_TEXT_FORMAT; 814 outInfo->group = B_TRANSLATOR_TEXT; 815 outInfo->quality = STXT_IN_QUALITY; 816 outInfo->capability = STXT_IN_CAPABILITY; 817 strcpy(outInfo->name, "Be styled text file"); 818 strcpy(outInfo->MIME, "text/x-vnd.Be-stxt"); 819 820 return B_OK; 821 } 822 823 824 /*! 825 Determines if the data in \a inSource is of the UTF8 plain 826 827 \param data buffer containing data already read (must be at 828 least DATA_BUFFER_SIZE bytes large) 829 \param nread number of bytes that have already been read from the stream 830 \param header the STXT stream header read in by Identify() or Translate() 831 \param inSource the stream with the STXT data 832 \param outInfo information about the type of data from inSource is stored here 833 \param outType the desired output type for the data in inSource 834 */ 835 status_t 836 identify_text(uint8* data, int32 bytesRead, BPositionIO* source, 837 translator_info* outInfo, uint32 outType, const char*& encoding) 838 { 839 ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead); 840 if (readLater < B_OK) 841 return B_NO_TRANSLATOR; 842 843 bytesRead += readLater; 844 845 // TODO: identify encoding as possible! 846 BMimeType type; 847 if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding)) 848 return B_NO_TRANSLATOR; 849 850 float capability = TEXT_IN_CAPABILITY; 851 if (bytesRead < 20) 852 capability = .1f; 853 854 // return information about the data in the stream 855 outInfo->type = B_TRANSLATOR_TEXT; 856 outInfo->group = B_TRANSLATOR_TEXT; 857 outInfo->quality = TEXT_IN_QUALITY; 858 outInfo->capability = capability; 859 860 char description[B_MIME_TYPE_LENGTH]; 861 if (type.GetLongDescription(description) == B_OK) 862 strlcpy(outInfo->name, description, sizeof(outInfo->name)); 863 else 864 strlcpy(outInfo->name, "Plain text file", sizeof(outInfo->name)); 865 866 //strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME)); 867 strcpy(outInfo->MIME, "text/plain"); 868 return B_OK; 869 } 870 871 872 // --------------------------------------------------------------- 873 // translate_from_stxt 874 // 875 // Translates the data in inSource to the type outType and stores 876 // the translated data in outDestination. 877 // 878 // Preconditions: 879 // 880 // Parameters: inSource, the data to be translated 881 // 882 // outDestination, where the translated data is 883 // put 884 // 885 // outType, the type to convert inSource to 886 // 887 // txtheader, the TEXT header from inSource 888 // 889 // 890 // Postconditions: 891 // 892 // Returns: B_BAD_VALUE, if outType is invalid 893 // 894 // B_NO_TRANSLATOR, if this translator doesn't understand the data 895 // 896 // B_ERROR, if there was an error allocating memory or converting 897 // data 898 // 899 // B_OK, if all went well 900 // --------------------------------------------------------------- 901 status_t 902 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination, 903 uint32 outType, const TranslatorStyledTextTextHeader &txtheader) 904 { 905 if (inSource->Seek(0, SEEK_SET) != 0) 906 return B_ERROR; 907 908 const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader); 909 const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader); 910 911 bool btoplain; 912 if (outType == B_TRANSLATOR_TEXT) 913 btoplain = true; 914 else if (outType == B_STYLED_TEXT_FORMAT) 915 btoplain = false; 916 else 917 return B_BAD_VALUE; 918 919 uint8 buffer[READ_BUFFER_SIZE]; 920 ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0; 921 922 // skip to the actual text data when outputting a 923 // plain text file 924 if (btoplain) { 925 if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) != 926 kstxtsize + ktxtsize) 927 return B_ERROR; 928 } 929 930 // Read data from inSource 931 // When outputing B_TRANSLATOR_TEXT, the loop stops when all of 932 // the text data has been read and written. 933 // When outputting B_STYLED_TEXT_FORMAT, the loop stops when all 934 // of the data from inSource has been read and written. 935 if (btoplain) 936 nreed = min(READ_BUFFER_SIZE, 937 txtheader.header.data_size - ntotalread); 938 else 939 nreed = READ_BUFFER_SIZE; 940 nread = inSource->Read(buffer, nreed); 941 while (nread > 0) { 942 nwritten = outDestination->Write(buffer, nread); 943 if (nwritten != nread) 944 return B_ERROR; 945 946 if (btoplain) { 947 ntotalread += nread; 948 nreed = min(READ_BUFFER_SIZE, 949 txtheader.header.data_size - ntotalread); 950 } else 951 nreed = READ_BUFFER_SIZE; 952 nread = inSource->Read(buffer, nreed); 953 } 954 955 if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) != 956 ntotalread) 957 // If not all of the text data was able to be read... 958 return B_NO_TRANSLATOR; 959 else 960 return B_OK; 961 } 962 963 // --------------------------------------------------------------- 964 // output_headers 965 // 966 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT 967 // to outDestination, setting the data_size member of the text header 968 // to text_data_size 969 // 970 // Preconditions: 971 // 972 // Parameters: outDestination, where the translated data is 973 // put 974 // 975 // text_data_size, number of bytes in data section 976 // of the TEXT header 977 // 978 // 979 // Postconditions: 980 // 981 // Returns: 982 // 983 // B_ERROR, if there was an error writing to outDestination or 984 // an error with converting the byte order 985 // 986 // B_OK, if all went well 987 // --------------------------------------------------------------- 988 status_t 989 output_headers(BPositionIO *outDestination, uint32 text_data_size) 990 { 991 const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) + 992 sizeof(TranslatorStyledTextTextHeader); 993 status_t result; 994 TranslatorStyledTextStreamHeader stxtheader; 995 TranslatorStyledTextTextHeader txtheader; 996 997 uint8 buffer[kHeadersSize]; 998 999 stxtheader.header.magic = 'STXT'; 1000 stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader); 1001 stxtheader.header.data_size = 0; 1002 stxtheader.version = 100; 1003 memcpy(buffer, &stxtheader, stxtheader.header.header_size); 1004 1005 txtheader.header.magic = 'TEXT'; 1006 txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader); 1007 txtheader.header.data_size = text_data_size; 1008 txtheader.charset = B_UNICODE_UTF8; 1009 memcpy(buffer + stxtheader.header.header_size, &txtheader, 1010 txtheader.header.header_size); 1011 1012 // write out headers in Big Endian byte order 1013 result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize, 1014 B_SWAP_HOST_TO_BENDIAN); 1015 if (result == B_OK) { 1016 ssize_t nwritten = 0; 1017 nwritten = outDestination->Write(buffer, kHeadersSize); 1018 if (nwritten != kHeadersSize) 1019 return B_ERROR; 1020 else 1021 return B_OK; 1022 } 1023 1024 return result; 1025 } 1026 1027 // --------------------------------------------------------------- 1028 // output_styles 1029 // 1030 // Writes out the actual style information into outDestination 1031 // using the data from pflatRunArray 1032 // 1033 // Preconditions: 1034 // 1035 // Parameters: outDestination, where the translated data is 1036 // put 1037 // 1038 // text_size, size in bytes of the text in 1039 // outDestination 1040 // 1041 // data_size, size of pflatRunArray 1042 // 1043 // Postconditions: 1044 // 1045 // Returns: 1046 // 1047 // B_ERROR, if there was an error writing to outDestination or 1048 // an error with converting the byte order 1049 // 1050 // B_OK, if all went well 1051 // --------------------------------------------------------------- 1052 status_t 1053 output_styles(BPositionIO *outDestination, uint32 text_size, 1054 uint8 *pflatRunArray, ssize_t data_size) 1055 { 1056 const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader); 1057 1058 uint8 buffer[kstylsize]; 1059 1060 // output STYL header 1061 TranslatorStyledTextStyleHeader stylheader; 1062 stylheader.header.magic = 'STYL'; 1063 stylheader.header.header_size = 1064 sizeof(TranslatorStyledTextStyleHeader); 1065 stylheader.header.data_size = data_size; 1066 stylheader.apply_offset = 0; 1067 stylheader.apply_length = text_size; 1068 1069 memcpy(buffer, &stylheader, kstylsize); 1070 if (swap_data(B_UINT32_TYPE, buffer, kstylsize, 1071 B_SWAP_HOST_TO_BENDIAN) != B_OK) 1072 return B_ERROR; 1073 if (outDestination->Write(buffer, kstylsize) != kstylsize) 1074 return B_ERROR; 1075 1076 // output actual style information 1077 if (outDestination->Write(pflatRunArray, 1078 data_size) != data_size) 1079 return B_ERROR; 1080 1081 return B_OK; 1082 } 1083 1084 1085 /*! 1086 Convert the plain text (UTF8) from inSource to plain or 1087 styled text in outDestination 1088 */ 1089 status_t 1090 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding, 1091 BPositionIO* destination, uint32 outType) 1092 { 1093 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1094 return B_BAD_VALUE; 1095 1096 // find the length of the text 1097 off_t size = source->Seek(0, SEEK_END); 1098 if (size < 0) 1099 return (status_t)size; 1100 if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT) 1101 return B_NOT_SUPPORTED; 1102 1103 status_t status = source->Seek(0, SEEK_SET); 1104 if (status < B_OK) 1105 return status; 1106 1107 if (outType == B_STYLED_TEXT_FORMAT) { 1108 // output styled text headers 1109 status = output_headers(destination, (uint32)size); 1110 if (status != B_OK) 1111 return status; 1112 } 1113 1114 class MallocBuffer { 1115 public: 1116 MallocBuffer() : fBuffer(NULL), fSize(0) {} 1117 ~MallocBuffer() { free(fBuffer); } 1118 1119 void* Buffer() { return fBuffer; } 1120 size_t Size() const { return fSize; } 1121 1122 status_t 1123 Allocate(size_t size) 1124 { 1125 fBuffer = malloc(size); 1126 if (fBuffer != NULL) { 1127 fSize = size; 1128 return B_OK; 1129 } 1130 return B_NO_MEMORY; 1131 } 1132 1133 private: 1134 void* fBuffer; 1135 size_t fSize; 1136 } encodingBuffer; 1137 BMallocIO encodingIO; 1138 uint32 encodingID = 0; 1139 // defaults to UTF-8 or no encoding 1140 1141 BNode* node = dynamic_cast<BNode*>(source); 1142 if (node != NULL) { 1143 // determine encoding, if available 1144 const BCharacterSet* characterSet = NULL; 1145 bool hasAttribute = false; 1146 if (encoding != NULL && !forceEncoding) { 1147 BString name; 1148 if (node->ReadAttrString("be:encoding", &name) == B_OK) { 1149 encoding = name.String(); 1150 hasAttribute = true; 1151 } else { 1152 int32 value; 1153 ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0, 1154 &value, sizeof(value)); 1155 if (bytesRead == (ssize_t)sizeof(value)) { 1156 hasAttribute = true; 1157 if (value != 65535) 1158 characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value); 1159 } 1160 } 1161 } else { 1162 hasAttribute = true; 1163 // we don't write the encoding in this case 1164 } 1165 if (characterSet == NULL && encoding != NULL) 1166 characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding); 1167 1168 if (characterSet != NULL) { 1169 encodingID = characterSet->GetConversionID(); 1170 encodingBuffer.Allocate(READ_BUFFER_SIZE * 4); 1171 } 1172 1173 if (!hasAttribute && encoding != NULL) { 1174 // add encoding attribute, so that someone opening the file can 1175 // retrieve it for persistance 1176 node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding, 1177 strlen(encoding)); 1178 } 1179 } 1180 1181 off_t outputSize = 0; 1182 ssize_t bytesRead; 1183 int32 state = 0; 1184 1185 // output the actual text part of the data 1186 do { 1187 uint8 buffer[READ_BUFFER_SIZE]; 1188 bytesRead = source->Read(buffer, READ_BUFFER_SIZE); 1189 if (bytesRead < B_OK) 1190 return bytesRead; 1191 if (bytesRead == 0) 1192 break; 1193 1194 if (encodingBuffer.Size() == 0) { 1195 // default, no encoding 1196 ssize_t bytesWritten = destination->Write(buffer, bytesRead); 1197 if (bytesWritten != bytesRead) { 1198 if (bytesWritten < B_OK) 1199 return bytesWritten; 1200 1201 return B_ERROR; 1202 } 1203 1204 outputSize += bytesRead; 1205 } else { 1206 // decode text file to UTF-8 1207 char* pos = (char*)buffer; 1208 int32 encodingLength = encodingIO.BufferLength(); 1209 int32 bytesLeft = bytesRead; 1210 int32 bytes; 1211 do { 1212 encodingLength = READ_BUFFER_SIZE * 4; 1213 bytes = bytesLeft; 1214 1215 status = convert_to_utf8(encodingID, pos, &bytes, 1216 (char*)encodingBuffer.Buffer(), &encodingLength, &state); 1217 if (status < B_OK) 1218 return status; 1219 1220 ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(), 1221 encodingLength); 1222 if (bytesWritten < encodingLength) { 1223 if (bytesWritten < B_OK) 1224 return bytesWritten; 1225 1226 return B_ERROR; 1227 } 1228 1229 pos += bytes; 1230 bytesLeft -= bytes; 1231 outputSize += encodingLength; 1232 } while (encodingLength > 0 && bytesLeft > 0); 1233 } 1234 } while (bytesRead > 0); 1235 1236 if (outType != B_STYLED_TEXT_FORMAT) 1237 return B_OK; 1238 1239 if (encodingBuffer.Size() != 0 && size != outputSize) { 1240 if (outputSize > UINT32_MAX) 1241 return B_NOT_SUPPORTED; 1242 1243 // we need to update the header as the decoded text size has changed 1244 status = destination->Seek(0, SEEK_SET); 1245 if (status == B_OK) 1246 status = output_headers(destination, (uint32)outputSize); 1247 if (status == B_OK) 1248 status = destination->Seek(0, SEEK_END); 1249 1250 if (status < B_OK) 1251 return status; 1252 } 1253 1254 // Read file attributes if outputting styled data 1255 // and source is a BNode object 1256 1257 if (node == NULL) 1258 return B_OK; 1259 1260 // Try to read styles - we only propagate an error if the actual on-disk 1261 // data is likely to be okay 1262 1263 const char *kAttrName = "styles"; 1264 attr_info info; 1265 if (node->GetAttrInfo(kAttrName, &info) != B_OK) 1266 return B_OK; 1267 1268 if (info.type != B_RAW_TYPE || info.size < 160) { 1269 // styles seem to be broken, but since we got the text, 1270 // we don't propagate the error 1271 return B_OK; 1272 } 1273 1274 uint8* flatRunArray = new (std::nothrow) uint8[info.size]; 1275 if (flatRunArray == NULL) 1276 return B_NO_MEMORY; 1277 1278 bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size); 1279 if (bytesRead != info.size) 1280 return B_OK; 1281 1282 output_styles(destination, size, flatRunArray, info.size); 1283 1284 delete[] flatRunArray; 1285 return B_OK; 1286 } 1287 1288 1289 // #pragma mark - 1290 1291 1292 STXTTranslator::STXTTranslator() 1293 : BaseTranslator("StyledEdit Files", "StyledEdit files translator", 1294 STXT_TRANSLATOR_VERSION, 1295 gInputFormats, sizeof(gInputFormats) / sizeof(translation_format), 1296 gOutputFormats, sizeof(gOutputFormats) / sizeof(translation_format), 1297 "STXTTranslator_Settings", 1298 gDefaultSettings, sizeof(gDefaultSettings) / sizeof(TranSetting), 1299 B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT) 1300 { 1301 } 1302 1303 1304 STXTTranslator::~STXTTranslator() 1305 { 1306 } 1307 1308 1309 status_t 1310 STXTTranslator::Identify(BPositionIO *inSource, 1311 const translation_format *inFormat, BMessage *ioExtension, 1312 translator_info *outInfo, uint32 outType) 1313 { 1314 if (!outType) 1315 outType = B_TRANSLATOR_TEXT; 1316 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1317 return B_NO_TRANSLATOR; 1318 1319 const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader); 1320 1321 uint8 buffer[DATA_BUFFER_SIZE]; 1322 status_t nread = 0; 1323 // Read in the header to determine 1324 // if the data is supported 1325 nread = inSource->Read(buffer, kstxtsize); 1326 if (nread < 0) 1327 return nread; 1328 1329 // read in enough data to fill the stream header 1330 if (nread == kstxtsize) { 1331 TranslatorStyledTextStreamHeader header; 1332 memcpy(&header, buffer, kstxtsize); 1333 if (swap_data(B_UINT32_TYPE, &header, kstxtsize, 1334 B_SWAP_BENDIAN_TO_HOST) != B_OK) 1335 return B_ERROR; 1336 1337 if (header.header.magic == B_STYLED_TEXT_FORMAT 1338 && header.header.header_size == (int32)kstxtsize 1339 && header.header.data_size == 0 1340 && header.version == 100) 1341 return identify_stxt_header(header, inSource, outInfo, outType); 1342 } 1343 1344 // if the data is not styled text, check if it is plain text 1345 const char* encoding; 1346 return identify_text(buffer, nread, inSource, outInfo, outType, encoding); 1347 } 1348 1349 1350 status_t 1351 STXTTranslator::Translate(BPositionIO* source, const translator_info* info, 1352 BMessage* ioExtension, uint32 outType, BPositionIO* outDestination) 1353 { 1354 if (!outType) 1355 outType = B_TRANSLATOR_TEXT; 1356 if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT) 1357 return B_NO_TRANSLATOR; 1358 1359 const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader); 1360 uint8 buffer[DATA_BUFFER_SIZE]; 1361 status_t result; 1362 translator_info outInfo; 1363 // Read in the header to determine 1364 // if the data is supported 1365 ssize_t bytesRead = source->Read(buffer, headerSize); 1366 if (bytesRead < 0) 1367 return bytesRead; 1368 1369 // read in enough data to fill the stream header 1370 if (bytesRead == headerSize) { 1371 TranslatorStyledTextStreamHeader header; 1372 memcpy(&header, buffer, headerSize); 1373 if (swap_data(B_UINT32_TYPE, &header, headerSize, 1374 B_SWAP_BENDIAN_TO_HOST) != B_OK) 1375 return B_ERROR; 1376 1377 if (header.header.magic == B_STYLED_TEXT_FORMAT 1378 && header.header.header_size == sizeof(TranslatorStyledTextStreamHeader) 1379 && header.header.data_size == 0 1380 && header.version == 100) { 1381 TranslatorStyledTextTextHeader textHeader; 1382 result = identify_stxt_header(header, source, &outInfo, outType, 1383 &textHeader); 1384 if (result != B_OK) 1385 return result; 1386 1387 return translate_from_stxt(source, outDestination, outType, textHeader); 1388 } 1389 } 1390 1391 // if the data is not styled text, check if it is ASCII text 1392 bool forceEncoding = false; 1393 const char* encoding = NULL; 1394 result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding); 1395 if (result != B_OK) 1396 return result; 1397 1398 if (ioExtension != NULL) { 1399 const char* value; 1400 if (ioExtension->FindString("be:encoding", &value) == B_OK 1401 && value[0]) { 1402 // override encoding 1403 encoding = value; 1404 forceEncoding = true; 1405 } 1406 } 1407 1408 return translate_from_text(source, encoding, forceEncoding, outDestination, outType); 1409 } 1410 1411 1412 BView * 1413 STXTTranslator::NewConfigView(TranslatorSettings *settings) 1414 { 1415 return new STXTView(BRect(0, 0, 225, 175), "STXTTranslator Settings", 1416 B_FOLLOW_ALL, B_WILL_DRAW, settings); 1417 } 1418 1419