xref: /haiku/src/add-ons/translators/stxt/STXTTranslator.cpp (revision 020cbad9d40235a2c50a81a42d69912a5ff8fbc4)
1 /*
2  * Copyright 2002-2007, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Michael Wilber
7  *		Axel Dörfler, axeld@pinc-software.de
8  */
9 
10 
11 #include "STXTTranslator.h"
12 #include "STXTView.h"
13 
14 #include <CharacterSet.h>
15 #include <CharacterSetRoster.h>
16 #include <MimeType.h>
17 #include <String.h>
18 #include <UTF8.h>
19 
20 #include <new>
21 #include <string.h>
22 #include <stdio.h>
23 #include <stdint.h>
24 
25 
26 using namespace BPrivate;
27 
28 
29 #define READ_BUFFER_SIZE 32768
30 #define DATA_BUFFER_SIZE 256
31 
32 // The input formats that this translator supports.
33 translation_format gInputFormats[] = {
34 	{
35 		B_TRANSLATOR_TEXT,
36 		B_TRANSLATOR_TEXT,
37 		TEXT_IN_QUALITY,
38 		TEXT_IN_CAPABILITY,
39 		"text/plain",
40 		"Plain text file"
41 	},
42 	{
43 		B_STYLED_TEXT_FORMAT,
44 		B_TRANSLATOR_TEXT,
45 		STXT_IN_QUALITY,
46 		STXT_IN_CAPABILITY,
47 		"text/x-vnd.Be-stxt",
48 		"Be styled text file"
49 	}
50 };
51 
52 // The output formats that this translator supports.
53 translation_format gOutputFormats[] = {
54 	{
55 		B_TRANSLATOR_TEXT,
56 		B_TRANSLATOR_TEXT,
57 		TEXT_OUT_QUALITY,
58 		TEXT_OUT_CAPABILITY,
59 		"text/plain",
60 		"Plain text file"
61 	},
62 	{
63 		B_STYLED_TEXT_FORMAT,
64 		B_TRANSLATOR_TEXT,
65 		STXT_OUT_QUALITY,
66 		STXT_OUT_CAPABILITY,
67 		"text/x-vnd.Be-stxt",
68 		"Be styled text file"
69 	}
70 };
71 
72 // Default settings for the Translator
73 TranSetting gDefaultSettings[] = {
74 	{B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
75 	{B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
76 };
77 
78 // ---------------------------------------------------------------
79 // make_nth_translator
80 //
81 // Creates a STXTTranslator object to be used by BTranslatorRoster
82 //
83 // Preconditions:
84 //
85 // Parameters: n,		The translator to return. Since
86 //						STXTTranslator only publishes one
87 //						translator, it only returns a
88 //						STXTTranslator if n == 0
89 //
90 //             you, 	The image_id of the add-on that
91 //						contains code (not used).
92 //
93 //             flags,	Has no meaning yet, should be 0.
94 //
95 // Postconditions:
96 //
97 // Returns: NULL if n is not zero,
98 //          a new STXTTranslator if n is zero
99 // ---------------------------------------------------------------
100 BTranslator *
101 make_nth_translator(int32 n, image_id you, uint32 flags, ...)
102 {
103 	if (!n)
104 		return new (std::nothrow) STXTTranslator();
105 
106 	return NULL;
107 }
108 
109 
110 // #pragma mark - ascmagic.c from the BSD file tool
111 /*
112  * The following code has been taken from version 4.17 of the BSD file tool,
113  * file ascmagic.c, modified for our purpose.
114  */
115 
116 /*
117  * Copyright (c) Ian F. Darwin 1986-1995.
118  * Software written by Ian F. Darwin and others;
119  * maintained 1995-present by Christos Zoulas and others.
120  *
121  * Redistribution and use in source and binary forms, with or without
122  * modification, are permitted provided that the following conditions
123  * are met:
124  * 1. Redistributions of source code must retain the above copyright
125  *    notice immediately at the beginning of the file, without modification,
126  *    this list of conditions, and the following disclaimer.
127  * 2. Redistributions in binary form must reproduce the above copyright
128  *    notice, this list of conditions and the following disclaimer in the
129  *    documentation and/or other materials provided with the distribution.
130  *
131  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
132  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
133  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
134  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
135  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
136  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
137  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
138  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
139  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
140  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
141  * SUCH DAMAGE.
142  */
143 /*
144  * ASCII magic -- file types that we know based on keywords
145  * that can appear anywhere in the file.
146  *
147  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
148  * to handle character codes other than ASCII on a unified basis.
149  *
150  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
151  * international characters, now subsumed into this file.
152  */
153 
154 #include <stdio.h>
155 #include <string.h>
156 #include <memory.h>
157 #include <ctype.h>
158 #include <stdlib.h>
159 #include <unistd.h>
160 #include "names.h"
161 
162 typedef unsigned long my_unichar;
163 
164 #define MAXLINELEN 300	/* longest sane line length */
165 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
166 		  || (x) == 0x85 || (x) == '\f')
167 
168 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
169 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
170 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
171 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
172 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
173 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
174 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
175 
176 
177 static int
178 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
179 	const char*& encoding)
180 {
181 	size_t i;
182 	unsigned char *nbuf = NULL;
183 	my_unichar *ubuf = NULL;
184 	size_t ulen;
185 	struct names *p;
186 	int rv = -1;
187 
188 	const char *code = NULL;
189 	encoding = NULL;
190 	const char *type = NULL;
191 	const char *subtype = NULL;
192 	const char *subtype_mime = NULL;
193 
194 	int has_escapes = 0;
195 	int has_backspace = 0;
196 	int seen_cr = 0;
197 
198 	int n_crlf = 0;
199 	int n_lf = 0;
200 	int n_cr = 0;
201 	int n_nel = 0;
202 
203 	int last_line_end = -1;
204 	int has_long_lines = 0;
205 
206 	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
207 		goto done;
208 	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
209 		goto done;
210 
211 	/*
212 	 * Then try to determine whether it's any character code we can
213 	 * identify.  Each of these tests, if it succeeds, will leave
214 	 * the text converted into one-my_unichar-per-character Unicode in
215 	 * ubuf, and the number of characters converted in ulen.
216 	 */
217 	if (nbytes == 0) {
218 		code = "UTF-8 Unicode";
219 		encoding = NULL; // "UTF-8";
220 		type = "text";
221 		rv = 1;
222 	} else if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
223 		code = "ASCII";
224 		encoding = NULL; //"us-ascii";
225 		type = "text";
226 		if (nbytes == 1) {
227 			// no further tests
228 			rv = 1;
229 		}
230 	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
231 		code = "UTF-8 Unicode";
232 		encoding = NULL; // "UTF-8";
233 		type = "text";
234 	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
235 		if (i == 1) {
236 			code = "Little-endian UTF-16 Unicode";
237 			encoding = "UTF-16";
238 		} else {
239 			code = "Big-endian UTF-16 Unicode";
240 			encoding = "UTF-16";
241 		}
242 
243 		type = "character data";
244 	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
245 		code = "ISO-8859";
246 		type = "text";
247 		encoding = "iso-8859-1";
248 	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
249 		code = "Non-ISO extended-ASCII";
250 		type = "text";
251 		encoding = "unknown";
252 	} else {
253 		from_ebcdic(buf, nbytes, nbuf);
254 
255 		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
256 			code = "EBCDIC";
257 			type = "character data";
258 			encoding = "ebcdic";
259 		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
260 			code = "International EBCDIC";
261 			type = "character data";
262 			encoding = "ebcdic";
263 		} else {
264 			rv = 0;
265 			goto done;  /* doesn't look like text at all */
266 		}
267 	}
268 
269 	if (nbytes <= 1) {
270 		if (rv == -1)
271 			rv = 0;
272 		goto done;
273 	}
274 
275 	/*
276 	 * for troff, look for . + letter + letter or .\";
277 	 * this must be done to disambiguate tar archives' ./file
278 	 * and other trash from real troff input.
279 	 *
280 	 * I believe Plan 9 troff allows non-ASCII characters in the names
281 	 * of macros, so this test might possibly fail on such a file.
282 	 */
283 	if (*ubuf == '.') {
284 		my_unichar *tp = ubuf + 1;
285 
286 		while (ISSPC(*tp))
287 			++tp;	/* skip leading whitespace */
288 		if ((tp[0] == '\\' && tp[1] == '\"') ||
289 		    (isascii((unsigned char)tp[0]) &&
290 		     isalnum((unsigned char)tp[0]) &&
291 		     isascii((unsigned char)tp[1]) &&
292 		     isalnum((unsigned char)tp[1]) &&
293 		     ISSPC(tp[2]))) {
294 			subtype_mime = "text/troff";
295 			subtype = "troff or preprocessor input";
296 			goto subtype_identified;
297 		}
298 	}
299 
300 	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
301 		subtype_mime = "text/fortran";
302 		subtype = "fortran program";
303 		goto subtype_identified;
304 	}
305 
306 	/* look for tokens from names.h - this is expensive! */
307 
308 	i = 0;
309 	while (i < ulen) {
310 		size_t end;
311 
312 		/*
313 		 * skip past any leading space
314 		 */
315 		while (i < ulen && ISSPC(ubuf[i]))
316 			i++;
317 		if (i >= ulen)
318 			break;
319 
320 		/*
321 		 * find the next whitespace
322 		 */
323 		for (end = i + 1; end < nbytes; end++)
324 			if (ISSPC(ubuf[end]))
325 				break;
326 
327 		/*
328 		 * compare the word thus isolated against the token list
329 		 */
330 		for (p = names; p < names + NNAMES; p++) {
331 			if (ascmatch((const unsigned char *)p->name, ubuf + i,
332 			    end - i)) {
333 				subtype = types[p->type].human;
334 				subtype_mime = types[p->type].mime;
335 				goto subtype_identified;
336 			}
337 		}
338 
339 		i = end;
340 	}
341 
342 subtype_identified:
343 
344 	/*
345 	 * Now try to discover other details about the file.
346 	 */
347 	for (i = 0; i < ulen; i++) {
348 		if (ubuf[i] == '\n') {
349 			if (seen_cr)
350 				n_crlf++;
351 			else
352 				n_lf++;
353 			last_line_end = i;
354 		} else if (seen_cr)
355 			n_cr++;
356 
357 		seen_cr = (ubuf[i] == '\r');
358 		if (seen_cr)
359 			last_line_end = i;
360 
361 		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
362 			n_nel++;
363 			last_line_end = i;
364 		}
365 
366 		/* If this line is _longer_ than MAXLINELEN, remember it. */
367 		if ((int)i > last_line_end + MAXLINELEN)
368 			has_long_lines = 1;
369 
370 		if (ubuf[i] == '\033')
371 			has_escapes = 1;
372 		if (ubuf[i] == '\b')
373 			has_backspace = 1;
374 	}
375 
376 	rv = 1;
377 done:
378 	if (nbuf)
379 		free(nbuf);
380 	if (ubuf)
381 		free(ubuf);
382 
383 	if (rv) {
384 		// If we have identified the subtype, return it, otherwise just
385 		// text/plain.
386 		if (subtype_mime)
387 			mimeType->SetTo(subtype_mime);
388 		else
389 			mimeType->SetTo("text/plain");
390 	}
391 
392 	return rv;
393 }
394 
395 static int
396 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
397 {
398 	size_t i;
399 
400 	for (i = 0; i < ulen; i++) {
401 		if (s[i] != us[i])
402 			return 0;
403 	}
404 
405 	if (s[i])
406 		return 0;
407 	else
408 		return 1;
409 }
410 
411 /*
412  * This table reflects a particular philosophy about what constitutes
413  * "text," and there is room for disagreement about it.
414  *
415  * Version 3.31 of the file command considered a file to be ASCII if
416  * each of its characters was approved by either the isascii() or
417  * isalpha() function.  On most systems, this would mean that any
418  * file consisting only of characters in the range 0x00 ... 0x7F
419  * would be called ASCII text, but many systems might reasonably
420  * consider some characters outside this range to be alphabetic,
421  * so the file command would call such characters ASCII.  It might
422  * have been more accurate to call this "considered textual on the
423  * local system" than "ASCII."
424  *
425  * It considered a file to be "International language text" if each
426  * of its characters was either an ASCII printing character (according
427  * to the real ASCII standard, not the above test), a character in
428  * the range 0x80 ... 0xFF, or one of the following control characters:
429  * backspace, tab, line feed, vertical tab, form feed, carriage return,
430  * escape.  No attempt was made to determine the language in which files
431  * of this type were written.
432  *
433  *
434  * The table below considers a file to be ASCII if all of its characters
435  * are either ASCII printing characters (again, according to the X3.4
436  * standard, not isascii()) or any of the following controls: bell,
437  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
438  *
439  * I include bell because some programs (particularly shell scripts)
440  * use it literally, even though it is rare in normal text.  I exclude
441  * vertical tab because it never seems to be used in real text.  I also
442  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
443  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
444  * character to.  It might be more appropriate to include it in the 8859
445  * set instead of the ASCII set, but it's got to be included in *something*
446  * we recognize or EBCDIC files aren't going to be considered textual.
447  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
448  * and Latin characters, so these should possibly be allowed.  But they
449  * make a real mess on VT100-style displays if they're not paired properly,
450  * so we are probably better off not calling them text.
451  *
452  * A file is considered to be ISO-8859 text if its characters are all
453  * either ASCII, according to the above definition, or printing characters
454  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
455  *
456  * Finally, a file is considered to be international text from some other
457  * character code if its characters are all either ISO-8859 (according to
458  * the above definition) or characters in the range 0x80 ... 0x9F, which
459  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
460  * consider to be printing characters.
461  */
462 
463 #define F 0   /* character never appears in text */
464 #define T 1   /* character appears in plain ASCII text */
465 #define I 2   /* character appears in ISO-8859 text */
466 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
467 
468 static char text_chars[256] = {
469 	/*                  BEL BS HT LF    FF CR    */
470 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
471         /*                              ESC          */
472 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
473 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
474 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
475 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
476 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
477 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
478 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
479 	/*            NEL                            */
480 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
481 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
482 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
483 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
484 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
485 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
486 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
487 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
488 };
489 
490 static int
491 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
492     size_t *ulen)
493 {
494 	int i;
495 
496 	*ulen = 0;
497 
498 	for (i = 0; i < (int)nbytes; i++) {
499 		int t = text_chars[buf[i]];
500 
501 		if (t != T)
502 			return 0;
503 
504 		ubuf[(*ulen)++] = buf[i];
505 	}
506 
507 	return 1;
508 }
509 
510 static int
511 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
512 {
513 	int i;
514 
515 	*ulen = 0;
516 
517 	for (i = 0; i < (int)nbytes; i++) {
518 		int t = text_chars[buf[i]];
519 
520 		if (t != T && t != I)
521 			return 0;
522 
523 		ubuf[(*ulen)++] = buf[i];
524 	}
525 
526 	return 1;
527 }
528 
529 static int
530 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
531     size_t *ulen)
532 {
533 	int i;
534 
535 	*ulen = 0;
536 
537 	for (i = 0; i < (int)nbytes; i++) {
538 		int t = text_chars[buf[i]];
539 
540 		if (t != T && t != I && t != X)
541 			return 0;
542 
543 		ubuf[(*ulen)++] = buf[i];
544 	}
545 
546 	return 1;
547 }
548 
549 static int
550 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
551 {
552 	int i, n;
553 	my_unichar c;
554 	int gotone = 0;
555 
556 	*ulen = 0;
557 
558 	for (i = 0; i < (int)nbytes; i++) {
559 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
560 			/*
561 			 * Even if the whole file is valid UTF-8 sequences,
562 			 * still reject it if it uses weird control characters.
563 			 */
564 
565 			if (text_chars[buf[i]] != T)
566 				return 0;
567 
568 			ubuf[(*ulen)++] = buf[i];
569 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
570 			return 0;
571 		} else {			   /* 11xxxxxx begins UTF-8 */
572 			int following;
573 
574 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
575 				c = buf[i] & 0x1f;
576 				following = 1;
577 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
578 				c = buf[i] & 0x0f;
579 				following = 2;
580 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
581 				c = buf[i] & 0x07;
582 				following = 3;
583 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
584 				c = buf[i] & 0x03;
585 				following = 4;
586 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
587 				c = buf[i] & 0x01;
588 				following = 5;
589 			} else
590 				return 0;
591 
592 			for (n = 0; n < following; n++) {
593 				i++;
594 				if (i >= (int)nbytes)
595 					goto done;
596 
597 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
598 					return 0;
599 
600 				c = (c << 6) + (buf[i] & 0x3f);
601 			}
602 
603 			ubuf[(*ulen)++] = c;
604 			gotone = 1;
605 		}
606 	}
607 done:
608 	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
609 }
610 
611 static int
612 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
613     size_t *ulen)
614 {
615 	int bigend;
616 	int i;
617 
618 	if (nbytes < 2)
619 		return 0;
620 
621 	if (buf[0] == 0xff && buf[1] == 0xfe)
622 		bigend = 0;
623 	else if (buf[0] == 0xfe && buf[1] == 0xff)
624 		bigend = 1;
625 	else
626 		return 0;
627 
628 	*ulen = 0;
629 
630 	for (i = 2; i + 1 < (int)nbytes; i += 2) {
631 		/* XXX fix to properly handle chars > 65536 */
632 
633 		if (bigend)
634 			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
635 		else
636 			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
637 
638 		if (ubuf[*ulen - 1] == 0xfffe)
639 			return 0;
640 		if (ubuf[*ulen - 1] < 128 &&
641 		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
642 			return 0;
643 	}
644 
645 	return 1 + bigend;
646 }
647 
648 #undef F
649 #undef T
650 #undef I
651 #undef X
652 
653 /*
654  * This table maps each EBCDIC character to an (8-bit extended) ASCII
655  * character, as specified in the rationale for the dd(1) command in
656  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
657  *
658  * Unfortunately it does not seem to correspond exactly to any of the
659  * five variants of EBCDIC documented in IBM's _Enterprise Systems
660  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
661  * Edition, July, 1999, pp. I-1 - I-4.
662  *
663  * Fortunately, though, all versions of EBCDIC, including this one, agree
664  * on most of the printing characters that also appear in (7-bit) ASCII.
665  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
666  *
667  * Fortunately too, there is general agreement that codes 0x00 through
668  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
669  * remainder printing characters.
670  *
671  * This is sufficient to allow us to identify EBCDIC text and to distinguish
672  * between old-style and internationalized examples of text.
673  */
674 
675 static unsigned char ebcdic_to_ascii[] = {
676   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
677  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
678 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
679 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
680 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
681 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
682 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
683 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
684 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
685 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
686 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
687 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
688 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
689 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
690 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
691 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
692 };
693 
694 #ifdef notdef
695 /*
696  * The following EBCDIC-to-ASCII table may relate more closely to reality,
697  * or at least to modern reality.  It comes from
698  *
699  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
700  *
701  * and maps the characters of EBCDIC code page 1047 (the code used for
702  * Unix-derived software on IBM's 390 systems) to the corresponding
703  * characters from ISO 8859-1.
704  *
705  * If this table is used instead of the above one, some of the special
706  * cases for the NEL character can be taken out of the code.
707  */
708 
709 static unsigned char ebcdic_1047_to_8859[] = {
710 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
711 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
712 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
713 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
714 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
715 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
716 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
717 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
718 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
719 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
720 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
721 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
722 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
723 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
724 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
725 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
726 };
727 #endif
728 
729 /*
730  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
731  */
732 static void
733 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
734 {
735 	int i;
736 
737 	for (i = 0; i < (int)nbytes; i++) {
738 		out[i] = ebcdic_to_ascii[buf[i]];
739 	}
740 }
741 
742 
743 //	#pragma mark -
744 
745 
746 /*!
747 	Determines if the data in inSource is of the STXT format.
748 
749 	\param header the STXT stream header read in by Identify() or Translate()
750 	\param inSource the stream with the STXT data
751 	\param outInfo information about the type of data from inSource is stored here
752 	\param outType the desired output type for the data in inSource
753 	\param ptxtheader if this is not NULL, the TEXT header from
754 		inSource is copied to it
755 */
756 status_t
757 identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
758 	BPositionIO *inSource, translator_info *outInfo, uint32 outType,
759 	TranslatorStyledTextTextHeader *ptxtheader = NULL)
760 {
761 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
762 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
763 
764 	uint8 buffer[max(ktxtsize, kstylsize)];
765 
766 	// Check the TEXT header
767 	TranslatorStyledTextTextHeader txtheader;
768 	if (inSource->Read(buffer, ktxtsize) != ktxtsize)
769 		return B_NO_TRANSLATOR;
770 
771 	memcpy(&txtheader, buffer, ktxtsize);
772 	if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
773 		B_SWAP_BENDIAN_TO_HOST) != B_OK)
774 		return B_ERROR;
775 
776 	if (txtheader.header.magic != 'TEXT'
777 		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
778 		|| txtheader.charset != B_UNICODE_UTF8)
779 		return B_NO_TRANSLATOR;
780 
781 	// skip the text data
782 	off_t seekresult, pos;
783 	pos = header.header.header_size + txtheader.header.header_size
784 		+ txtheader.header.data_size;
785 	seekresult = inSource->Seek(txtheader.header.data_size,
786 		SEEK_CUR);
787 	if (seekresult < pos)
788 		return B_NO_TRANSLATOR;
789 	if (seekresult > pos)
790 		return B_ERROR;
791 
792 	// check the STYL header (not all STXT files have this)
793 	ssize_t read = 0;
794 	TranslatorStyledTextStyleHeader stylheader;
795 	read = inSource->Read(buffer, kstylsize);
796 	if (read < 0)
797 		return read;
798 	if (read != kstylsize && read != 0)
799 		return B_NO_TRANSLATOR;
800 
801 	// If there is a STYL header
802 	if (read == kstylsize) {
803 		memcpy(&stylheader, buffer, kstylsize);
804 		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
805 			B_SWAP_BENDIAN_TO_HOST) != B_OK)
806 			return B_ERROR;
807 
808 		if (stylheader.header.magic != 'STYL'
809 			|| stylheader.header.header_size !=
810 				sizeof(TranslatorStyledTextStyleHeader))
811 			return B_NO_TRANSLATOR;
812 	}
813 
814 	// if output TEXT header is supplied, fill it with data
815 	if (ptxtheader) {
816 		ptxtheader->header.magic = txtheader.header.magic;
817 		ptxtheader->header.header_size = txtheader.header.header_size;
818 		ptxtheader->header.data_size = txtheader.header.data_size;
819 		ptxtheader->charset = txtheader.charset;
820 	}
821 
822 	// return information about the data in the stream
823 	outInfo->type = B_STYLED_TEXT_FORMAT;
824 	outInfo->group = B_TRANSLATOR_TEXT;
825 	outInfo->quality = STXT_IN_QUALITY;
826 	outInfo->capability = STXT_IN_CAPABILITY;
827 	strcpy(outInfo->name, "Be styled text file");
828 	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
829 
830 	return B_OK;
831 }
832 
833 
834 /*!
835 	Determines if the data in \a inSource is of the UTF8 plain
836 
837 	\param data buffer containing data already read (must be at
838 		least DATA_BUFFER_SIZE bytes large)
839 	\param nread number of bytes that have already been read from the stream
840 	\param header the STXT stream header read in by Identify() or Translate()
841 	\param inSource the stream with the STXT data
842 	\param outInfo information about the type of data from inSource is stored here
843 	\param outType the desired output type for the data in inSource
844 */
845 status_t
846 identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
847 	translator_info* outInfo, uint32 outType, const char*& encoding)
848 {
849 	ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
850 	if (readLater < B_OK)
851 		return B_NO_TRANSLATOR;
852 
853 	bytesRead += readLater;
854 
855 	// TODO: identify encoding as possible!
856 	BMimeType type;
857 	if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
858 		return B_NO_TRANSLATOR;
859 
860 	float capability = TEXT_IN_CAPABILITY;
861 	if (bytesRead < 20)
862 		capability = .1f;
863 
864 	// return information about the data in the stream
865 	outInfo->type = B_TRANSLATOR_TEXT;
866 	outInfo->group = B_TRANSLATOR_TEXT;
867 	outInfo->quality = TEXT_IN_QUALITY;
868 	outInfo->capability = capability;
869 
870 	char description[B_MIME_TYPE_LENGTH];
871 	if (type.GetLongDescription(description) == B_OK)
872 		strlcpy(outInfo->name, description, sizeof(outInfo->name));
873 	else
874 		strlcpy(outInfo->name, "Plain text file", sizeof(outInfo->name));
875 
876 	//strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
877 	strcpy(outInfo->MIME, "text/plain");
878 	return B_OK;
879 }
880 
881 
882 // ---------------------------------------------------------------
883 // translate_from_stxt
884 //
885 // Translates the data in inSource to the type outType and stores
886 // the translated data in outDestination.
887 //
888 // Preconditions:
889 //
890 // Parameters:	inSource,	the data to be translated
891 //
892 //				outDestination,	where the translated data is
893 //								put
894 //
895 //				outType,	the type to convert inSource to
896 //
897 //				txtheader, 	the TEXT header from inSource
898 //
899 //
900 // Postconditions:
901 //
902 // Returns: B_BAD_VALUE, if outType is invalid
903 //
904 // B_NO_TRANSLATOR, if this translator doesn't understand the data
905 //
906 // B_ERROR, if there was an error allocating memory or converting
907 //          data
908 //
909 // B_OK, if all went well
910 // ---------------------------------------------------------------
911 status_t
912 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
913 		uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
914 {
915 	if (inSource->Seek(0, SEEK_SET) != 0)
916 		return B_ERROR;
917 
918 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
919 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
920 
921 	bool btoplain;
922 	if (outType == B_TRANSLATOR_TEXT)
923 		btoplain = true;
924 	else if (outType == B_STYLED_TEXT_FORMAT)
925 		btoplain = false;
926 	else
927 		return B_BAD_VALUE;
928 
929 	uint8 buffer[READ_BUFFER_SIZE];
930 	ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
931 
932 	// skip to the actual text data when outputting a
933 	// plain text file
934 	if (btoplain) {
935 		if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
936 			kstxtsize + ktxtsize)
937 			return B_ERROR;
938 	}
939 
940 	// Read data from inSource
941 	// When outputing B_TRANSLATOR_TEXT, the loop stops when all of
942 	// the text data has been read and written.
943 	// When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
944 	// of the data from inSource has been read and written.
945 	if (btoplain)
946 		nreed = min(READ_BUFFER_SIZE,
947 			txtheader.header.data_size - ntotalread);
948 	else
949 		nreed = READ_BUFFER_SIZE;
950 	nread = inSource->Read(buffer, nreed);
951 	while (nread > 0) {
952 		nwritten = outDestination->Write(buffer, nread);
953 		if (nwritten != nread)
954 			return B_ERROR;
955 
956 		if (btoplain) {
957 			ntotalread += nread;
958 			nreed = min(READ_BUFFER_SIZE,
959 				txtheader.header.data_size - ntotalread);
960 		} else
961 			nreed = READ_BUFFER_SIZE;
962 		nread = inSource->Read(buffer, nreed);
963 	}
964 
965 	if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
966 		ntotalread)
967 		// If not all of the text data was able to be read...
968 		return B_NO_TRANSLATOR;
969 	else
970 		return B_OK;
971 }
972 
973 // ---------------------------------------------------------------
974 // output_headers
975 //
976 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
977 // to outDestination, setting the data_size member of the text header
978 // to text_data_size
979 //
980 // Preconditions:
981 //
982 // Parameters:	outDestination,	where the translated data is
983 //								put
984 //
985 //				text_data_size, number of bytes in data section
986 //							    of the TEXT header
987 //
988 //
989 // Postconditions:
990 //
991 // Returns:
992 //
993 // B_ERROR, if there was an error writing to outDestination or
994 // 	an error with converting the byte order
995 //
996 // B_OK, if all went well
997 // ---------------------------------------------------------------
998 status_t
999 output_headers(BPositionIO *outDestination, uint32 text_data_size)
1000 {
1001 	const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
1002 		sizeof(TranslatorStyledTextTextHeader);
1003 	status_t result;
1004 	TranslatorStyledTextStreamHeader stxtheader;
1005 	TranslatorStyledTextTextHeader txtheader;
1006 
1007 	uint8 buffer[kHeadersSize];
1008 
1009 	stxtheader.header.magic = 'STXT';
1010 	stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1011 	stxtheader.header.data_size = 0;
1012 	stxtheader.version = 100;
1013 	memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1014 
1015 	txtheader.header.magic = 'TEXT';
1016 	txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1017 	txtheader.header.data_size = text_data_size;
1018 	txtheader.charset = B_UNICODE_UTF8;
1019 	memcpy(buffer + stxtheader.header.header_size, &txtheader,
1020 		txtheader.header.header_size);
1021 
1022 	// write out headers in Big Endian byte order
1023 	result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1024 		B_SWAP_HOST_TO_BENDIAN);
1025 	if (result == B_OK) {
1026 		ssize_t nwritten = 0;
1027 		nwritten = outDestination->Write(buffer, kHeadersSize);
1028 		if (nwritten != kHeadersSize)
1029 			return B_ERROR;
1030 		else
1031 			return B_OK;
1032 	}
1033 
1034 	return result;
1035 }
1036 
1037 // ---------------------------------------------------------------
1038 // output_styles
1039 //
1040 // Writes out the actual style information into outDestination
1041 // using the data from pflatRunArray
1042 //
1043 // Preconditions:
1044 //
1045 // Parameters:	outDestination,	where the translated data is
1046 //								put
1047 //
1048 //				text_size,		size in bytes of the text in
1049 //								outDestination
1050 //
1051 //				data_size,		size of pflatRunArray
1052 //
1053 // Postconditions:
1054 //
1055 // Returns:
1056 //
1057 // B_ERROR, if there was an error writing to outDestination or
1058 // 	an error with converting the byte order
1059 //
1060 // B_OK, if all went well
1061 // ---------------------------------------------------------------
1062 status_t
1063 output_styles(BPositionIO *outDestination, uint32 text_size,
1064 	uint8 *pflatRunArray, ssize_t data_size)
1065 {
1066 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1067 
1068 	uint8 buffer[kstylsize];
1069 
1070 	// output STYL header
1071 	TranslatorStyledTextStyleHeader stylheader;
1072 	stylheader.header.magic = 'STYL';
1073 	stylheader.header.header_size =
1074 		sizeof(TranslatorStyledTextStyleHeader);
1075 	stylheader.header.data_size = data_size;
1076 	stylheader.apply_offset = 0;
1077 	stylheader.apply_length = text_size;
1078 
1079 	memcpy(buffer, &stylheader, kstylsize);
1080 	if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1081 		B_SWAP_HOST_TO_BENDIAN) != B_OK)
1082 		return B_ERROR;
1083 	if (outDestination->Write(buffer, kstylsize) != kstylsize)
1084 		return B_ERROR;
1085 
1086 	// output actual style information
1087 	if (outDestination->Write(pflatRunArray,
1088 		data_size) != data_size)
1089 		return B_ERROR;
1090 
1091 	return B_OK;
1092 }
1093 
1094 
1095 /*!
1096 	Convert the plain text (UTF8) from inSource to plain or
1097 	styled text in outDestination
1098 */
1099 status_t
1100 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1101 	BPositionIO* destination, uint32 outType)
1102 {
1103 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1104 		return B_BAD_VALUE;
1105 
1106 	// find the length of the text
1107 	off_t size = source->Seek(0, SEEK_END);
1108 	if (size < 0)
1109 		return (status_t)size;
1110 	if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1111 		return B_NOT_SUPPORTED;
1112 
1113 	status_t status = source->Seek(0, SEEK_SET);
1114 	if (status < B_OK)
1115 		return status;
1116 
1117 	if (outType == B_STYLED_TEXT_FORMAT) {
1118 		// output styled text headers
1119 		status = output_headers(destination, (uint32)size);
1120 		if (status != B_OK)
1121 			return status;
1122 	}
1123 
1124 	class MallocBuffer {
1125 		public:
1126 			MallocBuffer() : fBuffer(NULL), fSize(0) {}
1127 			~MallocBuffer() { free(fBuffer); }
1128 
1129 			void* Buffer() { return fBuffer; }
1130 			size_t Size() const { return fSize; }
1131 
1132 			status_t
1133 			Allocate(size_t size)
1134 			{
1135 				fBuffer = malloc(size);
1136 				if (fBuffer != NULL) {
1137 					fSize = size;
1138 					return B_OK;
1139 				}
1140 				return B_NO_MEMORY;
1141 			}
1142 
1143 		private:
1144 			void*	fBuffer;
1145 			size_t	fSize;
1146 	} encodingBuffer;
1147 	BMallocIO encodingIO;
1148 	uint32 encodingID = 0;
1149 		// defaults to UTF-8 or no encoding
1150 
1151 	BNode* node = dynamic_cast<BNode*>(source);
1152 	if (node != NULL) {
1153 		// determine encoding, if available
1154 		const BCharacterSet* characterSet = NULL;
1155 		bool hasAttribute = false;
1156 		if (encoding != NULL && !forceEncoding) {
1157 			BString name;
1158 			if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1159 				encoding = name.String();
1160 				hasAttribute = true;
1161 			} else {
1162 				int32 value;
1163 				ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1164 					&value, sizeof(value));
1165 				if (bytesRead == (ssize_t)sizeof(value)) {
1166 					hasAttribute = true;
1167 					if (value != 65535)
1168 						characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1169 				}
1170 			}
1171 		} else {
1172 			hasAttribute = true;
1173 				// we don't write the encoding in this case
1174 		}
1175 		if (characterSet == NULL && encoding != NULL)
1176 			characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1177 
1178 		if (characterSet != NULL) {
1179 			encodingID = characterSet->GetConversionID();
1180 			encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1181 		}
1182 
1183 		if (!hasAttribute && encoding != NULL) {
1184 			// add encoding attribute, so that someone opening the file can
1185 			// retrieve it for persistance
1186 			node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1187 				strlen(encoding));
1188 		}
1189 	}
1190 
1191 	off_t outputSize = 0;
1192 	ssize_t bytesRead;
1193 	int32 state = 0;
1194 
1195 	// output the actual text part of the data
1196 	do {
1197 		uint8 buffer[READ_BUFFER_SIZE];
1198 		bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1199 		if (bytesRead < B_OK)
1200 			return bytesRead;
1201 		if (bytesRead == 0)
1202 			break;
1203 
1204 		if (encodingBuffer.Size() == 0) {
1205 			// default, no encoding
1206 			ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1207 			if (bytesWritten != bytesRead) {
1208 				if (bytesWritten < B_OK)
1209 					return bytesWritten;
1210 
1211 				return B_ERROR;
1212 			}
1213 
1214 			outputSize += bytesRead;
1215 		} else {
1216 			// decode text file to UTF-8
1217 			char* pos = (char*)buffer;
1218 			int32 encodingLength = encodingIO.BufferLength();
1219 			int32 bytesLeft = bytesRead;
1220 			int32 bytes;
1221 			do {
1222 				encodingLength = READ_BUFFER_SIZE * 4;
1223 				bytes = bytesLeft;
1224 
1225 				status = convert_to_utf8(encodingID, pos, &bytes,
1226 					(char*)encodingBuffer.Buffer(), &encodingLength, &state);
1227 				if (status < B_OK)
1228 					return status;
1229 
1230 				ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1231 					encodingLength);
1232 				if (bytesWritten < encodingLength) {
1233 					if (bytesWritten < B_OK)
1234 						return bytesWritten;
1235 
1236 					return B_ERROR;
1237 				}
1238 
1239 				pos += bytes;
1240 				bytesLeft -= bytes;
1241 				outputSize += encodingLength;
1242 			} while (encodingLength > 0 && bytesLeft > 0);
1243 		}
1244 	} while (bytesRead > 0);
1245 
1246 	if (outType != B_STYLED_TEXT_FORMAT)
1247 		return B_OK;
1248 
1249 	if (encodingBuffer.Size() != 0 && size != outputSize) {
1250 		if (outputSize > UINT32_MAX)
1251 			return B_NOT_SUPPORTED;
1252 
1253 		// we need to update the header as the decoded text size has changed
1254 		status = destination->Seek(0, SEEK_SET);
1255 		if (status == B_OK)
1256 			status = output_headers(destination, (uint32)outputSize);
1257 		if (status == B_OK)
1258 			status = destination->Seek(0, SEEK_END);
1259 
1260 		if (status < B_OK)
1261 			return status;
1262 	}
1263 
1264 	// Read file attributes if outputting styled data
1265 	// and source is a BNode object
1266 
1267 	if (node == NULL)
1268 		return B_OK;
1269 
1270 	// Try to read styles - we only propagate an error if the actual on-disk
1271 	// data is likely to be okay
1272 
1273 	const char *kAttrName = "styles";
1274 	attr_info info;
1275 	if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1276 		return B_OK;
1277 
1278 	if (info.type != B_RAW_TYPE || info.size < 160) {
1279 		// styles seem to be broken, but since we got the text,
1280 		// we don't propagate the error
1281 		return B_OK;
1282 	}
1283 
1284 	uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1285 	if (flatRunArray == NULL)
1286 		return B_NO_MEMORY;
1287 
1288 	bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1289 	if (bytesRead != info.size)
1290 		return B_OK;
1291 
1292 	output_styles(destination, size, flatRunArray, info.size);
1293 
1294 	delete[] flatRunArray;
1295 	return B_OK;
1296 }
1297 
1298 
1299 //	#pragma mark -
1300 
1301 
1302 STXTTranslator::STXTTranslator()
1303 	: BaseTranslator("StyledEdit Files", "StyledEdit files translator",
1304 		STXT_TRANSLATOR_VERSION,
1305 		gInputFormats, sizeof(gInputFormats) / sizeof(translation_format),
1306 		gOutputFormats, sizeof(gOutputFormats) / sizeof(translation_format),
1307 		"STXTTranslator_Settings",
1308 		gDefaultSettings, sizeof(gDefaultSettings) / sizeof(TranSetting),
1309 		B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1310 {
1311 }
1312 
1313 
1314 STXTTranslator::~STXTTranslator()
1315 {
1316 }
1317 
1318 
1319 status_t
1320 STXTTranslator::Identify(BPositionIO *inSource,
1321 	const translation_format *inFormat, BMessage *ioExtension,
1322 	translator_info *outInfo, uint32 outType)
1323 {
1324 	if (!outType)
1325 		outType = B_TRANSLATOR_TEXT;
1326 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1327 		return B_NO_TRANSLATOR;
1328 
1329 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1330 
1331 	uint8 buffer[DATA_BUFFER_SIZE];
1332 	status_t nread = 0;
1333 	// Read in the header to determine
1334 	// if the data is supported
1335 	nread = inSource->Read(buffer, kstxtsize);
1336 	if (nread < 0)
1337 		return nread;
1338 
1339 	// read in enough data to fill the stream header
1340 	if (nread == kstxtsize) {
1341 		TranslatorStyledTextStreamHeader header;
1342 		memcpy(&header, buffer, kstxtsize);
1343 		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1344 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1345 			return B_ERROR;
1346 
1347 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1348 			&& header.header.header_size == (int32)kstxtsize
1349 			&& header.header.data_size == 0
1350 			&& header.version == 100)
1351 			return identify_stxt_header(header, inSource, outInfo, outType);
1352 	}
1353 
1354 	// if the data is not styled text, check if it is plain text
1355 	const char* encoding;
1356 	return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1357 }
1358 
1359 
1360 status_t
1361 STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1362 	BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1363 {
1364 	if (!outType)
1365 		outType = B_TRANSLATOR_TEXT;
1366 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1367 		return B_NO_TRANSLATOR;
1368 
1369 	const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1370 	uint8 buffer[DATA_BUFFER_SIZE];
1371 	status_t result;
1372 	translator_info outInfo;
1373 	// Read in the header to determine
1374 	// if the data is supported
1375 	ssize_t bytesRead = source->Read(buffer, headerSize);
1376 	if (bytesRead < 0)
1377 		return bytesRead;
1378 
1379 	// read in enough data to fill the stream header
1380 	if (bytesRead == headerSize) {
1381 		TranslatorStyledTextStreamHeader header;
1382 		memcpy(&header, buffer, headerSize);
1383 		if (swap_data(B_UINT32_TYPE, &header, headerSize,
1384 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1385 			return B_ERROR;
1386 
1387 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1388 			&& header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1389 			&& header.header.data_size == 0
1390 			&& header.version == 100) {
1391 			TranslatorStyledTextTextHeader textHeader;
1392 			result = identify_stxt_header(header, source, &outInfo, outType,
1393 				&textHeader);
1394 			if (result != B_OK)
1395 				return result;
1396 
1397 			return translate_from_stxt(source, outDestination, outType, textHeader);
1398 		}
1399 	}
1400 
1401 	// if the data is not styled text, check if it is ASCII text
1402 	bool forceEncoding = false;
1403 	const char* encoding = NULL;
1404 	result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1405 	if (result != B_OK)
1406 		return result;
1407 
1408 	if (ioExtension != NULL) {
1409 		const char* value;
1410 		if (ioExtension->FindString("be:encoding", &value) == B_OK
1411 			&& value[0]) {
1412 			// override encoding
1413 			encoding = value;
1414 			forceEncoding = true;
1415 		}
1416 	}
1417 
1418 	return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1419 }
1420 
1421 
1422 BView *
1423 STXTTranslator::NewConfigView(TranslatorSettings *settings)
1424 {
1425 	return new STXTView(BRect(0, 0, 225, 175), "STXTTranslator Settings",
1426 		B_FOLLOW_ALL, B_WILL_DRAW, settings);
1427 }
1428 
1429