xref: /haiku/src/add-ons/translators/stxt/STXTTranslator.cpp (revision 1acbe440b8dd798953bec31d18ee589aa3f71b73)
1 /*
2  * Copyright 2002-2007, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Michael Wilber
7  *		Axel Dörfler, axeld@pinc-software.de
8  */
9 
10 
11 #include "STXTTranslator.h"
12 #include "STXTView.h"
13 
14 #include <CharacterSet.h>
15 #include <CharacterSetRoster.h>
16 #include <MimeType.h>
17 #include <String.h>
18 #include <UTF8.h>
19 
20 #include <new>
21 #include <string.h>
22 #include <stdio.h>
23 #include <stdint.h>
24 
25 
26 using namespace BPrivate;
27 
28 
29 #define READ_BUFFER_SIZE 32768
30 #define DATA_BUFFER_SIZE 256
31 
32 // The input formats that this translator supports.
33 translation_format gInputFormats[] = {
34 	{
35 		B_TRANSLATOR_TEXT,
36 		B_TRANSLATOR_TEXT,
37 		TEXT_IN_QUALITY,
38 		TEXT_IN_CAPABILITY,
39 		"text/plain",
40 		"Plain text file"
41 	},
42 	{
43 		B_STYLED_TEXT_FORMAT,
44 		B_TRANSLATOR_TEXT,
45 		STXT_IN_QUALITY,
46 		STXT_IN_CAPABILITY,
47 		"text/x-vnd.Be-stxt",
48 		"Be styled text file"
49 	}
50 };
51 
52 // The output formats that this translator supports.
53 translation_format gOutputFormats[] = {
54 	{
55 		B_TRANSLATOR_TEXT,
56 		B_TRANSLATOR_TEXT,
57 		TEXT_OUT_QUALITY,
58 		TEXT_OUT_CAPABILITY,
59 		"text/plain",
60 		"Plain text file"
61 	},
62 	{
63 		B_STYLED_TEXT_FORMAT,
64 		B_TRANSLATOR_TEXT,
65 		STXT_OUT_QUALITY,
66 		STXT_OUT_CAPABILITY,
67 		"text/x-vnd.Be-stxt",
68 		"Be styled text file"
69 	}
70 };
71 
72 // Default settings for the Translator
73 TranSetting gDefaultSettings[] = {
74 	{B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
75 	{B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
76 };
77 
78 // ---------------------------------------------------------------
79 // make_nth_translator
80 //
81 // Creates a STXTTranslator object to be used by BTranslatorRoster
82 //
83 // Preconditions:
84 //
85 // Parameters: n,		The translator to return. Since
86 //						STXTTranslator only publishes one
87 //						translator, it only returns a
88 //						STXTTranslator if n == 0
89 //
90 //             you, 	The image_id of the add-on that
91 //						contains code (not used).
92 //
93 //             flags,	Has no meaning yet, should be 0.
94 //
95 // Postconditions:
96 //
97 // Returns: NULL if n is not zero,
98 //          a new STXTTranslator if n is zero
99 // ---------------------------------------------------------------
100 BTranslator *
101 make_nth_translator(int32 n, image_id you, uint32 flags, ...)
102 {
103 	if (!n)
104 		return new (std::nothrow) STXTTranslator();
105 
106 	return NULL;
107 }
108 
109 
110 // #pragma mark - ascmagic.c from the BSD file tool
111 /*
112  * The following code has been taken from version 4.17 of the BSD file tool,
113  * file ascmagic.c, modified for our purpose.
114  */
115 
116 /*
117  * Copyright (c) Ian F. Darwin 1986-1995.
118  * Software written by Ian F. Darwin and others;
119  * maintained 1995-present by Christos Zoulas and others.
120  *
121  * Redistribution and use in source and binary forms, with or without
122  * modification, are permitted provided that the following conditions
123  * are met:
124  * 1. Redistributions of source code must retain the above copyright
125  *    notice immediately at the beginning of the file, without modification,
126  *    this list of conditions, and the following disclaimer.
127  * 2. Redistributions in binary form must reproduce the above copyright
128  *    notice, this list of conditions and the following disclaimer in the
129  *    documentation and/or other materials provided with the distribution.
130  *
131  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
132  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
133  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
134  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
135  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
136  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
137  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
138  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
139  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
140  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
141  * SUCH DAMAGE.
142  */
143 /*
144  * ASCII magic -- file types that we know based on keywords
145  * that can appear anywhere in the file.
146  *
147  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
148  * to handle character codes other than ASCII on a unified basis.
149  *
150  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
151  * international characters, now subsumed into this file.
152  */
153 
154 #include <stdio.h>
155 #include <string.h>
156 #include <memory.h>
157 #include <ctype.h>
158 #include <stdlib.h>
159 #include <unistd.h>
160 #include "names.h"
161 
162 typedef unsigned long my_unichar;
163 
164 #define MAXLINELEN 300	/* longest sane line length */
165 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
166 		  || (x) == 0x85 || (x) == '\f')
167 
168 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
169 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
170 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
171 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
172 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
173 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
174 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
175 
176 
177 static int
178 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
179 	const char*& encoding)
180 {
181 	size_t i;
182 	unsigned char *nbuf = NULL;
183 	my_unichar *ubuf = NULL;
184 	size_t ulen;
185 	struct names *p;
186 	int rv = -1;
187 
188 	const char *code = NULL;
189 	encoding = NULL;
190 	const char *type = NULL;
191 	const char *subtype = NULL;
192 	const char *subtype_mime = NULL;
193 
194 	int has_escapes = 0;
195 	int has_backspace = 0;
196 	int seen_cr = 0;
197 
198 	int n_crlf = 0;
199 	int n_lf = 0;
200 	int n_cr = 0;
201 	int n_nel = 0;
202 
203 	int last_line_end = -1;
204 	int has_long_lines = 0;
205 
206 	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
207 		goto done;
208 	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
209 		goto done;
210 
211 	/*
212 	 * Then try to determine whether it's any character code we can
213 	 * identify.  Each of these tests, if it succeeds, will leave
214 	 * the text converted into one-my_unichar-per-character Unicode in
215 	 * ubuf, and the number of characters converted in ulen.
216 	 */
217 	if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
218 		code = "ASCII";
219 		encoding = NULL; //"us-ascii";
220 		type = "text";
221 	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
222 		code = "UTF-8 Unicode";
223 		encoding = NULL; // "UTF-8";
224 		type = "text";
225 	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
226 		if (i == 1) {
227 			code = "Little-endian UTF-16 Unicode";
228 			encoding = "UTF-16";
229 		} else {
230 			code = "Big-endian UTF-16 Unicode";
231 			encoding = "UTF-16";
232 		}
233 
234 		type = "character data";
235 	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
236 		code = "ISO-8859";
237 		type = "text";
238 		encoding = "iso-8859-1";
239 	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
240 		code = "Non-ISO extended-ASCII";
241 		type = "text";
242 		encoding = "unknown";
243 	} else {
244 		from_ebcdic(buf, nbytes, nbuf);
245 
246 		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
247 			code = "EBCDIC";
248 			type = "character data";
249 			encoding = "ebcdic";
250 		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
251 			code = "International EBCDIC";
252 			type = "character data";
253 			encoding = "ebcdic";
254 		} else {
255 			rv = 0;
256 			goto done;  /* doesn't look like text at all */
257 		}
258 	}
259 
260 	if (nbytes <= 1) {
261 		rv = 0;
262 		goto done;
263 	}
264 
265 	/*
266 	 * for troff, look for . + letter + letter or .\";
267 	 * this must be done to disambiguate tar archives' ./file
268 	 * and other trash from real troff input.
269 	 *
270 	 * I believe Plan 9 troff allows non-ASCII characters in the names
271 	 * of macros, so this test might possibly fail on such a file.
272 	 */
273 	if (*ubuf == '.') {
274 		my_unichar *tp = ubuf + 1;
275 
276 		while (ISSPC(*tp))
277 			++tp;	/* skip leading whitespace */
278 		if ((tp[0] == '\\' && tp[1] == '\"') ||
279 		    (isascii((unsigned char)tp[0]) &&
280 		     isalnum((unsigned char)tp[0]) &&
281 		     isascii((unsigned char)tp[1]) &&
282 		     isalnum((unsigned char)tp[1]) &&
283 		     ISSPC(tp[2]))) {
284 			subtype_mime = "text/troff";
285 			subtype = "troff or preprocessor input";
286 			goto subtype_identified;
287 		}
288 	}
289 
290 	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
291 		subtype_mime = "text/fortran";
292 		subtype = "fortran program";
293 		goto subtype_identified;
294 	}
295 
296 	/* look for tokens from names.h - this is expensive! */
297 
298 	i = 0;
299 	while (i < ulen) {
300 		size_t end;
301 
302 		/*
303 		 * skip past any leading space
304 		 */
305 		while (i < ulen && ISSPC(ubuf[i]))
306 			i++;
307 		if (i >= ulen)
308 			break;
309 
310 		/*
311 		 * find the next whitespace
312 		 */
313 		for (end = i + 1; end < nbytes; end++)
314 			if (ISSPC(ubuf[end]))
315 				break;
316 
317 		/*
318 		 * compare the word thus isolated against the token list
319 		 */
320 		for (p = names; p < names + NNAMES; p++) {
321 			if (ascmatch((const unsigned char *)p->name, ubuf + i,
322 			    end - i)) {
323 				subtype = types[p->type].human;
324 				subtype_mime = types[p->type].mime;
325 				goto subtype_identified;
326 			}
327 		}
328 
329 		i = end;
330 	}
331 
332 subtype_identified:
333 
334 	/*
335 	 * Now try to discover other details about the file.
336 	 */
337 	for (i = 0; i < ulen; i++) {
338 		if (ubuf[i] == '\n') {
339 			if (seen_cr)
340 				n_crlf++;
341 			else
342 				n_lf++;
343 			last_line_end = i;
344 		} else if (seen_cr)
345 			n_cr++;
346 
347 		seen_cr = (ubuf[i] == '\r');
348 		if (seen_cr)
349 			last_line_end = i;
350 
351 		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
352 			n_nel++;
353 			last_line_end = i;
354 		}
355 
356 		/* If this line is _longer_ than MAXLINELEN, remember it. */
357 		if ((int)i > last_line_end + MAXLINELEN)
358 			has_long_lines = 1;
359 
360 		if (ubuf[i] == '\033')
361 			has_escapes = 1;
362 		if (ubuf[i] == '\b')
363 			has_backspace = 1;
364 	}
365 
366 	rv = 1;
367 done:
368 	if (nbuf)
369 		free(nbuf);
370 	if (ubuf)
371 		free(ubuf);
372 
373 	if (rv) {
374 		// If we have identified the subtype, return it, otherwise just
375 		// text/plain.
376 		if (subtype_mime)
377 			mimeType->SetTo(subtype_mime);
378 		else
379 			mimeType->SetTo("text/plain");
380 	}
381 
382 	return rv;
383 }
384 
385 static int
386 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
387 {
388 	size_t i;
389 
390 	for (i = 0; i < ulen; i++) {
391 		if (s[i] != us[i])
392 			return 0;
393 	}
394 
395 	if (s[i])
396 		return 0;
397 	else
398 		return 1;
399 }
400 
401 /*
402  * This table reflects a particular philosophy about what constitutes
403  * "text," and there is room for disagreement about it.
404  *
405  * Version 3.31 of the file command considered a file to be ASCII if
406  * each of its characters was approved by either the isascii() or
407  * isalpha() function.  On most systems, this would mean that any
408  * file consisting only of characters in the range 0x00 ... 0x7F
409  * would be called ASCII text, but many systems might reasonably
410  * consider some characters outside this range to be alphabetic,
411  * so the file command would call such characters ASCII.  It might
412  * have been more accurate to call this "considered textual on the
413  * local system" than "ASCII."
414  *
415  * It considered a file to be "International language text" if each
416  * of its characters was either an ASCII printing character (according
417  * to the real ASCII standard, not the above test), a character in
418  * the range 0x80 ... 0xFF, or one of the following control characters:
419  * backspace, tab, line feed, vertical tab, form feed, carriage return,
420  * escape.  No attempt was made to determine the language in which files
421  * of this type were written.
422  *
423  *
424  * The table below considers a file to be ASCII if all of its characters
425  * are either ASCII printing characters (again, according to the X3.4
426  * standard, not isascii()) or any of the following controls: bell,
427  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
428  *
429  * I include bell because some programs (particularly shell scripts)
430  * use it literally, even though it is rare in normal text.  I exclude
431  * vertical tab because it never seems to be used in real text.  I also
432  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
433  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
434  * character to.  It might be more appropriate to include it in the 8859
435  * set instead of the ASCII set, but it's got to be included in *something*
436  * we recognize or EBCDIC files aren't going to be considered textual.
437  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
438  * and Latin characters, so these should possibly be allowed.  But they
439  * make a real mess on VT100-style displays if they're not paired properly,
440  * so we are probably better off not calling them text.
441  *
442  * A file is considered to be ISO-8859 text if its characters are all
443  * either ASCII, according to the above definition, or printing characters
444  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
445  *
446  * Finally, a file is considered to be international text from some other
447  * character code if its characters are all either ISO-8859 (according to
448  * the above definition) or characters in the range 0x80 ... 0x9F, which
449  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
450  * consider to be printing characters.
451  */
452 
453 #define F 0   /* character never appears in text */
454 #define T 1   /* character appears in plain ASCII text */
455 #define I 2   /* character appears in ISO-8859 text */
456 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
457 
458 static char text_chars[256] = {
459 	/*                  BEL BS HT LF    FF CR    */
460 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
461         /*                              ESC          */
462 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
463 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
464 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
465 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
466 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
467 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
468 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
469 	/*            NEL                            */
470 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
471 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
472 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
473 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
474 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
475 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
476 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
477 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
478 };
479 
480 static int
481 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
482     size_t *ulen)
483 {
484 	int i;
485 
486 	*ulen = 0;
487 
488 	for (i = 0; i < (int)nbytes; i++) {
489 		int t = text_chars[buf[i]];
490 
491 		if (t != T)
492 			return 0;
493 
494 		ubuf[(*ulen)++] = buf[i];
495 	}
496 
497 	return 1;
498 }
499 
500 static int
501 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
502 {
503 	int i;
504 
505 	*ulen = 0;
506 
507 	for (i = 0; i < (int)nbytes; i++) {
508 		int t = text_chars[buf[i]];
509 
510 		if (t != T && t != I)
511 			return 0;
512 
513 		ubuf[(*ulen)++] = buf[i];
514 	}
515 
516 	return 1;
517 }
518 
519 static int
520 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
521     size_t *ulen)
522 {
523 	int i;
524 
525 	*ulen = 0;
526 
527 	for (i = 0; i < (int)nbytes; i++) {
528 		int t = text_chars[buf[i]];
529 
530 		if (t != T && t != I && t != X)
531 			return 0;
532 
533 		ubuf[(*ulen)++] = buf[i];
534 	}
535 
536 	return 1;
537 }
538 
539 static int
540 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
541 {
542 	int i, n;
543 	my_unichar c;
544 	int gotone = 0;
545 
546 	*ulen = 0;
547 
548 	for (i = 0; i < (int)nbytes; i++) {
549 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
550 			/*
551 			 * Even if the whole file is valid UTF-8 sequences,
552 			 * still reject it if it uses weird control characters.
553 			 */
554 
555 			if (text_chars[buf[i]] != T)
556 				return 0;
557 
558 			ubuf[(*ulen)++] = buf[i];
559 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
560 			return 0;
561 		} else {			   /* 11xxxxxx begins UTF-8 */
562 			int following;
563 
564 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
565 				c = buf[i] & 0x1f;
566 				following = 1;
567 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
568 				c = buf[i] & 0x0f;
569 				following = 2;
570 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
571 				c = buf[i] & 0x07;
572 				following = 3;
573 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
574 				c = buf[i] & 0x03;
575 				following = 4;
576 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
577 				c = buf[i] & 0x01;
578 				following = 5;
579 			} else
580 				return 0;
581 
582 			for (n = 0; n < following; n++) {
583 				i++;
584 				if (i >= (int)nbytes)
585 					goto done;
586 
587 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
588 					return 0;
589 
590 				c = (c << 6) + (buf[i] & 0x3f);
591 			}
592 
593 			ubuf[(*ulen)++] = c;
594 			gotone = 1;
595 		}
596 	}
597 done:
598 	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
599 }
600 
601 static int
602 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
603     size_t *ulen)
604 {
605 	int bigend;
606 	int i;
607 
608 	if (nbytes < 2)
609 		return 0;
610 
611 	if (buf[0] == 0xff && buf[1] == 0xfe)
612 		bigend = 0;
613 	else if (buf[0] == 0xfe && buf[1] == 0xff)
614 		bigend = 1;
615 	else
616 		return 0;
617 
618 	*ulen = 0;
619 
620 	for (i = 2; i + 1 < (int)nbytes; i += 2) {
621 		/* XXX fix to properly handle chars > 65536 */
622 
623 		if (bigend)
624 			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
625 		else
626 			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
627 
628 		if (ubuf[*ulen - 1] == 0xfffe)
629 			return 0;
630 		if (ubuf[*ulen - 1] < 128 &&
631 		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
632 			return 0;
633 	}
634 
635 	return 1 + bigend;
636 }
637 
638 #undef F
639 #undef T
640 #undef I
641 #undef X
642 
643 /*
644  * This table maps each EBCDIC character to an (8-bit extended) ASCII
645  * character, as specified in the rationale for the dd(1) command in
646  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
647  *
648  * Unfortunately it does not seem to correspond exactly to any of the
649  * five variants of EBCDIC documented in IBM's _Enterprise Systems
650  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
651  * Edition, July, 1999, pp. I-1 - I-4.
652  *
653  * Fortunately, though, all versions of EBCDIC, including this one, agree
654  * on most of the printing characters that also appear in (7-bit) ASCII.
655  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
656  *
657  * Fortunately too, there is general agreement that codes 0x00 through
658  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
659  * remainder printing characters.
660  *
661  * This is sufficient to allow us to identify EBCDIC text and to distinguish
662  * between old-style and internationalized examples of text.
663  */
664 
665 static unsigned char ebcdic_to_ascii[] = {
666   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
667  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
668 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
669 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
670 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
671 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
672 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
673 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
674 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
675 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
676 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
677 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
678 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
679 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
680 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
681 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
682 };
683 
684 #ifdef notdef
685 /*
686  * The following EBCDIC-to-ASCII table may relate more closely to reality,
687  * or at least to modern reality.  It comes from
688  *
689  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
690  *
691  * and maps the characters of EBCDIC code page 1047 (the code used for
692  * Unix-derived software on IBM's 390 systems) to the corresponding
693  * characters from ISO 8859-1.
694  *
695  * If this table is used instead of the above one, some of the special
696  * cases for the NEL character can be taken out of the code.
697  */
698 
699 static unsigned char ebcdic_1047_to_8859[] = {
700 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
701 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
702 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
703 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
704 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
705 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
706 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
707 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
708 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
709 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
710 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
711 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
712 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
713 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
714 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
715 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
716 };
717 #endif
718 
719 /*
720  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
721  */
722 static void
723 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
724 {
725 	int i;
726 
727 	for (i = 0; i < (int)nbytes; i++) {
728 		out[i] = ebcdic_to_ascii[buf[i]];
729 	}
730 }
731 
732 
733 //	#pragma mark -
734 
735 
736 /*!
737 	Determines if the data in inSource is of the STXT format.
738 
739 	\param header the STXT stream header read in by Identify() or Translate()
740 	\param inSource the stream with the STXT data
741 	\param outInfo information about the type of data from inSource is stored here
742 	\param outType the desired output type for the data in inSource
743 	\param ptxtheader if this is not NULL, the TEXT header from
744 		inSource is copied to it
745 */
746 status_t
747 identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
748 	BPositionIO *inSource, translator_info *outInfo, uint32 outType,
749 	TranslatorStyledTextTextHeader *ptxtheader = NULL)
750 {
751 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
752 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
753 
754 	uint8 buffer[max(ktxtsize, kstylsize)];
755 
756 	// Check the TEXT header
757 	TranslatorStyledTextTextHeader txtheader;
758 	if (inSource->Read(buffer, ktxtsize) != ktxtsize)
759 		return B_NO_TRANSLATOR;
760 
761 	memcpy(&txtheader, buffer, ktxtsize);
762 	if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
763 		B_SWAP_BENDIAN_TO_HOST) != B_OK)
764 		return B_ERROR;
765 
766 	if (txtheader.header.magic != 'TEXT'
767 		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
768 		|| txtheader.charset != B_UNICODE_UTF8)
769 		return B_NO_TRANSLATOR;
770 
771 	// skip the text data
772 	off_t seekresult, pos;
773 	pos = header.header.header_size + txtheader.header.header_size
774 		+ txtheader.header.data_size;
775 	seekresult = inSource->Seek(txtheader.header.data_size,
776 		SEEK_CUR);
777 	if (seekresult < pos)
778 		return B_NO_TRANSLATOR;
779 	if (seekresult > pos)
780 		return B_ERROR;
781 
782 	// check the STYL header (not all STXT files have this)
783 	ssize_t read = 0;
784 	TranslatorStyledTextStyleHeader stylheader;
785 	read = inSource->Read(buffer, kstylsize);
786 	if (read < 0)
787 		return read;
788 	if (read != kstylsize && read != 0)
789 		return B_NO_TRANSLATOR;
790 
791 	// If there is a STYL header
792 	if (read == kstylsize) {
793 		memcpy(&stylheader, buffer, kstylsize);
794 		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
795 			B_SWAP_BENDIAN_TO_HOST) != B_OK)
796 			return B_ERROR;
797 
798 		if (stylheader.header.magic != 'STYL'
799 			|| stylheader.header.header_size !=
800 				sizeof(TranslatorStyledTextStyleHeader))
801 			return B_NO_TRANSLATOR;
802 	}
803 
804 	// if output TEXT header is supplied, fill it with data
805 	if (ptxtheader) {
806 		ptxtheader->header.magic = txtheader.header.magic;
807 		ptxtheader->header.header_size = txtheader.header.header_size;
808 		ptxtheader->header.data_size = txtheader.header.data_size;
809 		ptxtheader->charset = txtheader.charset;
810 	}
811 
812 	// return information about the data in the stream
813 	outInfo->type = B_STYLED_TEXT_FORMAT;
814 	outInfo->group = B_TRANSLATOR_TEXT;
815 	outInfo->quality = STXT_IN_QUALITY;
816 	outInfo->capability = STXT_IN_CAPABILITY;
817 	strcpy(outInfo->name, "Be styled text file");
818 	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
819 
820 	return B_OK;
821 }
822 
823 
824 /*!
825 	Determines if the data in \a inSource is of the UTF8 plain
826 
827 	\param data buffer containing data already read (must be at
828 		least DATA_BUFFER_SIZE bytes large)
829 	\param nread number of bytes that have already been read from the stream
830 	\param header the STXT stream header read in by Identify() or Translate()
831 	\param inSource the stream with the STXT data
832 	\param outInfo information about the type of data from inSource is stored here
833 	\param outType the desired output type for the data in inSource
834 */
835 status_t
836 identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
837 	translator_info* outInfo, uint32 outType, const char*& encoding)
838 {
839 	ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
840 	if (readLater < B_OK)
841 		return B_NO_TRANSLATOR;
842 
843 	bytesRead += readLater;
844 
845 	// TODO: identify encoding as possible!
846 	BMimeType type;
847 	if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
848 		return B_NO_TRANSLATOR;
849 
850 	float capability = TEXT_IN_CAPABILITY;
851 	if (bytesRead < 20)
852 		capability = .1f;
853 
854 	// return information about the data in the stream
855 	outInfo->type = B_TRANSLATOR_TEXT;
856 	outInfo->group = B_TRANSLATOR_TEXT;
857 	outInfo->quality = TEXT_IN_QUALITY;
858 	outInfo->capability = capability;
859 
860 	char description[B_MIME_TYPE_LENGTH];
861 	if (type.GetLongDescription(description) == B_OK)
862 		strlcpy(outInfo->name, description, sizeof(outInfo->name));
863 	else
864 		strlcpy(outInfo->name, "Plain text file", sizeof(outInfo->name));
865 
866 	//strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
867 	strcpy(outInfo->MIME, "text/plain");
868 	return B_OK;
869 }
870 
871 
872 // ---------------------------------------------------------------
873 // translate_from_stxt
874 //
875 // Translates the data in inSource to the type outType and stores
876 // the translated data in outDestination.
877 //
878 // Preconditions:
879 //
880 // Parameters:	inSource,	the data to be translated
881 //
882 //				outDestination,	where the translated data is
883 //								put
884 //
885 //				outType,	the type to convert inSource to
886 //
887 //				txtheader, 	the TEXT header from inSource
888 //
889 //
890 // Postconditions:
891 //
892 // Returns: B_BAD_VALUE, if outType is invalid
893 //
894 // B_NO_TRANSLATOR, if this translator doesn't understand the data
895 //
896 // B_ERROR, if there was an error allocating memory or converting
897 //          data
898 //
899 // B_OK, if all went well
900 // ---------------------------------------------------------------
901 status_t
902 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
903 		uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
904 {
905 	if (inSource->Seek(0, SEEK_SET) != 0)
906 		return B_ERROR;
907 
908 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
909 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
910 
911 	bool btoplain;
912 	if (outType == B_TRANSLATOR_TEXT)
913 		btoplain = true;
914 	else if (outType == B_STYLED_TEXT_FORMAT)
915 		btoplain = false;
916 	else
917 		return B_BAD_VALUE;
918 
919 	uint8 buffer[READ_BUFFER_SIZE];
920 	ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
921 
922 	// skip to the actual text data when outputting a
923 	// plain text file
924 	if (btoplain) {
925 		if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
926 			kstxtsize + ktxtsize)
927 			return B_ERROR;
928 	}
929 
930 	// Read data from inSource
931 	// When outputing B_TRANSLATOR_TEXT, the loop stops when all of
932 	// the text data has been read and written.
933 	// When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
934 	// of the data from inSource has been read and written.
935 	if (btoplain)
936 		nreed = min(READ_BUFFER_SIZE,
937 			txtheader.header.data_size - ntotalread);
938 	else
939 		nreed = READ_BUFFER_SIZE;
940 	nread = inSource->Read(buffer, nreed);
941 	while (nread > 0) {
942 		nwritten = outDestination->Write(buffer, nread);
943 		if (nwritten != nread)
944 			return B_ERROR;
945 
946 		if (btoplain) {
947 			ntotalread += nread;
948 			nreed = min(READ_BUFFER_SIZE,
949 				txtheader.header.data_size - ntotalread);
950 		} else
951 			nreed = READ_BUFFER_SIZE;
952 		nread = inSource->Read(buffer, nreed);
953 	}
954 
955 	if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
956 		ntotalread)
957 		// If not all of the text data was able to be read...
958 		return B_NO_TRANSLATOR;
959 	else
960 		return B_OK;
961 }
962 
963 // ---------------------------------------------------------------
964 // output_headers
965 //
966 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
967 // to outDestination, setting the data_size member of the text header
968 // to text_data_size
969 //
970 // Preconditions:
971 //
972 // Parameters:	outDestination,	where the translated data is
973 //								put
974 //
975 //				text_data_size, number of bytes in data section
976 //							    of the TEXT header
977 //
978 //
979 // Postconditions:
980 //
981 // Returns:
982 //
983 // B_ERROR, if there was an error writing to outDestination or
984 // 	an error with converting the byte order
985 //
986 // B_OK, if all went well
987 // ---------------------------------------------------------------
988 status_t
989 output_headers(BPositionIO *outDestination, uint32 text_data_size)
990 {
991 	const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
992 		sizeof(TranslatorStyledTextTextHeader);
993 	status_t result;
994 	TranslatorStyledTextStreamHeader stxtheader;
995 	TranslatorStyledTextTextHeader txtheader;
996 
997 	uint8 buffer[kHeadersSize];
998 
999 	stxtheader.header.magic = 'STXT';
1000 	stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1001 	stxtheader.header.data_size = 0;
1002 	stxtheader.version = 100;
1003 	memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1004 
1005 	txtheader.header.magic = 'TEXT';
1006 	txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1007 	txtheader.header.data_size = text_data_size;
1008 	txtheader.charset = B_UNICODE_UTF8;
1009 	memcpy(buffer + stxtheader.header.header_size, &txtheader,
1010 		txtheader.header.header_size);
1011 
1012 	// write out headers in Big Endian byte order
1013 	result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1014 		B_SWAP_HOST_TO_BENDIAN);
1015 	if (result == B_OK) {
1016 		ssize_t nwritten = 0;
1017 		nwritten = outDestination->Write(buffer, kHeadersSize);
1018 		if (nwritten != kHeadersSize)
1019 			return B_ERROR;
1020 		else
1021 			return B_OK;
1022 	}
1023 
1024 	return result;
1025 }
1026 
1027 // ---------------------------------------------------------------
1028 // output_styles
1029 //
1030 // Writes out the actual style information into outDestination
1031 // using the data from pflatRunArray
1032 //
1033 // Preconditions:
1034 //
1035 // Parameters:	outDestination,	where the translated data is
1036 //								put
1037 //
1038 //				text_size,		size in bytes of the text in
1039 //								outDestination
1040 //
1041 //				data_size,		size of pflatRunArray
1042 //
1043 // Postconditions:
1044 //
1045 // Returns:
1046 //
1047 // B_ERROR, if there was an error writing to outDestination or
1048 // 	an error with converting the byte order
1049 //
1050 // B_OK, if all went well
1051 // ---------------------------------------------------------------
1052 status_t
1053 output_styles(BPositionIO *outDestination, uint32 text_size,
1054 	uint8 *pflatRunArray, ssize_t data_size)
1055 {
1056 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1057 
1058 	uint8 buffer[kstylsize];
1059 
1060 	// output STYL header
1061 	TranslatorStyledTextStyleHeader stylheader;
1062 	stylheader.header.magic = 'STYL';
1063 	stylheader.header.header_size =
1064 		sizeof(TranslatorStyledTextStyleHeader);
1065 	stylheader.header.data_size = data_size;
1066 	stylheader.apply_offset = 0;
1067 	stylheader.apply_length = text_size;
1068 
1069 	memcpy(buffer, &stylheader, kstylsize);
1070 	if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1071 		B_SWAP_HOST_TO_BENDIAN) != B_OK)
1072 		return B_ERROR;
1073 	if (outDestination->Write(buffer, kstylsize) != kstylsize)
1074 		return B_ERROR;
1075 
1076 	// output actual style information
1077 	if (outDestination->Write(pflatRunArray,
1078 		data_size) != data_size)
1079 		return B_ERROR;
1080 
1081 	return B_OK;
1082 }
1083 
1084 
1085 /*!
1086 	Convert the plain text (UTF8) from inSource to plain or
1087 	styled text in outDestination
1088 */
1089 status_t
1090 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1091 	BPositionIO* destination, uint32 outType)
1092 {
1093 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1094 		return B_BAD_VALUE;
1095 
1096 	// find the length of the text
1097 	off_t size = source->Seek(0, SEEK_END);
1098 	if (size < 0)
1099 		return (status_t)size;
1100 	if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1101 		return B_NOT_SUPPORTED;
1102 
1103 	status_t status = source->Seek(0, SEEK_SET);
1104 	if (status < B_OK)
1105 		return status;
1106 
1107 	if (outType == B_STYLED_TEXT_FORMAT) {
1108 		// output styled text headers
1109 		status = output_headers(destination, (uint32)size);
1110 		if (status != B_OK)
1111 			return status;
1112 	}
1113 
1114 	class MallocBuffer {
1115 		public:
1116 			MallocBuffer() : fBuffer(NULL), fSize(0) {}
1117 			~MallocBuffer() { free(fBuffer); }
1118 
1119 			void* Buffer() { return fBuffer; }
1120 			size_t Size() const { return fSize; }
1121 
1122 			status_t
1123 			Allocate(size_t size)
1124 			{
1125 				fBuffer = malloc(size);
1126 				if (fBuffer != NULL) {
1127 					fSize = size;
1128 					return B_OK;
1129 				}
1130 				return B_NO_MEMORY;
1131 			}
1132 
1133 		private:
1134 			void*	fBuffer;
1135 			size_t	fSize;
1136 	} encodingBuffer;
1137 	BMallocIO encodingIO;
1138 	uint32 encodingID = 0;
1139 		// defaults to UTF-8 or no encoding
1140 
1141 	BNode* node = dynamic_cast<BNode*>(source);
1142 	if (node != NULL) {
1143 		// determine encoding, if available
1144 		const BCharacterSet* characterSet = NULL;
1145 		bool hasAttribute = false;
1146 		if (encoding != NULL && !forceEncoding) {
1147 			BString name;
1148 			if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1149 				encoding = name.String();
1150 				hasAttribute = true;
1151 			} else {
1152 				int32 value;
1153 				ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1154 					&value, sizeof(value));
1155 				if (bytesRead == (ssize_t)sizeof(value)) {
1156 					hasAttribute = true;
1157 					if (value != 65535)
1158 						characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1159 				}
1160 			}
1161 		} else {
1162 			hasAttribute = true;
1163 				// we don't write the encoding in this case
1164 		}
1165 		if (characterSet == NULL && encoding != NULL)
1166 			characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1167 
1168 		if (characterSet != NULL) {
1169 			encodingID = characterSet->GetConversionID();
1170 			encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1171 		}
1172 
1173 		if (!hasAttribute && encoding != NULL) {
1174 			// add encoding attribute, so that someone opening the file can
1175 			// retrieve it for persistance
1176 			node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1177 				strlen(encoding));
1178 		}
1179 	}
1180 
1181 	off_t outputSize = 0;
1182 	ssize_t bytesRead;
1183 	int32 state = 0;
1184 
1185 	// output the actual text part of the data
1186 	do {
1187 		uint8 buffer[READ_BUFFER_SIZE];
1188 		bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1189 		if (bytesRead < B_OK)
1190 			return bytesRead;
1191 		if (bytesRead == 0)
1192 			break;
1193 
1194 		if (encodingBuffer.Size() == 0) {
1195 			// default, no encoding
1196 			ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1197 			if (bytesWritten != bytesRead) {
1198 				if (bytesWritten < B_OK)
1199 					return bytesWritten;
1200 
1201 				return B_ERROR;
1202 			}
1203 
1204 			outputSize += bytesRead;
1205 		} else {
1206 			// decode text file to UTF-8
1207 			char* pos = (char*)buffer;
1208 			int32 encodingLength = encodingIO.BufferLength();
1209 			int32 bytesLeft = bytesRead;
1210 			int32 bytes;
1211 			do {
1212 				encodingLength = READ_BUFFER_SIZE * 4;
1213 				bytes = bytesLeft;
1214 
1215 				status = convert_to_utf8(encodingID, pos, &bytes,
1216 					(char*)encodingBuffer.Buffer(), &encodingLength, &state);
1217 				if (status < B_OK)
1218 					return status;
1219 
1220 				ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1221 					encodingLength);
1222 				if (bytesWritten < encodingLength) {
1223 					if (bytesWritten < B_OK)
1224 						return bytesWritten;
1225 
1226 					return B_ERROR;
1227 				}
1228 
1229 				pos += bytes;
1230 				bytesLeft -= bytes;
1231 				outputSize += encodingLength;
1232 			} while (encodingLength > 0 && bytesLeft > 0);
1233 		}
1234 	} while (bytesRead > 0);
1235 
1236 	if (outType != B_STYLED_TEXT_FORMAT)
1237 		return B_OK;
1238 
1239 	if (encodingBuffer.Size() != 0 && size != outputSize) {
1240 		if (outputSize > UINT32_MAX)
1241 			return B_NOT_SUPPORTED;
1242 
1243 		// we need to update the header as the decoded text size has changed
1244 		status = destination->Seek(0, SEEK_SET);
1245 		if (status == B_OK)
1246 			status = output_headers(destination, (uint32)outputSize);
1247 		if (status == B_OK)
1248 			status = destination->Seek(0, SEEK_END);
1249 
1250 		if (status < B_OK)
1251 			return status;
1252 	}
1253 
1254 	// Read file attributes if outputting styled data
1255 	// and source is a BNode object
1256 
1257 	if (node == NULL)
1258 		return B_OK;
1259 
1260 	// Try to read styles - we only propagate an error if the actual on-disk
1261 	// data is likely to be okay
1262 
1263 	const char *kAttrName = "styles";
1264 	attr_info info;
1265 	if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1266 		return B_OK;
1267 
1268 	if (info.type != B_RAW_TYPE || info.size < 160) {
1269 		// styles seem to be broken, but since we got the text,
1270 		// we don't propagate the error
1271 		return B_OK;
1272 	}
1273 
1274 	uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1275 	if (flatRunArray == NULL)
1276 		return B_NO_MEMORY;
1277 
1278 	bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1279 	if (bytesRead != info.size)
1280 		return B_OK;
1281 
1282 	output_styles(destination, size, flatRunArray, info.size);
1283 
1284 	delete[] flatRunArray;
1285 	return B_OK;
1286 }
1287 
1288 
1289 //	#pragma mark -
1290 
1291 
1292 STXTTranslator::STXTTranslator()
1293 	: BaseTranslator("StyledEdit Files", "StyledEdit files translator",
1294 		STXT_TRANSLATOR_VERSION,
1295 		gInputFormats, sizeof(gInputFormats) / sizeof(translation_format),
1296 		gOutputFormats, sizeof(gOutputFormats) / sizeof(translation_format),
1297 		"STXTTranslator_Settings",
1298 		gDefaultSettings, sizeof(gDefaultSettings) / sizeof(TranSetting),
1299 		B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1300 {
1301 }
1302 
1303 
1304 STXTTranslator::~STXTTranslator()
1305 {
1306 }
1307 
1308 
1309 status_t
1310 STXTTranslator::Identify(BPositionIO *inSource,
1311 	const translation_format *inFormat, BMessage *ioExtension,
1312 	translator_info *outInfo, uint32 outType)
1313 {
1314 	if (!outType)
1315 		outType = B_TRANSLATOR_TEXT;
1316 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1317 		return B_NO_TRANSLATOR;
1318 
1319 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1320 
1321 	uint8 buffer[DATA_BUFFER_SIZE];
1322 	status_t nread = 0;
1323 	// Read in the header to determine
1324 	// if the data is supported
1325 	nread = inSource->Read(buffer, kstxtsize);
1326 	if (nread < 0)
1327 		return nread;
1328 
1329 	// read in enough data to fill the stream header
1330 	if (nread == kstxtsize) {
1331 		TranslatorStyledTextStreamHeader header;
1332 		memcpy(&header, buffer, kstxtsize);
1333 		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1334 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1335 			return B_ERROR;
1336 
1337 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1338 			&& header.header.header_size == (int32)kstxtsize
1339 			&& header.header.data_size == 0
1340 			&& header.version == 100)
1341 			return identify_stxt_header(header, inSource, outInfo, outType);
1342 	}
1343 
1344 	// if the data is not styled text, check if it is plain text
1345 	const char* encoding;
1346 	return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1347 }
1348 
1349 
1350 status_t
1351 STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1352 	BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1353 {
1354 	if (!outType)
1355 		outType = B_TRANSLATOR_TEXT;
1356 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1357 		return B_NO_TRANSLATOR;
1358 
1359 	const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1360 	uint8 buffer[DATA_BUFFER_SIZE];
1361 	status_t result;
1362 	translator_info outInfo;
1363 	// Read in the header to determine
1364 	// if the data is supported
1365 	ssize_t bytesRead = source->Read(buffer, headerSize);
1366 	if (bytesRead < 0)
1367 		return bytesRead;
1368 
1369 	// read in enough data to fill the stream header
1370 	if (bytesRead == headerSize) {
1371 		TranslatorStyledTextStreamHeader header;
1372 		memcpy(&header, buffer, headerSize);
1373 		if (swap_data(B_UINT32_TYPE, &header, headerSize,
1374 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1375 			return B_ERROR;
1376 
1377 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1378 			&& header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1379 			&& header.header.data_size == 0
1380 			&& header.version == 100) {
1381 			TranslatorStyledTextTextHeader textHeader;
1382 			result = identify_stxt_header(header, source, &outInfo, outType,
1383 				&textHeader);
1384 			if (result != B_OK)
1385 				return result;
1386 
1387 			return translate_from_stxt(source, outDestination, outType, textHeader);
1388 		}
1389 	}
1390 
1391 	// if the data is not styled text, check if it is ASCII text
1392 	bool forceEncoding = false;
1393 	const char* encoding = NULL;
1394 	result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1395 	if (result != B_OK)
1396 		return result;
1397 
1398 	if (ioExtension != NULL) {
1399 		const char* value;
1400 		if (ioExtension->FindString("be:encoding", &value) == B_OK
1401 			&& value[0]) {
1402 			// override encoding
1403 			encoding = value;
1404 			forceEncoding = true;
1405 		}
1406 	}
1407 
1408 	return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1409 }
1410 
1411 
1412 BView *
1413 STXTTranslator::NewConfigView(TranslatorSettings *settings)
1414 {
1415 	return new STXTView(BRect(0, 0, 225, 175), "STXTTranslator Settings",
1416 		B_FOLLOW_ALL, B_WILL_DRAW, settings);
1417 }
1418 
1419