xref: /haiku/src/add-ons/translators/stxt/STXTTranslator.cpp (revision b30304acc8c37e678a1bf66976d15bdab103f931)
1 /*
2  * Copyright 2002-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Michael Wilber
7  *		Axel Dörfler, axeld@pinc-software.de
8  */
9 
10 
11 #include "STXTTranslator.h"
12 #include "STXTView.h"
13 
14 #include <CharacterSet.h>
15 #include <CharacterSetRoster.h>
16 #include <MimeType.h>
17 #include <String.h>
18 #include <UTF8.h>
19 
20 #include <new>
21 #include <string.h>
22 #include <stdio.h>
23 #include <stdint.h>
24 
25 
26 using namespace BPrivate;
27 
28 
29 #define READ_BUFFER_SIZE 32768
30 #define DATA_BUFFER_SIZE 256
31 
32 // The input formats that this translator supports.
33 translation_format gInputFormats[] = {
34 	{
35 		B_TRANSLATOR_TEXT,
36 		B_TRANSLATOR_TEXT,
37 		TEXT_IN_QUALITY,
38 		TEXT_IN_CAPABILITY,
39 		"text/plain",
40 		"Plain text file"
41 	},
42 	{
43 		B_STYLED_TEXT_FORMAT,
44 		B_TRANSLATOR_TEXT,
45 		STXT_IN_QUALITY,
46 		STXT_IN_CAPABILITY,
47 		"text/x-vnd.Be-stxt",
48 		"Be styled text file"
49 	}
50 };
51 
52 // The output formats that this translator supports.
53 translation_format gOutputFormats[] = {
54 	{
55 		B_TRANSLATOR_TEXT,
56 		B_TRANSLATOR_TEXT,
57 		TEXT_OUT_QUALITY,
58 		TEXT_OUT_CAPABILITY,
59 		"text/plain",
60 		"Plain text file"
61 	},
62 	{
63 		B_STYLED_TEXT_FORMAT,
64 		B_TRANSLATOR_TEXT,
65 		STXT_OUT_QUALITY,
66 		STXT_OUT_CAPABILITY,
67 		"text/x-vnd.Be-stxt",
68 		"Be styled text file"
69 	}
70 };
71 
72 // Default settings for the Translator
73 TranSetting gDefaultSettings[] = {
74 	{B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
75 	{B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
76 };
77 
78 // ---------------------------------------------------------------
79 // make_nth_translator
80 //
81 // Creates a STXTTranslator object to be used by BTranslatorRoster
82 //
83 // Preconditions:
84 //
85 // Parameters: n,		The translator to return. Since
86 //						STXTTranslator only publishes one
87 //						translator, it only returns a
88 //						STXTTranslator if n == 0
89 //
90 //             you, 	The image_id of the add-on that
91 //						contains code (not used).
92 //
93 //             flags,	Has no meaning yet, should be 0.
94 //
95 // Postconditions:
96 //
97 // Returns: NULL if n is not zero,
98 //          a new STXTTranslator if n is zero
99 // ---------------------------------------------------------------
100 BTranslator *
101 make_nth_translator(int32 n, image_id you, uint32 flags, ...)
102 {
103 	if (!n)
104 		return new (std::nothrow) STXTTranslator();
105 
106 	return NULL;
107 }
108 
109 
110 // #pragma mark - ascmagic.c from the BSD file tool
111 /*
112  * The following code has been taken from version 4.17 of the BSD file tool,
113  * file ascmagic.c, modified for our purpose.
114  */
115 
116 /*
117  * Copyright (c) Ian F. Darwin 1986-1995.
118  * Software written by Ian F. Darwin and others;
119  * maintained 1995-present by Christos Zoulas and others.
120  *
121  * Redistribution and use in source and binary forms, with or without
122  * modification, are permitted provided that the following conditions
123  * are met:
124  * 1. Redistributions of source code must retain the above copyright
125  *    notice immediately at the beginning of the file, without modification,
126  *    this list of conditions, and the following disclaimer.
127  * 2. Redistributions in binary form must reproduce the above copyright
128  *    notice, this list of conditions and the following disclaimer in the
129  *    documentation and/or other materials provided with the distribution.
130  *
131  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
132  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
133  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
134  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
135  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
136  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
137  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
138  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
139  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
140  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
141  * SUCH DAMAGE.
142  */
143 /*
144  * ASCII magic -- file types that we know based on keywords
145  * that can appear anywhere in the file.
146  *		bool found = false;
147 		if (subtypeMimeSpecific != NULL) {
148 			mimeType->SetTo(subtypeMimeSpecific);
149 			if (mimeType->IsInstalled())
150 				found = true;
151 		}
152 		if (!found && subtypeMimeGeneric != NULL) {
153 			mimeType->SetTo(subtypeMimeGeneric);
154 			if (mimeType->IsInstalled())
155 				found = true;
156 		}
157 		if (!found)
158 			mimeType->SetTo("text/plain");
159 
160  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
161  * to handle character codes other than ASCII on a unified basis.
162  *
163  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
164  * international characters, now subsumed into this file.
165  */
166 
167 #include <stdio.h>
168 #include <string.h>
169 #include <memory.h>
170 #include <ctype.h>
171 #include <stdlib.h>
172 #include <unistd.h>
173 #include "names.h"
174 
175 typedef unsigned long my_unichar;
176 
177 #define MAXLINELEN 300	/* longest sane line length */
178 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
179 		  || (x) == 0x85 || (x) == '\f')
180 
181 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
182 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
183 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
184 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
185 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
186 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
187 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
188 
189 
190 static int
191 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
192 	const char*& encoding)
193 {
194 	size_t i;
195 	unsigned char *nbuf = NULL;
196 	my_unichar *ubuf = NULL;
197 	size_t ulen;
198 	struct names *p;
199 	int rv = -1;
200 
201 	const char *code = NULL;
202 	encoding = NULL;
203 	const char *type = NULL;
204 	const char *subtype = NULL;
205 	const char *subtypeMimeGeneric = NULL;
206 	const char *subtypeMimeSpecific = NULL;
207 
208 	int has_escapes = 0;
209 	int has_backspace = 0;
210 	int seen_cr = 0;
211 
212 	int n_crlf = 0;
213 	int n_lf = 0;
214 	int n_cr = 0;
215 	int n_nel = 0;
216 
217 	int last_line_end = -1;
218 	int has_long_lines = 0;
219 
220 	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
221 		goto done;
222 	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
223 		goto done;
224 
225 	/*
226 	 * Then try to determine whether it's any character code we can
227 	 * identify.  Each of these tests, if it succeeds, will leave
228 	 * the text converted into one-my_unichar-per-character Unicode in
229 	 * ubuf, and the number of characters converted in ulen.
230 	 */
231 	if (nbytes == 0) {
232 		code = "UTF-8 Unicode";
233 		encoding = NULL; // "UTF-8";
234 		type = "text";
235 		rv = 1;
236 	} else if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
237 		code = "ASCII";
238 		encoding = NULL; //"us-ascii";
239 		type = "text";
240 		if (nbytes == 1) {
241 			// no further tests
242 			rv = 1;
243 		}
244 	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
245 		code = "UTF-8 Unicode";
246 		encoding = NULL; // "UTF-8";
247 		type = "text";
248 	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
249 		if (i == 1) {
250 			code = "Little-endian UTF-16 Unicode";
251 			encoding = "UTF-16";
252 		} else {
253 			code = "Big-endian UTF-16 Unicode";
254 			encoding = "UTF-16";
255 		}
256 
257 		type = "character data";
258 	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
259 		code = "ISO-8859";
260 		type = "text";
261 		encoding = "iso-8859-1";
262 	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
263 		code = "Non-ISO extended-ASCII";
264 		type = "text";
265 		encoding = "unknown";
266 	} else {
267 		from_ebcdic(buf, nbytes, nbuf);
268 
269 		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
270 			code = "EBCDIC";
271 			type = "character data";
272 			encoding = "ebcdic";
273 		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
274 			code = "International EBCDIC";
275 			type = "character data";
276 			encoding = "ebcdic";
277 		} else {
278 			rv = 0;
279 			goto done;  /* doesn't look like text at all */
280 		}
281 	}
282 
283 	if (nbytes <= 1) {
284 		if (rv == -1)
285 			rv = 0;
286 		goto done;
287 	}
288 
289 	/*
290 	 * for troff, look for . + letter + letter or .\";
291 	 * this must be done to disambiguate tar archives' ./file
292 	 * and other trash from real troff input.
293 	 *
294 	 * I believe Plan 9 troff allows non-ASCII characters in the names
295 	 * of macros, so this test might possibly fail on such a file.
296 	 */
297 	if (*ubuf == '.') {
298 		my_unichar *tp = ubuf + 1;
299 
300 		while (ISSPC(*tp))
301 			++tp;	/* skip leading whitespace */
302 		if ((tp[0] == '\\' && tp[1] == '\"') ||
303 		    (isascii((unsigned char)tp[0]) &&
304 		     isalnum((unsigned char)tp[0]) &&
305 		     isascii((unsigned char)tp[1]) &&
306 		     isalnum((unsigned char)tp[1]) &&
307 		     ISSPC(tp[2]))) {
308 		    subtypeMimeGeneric = "text/x-source-code";
309 			subtypeMimeSpecific = "text/troff";
310 			subtype = "troff or preprocessor input";
311 			goto subtype_identified;
312 		}
313 	}
314 
315 	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
316 		subtypeMimeGeneric = "text/x-source-code";
317 		subtypeMimeSpecific = "text/fortran";
318 		subtype = "fortran program";
319 		goto subtype_identified;
320 	}
321 
322 	/* look for tokens from names.h - this is expensive! */
323 
324 	i = 0;
325 	while (i < ulen) {
326 		size_t end;
327 
328 		/*
329 		 * skip past any leading space
330 		 */
331 		while (i < ulen && ISSPC(ubuf[i]))
332 			i++;
333 		if (i >= ulen)
334 			break;
335 
336 		/*
337 		 * find the next whitespace
338 		 */
339 		for (end = i + 1; end < nbytes; end++)
340 			if (ISSPC(ubuf[end]))
341 				break;
342 
343 		/*
344 		 * compare the word thus isolated against the token list
345 		 */
346 		for (p = names; p < names + NNAMES; p++) {
347 			if (ascmatch((const unsigned char *)p->name, ubuf + i,
348 			    end - i)) {
349 				subtype = types[p->type].human;
350 				subtypeMimeGeneric = types[p->type].generic_mime;
351 				subtypeMimeSpecific = types[p->type].specific_mime;
352 				goto subtype_identified;
353 			}
354 		}
355 
356 		i = end;
357 	}
358 
359 subtype_identified:
360 
361 	/*
362 	 * Now try to discover other details about the file.
363 	 */
364 	for (i = 0; i < ulen; i++) {
365 		if (ubuf[i] == '\n') {
366 			if (seen_cr)
367 				n_crlf++;
368 			else
369 				n_lf++;
370 			last_line_end = i;
371 		} else if (seen_cr)
372 			n_cr++;
373 
374 		seen_cr = (ubuf[i] == '\r');
375 		if (seen_cr)
376 			last_line_end = i;
377 
378 		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
379 			n_nel++;
380 			last_line_end = i;
381 		}
382 
383 		/* If this line is _longer_ than MAXLINELEN, remember it. */
384 		if ((int)i > last_line_end + MAXLINELEN)
385 			has_long_lines = 1;
386 
387 		if (ubuf[i] == '\033')
388 			has_escapes = 1;
389 		if (ubuf[i] == '\b')
390 			has_backspace = 1;
391 	}
392 
393 	rv = 1;
394 done:
395 	if (nbuf)
396 		free(nbuf);
397 	if (ubuf)
398 		free(ubuf);
399 
400 	if (rv) {
401 		// If we have identified the subtype, return it, otherwise just
402 		// text/plain.
403 
404 		bool found = false;
405 		if (subtypeMimeSpecific != NULL) {
406 			mimeType->SetTo(subtypeMimeSpecific);
407 			if (mimeType->IsInstalled())
408 				found = true;
409 		}
410 		if (!found && subtypeMimeGeneric != NULL) {
411 			mimeType->SetTo(subtypeMimeGeneric);
412 			if (mimeType->IsInstalled())
413 				found = true;
414 		}
415 		if (!found)
416 			mimeType->SetTo("text/plain");
417 	}
418 
419 	return rv;
420 }
421 
422 static int
423 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
424 {
425 	size_t i;
426 
427 	for (i = 0; i < ulen; i++) {
428 		if (s[i] != us[i])
429 			return 0;
430 	}
431 
432 	if (s[i])
433 		return 0;
434 	else
435 		return 1;
436 }
437 
438 /*
439  * This table reflects a particular philosophy about what constitutes
440  * "text," and there is room for disagreement about it.
441  *
442  * Version 3.31 of the file command considered a file to be ASCII if
443  * each of its characters was approved by either the isascii() or
444  * isalpha() function.  On most systems, this would mean that any
445  * file consisting only of characters in the range 0x00 ... 0x7F
446  * would be called ASCII text, but many systems might reasonably
447  * consider some characters outside this range to be alphabetic,
448  * so the file command would call such characters ASCII.  It might
449  * have been more accurate to call this "considered textual on the
450  * local system" than "ASCII."
451  *
452  * It considered a file to be "International language text" if each
453  * of its characters was either an ASCII printing character (according
454  * to the real ASCII standard, not the above test), a character in
455  * the range 0x80 ... 0xFF, or one of the following control characters:
456  * backspace, tab, line feed, vertical tab, form feed, carriage return,
457  * escape.  No attempt was made to determine the language in which files
458  * of this type were written.
459  *
460  *
461  * The table below considers a file to be ASCII if all of its characters
462  * are either ASCII printing characters (again, according to the X3.4
463  * standard, not isascii()) or any of the following controls: bell,
464  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
465  *
466  * I include bell because some programs (particularly shell scripts)
467  * use it literally, even though it is rare in normal text.  I exclude
468  * vertical tab because it never seems to be used in real text.  I also
469  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
470  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
471  * character to.  It might be more appropriate to include it in the 8859
472  * set instead of the ASCII set, but it's got to be included in *something*
473  * we recognize or EBCDIC files aren't going to be considered textual.
474  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
475  * and Latin characters, so these should possibly be allowed.  But they
476  * make a real mess on VT100-style displays if they're not paired properly,
477  * so we are probably better off not calling them text.
478  *
479  * A file is considered to be ISO-8859 text if its characters are all
480  * either ASCII, according to the above definition, or printing characters
481  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
482  *
483  * Finally, a file is considered to be international text from some other
484  * character code if its characters are all either ISO-8859 (according to
485  * the above definition) or characters in the range 0x80 ... 0x9F, which
486  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
487  * consider to be printing characters.
488  */
489 
490 #define F 0   /* character never appears in text */
491 #define T 1   /* character appears in plain ASCII text */
492 #define I 2   /* character appears in ISO-8859 text */
493 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
494 
495 static char text_chars[256] = {
496 	/*                  BEL BS HT LF    FF CR    */
497 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
498         /*                              ESC          */
499 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
500 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
501 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
502 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
503 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
504 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
505 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
506 	/*            NEL                            */
507 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
508 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
509 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
510 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
511 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
512 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
513 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
514 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
515 };
516 
517 static int
518 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
519     size_t *ulen)
520 {
521 	int i;
522 
523 	*ulen = 0;
524 
525 	for (i = 0; i < (int)nbytes; i++) {
526 		int t = text_chars[buf[i]];
527 
528 		if (t != T)
529 			return 0;
530 
531 		ubuf[(*ulen)++] = buf[i];
532 	}
533 
534 	return 1;
535 }
536 
537 static int
538 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
539 {
540 	int i;
541 
542 	*ulen = 0;
543 
544 	for (i = 0; i < (int)nbytes; i++) {
545 		int t = text_chars[buf[i]];
546 
547 		if (t != T && t != I)
548 			return 0;
549 
550 		ubuf[(*ulen)++] = buf[i];
551 	}
552 
553 	return 1;
554 }
555 
556 static int
557 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
558     size_t *ulen)
559 {
560 	int i;
561 
562 	*ulen = 0;
563 
564 	for (i = 0; i < (int)nbytes; i++) {
565 		int t = text_chars[buf[i]];
566 
567 		if (t != T && t != I && t != X)
568 			return 0;
569 
570 		ubuf[(*ulen)++] = buf[i];
571 	}
572 
573 	return 1;
574 }
575 
576 static int
577 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
578 {
579 	int i, n;
580 	my_unichar c;
581 	int gotone = 0;
582 
583 	*ulen = 0;
584 
585 	for (i = 0; i < (int)nbytes; i++) {
586 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
587 			/*
588 			 * Even if the whole file is valid UTF-8 sequences,
589 			 * still reject it if it uses weird control characters.
590 			 */
591 
592 			if (text_chars[buf[i]] != T)
593 				return 0;
594 
595 			ubuf[(*ulen)++] = buf[i];
596 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
597 			return 0;
598 		} else {			   /* 11xxxxxx begins UTF-8 */
599 			int following;
600 
601 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
602 				c = buf[i] & 0x1f;
603 				following = 1;
604 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
605 				c = buf[i] & 0x0f;
606 				following = 2;
607 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
608 				c = buf[i] & 0x07;
609 				following = 3;
610 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
611 				c = buf[i] & 0x03;
612 				following = 4;
613 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
614 				c = buf[i] & 0x01;
615 				following = 5;
616 			} else
617 				return 0;
618 
619 			for (n = 0; n < following; n++) {
620 				i++;
621 				if (i >= (int)nbytes)
622 					goto done;
623 
624 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
625 					return 0;
626 
627 				c = (c << 6) + (buf[i] & 0x3f);
628 			}
629 
630 			ubuf[(*ulen)++] = c;
631 			gotone = 1;
632 		}
633 	}
634 done:
635 	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
636 }
637 
638 static int
639 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
640     size_t *ulen)
641 {
642 	int bigend;
643 	int i;
644 
645 	if (nbytes < 2)
646 		return 0;
647 
648 	if (buf[0] == 0xff && buf[1] == 0xfe)
649 		bigend = 0;
650 	else if (buf[0] == 0xfe && buf[1] == 0xff)
651 		bigend = 1;
652 	else
653 		return 0;
654 
655 	*ulen = 0;
656 
657 	for (i = 2; i + 1 < (int)nbytes; i += 2) {
658 		/* XXX fix to properly handle chars > 65536 */
659 
660 		if (bigend)
661 			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
662 		else
663 			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
664 
665 		if (ubuf[*ulen - 1] == 0xfffe)
666 			return 0;
667 		if (ubuf[*ulen - 1] < 128 &&
668 		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
669 			return 0;
670 	}
671 
672 	return 1 + bigend;
673 }
674 
675 #undef F
676 #undef T
677 #undef I
678 #undef X
679 
680 /*
681  * This table maps each EBCDIC character to an (8-bit extended) ASCII
682  * character, as specified in the rationale for the dd(1) command in
683  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
684  *
685  * Unfortunately it does not seem to correspond exactly to any of the
686  * five variants of EBCDIC documented in IBM's _Enterprise Systems
687  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
688  * Edition, July, 1999, pp. I-1 - I-4.
689  *
690  * Fortunately, though, all versions of EBCDIC, including this one, agree
691  * on most of the printing characters that also appear in (7-bit) ASCII.
692  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
693  *
694  * Fortunately too, there is general agreement that codes 0x00 through
695  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
696  * remainder printing characters.
697  *
698  * This is sufficient to allow us to identify EBCDIC text and to distinguish
699  * between old-style and internationalized examples of text.
700  */
701 
702 static unsigned char ebcdic_to_ascii[] = {
703   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
704  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
705 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
706 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
707 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
708 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
709 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
710 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
711 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
712 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
713 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
714 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
715 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
716 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
717 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
718 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
719 };
720 
721 #ifdef notdef
722 /*
723  * The following EBCDIC-to-ASCII table may relate more closely to reality,
724  * or at least to modern reality.  It comes from
725  *
726  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
727  *
728  * and maps the characters of EBCDIC code page 1047 (the code used for
729  * Unix-derived software on IBM's 390 systems) to the corresponding
730  * characters from ISO 8859-1.
731  *
732  * If this table is used instead of the above one, some of the special
733  * cases for the NEL character can be taken out of the code.
734  */
735 
736 static unsigned char ebcdic_1047_to_8859[] = {
737 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
738 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
739 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
740 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
741 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
742 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
743 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
744 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
745 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
746 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
747 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
748 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
749 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
750 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
751 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
752 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
753 };
754 #endif
755 
756 /*
757  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
758  */
759 static void
760 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
761 {
762 	int i;
763 
764 	for (i = 0; i < (int)nbytes; i++) {
765 		out[i] = ebcdic_to_ascii[buf[i]];
766 	}
767 }
768 
769 
770 //	#pragma mark -
771 
772 
773 /*!
774 	Determines if the data in inSource is of the STXT format.
775 
776 	\param header the STXT stream header read in by Identify() or Translate()
777 	\param inSource the stream with the STXT data
778 	\param outInfo information about the type of data from inSource is stored here
779 	\param outType the desired output type for the data in inSource
780 	\param ptxtheader if this is not NULL, the TEXT header from
781 		inSource is copied to it
782 */
783 status_t
784 identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
785 	BPositionIO *inSource, translator_info *outInfo, uint32 outType,
786 	TranslatorStyledTextTextHeader *ptxtheader = NULL)
787 {
788 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
789 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
790 
791 	uint8 buffer[max(ktxtsize, kstylsize)];
792 
793 	// Check the TEXT header
794 	TranslatorStyledTextTextHeader txtheader;
795 	if (inSource->Read(buffer, ktxtsize) != ktxtsize)
796 		return B_NO_TRANSLATOR;
797 
798 	memcpy(&txtheader, buffer, ktxtsize);
799 	if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
800 		B_SWAP_BENDIAN_TO_HOST) != B_OK)
801 		return B_ERROR;
802 
803 	if (txtheader.header.magic != 'TEXT'
804 		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
805 		|| txtheader.charset != B_UNICODE_UTF8)
806 		return B_NO_TRANSLATOR;
807 
808 	// skip the text data
809 	off_t seekresult, pos;
810 	pos = header.header.header_size + txtheader.header.header_size
811 		+ txtheader.header.data_size;
812 	seekresult = inSource->Seek(txtheader.header.data_size,
813 		SEEK_CUR);
814 	if (seekresult < pos)
815 		return B_NO_TRANSLATOR;
816 	if (seekresult > pos)
817 		return B_ERROR;
818 
819 	// check the STYL header (not all STXT files have this)
820 	ssize_t read = 0;
821 	TranslatorStyledTextStyleHeader stylheader;
822 	read = inSource->Read(buffer, kstylsize);
823 	if (read < 0)
824 		return read;
825 	if (read != kstylsize && read != 0)
826 		return B_NO_TRANSLATOR;
827 
828 	// If there is a STYL header
829 	if (read == kstylsize) {
830 		memcpy(&stylheader, buffer, kstylsize);
831 		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
832 			B_SWAP_BENDIAN_TO_HOST) != B_OK)
833 			return B_ERROR;
834 
835 		if (stylheader.header.magic != 'STYL'
836 			|| stylheader.header.header_size !=
837 				sizeof(TranslatorStyledTextStyleHeader))
838 			return B_NO_TRANSLATOR;
839 	}
840 
841 	// if output TEXT header is supplied, fill it with data
842 	if (ptxtheader) {
843 		ptxtheader->header.magic = txtheader.header.magic;
844 		ptxtheader->header.header_size = txtheader.header.header_size;
845 		ptxtheader->header.data_size = txtheader.header.data_size;
846 		ptxtheader->charset = txtheader.charset;
847 	}
848 
849 	// return information about the data in the stream
850 	outInfo->type = B_STYLED_TEXT_FORMAT;
851 	outInfo->group = B_TRANSLATOR_TEXT;
852 	outInfo->quality = STXT_IN_QUALITY;
853 	outInfo->capability = STXT_IN_CAPABILITY;
854 	strcpy(outInfo->name, "Be styled text file");
855 	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
856 
857 	return B_OK;
858 }
859 
860 
861 /*!
862 	Determines if the data in \a inSource is of the UTF8 plain
863 
864 	\param data buffer containing data already read (must be at
865 		least DATA_BUFFER_SIZE bytes large)
866 	\param nread number of bytes that have already been read from the stream
867 	\param header the STXT stream header read in by Identify() or Translate()
868 	\param inSource the stream with the STXT data
869 	\param outInfo information about the type of data from inSource is stored here
870 	\param outType the desired output type for the data in inSource
871 */
872 status_t
873 identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
874 	translator_info* outInfo, uint32 outType, const char*& encoding)
875 {
876 	ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
877 	if (readLater < B_OK)
878 		return B_NO_TRANSLATOR;
879 
880 	bytesRead += readLater;
881 
882 	// TODO: identify encoding as possible!
883 	BMimeType type;
884 	if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
885 		return B_NO_TRANSLATOR;
886 
887 	float capability = TEXT_IN_CAPABILITY;
888 	if (bytesRead < 20)
889 		capability = .1f;
890 
891 	// return information about the data in the stream
892 	outInfo->type = B_TRANSLATOR_TEXT;
893 	outInfo->group = B_TRANSLATOR_TEXT;
894 	outInfo->quality = TEXT_IN_QUALITY;
895 	outInfo->capability = capability;
896 
897 	char description[B_MIME_TYPE_LENGTH];
898 	if (type.GetLongDescription(description) == B_OK)
899 		strlcpy(outInfo->name, description, sizeof(outInfo->name));
900 	else
901 		strlcpy(outInfo->name, "Plain text file", sizeof(outInfo->name));
902 
903 	//strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
904 	strcpy(outInfo->MIME, "text/plain");
905 	return B_OK;
906 }
907 
908 
909 // ---------------------------------------------------------------
910 // translate_from_stxt
911 //
912 // Translates the data in inSource to the type outType and stores
913 // the translated data in outDestination.
914 //
915 // Preconditions:
916 //
917 // Parameters:	inSource,	the data to be translated
918 //
919 //				outDestination,	where the translated data is
920 //								put
921 //
922 //				outType,	the type to convert inSource to
923 //
924 //				txtheader, 	the TEXT header from inSource
925 //
926 //
927 // Postconditions:
928 //
929 // Returns: B_BAD_VALUE, if outType is invalid
930 //
931 // B_NO_TRANSLATOR, if this translator doesn't understand the data
932 //
933 // B_ERROR, if there was an error allocating memory or converting
934 //          data
935 //
936 // B_OK, if all went well
937 // ---------------------------------------------------------------
938 status_t
939 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
940 		uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
941 {
942 	if (inSource->Seek(0, SEEK_SET) != 0)
943 		return B_ERROR;
944 
945 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
946 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
947 
948 	bool btoplain;
949 	if (outType == B_TRANSLATOR_TEXT)
950 		btoplain = true;
951 	else if (outType == B_STYLED_TEXT_FORMAT)
952 		btoplain = false;
953 	else
954 		return B_BAD_VALUE;
955 
956 	uint8 buffer[READ_BUFFER_SIZE];
957 	ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
958 
959 	// skip to the actual text data when outputting a
960 	// plain text file
961 	if (btoplain) {
962 		if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
963 			kstxtsize + ktxtsize)
964 			return B_ERROR;
965 	}
966 
967 	// Read data from inSource
968 	// When outputing B_TRANSLATOR_TEXT, the loop stops when all of
969 	// the text data has been read and written.
970 	// When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
971 	// of the data from inSource has been read and written.
972 	if (btoplain)
973 		nreed = min(READ_BUFFER_SIZE,
974 			txtheader.header.data_size - ntotalread);
975 	else
976 		nreed = READ_BUFFER_SIZE;
977 	nread = inSource->Read(buffer, nreed);
978 	while (nread > 0) {
979 		nwritten = outDestination->Write(buffer, nread);
980 		if (nwritten != nread)
981 			return B_ERROR;
982 
983 		if (btoplain) {
984 			ntotalread += nread;
985 			nreed = min(READ_BUFFER_SIZE,
986 				txtheader.header.data_size - ntotalread);
987 		} else
988 			nreed = READ_BUFFER_SIZE;
989 		nread = inSource->Read(buffer, nreed);
990 	}
991 
992 	if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
993 		ntotalread)
994 		// If not all of the text data was able to be read...
995 		return B_NO_TRANSLATOR;
996 	else
997 		return B_OK;
998 }
999 
1000 // ---------------------------------------------------------------
1001 // output_headers
1002 //
1003 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
1004 // to outDestination, setting the data_size member of the text header
1005 // to text_data_size
1006 //
1007 // Preconditions:
1008 //
1009 // Parameters:	outDestination,	where the translated data is
1010 //								put
1011 //
1012 //				text_data_size, number of bytes in data section
1013 //							    of the TEXT header
1014 //
1015 //
1016 // Postconditions:
1017 //
1018 // Returns:
1019 //
1020 // B_ERROR, if there was an error writing to outDestination or
1021 // 	an error with converting the byte order
1022 //
1023 // B_OK, if all went well
1024 // ---------------------------------------------------------------
1025 status_t
1026 output_headers(BPositionIO *outDestination, uint32 text_data_size)
1027 {
1028 	const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
1029 		sizeof(TranslatorStyledTextTextHeader);
1030 	status_t result;
1031 	TranslatorStyledTextStreamHeader stxtheader;
1032 	TranslatorStyledTextTextHeader txtheader;
1033 
1034 	uint8 buffer[kHeadersSize];
1035 
1036 	stxtheader.header.magic = 'STXT';
1037 	stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1038 	stxtheader.header.data_size = 0;
1039 	stxtheader.version = 100;
1040 	memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1041 
1042 	txtheader.header.magic = 'TEXT';
1043 	txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1044 	txtheader.header.data_size = text_data_size;
1045 	txtheader.charset = B_UNICODE_UTF8;
1046 	memcpy(buffer + stxtheader.header.header_size, &txtheader,
1047 		txtheader.header.header_size);
1048 
1049 	// write out headers in Big Endian byte order
1050 	result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1051 		B_SWAP_HOST_TO_BENDIAN);
1052 	if (result == B_OK) {
1053 		ssize_t nwritten = 0;
1054 		nwritten = outDestination->Write(buffer, kHeadersSize);
1055 		if (nwritten != kHeadersSize)
1056 			return B_ERROR;
1057 		else
1058 			return B_OK;
1059 	}
1060 
1061 	return result;
1062 }
1063 
1064 // ---------------------------------------------------------------
1065 // output_styles
1066 //
1067 // Writes out the actual style information into outDestination
1068 // using the data from pflatRunArray
1069 //
1070 // Preconditions:
1071 //
1072 // Parameters:	outDestination,	where the translated data is
1073 //								put
1074 //
1075 //				text_size,		size in bytes of the text in
1076 //								outDestination
1077 //
1078 //				data_size,		size of pflatRunArray
1079 //
1080 // Postconditions:
1081 //
1082 // Returns:
1083 //
1084 // B_ERROR, if there was an error writing to outDestination or
1085 // 	an error with converting the byte order
1086 //
1087 // B_OK, if all went well
1088 // ---------------------------------------------------------------
1089 status_t
1090 output_styles(BPositionIO *outDestination, uint32 text_size,
1091 	uint8 *pflatRunArray, ssize_t data_size)
1092 {
1093 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1094 
1095 	uint8 buffer[kstylsize];
1096 
1097 	// output STYL header
1098 	TranslatorStyledTextStyleHeader stylheader;
1099 	stylheader.header.magic = 'STYL';
1100 	stylheader.header.header_size =
1101 		sizeof(TranslatorStyledTextStyleHeader);
1102 	stylheader.header.data_size = data_size;
1103 	stylheader.apply_offset = 0;
1104 	stylheader.apply_length = text_size;
1105 
1106 	memcpy(buffer, &stylheader, kstylsize);
1107 	if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1108 		B_SWAP_HOST_TO_BENDIAN) != B_OK)
1109 		return B_ERROR;
1110 	if (outDestination->Write(buffer, kstylsize) != kstylsize)
1111 		return B_ERROR;
1112 
1113 	// output actual style information
1114 	if (outDestination->Write(pflatRunArray,
1115 		data_size) != data_size)
1116 		return B_ERROR;
1117 
1118 	return B_OK;
1119 }
1120 
1121 
1122 /*!
1123 	Convert the plain text (UTF8) from inSource to plain or
1124 	styled text in outDestination
1125 */
1126 status_t
1127 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1128 	BPositionIO* destination, uint32 outType)
1129 {
1130 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1131 		return B_BAD_VALUE;
1132 
1133 	// find the length of the text
1134 	off_t size = source->Seek(0, SEEK_END);
1135 	if (size < 0)
1136 		return (status_t)size;
1137 	if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1138 		return B_NOT_SUPPORTED;
1139 
1140 	status_t status = source->Seek(0, SEEK_SET);
1141 	if (status < B_OK)
1142 		return status;
1143 
1144 	if (outType == B_STYLED_TEXT_FORMAT) {
1145 		// output styled text headers
1146 		status = output_headers(destination, (uint32)size);
1147 		if (status != B_OK)
1148 			return status;
1149 	}
1150 
1151 	class MallocBuffer {
1152 		public:
1153 			MallocBuffer() : fBuffer(NULL), fSize(0) {}
1154 			~MallocBuffer() { free(fBuffer); }
1155 
1156 			void* Buffer() { return fBuffer; }
1157 			size_t Size() const { return fSize; }
1158 
1159 			status_t
1160 			Allocate(size_t size)
1161 			{
1162 				fBuffer = malloc(size);
1163 				if (fBuffer != NULL) {
1164 					fSize = size;
1165 					return B_OK;
1166 				}
1167 				return B_NO_MEMORY;
1168 			}
1169 
1170 		private:
1171 			void*	fBuffer;
1172 			size_t	fSize;
1173 	} encodingBuffer;
1174 	BMallocIO encodingIO;
1175 	uint32 encodingID = 0;
1176 		// defaults to UTF-8 or no encoding
1177 
1178 	BNode* node = dynamic_cast<BNode*>(source);
1179 	if (node != NULL) {
1180 		// determine encoding, if available
1181 		const BCharacterSet* characterSet = NULL;
1182 		bool hasAttribute = false;
1183 		if (encoding != NULL && !forceEncoding) {
1184 			BString name;
1185 			if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1186 				encoding = name.String();
1187 				hasAttribute = true;
1188 			} else {
1189 				int32 value;
1190 				ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1191 					&value, sizeof(value));
1192 				if (bytesRead == (ssize_t)sizeof(value)) {
1193 					hasAttribute = true;
1194 					if (value != 65535)
1195 						characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1196 				}
1197 			}
1198 		} else {
1199 			hasAttribute = true;
1200 				// we don't write the encoding in this case
1201 		}
1202 		if (characterSet == NULL && encoding != NULL)
1203 			characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1204 
1205 		if (characterSet != NULL) {
1206 			encodingID = characterSet->GetConversionID();
1207 			encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1208 		}
1209 
1210 		if (!hasAttribute && encoding != NULL) {
1211 			// add encoding attribute, so that someone opening the file can
1212 			// retrieve it for persistance
1213 			node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1214 				strlen(encoding));
1215 		}
1216 	}
1217 
1218 	off_t outputSize = 0;
1219 	ssize_t bytesRead;
1220 	int32 state = 0;
1221 
1222 	// output the actual text part of the data
1223 	do {
1224 		uint8 buffer[READ_BUFFER_SIZE];
1225 		bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1226 		if (bytesRead < B_OK)
1227 			return bytesRead;
1228 		if (bytesRead == 0)
1229 			break;
1230 
1231 		if (encodingBuffer.Size() == 0) {
1232 			// default, no encoding
1233 			ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1234 			if (bytesWritten != bytesRead) {
1235 				if (bytesWritten < B_OK)
1236 					return bytesWritten;
1237 
1238 				return B_ERROR;
1239 			}
1240 
1241 			outputSize += bytesRead;
1242 		} else {
1243 			// decode text file to UTF-8
1244 			char* pos = (char*)buffer;
1245 			int32 encodingLength = encodingIO.BufferLength();
1246 			int32 bytesLeft = bytesRead;
1247 			int32 bytes;
1248 			do {
1249 				encodingLength = READ_BUFFER_SIZE * 4;
1250 				bytes = bytesLeft;
1251 
1252 				status = convert_to_utf8(encodingID, pos, &bytes,
1253 					(char*)encodingBuffer.Buffer(), &encodingLength, &state);
1254 				if (status < B_OK)
1255 					return status;
1256 
1257 				ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1258 					encodingLength);
1259 				if (bytesWritten < encodingLength) {
1260 					if (bytesWritten < B_OK)
1261 						return bytesWritten;
1262 
1263 					return B_ERROR;
1264 				}
1265 
1266 				pos += bytes;
1267 				bytesLeft -= bytes;
1268 				outputSize += encodingLength;
1269 			} while (encodingLength > 0 && bytesLeft > 0);
1270 		}
1271 	} while (bytesRead > 0);
1272 
1273 	if (outType != B_STYLED_TEXT_FORMAT)
1274 		return B_OK;
1275 
1276 	if (encodingBuffer.Size() != 0 && size != outputSize) {
1277 		if (outputSize > UINT32_MAX)
1278 			return B_NOT_SUPPORTED;
1279 
1280 		// we need to update the header as the decoded text size has changed
1281 		status = destination->Seek(0, SEEK_SET);
1282 		if (status == B_OK)
1283 			status = output_headers(destination, (uint32)outputSize);
1284 		if (status == B_OK)
1285 			status = destination->Seek(0, SEEK_END);
1286 
1287 		if (status < B_OK)
1288 			return status;
1289 	}
1290 
1291 	// Read file attributes if outputting styled data
1292 	// and source is a BNode object
1293 
1294 	if (node == NULL)
1295 		return B_OK;
1296 
1297 	// Try to read styles - we only propagate an error if the actual on-disk
1298 	// data is likely to be okay
1299 
1300 	const char *kAttrName = "styles";
1301 	attr_info info;
1302 	if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1303 		return B_OK;
1304 
1305 	if (info.type != B_RAW_TYPE || info.size < 160) {
1306 		// styles seem to be broken, but since we got the text,
1307 		// we don't propagate the error
1308 		return B_OK;
1309 	}
1310 
1311 	uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1312 	if (flatRunArray == NULL)
1313 		return B_NO_MEMORY;
1314 
1315 	bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1316 	if (bytesRead != info.size)
1317 		return B_OK;
1318 
1319 	output_styles(destination, size, flatRunArray, info.size);
1320 
1321 	delete[] flatRunArray;
1322 	return B_OK;
1323 }
1324 
1325 
1326 //	#pragma mark -
1327 
1328 
1329 STXTTranslator::STXTTranslator()
1330 	: BaseTranslator("StyledEdit Files", "StyledEdit files translator",
1331 		STXT_TRANSLATOR_VERSION,
1332 		gInputFormats, sizeof(gInputFormats) / sizeof(translation_format),
1333 		gOutputFormats, sizeof(gOutputFormats) / sizeof(translation_format),
1334 		"STXTTranslator_Settings",
1335 		gDefaultSettings, sizeof(gDefaultSettings) / sizeof(TranSetting),
1336 		B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1337 {
1338 }
1339 
1340 
1341 STXTTranslator::~STXTTranslator()
1342 {
1343 }
1344 
1345 
1346 status_t
1347 STXTTranslator::Identify(BPositionIO *inSource,
1348 	const translation_format *inFormat, BMessage *ioExtension,
1349 	translator_info *outInfo, uint32 outType)
1350 {
1351 	if (!outType)
1352 		outType = B_TRANSLATOR_TEXT;
1353 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1354 		return B_NO_TRANSLATOR;
1355 
1356 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1357 
1358 	uint8 buffer[DATA_BUFFER_SIZE];
1359 	status_t nread = 0;
1360 	// Read in the header to determine
1361 	// if the data is supported
1362 	nread = inSource->Read(buffer, kstxtsize);
1363 	if (nread < 0)
1364 		return nread;
1365 
1366 	// read in enough data to fill the stream header
1367 	if (nread == kstxtsize) {
1368 		TranslatorStyledTextStreamHeader header;
1369 		memcpy(&header, buffer, kstxtsize);
1370 		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1371 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1372 			return B_ERROR;
1373 
1374 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1375 			&& header.header.header_size == (int32)kstxtsize
1376 			&& header.header.data_size == 0
1377 			&& header.version == 100)
1378 			return identify_stxt_header(header, inSource, outInfo, outType);
1379 	}
1380 
1381 	// if the data is not styled text, check if it is plain text
1382 	const char* encoding;
1383 	return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1384 }
1385 
1386 
1387 status_t
1388 STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1389 	BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1390 {
1391 	if (!outType)
1392 		outType = B_TRANSLATOR_TEXT;
1393 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1394 		return B_NO_TRANSLATOR;
1395 
1396 	const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1397 	uint8 buffer[DATA_BUFFER_SIZE];
1398 	status_t result;
1399 	translator_info outInfo;
1400 	// Read in the header to determine
1401 	// if the data is supported
1402 	ssize_t bytesRead = source->Read(buffer, headerSize);
1403 	if (bytesRead < 0)
1404 		return bytesRead;
1405 
1406 	// read in enough data to fill the stream header
1407 	if (bytesRead == headerSize) {
1408 		TranslatorStyledTextStreamHeader header;
1409 		memcpy(&header, buffer, headerSize);
1410 		if (swap_data(B_UINT32_TYPE, &header, headerSize,
1411 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1412 			return B_ERROR;
1413 
1414 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1415 			&& header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1416 			&& header.header.data_size == 0
1417 			&& header.version == 100) {
1418 			TranslatorStyledTextTextHeader textHeader;
1419 			result = identify_stxt_header(header, source, &outInfo, outType,
1420 				&textHeader);
1421 			if (result != B_OK)
1422 				return result;
1423 
1424 			return translate_from_stxt(source, outDestination, outType, textHeader);
1425 		}
1426 	}
1427 
1428 	// if the data is not styled text, check if it is ASCII text
1429 	bool forceEncoding = false;
1430 	const char* encoding = NULL;
1431 	result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1432 	if (result != B_OK)
1433 		return result;
1434 
1435 	if (ioExtension != NULL) {
1436 		const char* value;
1437 		if (ioExtension->FindString("be:encoding", &value) == B_OK
1438 			&& value[0]) {
1439 			// override encoding
1440 			encoding = value;
1441 			forceEncoding = true;
1442 		}
1443 	}
1444 
1445 	return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1446 }
1447 
1448 
1449 BView *
1450 STXTTranslator::NewConfigView(TranslatorSettings *settings)
1451 {
1452 	return new STXTView(BRect(0, 0, 225, 175), "STXTTranslator Settings",
1453 		B_FOLLOW_ALL, B_WILL_DRAW, settings);
1454 }
1455 
1456