xref: /haiku/src/add-ons/translators/stxt/STXTTranslator.cpp (revision 99d027cd0238c1d86da86d7c3f4200509ccc61a6)
1 /*
2  * Copyright 2002-2009, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Michael Wilber
7  *		Axel Dörfler, axeld@pinc-software.de
8  */
9 
10 
11 #include "STXTTranslator.h"
12 #include "STXTView.h"
13 
14 #include <Catalog.h>
15 #include <CharacterSet.h>
16 #include <CharacterSetRoster.h>
17 #include <MimeType.h>
18 #include <String.h>
19 #include <UTF8.h>
20 
21 #include <algorithm>
22 #include <new>
23 #include <string.h>
24 #include <stdio.h>
25 #include <stdint.h>
26 
27 
28 using namespace BPrivate;
29 using namespace std;
30 
31 #undef B_TRANSLATION_CONTEXT
32 #define B_TRANSLATION_CONTEXT "STXTTranslator"
33 
34 #define READ_BUFFER_SIZE 32768
35 #define DATA_BUFFER_SIZE 256
36 
37 // The input formats that this translator supports.
38 static const translation_format sInputFormats[] = {
39 	{
40 		B_TRANSLATOR_TEXT,
41 		B_TRANSLATOR_TEXT,
42 		TEXT_IN_QUALITY,
43 		TEXT_IN_CAPABILITY,
44 		"text/plain",
45 		"Plain text file"
46 	},
47 	{
48 		B_STYLED_TEXT_FORMAT,
49 		B_TRANSLATOR_TEXT,
50 		STXT_IN_QUALITY,
51 		STXT_IN_CAPABILITY,
52 		"text/x-vnd.Be-stxt",
53 		"Be styled text file"
54 	}
55 };
56 
57 // The output formats that this translator supports.
58 static const translation_format sOutputFormats[] = {
59 	{
60 		B_TRANSLATOR_TEXT,
61 		B_TRANSLATOR_TEXT,
62 		TEXT_OUT_QUALITY,
63 		TEXT_OUT_CAPABILITY,
64 		"text/plain",
65 		"Plain text file"
66 	},
67 	{
68 		B_STYLED_TEXT_FORMAT,
69 		B_TRANSLATOR_TEXT,
70 		STXT_OUT_QUALITY,
71 		STXT_OUT_CAPABILITY,
72 		"text/x-vnd.Be-stxt",
73 		"Be styled text file"
74 	}
75 };
76 
77 // Default settings for the Translator
78 static const TranSetting sDefaultSettings[] = {
79 	{B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
80 	{B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
81 };
82 
83 const uint32 kNumInputFormats = sizeof(sInputFormats) / sizeof(translation_format);
84 const uint32 kNumOutputFormats = sizeof(sOutputFormats) / sizeof(translation_format);
85 const uint32 kNumDefaultSettings = sizeof(sDefaultSettings) / sizeof(TranSetting);
86 
87 // ---------------------------------------------------------------
88 // make_nth_translator
89 //
90 // Creates a STXTTranslator object to be used by BTranslatorRoster
91 //
92 // Preconditions:
93 //
94 // Parameters: n,		The translator to return. Since
95 //						STXTTranslator only publishes one
96 //						translator, it only returns a
97 //						STXTTranslator if n == 0
98 //
99 //             you, 	The image_id of the add-on that
100 //						contains code (not used).
101 //
102 //             flags,	Has no meaning yet, should be 0.
103 //
104 // Postconditions:
105 //
106 // Returns: NULL if n is not zero,
107 //          a new STXTTranslator if n is zero
108 // ---------------------------------------------------------------
109 BTranslator *
110 make_nth_translator(int32 n, image_id you, uint32 flags, ...)
111 {
112 	if (!n)
113 		return new (std::nothrow) STXTTranslator();
114 
115 	return NULL;
116 }
117 
118 
119 // #pragma mark - ascmagic.c from the BSD file tool
120 /*
121  * The following code has been taken from version 4.17 of the BSD file tool,
122  * file ascmagic.c, modified for our purpose.
123  */
124 
125 /*
126  * Copyright (c) Ian F. Darwin 1986-1995.
127  * Software written by Ian F. Darwin and others;
128  * maintained 1995-present by Christos Zoulas and others.
129  *
130  * Redistribution and use in source and binary forms, with or without
131  * modification, are permitted provided that the following conditions
132  * are met:
133  * 1. Redistributions of source code must retain the above copyright
134  *    notice immediately at the beginning of the file, without modification,
135  *    this list of conditions, and the following disclaimer.
136  * 2. Redistributions in binary form must reproduce the above copyright
137  *    notice, this list of conditions and the following disclaimer in the
138  *    documentation and/or other materials provided with the distribution.
139  *
140  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
141  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
142  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
143  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
144  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
145  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
146  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
147  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
148  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
149  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
150  * SUCH DAMAGE.
151  */
152 /*
153  * ASCII magic -- file types that we know based on keywords
154  * that can appear anywhere in the file.
155  *		bool found = false;
156 		if (subtypeMimeSpecific != NULL) {
157 			mimeType->SetTo(subtypeMimeSpecific);
158 			if (mimeType->IsInstalled())
159 				found = true;
160 		}
161 		if (!found && subtypeMimeGeneric != NULL) {
162 			mimeType->SetTo(subtypeMimeGeneric);
163 			if (mimeType->IsInstalled())
164 				found = true;
165 		}
166 		if (!found)
167 			mimeType->SetTo("text/plain");
168 
169  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
170  * to handle character codes other than ASCII on a unified basis.
171  *
172  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
173  * international characters, now subsumed into this file.
174  */
175 
176 #include <stdio.h>
177 #include <string.h>
178 #include <memory.h>
179 #include <ctype.h>
180 #include <stdlib.h>
181 #include <unistd.h>
182 #include "names.h"
183 
184 typedef unsigned long my_unichar;
185 
186 #define MAXLINELEN 300	/* longest sane line length */
187 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
188 		  || (x) == 0x85 || (x) == '\f')
189 
190 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
191 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
192 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
193 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
194 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
195 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
196 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
197 
198 
199 static int
200 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
201 	const char*& encoding)
202 {
203 	size_t i;
204 	unsigned char *nbuf = NULL;
205 	my_unichar *ubuf = NULL;
206 	size_t ulen;
207 	struct names *p;
208 	int rv = -1;
209 
210 	const char *code = NULL;
211 	encoding = NULL;
212 	const char *type = NULL;
213 	const char *subtype = NULL;
214 	const char *subtypeMimeGeneric = NULL;
215 	const char *subtypeMimeSpecific = NULL;
216 
217 	int has_escapes = 0;
218 	int has_backspace = 0;
219 	int seen_cr = 0;
220 
221 	int n_crlf = 0;
222 	int n_lf = 0;
223 	int n_cr = 0;
224 	int n_nel = 0;
225 
226 	int last_line_end = -1;
227 	int has_long_lines = 0;
228 
229 	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
230 		goto done;
231 	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
232 		goto done;
233 
234 	/*
235 	 * Then try to determine whether it's any character code we can
236 	 * identify.  Each of these tests, if it succeeds, will leave
237 	 * the text converted into one-my_unichar-per-character Unicode in
238 	 * ubuf, and the number of characters converted in ulen.
239 	 */
240 	if (nbytes == 0) {
241 		code = "UTF-8 Unicode";
242 		encoding = NULL; // "UTF-8";
243 		type = "text";
244 		rv = 1;
245 	} else if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
246 		code = "ASCII";
247 		encoding = NULL; //"us-ascii";
248 		type = "text";
249 		if (nbytes == 1) {
250 			// no further tests
251 			rv = 1;
252 		}
253 	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
254 		code = "UTF-8 Unicode";
255 		encoding = NULL; // "UTF-8";
256 		type = "text";
257 	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
258 		if (i == 1) {
259 			code = "Little-endian UTF-16 Unicode";
260 			encoding = "UTF-16";
261 		} else {
262 			code = "Big-endian UTF-16 Unicode";
263 			encoding = "UTF-16";
264 		}
265 
266 		type = "character data";
267 	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
268 		code = "ISO-8859";
269 		type = "text";
270 		encoding = "iso-8859-1";
271 	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
272 		code = "Non-ISO extended-ASCII";
273 		type = "text";
274 		encoding = "unknown";
275 	} else {
276 		from_ebcdic(buf, nbytes, nbuf);
277 
278 		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
279 			code = "EBCDIC";
280 			type = "character data";
281 			encoding = "ebcdic";
282 		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
283 			code = "International EBCDIC";
284 			type = "character data";
285 			encoding = "ebcdic";
286 		} else {
287 			rv = 0;
288 			goto done;  /* doesn't look like text at all */
289 		}
290 	}
291 
292 	if (nbytes <= 1) {
293 		if (rv == -1)
294 			rv = 0;
295 		goto done;
296 	}
297 
298 	/*
299 	 * for troff, look for . + letter + letter or .\";
300 	 * this must be done to disambiguate tar archives' ./file
301 	 * and other trash from real troff input.
302 	 *
303 	 * I believe Plan 9 troff allows non-ASCII characters in the names
304 	 * of macros, so this test might possibly fail on such a file.
305 	 */
306 	if (*ubuf == '.') {
307 		my_unichar *tp = ubuf + 1;
308 
309 		while (ISSPC(*tp))
310 			++tp;	/* skip leading whitespace */
311 		if ((tp[0] == '\\' && tp[1] == '\"') ||
312 		    (isascii((unsigned char)tp[0]) &&
313 		     isalnum((unsigned char)tp[0]) &&
314 		     isascii((unsigned char)tp[1]) &&
315 		     isalnum((unsigned char)tp[1]) &&
316 		     ISSPC(tp[2]))) {
317 		    subtypeMimeGeneric = "text/x-source-code";
318 			subtypeMimeSpecific = "text/troff";
319 			subtype = "troff or preprocessor input";
320 			goto subtype_identified;
321 		}
322 	}
323 
324 	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
325 		subtypeMimeGeneric = "text/x-source-code";
326 		subtypeMimeSpecific = "text/fortran";
327 		subtype = "fortran program";
328 		goto subtype_identified;
329 	}
330 
331 	/* look for tokens from names.h - this is expensive! */
332 
333 	i = 0;
334 	while (i < ulen) {
335 		size_t end;
336 
337 		/*
338 		 * skip past any leading space
339 		 */
340 		while (i < ulen && ISSPC(ubuf[i]))
341 			i++;
342 		if (i >= ulen)
343 			break;
344 
345 		/*
346 		 * find the next whitespace
347 		 */
348 		for (end = i + 1; end < nbytes; end++)
349 			if (ISSPC(ubuf[end]))
350 				break;
351 
352 		/*
353 		 * compare the word thus isolated against the token list
354 		 */
355 		for (p = names; p < names + NNAMES; p++) {
356 			if (ascmatch((const unsigned char *)p->name, ubuf + i,
357 			    end - i)) {
358 				subtype = types[p->type].human;
359 				subtypeMimeGeneric = types[p->type].generic_mime;
360 				subtypeMimeSpecific = types[p->type].specific_mime;
361 				goto subtype_identified;
362 			}
363 		}
364 
365 		i = end;
366 	}
367 
368 subtype_identified:
369 
370 	/*
371 	 * Now try to discover other details about the file.
372 	 */
373 	for (i = 0; i < ulen; i++) {
374 		if (ubuf[i] == '\n') {
375 			if (seen_cr)
376 				n_crlf++;
377 			else
378 				n_lf++;
379 			last_line_end = i;
380 		} else if (seen_cr)
381 			n_cr++;
382 
383 		seen_cr = (ubuf[i] == '\r');
384 		if (seen_cr)
385 			last_line_end = i;
386 
387 		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
388 			n_nel++;
389 			last_line_end = i;
390 		}
391 
392 		/* If this line is _longer_ than MAXLINELEN, remember it. */
393 		if ((int)i > last_line_end + MAXLINELEN)
394 			has_long_lines = 1;
395 
396 		if (ubuf[i] == '\033')
397 			has_escapes = 1;
398 		if (ubuf[i] == '\b')
399 			has_backspace = 1;
400 	}
401 
402 	rv = 1;
403 done:
404 	if (nbuf)
405 		free(nbuf);
406 	if (ubuf)
407 		free(ubuf);
408 
409 	if (rv) {
410 		// If we have identified the subtype, return it, otherwise just
411 		// text/plain.
412 
413 		bool found = false;
414 		if (subtypeMimeSpecific != NULL) {
415 			mimeType->SetTo(subtypeMimeSpecific);
416 			if (mimeType->IsInstalled())
417 				found = true;
418 		}
419 		if (!found && subtypeMimeGeneric != NULL) {
420 			mimeType->SetTo(subtypeMimeGeneric);
421 			if (mimeType->IsInstalled())
422 				found = true;
423 		}
424 		if (!found)
425 			mimeType->SetTo("text/plain");
426 	}
427 
428 	return rv;
429 }
430 
431 static int
432 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
433 {
434 	size_t i;
435 
436 	for (i = 0; i < ulen; i++) {
437 		if (s[i] != us[i])
438 			return 0;
439 	}
440 
441 	if (s[i])
442 		return 0;
443 	else
444 		return 1;
445 }
446 
447 /*
448  * This table reflects a particular philosophy about what constitutes
449  * "text," and there is room for disagreement about it.
450  *
451  * Version 3.31 of the file command considered a file to be ASCII if
452  * each of its characters was approved by either the isascii() or
453  * isalpha() function.  On most systems, this would mean that any
454  * file consisting only of characters in the range 0x00 ... 0x7F
455  * would be called ASCII text, but many systems might reasonably
456  * consider some characters outside this range to be alphabetic,
457  * so the file command would call such characters ASCII.  It might
458  * have been more accurate to call this "considered textual on the
459  * local system" than "ASCII."
460  *
461  * It considered a file to be "International language text" if each
462  * of its characters was either an ASCII printing character (according
463  * to the real ASCII standard, not the above test), a character in
464  * the range 0x80 ... 0xFF, or one of the following control characters:
465  * backspace, tab, line feed, vertical tab, form feed, carriage return,
466  * escape.  No attempt was made to determine the language in which files
467  * of this type were written.
468  *
469  *
470  * The table below considers a file to be ASCII if all of its characters
471  * are either ASCII printing characters (again, according to the X3.4
472  * standard, not isascii()) or any of the following controls: bell,
473  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
474  *
475  * I include bell because some programs (particularly shell scripts)
476  * use it literally, even though it is rare in normal text.  I exclude
477  * vertical tab because it never seems to be used in real text.  I also
478  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
479  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
480  * character to.  It might be more appropriate to include it in the 8859
481  * set instead of the ASCII set, but it's got to be included in *something*
482  * we recognize or EBCDIC files aren't going to be considered textual.
483  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
484  * and Latin characters, so these should possibly be allowed.  But they
485  * make a real mess on VT100-style displays if they're not paired properly,
486  * so we are probably better off not calling them text.
487  *
488  * A file is considered to be ISO-8859 text if its characters are all
489  * either ASCII, according to the above definition, or printing characters
490  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
491  *
492  * Finally, a file is considered to be international text from some other
493  * character code if its characters are all either ISO-8859 (according to
494  * the above definition) or characters in the range 0x80 ... 0x9F, which
495  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
496  * consider to be printing characters.
497  */
498 
499 #define F 0   /* character never appears in text */
500 #define T 1   /* character appears in plain ASCII text */
501 #define I 2   /* character appears in ISO-8859 text */
502 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
503 
504 static char text_chars[256] = {
505 	/*                  BEL BS HT LF    FF CR    */
506 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
507         /*                              ESC          */
508 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
509 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
510 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
511 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
512 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
513 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
514 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
515 	/*            NEL                            */
516 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
517 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
518 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
519 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
520 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
521 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
522 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
523 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
524 };
525 
526 static int
527 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
528     size_t *ulen)
529 {
530 	int i;
531 
532 	*ulen = 0;
533 
534 	for (i = 0; i < (int)nbytes; i++) {
535 		int t = text_chars[buf[i]];
536 
537 		if (t != T)
538 			return 0;
539 
540 		ubuf[(*ulen)++] = buf[i];
541 	}
542 
543 	return 1;
544 }
545 
546 static int
547 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
548 {
549 	int i;
550 
551 	*ulen = 0;
552 
553 	for (i = 0; i < (int)nbytes; i++) {
554 		int t = text_chars[buf[i]];
555 
556 		if (t != T && t != I)
557 			return 0;
558 
559 		ubuf[(*ulen)++] = buf[i];
560 	}
561 
562 	return 1;
563 }
564 
565 static int
566 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
567     size_t *ulen)
568 {
569 	int i;
570 
571 	*ulen = 0;
572 
573 	for (i = 0; i < (int)nbytes; i++) {
574 		int t = text_chars[buf[i]];
575 
576 		if (t != T && t != I && t != X)
577 			return 0;
578 
579 		ubuf[(*ulen)++] = buf[i];
580 	}
581 
582 	return 1;
583 }
584 
585 static int
586 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
587 {
588 	int i, n;
589 	my_unichar c;
590 	int gotone = 0;
591 
592 	*ulen = 0;
593 
594 	for (i = 0; i < (int)nbytes; i++) {
595 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
596 			/*
597 			 * Even if the whole file is valid UTF-8 sequences,
598 			 * still reject it if it uses weird control characters.
599 			 */
600 
601 			if (text_chars[buf[i]] != T)
602 				return 0;
603 
604 			ubuf[(*ulen)++] = buf[i];
605 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
606 			return 0;
607 		} else {			   /* 11xxxxxx begins UTF-8 */
608 			int following;
609 
610 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
611 				c = buf[i] & 0x1f;
612 				following = 1;
613 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
614 				c = buf[i] & 0x0f;
615 				following = 2;
616 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
617 				c = buf[i] & 0x07;
618 				following = 3;
619 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
620 				c = buf[i] & 0x03;
621 				following = 4;
622 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
623 				c = buf[i] & 0x01;
624 				following = 5;
625 			} else
626 				return 0;
627 
628 			for (n = 0; n < following; n++) {
629 				i++;
630 				if (i >= (int)nbytes)
631 					goto done;
632 
633 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
634 					return 0;
635 
636 				c = (c << 6) + (buf[i] & 0x3f);
637 			}
638 
639 			ubuf[(*ulen)++] = c;
640 			gotone = 1;
641 		}
642 	}
643 done:
644 	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
645 }
646 
647 static int
648 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
649     size_t *ulen)
650 {
651 	int bigend;
652 	int i;
653 
654 	if (nbytes < 2)
655 		return 0;
656 
657 	if (buf[0] == 0xff && buf[1] == 0xfe)
658 		bigend = 0;
659 	else if (buf[0] == 0xfe && buf[1] == 0xff)
660 		bigend = 1;
661 	else
662 		return 0;
663 
664 	*ulen = 0;
665 
666 	for (i = 2; i + 1 < (int)nbytes; i += 2) {
667 		/* XXX fix to properly handle chars > 65536 */
668 
669 		if (bigend)
670 			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
671 		else
672 			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
673 
674 		if (ubuf[*ulen - 1] == 0xfffe)
675 			return 0;
676 		if (ubuf[*ulen - 1] < 128 &&
677 		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
678 			return 0;
679 	}
680 
681 	return 1 + bigend;
682 }
683 
684 #undef F
685 #undef T
686 #undef I
687 #undef X
688 
689 /*
690  * This table maps each EBCDIC character to an (8-bit extended) ASCII
691  * character, as specified in the rationale for the dd(1) command in
692  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
693  *
694  * Unfortunately it does not seem to correspond exactly to any of the
695  * five variants of EBCDIC documented in IBM's _Enterprise Systems
696  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
697  * Edition, July, 1999, pp. I-1 - I-4.
698  *
699  * Fortunately, though, all versions of EBCDIC, including this one, agree
700  * on most of the printing characters that also appear in (7-bit) ASCII.
701  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
702  *
703  * Fortunately too, there is general agreement that codes 0x00 through
704  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
705  * remainder printing characters.
706  *
707  * This is sufficient to allow us to identify EBCDIC text and to distinguish
708  * between old-style and internationalized examples of text.
709  */
710 
711 static unsigned char ebcdic_to_ascii[] = {
712   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
713  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
714 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
715 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
716 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
717 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
718 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
719 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
720 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
721 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
722 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
723 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
724 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
725 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
726 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
727 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
728 };
729 
730 #ifdef notdef
731 /*
732  * The following EBCDIC-to-ASCII table may relate more closely to reality,
733  * or at least to modern reality.  It comes from
734  *
735  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
736  *
737  * and maps the characters of EBCDIC code page 1047 (the code used for
738  * Unix-derived software on IBM's 390 systems) to the corresponding
739  * characters from ISO 8859-1.
740  *
741  * If this table is used instead of the above one, some of the special
742  * cases for the NEL character can be taken out of the code.
743  */
744 
745 static unsigned char ebcdic_1047_to_8859[] = {
746 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
747 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
748 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
749 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
750 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
751 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
752 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
753 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
754 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
755 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
756 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
757 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
758 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
759 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
760 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
761 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
762 };
763 #endif
764 
765 /*
766  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
767  */
768 static void
769 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
770 {
771 	int i;
772 
773 	for (i = 0; i < (int)nbytes; i++) {
774 		out[i] = ebcdic_to_ascii[buf[i]];
775 	}
776 }
777 
778 
779 //	#pragma mark -
780 
781 
782 /*!
783 	Determines if the data in inSource is of the STXT format.
784 
785 	\param header the STXT stream header read in by Identify() or Translate()
786 	\param inSource the stream with the STXT data
787 	\param outInfo information about the type of data from inSource is stored here
788 	\param outType the desired output type for the data in inSource
789 	\param ptxtheader if this is not NULL, the TEXT header from
790 		inSource is copied to it
791 */
792 status_t
793 identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
794 	BPositionIO *inSource, translator_info *outInfo, uint32 outType,
795 	TranslatorStyledTextTextHeader *ptxtheader = NULL)
796 {
797 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
798 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
799 
800 	uint8 buffer[max(ktxtsize, kstylsize)];
801 
802 	// Check the TEXT header
803 	TranslatorStyledTextTextHeader txtheader;
804 	if (inSource->Read(buffer, ktxtsize) != ktxtsize)
805 		return B_NO_TRANSLATOR;
806 
807 	memcpy(&txtheader, buffer, ktxtsize);
808 	if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
809 		B_SWAP_BENDIAN_TO_HOST) != B_OK)
810 		return B_ERROR;
811 
812 	if (txtheader.header.magic != 'TEXT'
813 		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
814 		|| txtheader.charset != B_UNICODE_UTF8)
815 		return B_NO_TRANSLATOR;
816 
817 	// skip the text data
818 	off_t seekresult, pos;
819 	pos = header.header.header_size + txtheader.header.header_size
820 		+ txtheader.header.data_size;
821 	seekresult = inSource->Seek(txtheader.header.data_size,
822 		SEEK_CUR);
823 	if (seekresult < pos)
824 		return B_NO_TRANSLATOR;
825 	if (seekresult > pos)
826 		return B_ERROR;
827 
828 	// check the STYL header (not all STXT files have this)
829 	ssize_t read = 0;
830 	TranslatorStyledTextStyleHeader stylheader;
831 	read = inSource->Read(buffer, kstylsize);
832 	if (read < 0)
833 		return read;
834 	if (read != kstylsize && read != 0)
835 		return B_NO_TRANSLATOR;
836 
837 	// If there is a STYL header
838 	if (read == kstylsize) {
839 		memcpy(&stylheader, buffer, kstylsize);
840 		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
841 			B_SWAP_BENDIAN_TO_HOST) != B_OK)
842 			return B_ERROR;
843 
844 		if (stylheader.header.magic != 'STYL'
845 			|| stylheader.header.header_size !=
846 				sizeof(TranslatorStyledTextStyleHeader))
847 			return B_NO_TRANSLATOR;
848 	}
849 
850 	// if output TEXT header is supplied, fill it with data
851 	if (ptxtheader) {
852 		ptxtheader->header.magic = txtheader.header.magic;
853 		ptxtheader->header.header_size = txtheader.header.header_size;
854 		ptxtheader->header.data_size = txtheader.header.data_size;
855 		ptxtheader->charset = txtheader.charset;
856 	}
857 
858 	// return information about the data in the stream
859 	outInfo->type = B_STYLED_TEXT_FORMAT;
860 	outInfo->group = B_TRANSLATOR_TEXT;
861 	outInfo->quality = STXT_IN_QUALITY;
862 	outInfo->capability = STXT_IN_CAPABILITY;
863 	strlcpy(outInfo->name, B_TRANSLATE("Be styled text file"),
864 		sizeof(outInfo->name));
865 	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
866 
867 	return B_OK;
868 }
869 
870 
871 /*!
872 	Determines if the data in \a inSource is of the UTF8 plain
873 
874 	\param data buffer containing data already read (must be at
875 		least DATA_BUFFER_SIZE bytes large)
876 	\param nread number of bytes that have already been read from the stream
877 	\param header the STXT stream header read in by Identify() or Translate()
878 	\param inSource the stream with the STXT data
879 	\param outInfo information about the type of data from inSource is stored here
880 	\param outType the desired output type for the data in inSource
881 */
882 status_t
883 identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
884 	translator_info* outInfo, uint32 outType, const char*& encoding)
885 {
886 	ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
887 	if (readLater < B_OK)
888 		return B_NO_TRANSLATOR;
889 
890 	bytesRead += readLater;
891 
892 	// TODO: identify encoding as possible!
893 	BMimeType type;
894 	if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
895 		return B_NO_TRANSLATOR;
896 
897 	float capability = TEXT_IN_CAPABILITY;
898 	if (bytesRead < 20)
899 		capability = .1f;
900 
901 	// return information about the data in the stream
902 	outInfo->type = B_TRANSLATOR_TEXT;
903 	outInfo->group = B_TRANSLATOR_TEXT;
904 	outInfo->quality = TEXT_IN_QUALITY;
905 	outInfo->capability = capability;
906 
907 	char description[B_MIME_TYPE_LENGTH];
908 	if (type.GetLongDescription(description) == B_OK)
909 		strlcpy(outInfo->name, description, sizeof(outInfo->name));
910 	else
911 		strlcpy(outInfo->name, B_TRANSLATE("Plain text file"),
912 			sizeof(outInfo->name));
913 
914 	//strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
915 	strcpy(outInfo->MIME, "text/plain");
916 	return B_OK;
917 }
918 
919 
920 // ---------------------------------------------------------------
921 // translate_from_stxt
922 //
923 // Translates the data in inSource to the type outType and stores
924 // the translated data in outDestination.
925 //
926 // Preconditions:
927 //
928 // Parameters:	inSource,	the data to be translated
929 //
930 //				outDestination,	where the translated data is
931 //								put
932 //
933 //				outType,	the type to convert inSource to
934 //
935 //				txtheader, 	the TEXT header from inSource
936 //
937 //
938 // Postconditions:
939 //
940 // Returns: B_BAD_VALUE, if outType is invalid
941 //
942 // B_NO_TRANSLATOR, if this translator doesn't understand the data
943 //
944 // B_ERROR, if there was an error allocating memory or converting
945 //          data
946 //
947 // B_OK, if all went well
948 // ---------------------------------------------------------------
949 status_t
950 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
951 		uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
952 {
953 	if (inSource->Seek(0, SEEK_SET) != 0)
954 		return B_ERROR;
955 
956 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
957 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
958 
959 	bool btoplain;
960 	if (outType == B_TRANSLATOR_TEXT)
961 		btoplain = true;
962 	else if (outType == B_STYLED_TEXT_FORMAT)
963 		btoplain = false;
964 	else
965 		return B_BAD_VALUE;
966 
967 	uint8 buffer[READ_BUFFER_SIZE];
968 	ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
969 
970 	// skip to the actual text data when outputting a
971 	// plain text file
972 	if (btoplain) {
973 		if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
974 			kstxtsize + ktxtsize)
975 			return B_ERROR;
976 	}
977 
978 	// Read data from inSource
979 	// When outputing B_TRANSLATOR_TEXT, the loop stops when all of
980 	// the text data has been read and written.
981 	// When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
982 	// of the data from inSource has been read and written.
983 	if (btoplain)
984 		nreed = min((size_t)READ_BUFFER_SIZE,
985 			(size_t)txtheader.header.data_size - ntotalread);
986 	else
987 		nreed = READ_BUFFER_SIZE;
988 	nread = inSource->Read(buffer, nreed);
989 	while (nread > 0) {
990 		nwritten = outDestination->Write(buffer, nread);
991 		if (nwritten != nread)
992 			return B_ERROR;
993 
994 		if (btoplain) {
995 			ntotalread += nread;
996 			nreed = min((size_t)READ_BUFFER_SIZE,
997 				(size_t)txtheader.header.data_size - ntotalread);
998 		} else
999 			nreed = READ_BUFFER_SIZE;
1000 		nread = inSource->Read(buffer, nreed);
1001 	}
1002 
1003 	if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
1004 		ntotalread)
1005 		// If not all of the text data was able to be read...
1006 		return B_NO_TRANSLATOR;
1007 	else
1008 		return B_OK;
1009 }
1010 
1011 // ---------------------------------------------------------------
1012 // output_headers
1013 //
1014 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
1015 // to outDestination, setting the data_size member of the text header
1016 // to text_data_size
1017 //
1018 // Preconditions:
1019 //
1020 // Parameters:	outDestination,	where the translated data is
1021 //								put
1022 //
1023 //				text_data_size, number of bytes in data section
1024 //							    of the TEXT header
1025 //
1026 //
1027 // Postconditions:
1028 //
1029 // Returns:
1030 //
1031 // B_ERROR, if there was an error writing to outDestination or
1032 // 	an error with converting the byte order
1033 //
1034 // B_OK, if all went well
1035 // ---------------------------------------------------------------
1036 status_t
1037 output_headers(BPositionIO *outDestination, uint32 text_data_size)
1038 {
1039 	const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
1040 		sizeof(TranslatorStyledTextTextHeader);
1041 	status_t result;
1042 	TranslatorStyledTextStreamHeader stxtheader;
1043 	TranslatorStyledTextTextHeader txtheader;
1044 
1045 	uint8 buffer[kHeadersSize];
1046 
1047 	stxtheader.header.magic = 'STXT';
1048 	stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1049 	stxtheader.header.data_size = 0;
1050 	stxtheader.version = 100;
1051 	memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1052 
1053 	txtheader.header.magic = 'TEXT';
1054 	txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1055 	txtheader.header.data_size = text_data_size;
1056 	txtheader.charset = B_UNICODE_UTF8;
1057 	memcpy(buffer + stxtheader.header.header_size, &txtheader,
1058 		txtheader.header.header_size);
1059 
1060 	// write out headers in Big Endian byte order
1061 	result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1062 		B_SWAP_HOST_TO_BENDIAN);
1063 	if (result == B_OK) {
1064 		ssize_t nwritten = 0;
1065 		nwritten = outDestination->Write(buffer, kHeadersSize);
1066 		if (nwritten != kHeadersSize)
1067 			return B_ERROR;
1068 		else
1069 			return B_OK;
1070 	}
1071 
1072 	return result;
1073 }
1074 
1075 // ---------------------------------------------------------------
1076 // output_styles
1077 //
1078 // Writes out the actual style information into outDestination
1079 // using the data from pflatRunArray
1080 //
1081 // Preconditions:
1082 //
1083 // Parameters:	outDestination,	where the translated data is
1084 //								put
1085 //
1086 //				text_size,		size in bytes of the text in
1087 //								outDestination
1088 //
1089 //				data_size,		size of pflatRunArray
1090 //
1091 // Postconditions:
1092 //
1093 // Returns:
1094 //
1095 // B_ERROR, if there was an error writing to outDestination or
1096 // 	an error with converting the byte order
1097 //
1098 // B_OK, if all went well
1099 // ---------------------------------------------------------------
1100 status_t
1101 output_styles(BPositionIO *outDestination, uint32 text_size,
1102 	uint8 *pflatRunArray, ssize_t data_size)
1103 {
1104 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1105 
1106 	uint8 buffer[kstylsize];
1107 
1108 	// output STYL header
1109 	TranslatorStyledTextStyleHeader stylheader;
1110 	stylheader.header.magic = 'STYL';
1111 	stylheader.header.header_size =
1112 		sizeof(TranslatorStyledTextStyleHeader);
1113 	stylheader.header.data_size = data_size;
1114 	stylheader.apply_offset = 0;
1115 	stylheader.apply_length = text_size;
1116 
1117 	memcpy(buffer, &stylheader, kstylsize);
1118 	if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1119 		B_SWAP_HOST_TO_BENDIAN) != B_OK)
1120 		return B_ERROR;
1121 	if (outDestination->Write(buffer, kstylsize) != kstylsize)
1122 		return B_ERROR;
1123 
1124 	// output actual style information
1125 	if (outDestination->Write(pflatRunArray,
1126 		data_size) != data_size)
1127 		return B_ERROR;
1128 
1129 	return B_OK;
1130 }
1131 
1132 
1133 /*!
1134 	Convert the plain text (UTF8) from inSource to plain or
1135 	styled text in outDestination
1136 */
1137 status_t
1138 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1139 	BPositionIO* destination, uint32 outType)
1140 {
1141 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1142 		return B_BAD_VALUE;
1143 
1144 	// find the length of the text
1145 	off_t size = source->Seek(0, SEEK_END);
1146 	if (size < 0)
1147 		return (status_t)size;
1148 	if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1149 		return B_NOT_SUPPORTED;
1150 
1151 	status_t status = source->Seek(0, SEEK_SET);
1152 	if (status < B_OK)
1153 		return status;
1154 
1155 	if (outType == B_STYLED_TEXT_FORMAT) {
1156 		// output styled text headers
1157 		status = output_headers(destination, (uint32)size);
1158 		if (status != B_OK)
1159 			return status;
1160 	}
1161 
1162 	class MallocBuffer {
1163 		public:
1164 			MallocBuffer() : fBuffer(NULL), fSize(0) {}
1165 			~MallocBuffer() { free(fBuffer); }
1166 
1167 			void* Buffer() { return fBuffer; }
1168 			size_t Size() const { return fSize; }
1169 
1170 			status_t
1171 			Allocate(size_t size)
1172 			{
1173 				fBuffer = malloc(size);
1174 				if (fBuffer != NULL) {
1175 					fSize = size;
1176 					return B_OK;
1177 				}
1178 				return B_NO_MEMORY;
1179 			}
1180 
1181 		private:
1182 			void*	fBuffer;
1183 			size_t	fSize;
1184 	} encodingBuffer;
1185 	BMallocIO encodingIO;
1186 	uint32 encodingID = 0;
1187 		// defaults to UTF-8 or no encoding
1188 
1189 	BNode* node = dynamic_cast<BNode*>(source);
1190 	if (node != NULL) {
1191 		// determine encoding, if available
1192 		const BCharacterSet* characterSet = NULL;
1193 		bool hasAttribute = false;
1194 		if (encoding != NULL && !forceEncoding) {
1195 			BString name;
1196 			if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1197 				encoding = name.String();
1198 				hasAttribute = true;
1199 			} else {
1200 				int32 value;
1201 				ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1202 					&value, sizeof(value));
1203 				if (bytesRead == (ssize_t)sizeof(value)) {
1204 					hasAttribute = true;
1205 					if (value != 65535)
1206 						characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1207 				}
1208 			}
1209 		} else {
1210 			hasAttribute = true;
1211 				// we don't write the encoding in this case
1212 		}
1213 		if (characterSet == NULL && encoding != NULL)
1214 			characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1215 
1216 		if (characterSet != NULL) {
1217 			encodingID = characterSet->GetConversionID();
1218 			encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1219 		}
1220 
1221 		if (!hasAttribute && encoding != NULL) {
1222 			// add encoding attribute, so that someone opening the file can
1223 			// retrieve it for persistance
1224 			node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1225 				strlen(encoding));
1226 		}
1227 	}
1228 
1229 	off_t outputSize = 0;
1230 	ssize_t bytesRead;
1231 	int32 state = 0;
1232 
1233 	// output the actual text part of the data
1234 	do {
1235 		uint8 buffer[READ_BUFFER_SIZE];
1236 		bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1237 		if (bytesRead < B_OK)
1238 			return bytesRead;
1239 		if (bytesRead == 0)
1240 			break;
1241 
1242 		if (encodingBuffer.Size() == 0) {
1243 			// default, no encoding
1244 			ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1245 			if (bytesWritten != bytesRead) {
1246 				if (bytesWritten < B_OK)
1247 					return bytesWritten;
1248 
1249 				return B_ERROR;
1250 			}
1251 
1252 			outputSize += bytesRead;
1253 		} else {
1254 			// decode text file to UTF-8
1255 			char* pos = (char*)buffer;
1256 			int32 encodingLength = encodingIO.BufferLength();
1257 			int32 bytesLeft = bytesRead;
1258 			int32 bytes;
1259 			do {
1260 				encodingLength = READ_BUFFER_SIZE * 4;
1261 				bytes = bytesLeft;
1262 
1263 				status = convert_to_utf8(encodingID, pos, &bytes,
1264 					(char*)encodingBuffer.Buffer(), &encodingLength, &state);
1265 				if (status < B_OK)
1266 					return status;
1267 
1268 				ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1269 					encodingLength);
1270 				if (bytesWritten < encodingLength) {
1271 					if (bytesWritten < B_OK)
1272 						return bytesWritten;
1273 
1274 					return B_ERROR;
1275 				}
1276 
1277 				pos += bytes;
1278 				bytesLeft -= bytes;
1279 				outputSize += encodingLength;
1280 			} while (encodingLength > 0 && bytesLeft > 0);
1281 		}
1282 	} while (bytesRead > 0);
1283 
1284 	if (outType != B_STYLED_TEXT_FORMAT)
1285 		return B_OK;
1286 
1287 	if (encodingBuffer.Size() != 0 && size != outputSize) {
1288 		if (outputSize > UINT32_MAX)
1289 			return B_NOT_SUPPORTED;
1290 
1291 		// we need to update the header as the decoded text size has changed
1292 		status = destination->Seek(0, SEEK_SET);
1293 		if (status == B_OK)
1294 			status = output_headers(destination, (uint32)outputSize);
1295 		if (status == B_OK)
1296 			status = destination->Seek(0, SEEK_END);
1297 
1298 		if (status < B_OK)
1299 			return status;
1300 	}
1301 
1302 	// Read file attributes if outputting styled data
1303 	// and source is a BNode object
1304 
1305 	if (node == NULL)
1306 		return B_OK;
1307 
1308 	// Try to read styles - we only propagate an error if the actual on-disk
1309 	// data is likely to be okay
1310 
1311 	const char *kAttrName = "styles";
1312 	attr_info info;
1313 	if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1314 		return B_OK;
1315 
1316 	if (info.type != B_RAW_TYPE || info.size < 160) {
1317 		// styles seem to be broken, but since we got the text,
1318 		// we don't propagate the error
1319 		return B_OK;
1320 	}
1321 
1322 	uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1323 	if (flatRunArray == NULL)
1324 		return B_NO_MEMORY;
1325 
1326 	bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1327 	if (bytesRead != info.size)
1328 		return B_OK;
1329 
1330 	output_styles(destination, size, flatRunArray, info.size);
1331 
1332 	delete[] flatRunArray;
1333 	return B_OK;
1334 }
1335 
1336 
1337 //	#pragma mark -
1338 
1339 
1340 STXTTranslator::STXTTranslator()
1341 	: BaseTranslator(B_TRANSLATE("StyledEdit files"),
1342 		B_TRANSLATE("StyledEdit file translator"),
1343 		STXT_TRANSLATOR_VERSION,
1344 		sInputFormats, kNumInputFormats,
1345 		sOutputFormats, kNumOutputFormats,
1346 		"STXTTranslator_Settings",
1347 		sDefaultSettings, kNumDefaultSettings,
1348 		B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1349 {
1350 }
1351 
1352 
1353 STXTTranslator::~STXTTranslator()
1354 {
1355 }
1356 
1357 
1358 status_t
1359 STXTTranslator::Identify(BPositionIO *inSource,
1360 	const translation_format *inFormat, BMessage *ioExtension,
1361 	translator_info *outInfo, uint32 outType)
1362 {
1363 	if (!outType)
1364 		outType = B_TRANSLATOR_TEXT;
1365 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1366 		return B_NO_TRANSLATOR;
1367 
1368 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1369 
1370 	uint8 buffer[DATA_BUFFER_SIZE];
1371 	status_t nread = 0;
1372 	// Read in the header to determine
1373 	// if the data is supported
1374 	nread = inSource->Read(buffer, kstxtsize);
1375 	if (nread < 0)
1376 		return nread;
1377 
1378 	// read in enough data to fill the stream header
1379 	if (nread == kstxtsize) {
1380 		TranslatorStyledTextStreamHeader header;
1381 		memcpy(&header, buffer, kstxtsize);
1382 		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1383 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1384 			return B_ERROR;
1385 
1386 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1387 			&& header.header.header_size == (int32)kstxtsize
1388 			&& header.header.data_size == 0
1389 			&& header.version == 100)
1390 			return identify_stxt_header(header, inSource, outInfo, outType);
1391 	}
1392 
1393 	// if the data is not styled text, check if it is plain text
1394 	const char* encoding;
1395 	return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1396 }
1397 
1398 
1399 status_t
1400 STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1401 	BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1402 {
1403 	if (!outType)
1404 		outType = B_TRANSLATOR_TEXT;
1405 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1406 		return B_NO_TRANSLATOR;
1407 
1408 	const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1409 	uint8 buffer[DATA_BUFFER_SIZE];
1410 	status_t result;
1411 	translator_info outInfo;
1412 	// Read in the header to determine
1413 	// if the data is supported
1414 	ssize_t bytesRead = source->Read(buffer, headerSize);
1415 	if (bytesRead < 0)
1416 		return bytesRead;
1417 
1418 	// read in enough data to fill the stream header
1419 	if (bytesRead == headerSize) {
1420 		TranslatorStyledTextStreamHeader header;
1421 		memcpy(&header, buffer, headerSize);
1422 		if (swap_data(B_UINT32_TYPE, &header, headerSize,
1423 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1424 			return B_ERROR;
1425 
1426 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1427 			&& header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1428 			&& header.header.data_size == 0
1429 			&& header.version == 100) {
1430 			TranslatorStyledTextTextHeader textHeader;
1431 			result = identify_stxt_header(header, source, &outInfo, outType,
1432 				&textHeader);
1433 			if (result != B_OK)
1434 				return result;
1435 
1436 			return translate_from_stxt(source, outDestination, outType, textHeader);
1437 		}
1438 	}
1439 
1440 	// if the data is not styled text, check if it is ASCII text
1441 	bool forceEncoding = false;
1442 	const char* encoding = NULL;
1443 	result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1444 	if (result != B_OK)
1445 		return result;
1446 
1447 	if (ioExtension != NULL) {
1448 		const char* value;
1449 		if (ioExtension->FindString("be:encoding", &value) == B_OK
1450 			&& value[0]) {
1451 			// override encoding
1452 			encoding = value;
1453 			forceEncoding = true;
1454 		}
1455 	}
1456 
1457 	return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1458 }
1459 
1460 
1461 BView *
1462 STXTTranslator::NewConfigView(TranslatorSettings *settings)
1463 {
1464 	return new STXTView(BRect(0, 0, 225, 175),
1465 		B_TRANSLATE("STXTTranslator Settings"),
1466 		B_FOLLOW_ALL, B_WILL_DRAW, settings);
1467 }
1468 
1469