xref: /haiku/src/add-ons/translators/stxt/STXTTranslator.cpp (revision adb0d19d561947362090081e81d90dde59142026)
1 /*
2  * Copyright 2002-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Michael Wilber
7  *		Axel Dörfler, axeld@pinc-software.de
8  */
9 
10 
11 #include "STXTTranslator.h"
12 #include "STXTView.h"
13 
14 #include <CharacterSet.h>
15 #include <CharacterSetRoster.h>
16 #include <MimeType.h>
17 #include <String.h>
18 #include <UTF8.h>
19 
20 #include <algorithm>
21 #include <new>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdint.h>
25 
26 
27 using namespace BPrivate;
28 using namespace std;
29 
30 
31 #define READ_BUFFER_SIZE 32768
32 #define DATA_BUFFER_SIZE 256
33 
34 // The input formats that this translator supports.
35 translation_format gInputFormats[] = {
36 	{
37 		B_TRANSLATOR_TEXT,
38 		B_TRANSLATOR_TEXT,
39 		TEXT_IN_QUALITY,
40 		TEXT_IN_CAPABILITY,
41 		"text/plain",
42 		"Plain text file"
43 	},
44 	{
45 		B_STYLED_TEXT_FORMAT,
46 		B_TRANSLATOR_TEXT,
47 		STXT_IN_QUALITY,
48 		STXT_IN_CAPABILITY,
49 		"text/x-vnd.Be-stxt",
50 		"Be styled text file"
51 	}
52 };
53 
54 // The output formats that this translator supports.
55 translation_format gOutputFormats[] = {
56 	{
57 		B_TRANSLATOR_TEXT,
58 		B_TRANSLATOR_TEXT,
59 		TEXT_OUT_QUALITY,
60 		TEXT_OUT_CAPABILITY,
61 		"text/plain",
62 		"Plain text file"
63 	},
64 	{
65 		B_STYLED_TEXT_FORMAT,
66 		B_TRANSLATOR_TEXT,
67 		STXT_OUT_QUALITY,
68 		STXT_OUT_CAPABILITY,
69 		"text/x-vnd.Be-stxt",
70 		"Be styled text file"
71 	}
72 };
73 
74 // Default settings for the Translator
75 TranSetting gDefaultSettings[] = {
76 	{B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
77 	{B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
78 };
79 
80 // ---------------------------------------------------------------
81 // make_nth_translator
82 //
83 // Creates a STXTTranslator object to be used by BTranslatorRoster
84 //
85 // Preconditions:
86 //
87 // Parameters: n,		The translator to return. Since
88 //						STXTTranslator only publishes one
89 //						translator, it only returns a
90 //						STXTTranslator if n == 0
91 //
92 //             you, 	The image_id of the add-on that
93 //						contains code (not used).
94 //
95 //             flags,	Has no meaning yet, should be 0.
96 //
97 // Postconditions:
98 //
99 // Returns: NULL if n is not zero,
100 //          a new STXTTranslator if n is zero
101 // ---------------------------------------------------------------
102 BTranslator *
103 make_nth_translator(int32 n, image_id you, uint32 flags, ...)
104 {
105 	if (!n)
106 		return new (std::nothrow) STXTTranslator();
107 
108 	return NULL;
109 }
110 
111 
112 // #pragma mark - ascmagic.c from the BSD file tool
113 /*
114  * The following code has been taken from version 4.17 of the BSD file tool,
115  * file ascmagic.c, modified for our purpose.
116  */
117 
118 /*
119  * Copyright (c) Ian F. Darwin 1986-1995.
120  * Software written by Ian F. Darwin and others;
121  * maintained 1995-present by Christos Zoulas and others.
122  *
123  * Redistribution and use in source and binary forms, with or without
124  * modification, are permitted provided that the following conditions
125  * are met:
126  * 1. Redistributions of source code must retain the above copyright
127  *    notice immediately at the beginning of the file, without modification,
128  *    this list of conditions, and the following disclaimer.
129  * 2. Redistributions in binary form must reproduce the above copyright
130  *    notice, this list of conditions and the following disclaimer in the
131  *    documentation and/or other materials provided with the distribution.
132  *
133  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
134  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
135  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
136  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
137  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
138  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
139  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
140  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
141  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
142  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
143  * SUCH DAMAGE.
144  */
145 /*
146  * ASCII magic -- file types that we know based on keywords
147  * that can appear anywhere in the file.
148  *		bool found = false;
149 		if (subtypeMimeSpecific != NULL) {
150 			mimeType->SetTo(subtypeMimeSpecific);
151 			if (mimeType->IsInstalled())
152 				found = true;
153 		}
154 		if (!found && subtypeMimeGeneric != NULL) {
155 			mimeType->SetTo(subtypeMimeGeneric);
156 			if (mimeType->IsInstalled())
157 				found = true;
158 		}
159 		if (!found)
160 			mimeType->SetTo("text/plain");
161 
162  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
163  * to handle character codes other than ASCII on a unified basis.
164  *
165  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
166  * international characters, now subsumed into this file.
167  */
168 
169 #include <stdio.h>
170 #include <string.h>
171 #include <memory.h>
172 #include <ctype.h>
173 #include <stdlib.h>
174 #include <unistd.h>
175 #include "names.h"
176 
177 typedef unsigned long my_unichar;
178 
179 #define MAXLINELEN 300	/* longest sane line length */
180 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
181 		  || (x) == 0x85 || (x) == '\f')
182 
183 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
184 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
185 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
186 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
187 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
188 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
189 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
190 
191 
192 static int
193 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
194 	const char*& encoding)
195 {
196 	size_t i;
197 	unsigned char *nbuf = NULL;
198 	my_unichar *ubuf = NULL;
199 	size_t ulen;
200 	struct names *p;
201 	int rv = -1;
202 
203 	const char *code = NULL;
204 	encoding = NULL;
205 	const char *type = NULL;
206 	const char *subtype = NULL;
207 	const char *subtypeMimeGeneric = NULL;
208 	const char *subtypeMimeSpecific = NULL;
209 
210 	int has_escapes = 0;
211 	int has_backspace = 0;
212 	int seen_cr = 0;
213 
214 	int n_crlf = 0;
215 	int n_lf = 0;
216 	int n_cr = 0;
217 	int n_nel = 0;
218 
219 	int last_line_end = -1;
220 	int has_long_lines = 0;
221 
222 	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
223 		goto done;
224 	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
225 		goto done;
226 
227 	/*
228 	 * Then try to determine whether it's any character code we can
229 	 * identify.  Each of these tests, if it succeeds, will leave
230 	 * the text converted into one-my_unichar-per-character Unicode in
231 	 * ubuf, and the number of characters converted in ulen.
232 	 */
233 	if (nbytes == 0) {
234 		code = "UTF-8 Unicode";
235 		encoding = NULL; // "UTF-8";
236 		type = "text";
237 		rv = 1;
238 	} else if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
239 		code = "ASCII";
240 		encoding = NULL; //"us-ascii";
241 		type = "text";
242 		if (nbytes == 1) {
243 			// no further tests
244 			rv = 1;
245 		}
246 	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
247 		code = "UTF-8 Unicode";
248 		encoding = NULL; // "UTF-8";
249 		type = "text";
250 	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
251 		if (i == 1) {
252 			code = "Little-endian UTF-16 Unicode";
253 			encoding = "UTF-16";
254 		} else {
255 			code = "Big-endian UTF-16 Unicode";
256 			encoding = "UTF-16";
257 		}
258 
259 		type = "character data";
260 	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
261 		code = "ISO-8859";
262 		type = "text";
263 		encoding = "iso-8859-1";
264 	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
265 		code = "Non-ISO extended-ASCII";
266 		type = "text";
267 		encoding = "unknown";
268 	} else {
269 		from_ebcdic(buf, nbytes, nbuf);
270 
271 		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
272 			code = "EBCDIC";
273 			type = "character data";
274 			encoding = "ebcdic";
275 		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
276 			code = "International EBCDIC";
277 			type = "character data";
278 			encoding = "ebcdic";
279 		} else {
280 			rv = 0;
281 			goto done;  /* doesn't look like text at all */
282 		}
283 	}
284 
285 	if (nbytes <= 1) {
286 		if (rv == -1)
287 			rv = 0;
288 		goto done;
289 	}
290 
291 	/*
292 	 * for troff, look for . + letter + letter or .\";
293 	 * this must be done to disambiguate tar archives' ./file
294 	 * and other trash from real troff input.
295 	 *
296 	 * I believe Plan 9 troff allows non-ASCII characters in the names
297 	 * of macros, so this test might possibly fail on such a file.
298 	 */
299 	if (*ubuf == '.') {
300 		my_unichar *tp = ubuf + 1;
301 
302 		while (ISSPC(*tp))
303 			++tp;	/* skip leading whitespace */
304 		if ((tp[0] == '\\' && tp[1] == '\"') ||
305 		    (isascii((unsigned char)tp[0]) &&
306 		     isalnum((unsigned char)tp[0]) &&
307 		     isascii((unsigned char)tp[1]) &&
308 		     isalnum((unsigned char)tp[1]) &&
309 		     ISSPC(tp[2]))) {
310 		    subtypeMimeGeneric = "text/x-source-code";
311 			subtypeMimeSpecific = "text/troff";
312 			subtype = "troff or preprocessor input";
313 			goto subtype_identified;
314 		}
315 	}
316 
317 	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
318 		subtypeMimeGeneric = "text/x-source-code";
319 		subtypeMimeSpecific = "text/fortran";
320 		subtype = "fortran program";
321 		goto subtype_identified;
322 	}
323 
324 	/* look for tokens from names.h - this is expensive! */
325 
326 	i = 0;
327 	while (i < ulen) {
328 		size_t end;
329 
330 		/*
331 		 * skip past any leading space
332 		 */
333 		while (i < ulen && ISSPC(ubuf[i]))
334 			i++;
335 		if (i >= ulen)
336 			break;
337 
338 		/*
339 		 * find the next whitespace
340 		 */
341 		for (end = i + 1; end < nbytes; end++)
342 			if (ISSPC(ubuf[end]))
343 				break;
344 
345 		/*
346 		 * compare the word thus isolated against the token list
347 		 */
348 		for (p = names; p < names + NNAMES; p++) {
349 			if (ascmatch((const unsigned char *)p->name, ubuf + i,
350 			    end - i)) {
351 				subtype = types[p->type].human;
352 				subtypeMimeGeneric = types[p->type].generic_mime;
353 				subtypeMimeSpecific = types[p->type].specific_mime;
354 				goto subtype_identified;
355 			}
356 		}
357 
358 		i = end;
359 	}
360 
361 subtype_identified:
362 
363 	/*
364 	 * Now try to discover other details about the file.
365 	 */
366 	for (i = 0; i < ulen; i++) {
367 		if (ubuf[i] == '\n') {
368 			if (seen_cr)
369 				n_crlf++;
370 			else
371 				n_lf++;
372 			last_line_end = i;
373 		} else if (seen_cr)
374 			n_cr++;
375 
376 		seen_cr = (ubuf[i] == '\r');
377 		if (seen_cr)
378 			last_line_end = i;
379 
380 		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
381 			n_nel++;
382 			last_line_end = i;
383 		}
384 
385 		/* If this line is _longer_ than MAXLINELEN, remember it. */
386 		if ((int)i > last_line_end + MAXLINELEN)
387 			has_long_lines = 1;
388 
389 		if (ubuf[i] == '\033')
390 			has_escapes = 1;
391 		if (ubuf[i] == '\b')
392 			has_backspace = 1;
393 	}
394 
395 	rv = 1;
396 done:
397 	if (nbuf)
398 		free(nbuf);
399 	if (ubuf)
400 		free(ubuf);
401 
402 	if (rv) {
403 		// If we have identified the subtype, return it, otherwise just
404 		// text/plain.
405 
406 		bool found = false;
407 		if (subtypeMimeSpecific != NULL) {
408 			mimeType->SetTo(subtypeMimeSpecific);
409 			if (mimeType->IsInstalled())
410 				found = true;
411 		}
412 		if (!found && subtypeMimeGeneric != NULL) {
413 			mimeType->SetTo(subtypeMimeGeneric);
414 			if (mimeType->IsInstalled())
415 				found = true;
416 		}
417 		if (!found)
418 			mimeType->SetTo("text/plain");
419 	}
420 
421 	return rv;
422 }
423 
424 static int
425 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
426 {
427 	size_t i;
428 
429 	for (i = 0; i < ulen; i++) {
430 		if (s[i] != us[i])
431 			return 0;
432 	}
433 
434 	if (s[i])
435 		return 0;
436 	else
437 		return 1;
438 }
439 
440 /*
441  * This table reflects a particular philosophy about what constitutes
442  * "text," and there is room for disagreement about it.
443  *
444  * Version 3.31 of the file command considered a file to be ASCII if
445  * each of its characters was approved by either the isascii() or
446  * isalpha() function.  On most systems, this would mean that any
447  * file consisting only of characters in the range 0x00 ... 0x7F
448  * would be called ASCII text, but many systems might reasonably
449  * consider some characters outside this range to be alphabetic,
450  * so the file command would call such characters ASCII.  It might
451  * have been more accurate to call this "considered textual on the
452  * local system" than "ASCII."
453  *
454  * It considered a file to be "International language text" if each
455  * of its characters was either an ASCII printing character (according
456  * to the real ASCII standard, not the above test), a character in
457  * the range 0x80 ... 0xFF, or one of the following control characters:
458  * backspace, tab, line feed, vertical tab, form feed, carriage return,
459  * escape.  No attempt was made to determine the language in which files
460  * of this type were written.
461  *
462  *
463  * The table below considers a file to be ASCII if all of its characters
464  * are either ASCII printing characters (again, according to the X3.4
465  * standard, not isascii()) or any of the following controls: bell,
466  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
467  *
468  * I include bell because some programs (particularly shell scripts)
469  * use it literally, even though it is rare in normal text.  I exclude
470  * vertical tab because it never seems to be used in real text.  I also
471  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
472  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
473  * character to.  It might be more appropriate to include it in the 8859
474  * set instead of the ASCII set, but it's got to be included in *something*
475  * we recognize or EBCDIC files aren't going to be considered textual.
476  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
477  * and Latin characters, so these should possibly be allowed.  But they
478  * make a real mess on VT100-style displays if they're not paired properly,
479  * so we are probably better off not calling them text.
480  *
481  * A file is considered to be ISO-8859 text if its characters are all
482  * either ASCII, according to the above definition, or printing characters
483  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
484  *
485  * Finally, a file is considered to be international text from some other
486  * character code if its characters are all either ISO-8859 (according to
487  * the above definition) or characters in the range 0x80 ... 0x9F, which
488  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
489  * consider to be printing characters.
490  */
491 
492 #define F 0   /* character never appears in text */
493 #define T 1   /* character appears in plain ASCII text */
494 #define I 2   /* character appears in ISO-8859 text */
495 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
496 
497 static char text_chars[256] = {
498 	/*                  BEL BS HT LF    FF CR    */
499 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
500         /*                              ESC          */
501 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
502 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
503 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
504 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
505 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
506 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
507 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
508 	/*            NEL                            */
509 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
510 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
511 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
512 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
513 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
514 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
515 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
516 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
517 };
518 
519 static int
520 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
521     size_t *ulen)
522 {
523 	int i;
524 
525 	*ulen = 0;
526 
527 	for (i = 0; i < (int)nbytes; i++) {
528 		int t = text_chars[buf[i]];
529 
530 		if (t != T)
531 			return 0;
532 
533 		ubuf[(*ulen)++] = buf[i];
534 	}
535 
536 	return 1;
537 }
538 
539 static int
540 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
541 {
542 	int i;
543 
544 	*ulen = 0;
545 
546 	for (i = 0; i < (int)nbytes; i++) {
547 		int t = text_chars[buf[i]];
548 
549 		if (t != T && t != I)
550 			return 0;
551 
552 		ubuf[(*ulen)++] = buf[i];
553 	}
554 
555 	return 1;
556 }
557 
558 static int
559 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
560     size_t *ulen)
561 {
562 	int i;
563 
564 	*ulen = 0;
565 
566 	for (i = 0; i < (int)nbytes; i++) {
567 		int t = text_chars[buf[i]];
568 
569 		if (t != T && t != I && t != X)
570 			return 0;
571 
572 		ubuf[(*ulen)++] = buf[i];
573 	}
574 
575 	return 1;
576 }
577 
578 static int
579 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
580 {
581 	int i, n;
582 	my_unichar c;
583 	int gotone = 0;
584 
585 	*ulen = 0;
586 
587 	for (i = 0; i < (int)nbytes; i++) {
588 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
589 			/*
590 			 * Even if the whole file is valid UTF-8 sequences,
591 			 * still reject it if it uses weird control characters.
592 			 */
593 
594 			if (text_chars[buf[i]] != T)
595 				return 0;
596 
597 			ubuf[(*ulen)++] = buf[i];
598 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
599 			return 0;
600 		} else {			   /* 11xxxxxx begins UTF-8 */
601 			int following;
602 
603 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
604 				c = buf[i] & 0x1f;
605 				following = 1;
606 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
607 				c = buf[i] & 0x0f;
608 				following = 2;
609 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
610 				c = buf[i] & 0x07;
611 				following = 3;
612 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
613 				c = buf[i] & 0x03;
614 				following = 4;
615 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
616 				c = buf[i] & 0x01;
617 				following = 5;
618 			} else
619 				return 0;
620 
621 			for (n = 0; n < following; n++) {
622 				i++;
623 				if (i >= (int)nbytes)
624 					goto done;
625 
626 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
627 					return 0;
628 
629 				c = (c << 6) + (buf[i] & 0x3f);
630 			}
631 
632 			ubuf[(*ulen)++] = c;
633 			gotone = 1;
634 		}
635 	}
636 done:
637 	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
638 }
639 
640 static int
641 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
642     size_t *ulen)
643 {
644 	int bigend;
645 	int i;
646 
647 	if (nbytes < 2)
648 		return 0;
649 
650 	if (buf[0] == 0xff && buf[1] == 0xfe)
651 		bigend = 0;
652 	else if (buf[0] == 0xfe && buf[1] == 0xff)
653 		bigend = 1;
654 	else
655 		return 0;
656 
657 	*ulen = 0;
658 
659 	for (i = 2; i + 1 < (int)nbytes; i += 2) {
660 		/* XXX fix to properly handle chars > 65536 */
661 
662 		if (bigend)
663 			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
664 		else
665 			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
666 
667 		if (ubuf[*ulen - 1] == 0xfffe)
668 			return 0;
669 		if (ubuf[*ulen - 1] < 128 &&
670 		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
671 			return 0;
672 	}
673 
674 	return 1 + bigend;
675 }
676 
677 #undef F
678 #undef T
679 #undef I
680 #undef X
681 
682 /*
683  * This table maps each EBCDIC character to an (8-bit extended) ASCII
684  * character, as specified in the rationale for the dd(1) command in
685  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
686  *
687  * Unfortunately it does not seem to correspond exactly to any of the
688  * five variants of EBCDIC documented in IBM's _Enterprise Systems
689  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
690  * Edition, July, 1999, pp. I-1 - I-4.
691  *
692  * Fortunately, though, all versions of EBCDIC, including this one, agree
693  * on most of the printing characters that also appear in (7-bit) ASCII.
694  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
695  *
696  * Fortunately too, there is general agreement that codes 0x00 through
697  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
698  * remainder printing characters.
699  *
700  * This is sufficient to allow us to identify EBCDIC text and to distinguish
701  * between old-style and internationalized examples of text.
702  */
703 
704 static unsigned char ebcdic_to_ascii[] = {
705   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
706  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
707 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
708 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
709 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
710 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
711 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
712 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
713 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
714 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
715 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
716 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
717 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
718 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
719 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
720 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
721 };
722 
723 #ifdef notdef
724 /*
725  * The following EBCDIC-to-ASCII table may relate more closely to reality,
726  * or at least to modern reality.  It comes from
727  *
728  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
729  *
730  * and maps the characters of EBCDIC code page 1047 (the code used for
731  * Unix-derived software on IBM's 390 systems) to the corresponding
732  * characters from ISO 8859-1.
733  *
734  * If this table is used instead of the above one, some of the special
735  * cases for the NEL character can be taken out of the code.
736  */
737 
738 static unsigned char ebcdic_1047_to_8859[] = {
739 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
740 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
741 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
742 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
743 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
744 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
745 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
746 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
747 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
748 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
749 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
750 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
751 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
752 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
753 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
754 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
755 };
756 #endif
757 
758 /*
759  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
760  */
761 static void
762 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
763 {
764 	int i;
765 
766 	for (i = 0; i < (int)nbytes; i++) {
767 		out[i] = ebcdic_to_ascii[buf[i]];
768 	}
769 }
770 
771 
772 //	#pragma mark -
773 
774 
775 /*!
776 	Determines if the data in inSource is of the STXT format.
777 
778 	\param header the STXT stream header read in by Identify() or Translate()
779 	\param inSource the stream with the STXT data
780 	\param outInfo information about the type of data from inSource is stored here
781 	\param outType the desired output type for the data in inSource
782 	\param ptxtheader if this is not NULL, the TEXT header from
783 		inSource is copied to it
784 */
785 status_t
786 identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
787 	BPositionIO *inSource, translator_info *outInfo, uint32 outType,
788 	TranslatorStyledTextTextHeader *ptxtheader = NULL)
789 {
790 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
791 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
792 
793 	uint8 buffer[max(ktxtsize, kstylsize)];
794 
795 	// Check the TEXT header
796 	TranslatorStyledTextTextHeader txtheader;
797 	if (inSource->Read(buffer, ktxtsize) != ktxtsize)
798 		return B_NO_TRANSLATOR;
799 
800 	memcpy(&txtheader, buffer, ktxtsize);
801 	if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
802 		B_SWAP_BENDIAN_TO_HOST) != B_OK)
803 		return B_ERROR;
804 
805 	if (txtheader.header.magic != 'TEXT'
806 		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
807 		|| txtheader.charset != B_UNICODE_UTF8)
808 		return B_NO_TRANSLATOR;
809 
810 	// skip the text data
811 	off_t seekresult, pos;
812 	pos = header.header.header_size + txtheader.header.header_size
813 		+ txtheader.header.data_size;
814 	seekresult = inSource->Seek(txtheader.header.data_size,
815 		SEEK_CUR);
816 	if (seekresult < pos)
817 		return B_NO_TRANSLATOR;
818 	if (seekresult > pos)
819 		return B_ERROR;
820 
821 	// check the STYL header (not all STXT files have this)
822 	ssize_t read = 0;
823 	TranslatorStyledTextStyleHeader stylheader;
824 	read = inSource->Read(buffer, kstylsize);
825 	if (read < 0)
826 		return read;
827 	if (read != kstylsize && read != 0)
828 		return B_NO_TRANSLATOR;
829 
830 	// If there is a STYL header
831 	if (read == kstylsize) {
832 		memcpy(&stylheader, buffer, kstylsize);
833 		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
834 			B_SWAP_BENDIAN_TO_HOST) != B_OK)
835 			return B_ERROR;
836 
837 		if (stylheader.header.magic != 'STYL'
838 			|| stylheader.header.header_size !=
839 				sizeof(TranslatorStyledTextStyleHeader))
840 			return B_NO_TRANSLATOR;
841 	}
842 
843 	// if output TEXT header is supplied, fill it with data
844 	if (ptxtheader) {
845 		ptxtheader->header.magic = txtheader.header.magic;
846 		ptxtheader->header.header_size = txtheader.header.header_size;
847 		ptxtheader->header.data_size = txtheader.header.data_size;
848 		ptxtheader->charset = txtheader.charset;
849 	}
850 
851 	// return information about the data in the stream
852 	outInfo->type = B_STYLED_TEXT_FORMAT;
853 	outInfo->group = B_TRANSLATOR_TEXT;
854 	outInfo->quality = STXT_IN_QUALITY;
855 	outInfo->capability = STXT_IN_CAPABILITY;
856 	strcpy(outInfo->name, "Be styled text file");
857 	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
858 
859 	return B_OK;
860 }
861 
862 
863 /*!
864 	Determines if the data in \a inSource is of the UTF8 plain
865 
866 	\param data buffer containing data already read (must be at
867 		least DATA_BUFFER_SIZE bytes large)
868 	\param nread number of bytes that have already been read from the stream
869 	\param header the STXT stream header read in by Identify() or Translate()
870 	\param inSource the stream with the STXT data
871 	\param outInfo information about the type of data from inSource is stored here
872 	\param outType the desired output type for the data in inSource
873 */
874 status_t
875 identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
876 	translator_info* outInfo, uint32 outType, const char*& encoding)
877 {
878 	ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
879 	if (readLater < B_OK)
880 		return B_NO_TRANSLATOR;
881 
882 	bytesRead += readLater;
883 
884 	// TODO: identify encoding as possible!
885 	BMimeType type;
886 	if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
887 		return B_NO_TRANSLATOR;
888 
889 	float capability = TEXT_IN_CAPABILITY;
890 	if (bytesRead < 20)
891 		capability = .1f;
892 
893 	// return information about the data in the stream
894 	outInfo->type = B_TRANSLATOR_TEXT;
895 	outInfo->group = B_TRANSLATOR_TEXT;
896 	outInfo->quality = TEXT_IN_QUALITY;
897 	outInfo->capability = capability;
898 
899 	char description[B_MIME_TYPE_LENGTH];
900 	if (type.GetLongDescription(description) == B_OK)
901 		strlcpy(outInfo->name, description, sizeof(outInfo->name));
902 	else
903 		strlcpy(outInfo->name, "Plain text file", sizeof(outInfo->name));
904 
905 	//strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
906 	strcpy(outInfo->MIME, "text/plain");
907 	return B_OK;
908 }
909 
910 
911 // ---------------------------------------------------------------
912 // translate_from_stxt
913 //
914 // Translates the data in inSource to the type outType and stores
915 // the translated data in outDestination.
916 //
917 // Preconditions:
918 //
919 // Parameters:	inSource,	the data to be translated
920 //
921 //				outDestination,	where the translated data is
922 //								put
923 //
924 //				outType,	the type to convert inSource to
925 //
926 //				txtheader, 	the TEXT header from inSource
927 //
928 //
929 // Postconditions:
930 //
931 // Returns: B_BAD_VALUE, if outType is invalid
932 //
933 // B_NO_TRANSLATOR, if this translator doesn't understand the data
934 //
935 // B_ERROR, if there was an error allocating memory or converting
936 //          data
937 //
938 // B_OK, if all went well
939 // ---------------------------------------------------------------
940 status_t
941 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
942 		uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
943 {
944 	if (inSource->Seek(0, SEEK_SET) != 0)
945 		return B_ERROR;
946 
947 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
948 	const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
949 
950 	bool btoplain;
951 	if (outType == B_TRANSLATOR_TEXT)
952 		btoplain = true;
953 	else if (outType == B_STYLED_TEXT_FORMAT)
954 		btoplain = false;
955 	else
956 		return B_BAD_VALUE;
957 
958 	uint8 buffer[READ_BUFFER_SIZE];
959 	ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
960 
961 	// skip to the actual text data when outputting a
962 	// plain text file
963 	if (btoplain) {
964 		if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
965 			kstxtsize + ktxtsize)
966 			return B_ERROR;
967 	}
968 
969 	// Read data from inSource
970 	// When outputing B_TRANSLATOR_TEXT, the loop stops when all of
971 	// the text data has been read and written.
972 	// When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
973 	// of the data from inSource has been read and written.
974 	if (btoplain)
975 		nreed = min((size_t)READ_BUFFER_SIZE,
976 			txtheader.header.data_size - ntotalread);
977 	else
978 		nreed = READ_BUFFER_SIZE;
979 	nread = inSource->Read(buffer, nreed);
980 	while (nread > 0) {
981 		nwritten = outDestination->Write(buffer, nread);
982 		if (nwritten != nread)
983 			return B_ERROR;
984 
985 		if (btoplain) {
986 			ntotalread += nread;
987 			nreed = min((size_t)READ_BUFFER_SIZE,
988 				txtheader.header.data_size - ntotalread);
989 		} else
990 			nreed = READ_BUFFER_SIZE;
991 		nread = inSource->Read(buffer, nreed);
992 	}
993 
994 	if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
995 		ntotalread)
996 		// If not all of the text data was able to be read...
997 		return B_NO_TRANSLATOR;
998 	else
999 		return B_OK;
1000 }
1001 
1002 // ---------------------------------------------------------------
1003 // output_headers
1004 //
1005 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
1006 // to outDestination, setting the data_size member of the text header
1007 // to text_data_size
1008 //
1009 // Preconditions:
1010 //
1011 // Parameters:	outDestination,	where the translated data is
1012 //								put
1013 //
1014 //				text_data_size, number of bytes in data section
1015 //							    of the TEXT header
1016 //
1017 //
1018 // Postconditions:
1019 //
1020 // Returns:
1021 //
1022 // B_ERROR, if there was an error writing to outDestination or
1023 // 	an error with converting the byte order
1024 //
1025 // B_OK, if all went well
1026 // ---------------------------------------------------------------
1027 status_t
1028 output_headers(BPositionIO *outDestination, uint32 text_data_size)
1029 {
1030 	const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
1031 		sizeof(TranslatorStyledTextTextHeader);
1032 	status_t result;
1033 	TranslatorStyledTextStreamHeader stxtheader;
1034 	TranslatorStyledTextTextHeader txtheader;
1035 
1036 	uint8 buffer[kHeadersSize];
1037 
1038 	stxtheader.header.magic = 'STXT';
1039 	stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1040 	stxtheader.header.data_size = 0;
1041 	stxtheader.version = 100;
1042 	memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1043 
1044 	txtheader.header.magic = 'TEXT';
1045 	txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1046 	txtheader.header.data_size = text_data_size;
1047 	txtheader.charset = B_UNICODE_UTF8;
1048 	memcpy(buffer + stxtheader.header.header_size, &txtheader,
1049 		txtheader.header.header_size);
1050 
1051 	// write out headers in Big Endian byte order
1052 	result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1053 		B_SWAP_HOST_TO_BENDIAN);
1054 	if (result == B_OK) {
1055 		ssize_t nwritten = 0;
1056 		nwritten = outDestination->Write(buffer, kHeadersSize);
1057 		if (nwritten != kHeadersSize)
1058 			return B_ERROR;
1059 		else
1060 			return B_OK;
1061 	}
1062 
1063 	return result;
1064 }
1065 
1066 // ---------------------------------------------------------------
1067 // output_styles
1068 //
1069 // Writes out the actual style information into outDestination
1070 // using the data from pflatRunArray
1071 //
1072 // Preconditions:
1073 //
1074 // Parameters:	outDestination,	where the translated data is
1075 //								put
1076 //
1077 //				text_size,		size in bytes of the text in
1078 //								outDestination
1079 //
1080 //				data_size,		size of pflatRunArray
1081 //
1082 // Postconditions:
1083 //
1084 // Returns:
1085 //
1086 // B_ERROR, if there was an error writing to outDestination or
1087 // 	an error with converting the byte order
1088 //
1089 // B_OK, if all went well
1090 // ---------------------------------------------------------------
1091 status_t
1092 output_styles(BPositionIO *outDestination, uint32 text_size,
1093 	uint8 *pflatRunArray, ssize_t data_size)
1094 {
1095 	const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1096 
1097 	uint8 buffer[kstylsize];
1098 
1099 	// output STYL header
1100 	TranslatorStyledTextStyleHeader stylheader;
1101 	stylheader.header.magic = 'STYL';
1102 	stylheader.header.header_size =
1103 		sizeof(TranslatorStyledTextStyleHeader);
1104 	stylheader.header.data_size = data_size;
1105 	stylheader.apply_offset = 0;
1106 	stylheader.apply_length = text_size;
1107 
1108 	memcpy(buffer, &stylheader, kstylsize);
1109 	if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1110 		B_SWAP_HOST_TO_BENDIAN) != B_OK)
1111 		return B_ERROR;
1112 	if (outDestination->Write(buffer, kstylsize) != kstylsize)
1113 		return B_ERROR;
1114 
1115 	// output actual style information
1116 	if (outDestination->Write(pflatRunArray,
1117 		data_size) != data_size)
1118 		return B_ERROR;
1119 
1120 	return B_OK;
1121 }
1122 
1123 
1124 /*!
1125 	Convert the plain text (UTF8) from inSource to plain or
1126 	styled text in outDestination
1127 */
1128 status_t
1129 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1130 	BPositionIO* destination, uint32 outType)
1131 {
1132 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1133 		return B_BAD_VALUE;
1134 
1135 	// find the length of the text
1136 	off_t size = source->Seek(0, SEEK_END);
1137 	if (size < 0)
1138 		return (status_t)size;
1139 	if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1140 		return B_NOT_SUPPORTED;
1141 
1142 	status_t status = source->Seek(0, SEEK_SET);
1143 	if (status < B_OK)
1144 		return status;
1145 
1146 	if (outType == B_STYLED_TEXT_FORMAT) {
1147 		// output styled text headers
1148 		status = output_headers(destination, (uint32)size);
1149 		if (status != B_OK)
1150 			return status;
1151 	}
1152 
1153 	class MallocBuffer {
1154 		public:
1155 			MallocBuffer() : fBuffer(NULL), fSize(0) {}
1156 			~MallocBuffer() { free(fBuffer); }
1157 
1158 			void* Buffer() { return fBuffer; }
1159 			size_t Size() const { return fSize; }
1160 
1161 			status_t
1162 			Allocate(size_t size)
1163 			{
1164 				fBuffer = malloc(size);
1165 				if (fBuffer != NULL) {
1166 					fSize = size;
1167 					return B_OK;
1168 				}
1169 				return B_NO_MEMORY;
1170 			}
1171 
1172 		private:
1173 			void*	fBuffer;
1174 			size_t	fSize;
1175 	} encodingBuffer;
1176 	BMallocIO encodingIO;
1177 	uint32 encodingID = 0;
1178 		// defaults to UTF-8 or no encoding
1179 
1180 	BNode* node = dynamic_cast<BNode*>(source);
1181 	if (node != NULL) {
1182 		// determine encoding, if available
1183 		const BCharacterSet* characterSet = NULL;
1184 		bool hasAttribute = false;
1185 		if (encoding != NULL && !forceEncoding) {
1186 			BString name;
1187 			if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1188 				encoding = name.String();
1189 				hasAttribute = true;
1190 			} else {
1191 				int32 value;
1192 				ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1193 					&value, sizeof(value));
1194 				if (bytesRead == (ssize_t)sizeof(value)) {
1195 					hasAttribute = true;
1196 					if (value != 65535)
1197 						characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1198 				}
1199 			}
1200 		} else {
1201 			hasAttribute = true;
1202 				// we don't write the encoding in this case
1203 		}
1204 		if (characterSet == NULL && encoding != NULL)
1205 			characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1206 
1207 		if (characterSet != NULL) {
1208 			encodingID = characterSet->GetConversionID();
1209 			encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1210 		}
1211 
1212 		if (!hasAttribute && encoding != NULL) {
1213 			// add encoding attribute, so that someone opening the file can
1214 			// retrieve it for persistance
1215 			node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1216 				strlen(encoding));
1217 		}
1218 	}
1219 
1220 	off_t outputSize = 0;
1221 	ssize_t bytesRead;
1222 	int32 state = 0;
1223 
1224 	// output the actual text part of the data
1225 	do {
1226 		uint8 buffer[READ_BUFFER_SIZE];
1227 		bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1228 		if (bytesRead < B_OK)
1229 			return bytesRead;
1230 		if (bytesRead == 0)
1231 			break;
1232 
1233 		if (encodingBuffer.Size() == 0) {
1234 			// default, no encoding
1235 			ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1236 			if (bytesWritten != bytesRead) {
1237 				if (bytesWritten < B_OK)
1238 					return bytesWritten;
1239 
1240 				return B_ERROR;
1241 			}
1242 
1243 			outputSize += bytesRead;
1244 		} else {
1245 			// decode text file to UTF-8
1246 			char* pos = (char*)buffer;
1247 			int32 encodingLength = encodingIO.BufferLength();
1248 			int32 bytesLeft = bytesRead;
1249 			int32 bytes;
1250 			do {
1251 				encodingLength = READ_BUFFER_SIZE * 4;
1252 				bytes = bytesLeft;
1253 
1254 				status = convert_to_utf8(encodingID, pos, &bytes,
1255 					(char*)encodingBuffer.Buffer(), &encodingLength, &state);
1256 				if (status < B_OK)
1257 					return status;
1258 
1259 				ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1260 					encodingLength);
1261 				if (bytesWritten < encodingLength) {
1262 					if (bytesWritten < B_OK)
1263 						return bytesWritten;
1264 
1265 					return B_ERROR;
1266 				}
1267 
1268 				pos += bytes;
1269 				bytesLeft -= bytes;
1270 				outputSize += encodingLength;
1271 			} while (encodingLength > 0 && bytesLeft > 0);
1272 		}
1273 	} while (bytesRead > 0);
1274 
1275 	if (outType != B_STYLED_TEXT_FORMAT)
1276 		return B_OK;
1277 
1278 	if (encodingBuffer.Size() != 0 && size != outputSize) {
1279 		if (outputSize > UINT32_MAX)
1280 			return B_NOT_SUPPORTED;
1281 
1282 		// we need to update the header as the decoded text size has changed
1283 		status = destination->Seek(0, SEEK_SET);
1284 		if (status == B_OK)
1285 			status = output_headers(destination, (uint32)outputSize);
1286 		if (status == B_OK)
1287 			status = destination->Seek(0, SEEK_END);
1288 
1289 		if (status < B_OK)
1290 			return status;
1291 	}
1292 
1293 	// Read file attributes if outputting styled data
1294 	// and source is a BNode object
1295 
1296 	if (node == NULL)
1297 		return B_OK;
1298 
1299 	// Try to read styles - we only propagate an error if the actual on-disk
1300 	// data is likely to be okay
1301 
1302 	const char *kAttrName = "styles";
1303 	attr_info info;
1304 	if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1305 		return B_OK;
1306 
1307 	if (info.type != B_RAW_TYPE || info.size < 160) {
1308 		// styles seem to be broken, but since we got the text,
1309 		// we don't propagate the error
1310 		return B_OK;
1311 	}
1312 
1313 	uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1314 	if (flatRunArray == NULL)
1315 		return B_NO_MEMORY;
1316 
1317 	bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1318 	if (bytesRead != info.size)
1319 		return B_OK;
1320 
1321 	output_styles(destination, size, flatRunArray, info.size);
1322 
1323 	delete[] flatRunArray;
1324 	return B_OK;
1325 }
1326 
1327 
1328 //	#pragma mark -
1329 
1330 
1331 STXTTranslator::STXTTranslator()
1332 	: BaseTranslator("StyledEdit Files", "StyledEdit files translator",
1333 		STXT_TRANSLATOR_VERSION,
1334 		gInputFormats, sizeof(gInputFormats) / sizeof(translation_format),
1335 		gOutputFormats, sizeof(gOutputFormats) / sizeof(translation_format),
1336 		"STXTTranslator_Settings",
1337 		gDefaultSettings, sizeof(gDefaultSettings) / sizeof(TranSetting),
1338 		B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1339 {
1340 }
1341 
1342 
1343 STXTTranslator::~STXTTranslator()
1344 {
1345 }
1346 
1347 
1348 status_t
1349 STXTTranslator::Identify(BPositionIO *inSource,
1350 	const translation_format *inFormat, BMessage *ioExtension,
1351 	translator_info *outInfo, uint32 outType)
1352 {
1353 	if (!outType)
1354 		outType = B_TRANSLATOR_TEXT;
1355 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1356 		return B_NO_TRANSLATOR;
1357 
1358 	const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1359 
1360 	uint8 buffer[DATA_BUFFER_SIZE];
1361 	status_t nread = 0;
1362 	// Read in the header to determine
1363 	// if the data is supported
1364 	nread = inSource->Read(buffer, kstxtsize);
1365 	if (nread < 0)
1366 		return nread;
1367 
1368 	// read in enough data to fill the stream header
1369 	if (nread == kstxtsize) {
1370 		TranslatorStyledTextStreamHeader header;
1371 		memcpy(&header, buffer, kstxtsize);
1372 		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1373 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1374 			return B_ERROR;
1375 
1376 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1377 			&& header.header.header_size == (int32)kstxtsize
1378 			&& header.header.data_size == 0
1379 			&& header.version == 100)
1380 			return identify_stxt_header(header, inSource, outInfo, outType);
1381 	}
1382 
1383 	// if the data is not styled text, check if it is plain text
1384 	const char* encoding;
1385 	return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1386 }
1387 
1388 
1389 status_t
1390 STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1391 	BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1392 {
1393 	if (!outType)
1394 		outType = B_TRANSLATOR_TEXT;
1395 	if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1396 		return B_NO_TRANSLATOR;
1397 
1398 	const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1399 	uint8 buffer[DATA_BUFFER_SIZE];
1400 	status_t result;
1401 	translator_info outInfo;
1402 	// Read in the header to determine
1403 	// if the data is supported
1404 	ssize_t bytesRead = source->Read(buffer, headerSize);
1405 	if (bytesRead < 0)
1406 		return bytesRead;
1407 
1408 	// read in enough data to fill the stream header
1409 	if (bytesRead == headerSize) {
1410 		TranslatorStyledTextStreamHeader header;
1411 		memcpy(&header, buffer, headerSize);
1412 		if (swap_data(B_UINT32_TYPE, &header, headerSize,
1413 				B_SWAP_BENDIAN_TO_HOST) != B_OK)
1414 			return B_ERROR;
1415 
1416 		if (header.header.magic == B_STYLED_TEXT_FORMAT
1417 			&& header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1418 			&& header.header.data_size == 0
1419 			&& header.version == 100) {
1420 			TranslatorStyledTextTextHeader textHeader;
1421 			result = identify_stxt_header(header, source, &outInfo, outType,
1422 				&textHeader);
1423 			if (result != B_OK)
1424 				return result;
1425 
1426 			return translate_from_stxt(source, outDestination, outType, textHeader);
1427 		}
1428 	}
1429 
1430 	// if the data is not styled text, check if it is ASCII text
1431 	bool forceEncoding = false;
1432 	const char* encoding = NULL;
1433 	result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1434 	if (result != B_OK)
1435 		return result;
1436 
1437 	if (ioExtension != NULL) {
1438 		const char* value;
1439 		if (ioExtension->FindString("be:encoding", &value) == B_OK
1440 			&& value[0]) {
1441 			// override encoding
1442 			encoding = value;
1443 			forceEncoding = true;
1444 		}
1445 	}
1446 
1447 	return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1448 }
1449 
1450 
1451 BView *
1452 STXTTranslator::NewConfigView(TranslatorSettings *settings)
1453 {
1454 	return new STXTView(BRect(0, 0, 225, 175), "STXTTranslator Settings",
1455 		B_FOLLOW_ALL, B_WILL_DRAW, settings);
1456 }
1457 
1458