xref: /haiku/src/kits/mail/mail_util.cpp (revision b46615c55ad2c8fe6de54412055a0713da3d610a)
1 /* mail util - header parsing
2 **
3 ** Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
4 */
5 
6 
7 #include <UTF8.h>
8 #include <Message.h>
9 #include <String.h>
10 #include <Locker.h>
11 #include <DataIO.h>
12 #include <List.h>
13 
14 #include <stdlib.h>
15 #include <string.h>
16 #include <stdio.h>
17 #define __USE_GNU
18 #include <regex.h>
19 #include <ctype.h>
20 #include <errno.h>
21 #include <parsedate.h>
22 
23 #include <mail_encoding.h>
24 
25 #include <mail_util.h>
26 
27 #include <CharacterSet.h>
28 #include <CharacterSetRoster.h>
29 
30 using namespace BPrivate;
31 
32 #define CRLF   "\r\n"
33 
34 struct CharsetConversionEntry
35 {
36 	const char *charset;
37 	uint32 flavor;
38 };
39 
40 extern const CharsetConversionEntry mail_charsets [] =
41 {
42 	// In order of authority, so when searching for the name for a particular
43 	// numbered conversion, start at the beginning of the array.
44 	{"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
45 	{"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
46 	{"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
47 	{"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
48 	{"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
49 	{"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
50 	{"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
51 	{"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
52 	{"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
53 	{"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
54 	{"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
55 	{"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
56 	{"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
57 
58 	{"shift_jis",	B_SJIS_CONVERSION}, // MIME STANDARD
59 	{"shift-jis",	B_SJIS_CONVERSION},
60 	{"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
61 	{"euc-jp",		B_EUC_CONVERSION}, // MIME STANDARD
62 
63 	{"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
64 	{"ksc5601",		B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
65 	{"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
66 
67 	{"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
68 	{"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
69 	{"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
70 
71 	{"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
72 	{"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
73 	{"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
74 
75     {"big5",        24}, // MIME STANDARD
76 
77     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
78     {"gb2312",      25}, // COMPATIBLE
79     {"gbk",         25}, // COMPATIBLE
80 
81 	/* {"utf-16",		B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
82 	{"us-ascii",	B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
83 	{"utf-8",		B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
84 
85 	{NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
86 };
87 
88 
89 status_t
90 write_read_attr(BNode& node, read_flags flag)
91 {
92 	if (node.WriteAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
93 		< 0)
94 		return B_ERROR;
95 
96 #if R5_COMPATIBLE
97 	// manage the status string only if it currently has a "read" status
98 	BString currentStatus;
99 	if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &currentStatus) == B_OK) {
100 		if (currentStatus.ICompare("New") != 0
101 			&& currentStatus.ICompare("Read") != 0
102 			&& currentStatus.ICompare("Seen") != 0)
103 			return B_OK;
104 	}
105 
106 	const char* statusString = (flag == B_READ) ? "Read"
107 		: (flag  == B_SEEN) ? "Seen" : "New";
108 	if (node.WriteAttr(B_MAIL_ATTR_STATUS, B_STRING_TYPE, 0, statusString,
109 		strlen(statusString)) < 0)
110 		return B_ERROR;
111 #endif
112 	return B_OK;
113 }
114 
115 
116 status_t
117 read_read_attr(BNode& node, read_flags& flag)
118 {
119 	if (node.ReadAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
120 		== sizeof(int32))
121 		return B_OK;
122 
123 #if R5_COMPATIBLE
124 	BString statusString;
125 	if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &statusString) == B_OK) {
126 		if (statusString.ICompare("New"))
127 			flag = B_UNREAD;
128 		else
129 			flag = B_READ;
130 
131 		return B_OK;
132 	}
133 #endif
134 	return B_ERROR;
135 }
136 
137 
138 // The next couple of functions are our wrapper around convert_to_utf8 and
139 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
140 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.  It
141 // also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
142 
143 _EXPORT status_t mail_convert_to_utf8 (
144 	uint32 srcEncoding,
145 	const char *src,
146 	int32 *srcLen,
147 	char *dst,
148 	int32 *dstLen,
149 	int32 *state,
150 	char substitute)
151 {
152 	int32    copyAmount;
153 	char    *originalDst = dst;
154 	status_t returnCode = -1;
155 
156 	if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
157 		copyAmount = *srcLen;
158 		if (*dstLen < copyAmount)
159 			copyAmount = *dstLen;
160 		memcpy (dst, src, copyAmount);
161 		*srcLen = copyAmount;
162 		*dstLen = copyAmount;
163 		returnCode = B_OK;
164 	} else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
165 		int32 i;
166 		unsigned char letter;
167 		copyAmount = *srcLen;
168 		if (*dstLen < copyAmount)
169 			copyAmount = *dstLen;
170 		for (i = 0; i < copyAmount; i++) {
171 			letter = *src++;
172 			if (letter > 0x80U)
173 				// Invalid, could also use substitute, but better to strip high bit.
174 				*dst++ = letter - 0x80U;
175 			else if (letter == 0x80U)
176 				// Can't convert to 0x00 since that's NUL, which would cause problems.
177 				*dst++ = substitute;
178 			else
179 				*dst++ = letter;
180 		}
181 		*srcLen = copyAmount;
182 		*dstLen = copyAmount;
183 		returnCode = B_OK;
184 	} else
185 		returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
186 			dst, dstLen, state, substitute);
187 
188 	if (returnCode == B_OK) {
189 		// Replace spurious NUL bytes, which should normally not be in the
190 		// output of the decoding (not normal UTF-8 characters, and no NULs are
191 		// in our usual input strings).  They happen for some odd ISO-2022-JP
192 		// byte pair combinations which are improperly handled by the BeOS
193 		// routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
194 		// first ESC $ B switches to a Japanese character set, then the next
195 		// two bytes "yD" specify a character, then ESC ( B switches back to
196 		// the ASCII character set.  The UTF-8 conversion yields a NUL byte.
197 		int32 i;
198 		for (i = 0; i < *dstLen; i++)
199 			if (originalDst[i] == 0)
200 				originalDst[i] = substitute;
201 	}
202 	return returnCode;
203 }
204 
205 
206 _EXPORT status_t mail_convert_from_utf8 (
207 	uint32 dstEncoding,
208 	const char *src,
209 	int32 *srcLen,
210 	char *dst,
211 	int32 *dstLen,
212 	int32 *state,
213 	char substitute)
214 {
215 	int32		copyAmount;
216 	status_t	errorCode;
217 	int32		originalDstLen = *dstLen;
218 	int32		tempDstLen;
219 	int32		tempSrcLen;
220 
221 	if (dstEncoding == B_MAIL_UTF8_CONVERSION)
222 	{
223 		copyAmount = *srcLen;
224 		if (*dstLen < copyAmount)
225 			copyAmount = *dstLen;
226 		memcpy (dst, src, copyAmount);
227 		*srcLen = copyAmount;
228 		*dstLen = copyAmount;
229 		return B_OK;
230 	}
231 
232 	if (dstEncoding == B_MAIL_US_ASCII_CONVERSION)
233 	{
234 		int32			characterLength;
235 		int32			dstRemaining = *dstLen;
236 		unsigned char	letter;
237 		int32			srcRemaining = *srcLen;
238 
239 		// state contains the number of source bytes to skip, left over from a
240 		// partial UTF-8 character split over the end of the buffer from last
241 		// time.
242 		if (srcRemaining <= *state) {
243 			*state -= srcRemaining;
244 			*dstLen = 0;
245 			return B_OK;
246 		}
247 		srcRemaining -= *state;
248 		src += *state;
249 		*state = 0;
250 
251 		while (true) {
252 			if (srcRemaining <= 0 || dstRemaining <= 0)
253 				break;
254 			letter = *src;
255 			if (letter < 0x80)
256 				characterLength = 1; // Regular ASCII equivalent code.
257 			else if (letter < 0xC0)
258 				characterLength = 1; // Invalid in-between data byte 10xxxxxx.
259 			else if (letter < 0xE0)
260 				characterLength = 2;
261 			else if (letter < 0xF0)
262 				characterLength = 3;
263 			else if (letter < 0xF8)
264 				characterLength = 4;
265 			else if (letter < 0xFC)
266 				characterLength = 5;
267 			else if (letter < 0xFE)
268 				characterLength = 6;
269 			else
270 				characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
271 			if (letter < 0x80)
272 				*dst++ = *src;
273 			else
274 				*dst++ = substitute;
275 			dstRemaining--;
276 			if (srcRemaining < characterLength) {
277 				// Character split past the end of the buffer.
278 				*state = characterLength - srcRemaining;
279 				srcRemaining = 0;
280 			} else {
281 				src += characterLength;
282 				srcRemaining -= characterLength;
283 			}
284 		}
285 		// Update with the amounts used.
286 		*srcLen = *srcLen - srcRemaining;
287 		*dstLen = *dstLen - dstRemaining;
288 		return B_OK;
289 	}
290 
291 	errorCode = convert_from_utf8 (dstEncoding, src, srcLen, dst, dstLen, state, substitute);
292 	if (errorCode != B_OK)
293 		return errorCode;
294 
295 	if (dstEncoding != B_JIS_CONVERSION)
296 		return B_OK;
297 
298 	// B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
299 	// character subsets.  For E-mail headers (and other uses), it needs to be
300 	// switched back to ASCII at the end (otherwise the last character gets
301 	// lost or other weird things happen in the headers).  Note that we can't
302 	// just append the escape code since the convert_from_utf8 "state" will be
303 	// wrong.  So we append an ASCII letter and throw it away, leaving just the
304 	// escape code.  Well, it actually switches to the Roman character set, not
305 	// ASCII, but that should be OK.
306 
307 	tempDstLen = originalDstLen - *dstLen;
308 	if (tempDstLen < 3) // Not enough space remaining in the output.
309 		return B_OK; // Sort of an error, but we did convert the rest OK.
310 	tempSrcLen = 1;
311 	errorCode = convert_from_utf8 (dstEncoding, "a", &tempSrcLen,
312 		dst + *dstLen, &tempDstLen, state, substitute);
313 	if (errorCode != B_OK)
314 		return errorCode;
315 	*dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
316 	return B_OK;
317 }
318 
319 
320 
321 static int handle_non_rfc2047_encoding(char **buffer,size_t *bufferLength,size_t *sourceLength)
322 {
323 	char *string = *buffer;
324 	int32 length = *sourceLength;
325 	int32 i;
326 
327 	// check for 8-bit characters
328 	for (i = 0;i < length;i++)
329 		if (string[i] & 0x80)
330 			break;
331 	if (i == length)
332 		return false;
333 
334 	// check for groups of 8-bit characters - this code is not very smart;
335 	// it just can detect some sort of single-byte encoded stuff, the rest
336 	// is regarded as UTF-8
337 
338 	int32 singletons = 0,doubles = 0;
339 
340 	for (i = 0;i < length;i++)
341 	{
342 		if (string[i] & 0x80)
343 		{
344 			if ((string[i + 1] & 0x80) == 0)
345 				singletons++;
346 			else doubles++;
347 			i++;
348 		}
349 	}
350 
351 	if (singletons != 0)	// can't be valid UTF-8 anymore, so we assume ISO-Latin-1
352 	{
353 		int32 state = 0;
354 		// just to be sure
355 		int32 destLength = length * 4 + 1;
356 		int32 destBufferLength = destLength;
357 		char *dest = (char*)malloc(destLength);
358 		if (dest == NULL)
359 			return 0;
360 
361 		if (convert_to_utf8(B_ISO1_CONVERSION, string, &length,dest,
362 			&destLength, &state) == B_OK) {
363 			*buffer = dest;
364 			*bufferLength = destBufferLength;
365 			*sourceLength = destLength;
366 			return true;
367 		}
368 		free(dest);
369 		return false;
370 	}
371 
372 	// we assume a valid UTF-8 string here, but yes, we don't check it
373 	return true;
374 }
375 
376 
377 _EXPORT ssize_t rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
378 {
379 	char *head, *tail;
380 	char *charset, *encoding, *end;
381 	ssize_t ret = B_OK;
382 
383 	if (bufp == NULL || *bufp == NULL)
384 		return -1;
385 
386 	char *string = *bufp;
387 
388 	//---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
389 	if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
390 		return strLen;
391 
392 	// set up string length
393 	if (strLen == 0)
394 		strLen = strlen(*bufp);
395 	char lastChar = (*bufp)[strLen];
396 	(*bufp)[strLen] = '\0';
397 
398 	//---------Whew! Now for RFC compliant mail
399 	bool encodedWordFoundPreviously = false;
400 	for (head = tail = string;
401 		((charset = strstr(tail, "=?")) != NULL)
402 		&& (((encoding = strchr(charset + 2, '?')) != NULL)
403 			&& encoding[1] && (encoding[2] == '?') && encoding[3])
404 		&& (end = strstr(encoding + 3, "?=")) != NULL;
405 		// found "=?...charset...?e?...text...?=   (e == encoding)
406 		//        ^charset       ^encoding    ^end
407 		tail = end)
408 	{
409 		// Copy non-encoded text (from tail up to charset) to the output.
410 		// Ignore spaces between two encoded "words".  RFC2047 says the words
411 		// should be concatenated without the space (designed for Asian
412 		// sentences which have no spaces yet need to be broken into "words" to
413 		// keep within the line length limits).
414 		bool nonSpaceFound = false;
415 		for (int i = 0; i < charset-tail; i++) {
416 			if (!isspace (tail[i])) {
417 				nonSpaceFound = true;
418 				break;
419 			}
420 		}
421 		if (!encodedWordFoundPreviously || nonSpaceFound) {
422 			if (string != tail && tail != charset)
423 				memmove(string, tail, charset-tail);
424 			string += charset-tail;
425 		}
426 		tail = charset;
427 		encodedWordFoundPreviously = true;
428 
429 		// move things to point at what they should:
430 		//   =?...charset...?e?...text...?=   (e == encoding)
431 		//     ^charset      ^encoding     ^end
432 		charset += 2;
433 		encoding += 1;
434 		end += 2;
435 
436 		// find the charset this text is in now
437 		size_t		cLen = encoding - 1 - charset;
438 		bool		base64encoded = toupper(*encoding) == 'B';
439 
440 		uint32 convert_id = B_MAIL_NULL_CONVERSION;
441 		char charset_string[cLen+1];
442 		memcpy(charset_string, charset, cLen);
443 		charset_string[cLen] = '\0';
444 		if (strcasecmp(charset_string, "us-ascii") == 0) {
445 			convert_id = B_MAIL_US_ASCII_CONVERSION;
446 		} else if (strcasecmp(charset_string, "utf-8") == 0) {
447 			convert_id = B_MAIL_UTF8_CONVERSION;
448 		} else {
449 			const BCharacterSet * cs = BCharacterSetRoster::FindCharacterSetByName(charset_string);
450 			if (cs != NULL) {
451 				convert_id = cs->GetConversionID();
452 			}
453 		}
454 		if (convert_id == B_MAIL_NULL_CONVERSION)
455 		{
456 			// unidentified charset
457 			// what to do? doing nothing skips the encoded text;
458 			// but we should keep it: we copy it to the output.
459 			if (string != tail && tail != end)
460 				memmove(string, tail, end-tail);
461 			string += end-tail;
462 			continue;
463 		}
464 		// else we've successfully identified the charset
465 
466 		char *src = encoding+2;
467 		int32 srcLen = end - 2 - src;
468 		// encoded text: src..src+srcLen
469 
470 		// decode text, get decoded length (reducing xforms)
471 		srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
472 				: decode_base64(src, src, srcLen);
473 
474 		// allocate space for the converted text
475 		int32 dstLen = end-string + *bufLen-strLen;
476 		char *dst = (char*)malloc(dstLen);
477 		int32 cvLen = srcLen;
478 		int32 convState = 0;
479 
480 		//
481 		// do the conversion
482 		//
483 		ret = mail_convert_to_utf8(convert_id, src, &cvLen, dst, &dstLen, &convState);
484 		if (ret != B_OK)
485 		{
486 			// what to do? doing nothing skips the encoded text
487 			// but we should keep it: we copy it to the output.
488 
489 			free(dst);
490 
491 			if (string != tail && tail != end)
492 				memmove(string, tail, end-tail);
493 			string += end-tail;
494 			continue;
495 		}
496 		/* convert_to_ is either returning something wrong or my
497 		   test data is screwed up.  Whatever it is, Not Enough
498 		   Space is not the only cause of the below, so we just
499 		   assume it succeeds if it converts anything at all.
500 		else if (cvLen < srcLen)
501 		{
502 			// not enough room to convert the data;
503 			// grow *buf and retry
504 
505 			free(dst);
506 
507 			char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
508 			if (temp == NULL)
509 			{
510 				ret = B_NO_MEMORY;
511 				break;
512 			}
513 
514 			*bufp = temp;
515 			*bufLen = 2*(*bufLen + 1);
516 
517 			string = *bufp + (string-head);
518 			tail = *bufp + (tail-head);
519 			charset = *bufp + (charset-head);
520 			encoding = *bufp + (encoding-head);
521 			end = *bufp + (end-head);
522 			src = *bufp + (src-head);
523 			head = *bufp;
524 			continue;
525 		}
526 		*/
527 		else
528 		{
529 			if (dstLen > end-string)
530 			{
531 				// copy the string forward...
532 				memmove(string+dstLen, end, strLen - (end-head) + 1);
533 				strLen += string+dstLen - end;
534 				end = string + dstLen;
535 			}
536 
537 			memcpy(string, dst, dstLen);
538 			string += dstLen;
539 			free(dst);
540 			continue;
541 		}
542 	}
543 
544 	// copy everything that's left
545 	size_t tailLen = strLen - (tail - head);
546 	memmove(string, tail, tailLen+1);
547 	string += tailLen;
548 
549 	// replace the last char
550 	(*bufp)[strLen] = lastChar;
551 
552 	return ret < B_OK ? ret : string-head;
553 }
554 
555 
556 _EXPORT ssize_t utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding) {
557 	struct word {
558 		BString	originalWord;
559 		BString	convertedWord;
560 		bool	needsEncoding;
561 
562 		// Convert the word from UTF-8 to the desired character set.  The
563 		// converted version also includes the escape codes to return to ASCII
564 		// mode, if relevant.  Also note if it uses unprintable characters,
565 		// which means it will need that special encoding treatment later.
566 		void ConvertWordToCharset (uint32 charset) {
567 			int32 state = 0;
568 			int32 originalLength = originalWord.Length();
569 			int32 convertedLength = originalLength * 5 + 1;
570 			char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
571 			mail_convert_from_utf8 (charset, originalWord.String(),
572 				&originalLength, convertedBuffer, &convertedLength, &state);
573 			for (int i = 0; i < convertedLength; i++) {
574 				if ((convertedBuffer[i] & (1 << 7)) ||
575 					(convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
576 					needsEncoding = true;
577 					break;
578 				}
579 			}
580 			convertedWord.UnlockBuffer (convertedLength);
581 		};
582 	};
583 	struct word *currentWord;
584 	BList words;
585 
586 	// Break the header into words.  White space characters (including tabs and
587 	// newlines) separate the words.  Each word includes any space before it as
588 	// part of the word.  Actually, quotes and other special characters
589 	// (",()<>@) are treated as separate words of their own so that they don't
590 	// get encoded (because MIME headers get the quotes parsed before character
591 	// set unconversion is done).  The reader is supposed to ignore all white
592 	// space between encoded words, which can be inserted so that older mail
593 	// parsers don't have overly long line length problems.
594 
595 	const char *source = *bufp;
596 	const char *bufEnd = *bufp + length;
597 	const char *specialChars = "\"()<>@,";
598 
599 	while (source < bufEnd) {
600 		currentWord = new struct word;
601 		currentWord->needsEncoding = false;
602 
603 		int wordEnd = 0;
604 
605 		// Include leading spaces as part of the word.
606 		while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
607 			wordEnd++;
608 
609 		if (source + wordEnd < bufEnd &&
610 			strchr (specialChars, source[wordEnd]) != NULL) {
611 			// Got a quote mark or other special character, which is treated as
612 			// a word in itself since it shouldn't be encoded, which would hide
613 			// it from the mail system.
614 			wordEnd++;
615 		} else {
616 			// Find the end of the word.  Leave wordEnd pointing just after the
617 			// last character in the word.
618 			while (source + wordEnd < bufEnd) {
619 				if (isspace(source[wordEnd]) ||
620 					strchr (specialChars, source[wordEnd]) != NULL)
621 					break;
622 				if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
623 					0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
624 					// No English words are that long (46 is the longest),
625 					// break up what is likely Asian text (which has no spaces)
626 					// at the start of the next non-ASCII UTF-8 character (high
627 					// two bits are both ones).  Note that two encoded words in
628 					// a row get joined together, even if there is a space
629 					// between them in the final output text, according to the
630 					// standard.  Next word will also be conveniently get
631 					// encoded due to the 0xC0 test.
632 					currentWord->needsEncoding = true;
633 					break;
634 				}
635 				wordEnd++;
636 			}
637 		}
638 		currentWord->originalWord.SetTo (source, wordEnd);
639 		currentWord->ConvertWordToCharset (charset);
640 		words.AddItem(currentWord);
641 		source += wordEnd;
642 	}
643 
644 	// Combine adjacent words which contain unprintable text so that the
645 	// overhead of switching back and forth between regular text and specially
646 	// encoded text is reduced.  However, the combined word must be shorter
647 	// than the maximum of 75 bytes, including character set specification and
648 	// all those delimiters (worst case 22 bytes of overhead).
649 
650 	struct word *run;
651 
652 	for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
653 		if (!currentWord->needsEncoding)
654 			continue; // No need to combine unencoded words.
655 		for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
656 			if (!run->needsEncoding)
657 				break; // Don't want to combine encoded and unencoded words.
658 			if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
659 				currentWord->originalWord.Append (run->originalWord);
660 				currentWord->ConvertWordToCharset (charset);
661 				words.RemoveItem(g);
662 				delete run;
663 				g--;
664 			} else // Can't merge this word, result would be too long.
665 				break;
666 		}
667 	}
668 
669 	// Combine the encoded and unencoded words into one line, doing the
670 	// quoted-printable or base64 encoding.  Insert an extra space between
671 	// words which are both encoded to make word wrapping easier, since there
672 	// is normally none, and you're allowed to insert space (the receiver
673 	// throws it away if it is between encoded words).
674 
675 	BString rfc2047;
676 	bool	previousWordNeededEncoding = false;
677 
678 	const char *charset_dec = "none-bug";
679 	for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
680 		if (mail_charsets[i].flavor == charset) {
681 			charset_dec = mail_charsets[i].charset;
682 			break;
683 		}
684 	}
685 
686 	while ((currentWord = (struct word *)words.RemoveItem(0L)) != NULL) {
687 		if ((encoding != quoted_printable && encoding != base64) ||
688 		!currentWord->needsEncoding) {
689 			rfc2047.Append (currentWord->convertedWord);
690 		} else {
691 			// This word needs encoding.  Try to insert a space between it and
692 			// the previous word.
693 			if (previousWordNeededEncoding)
694 				rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
695 			else {
696 				// Previous word is not encoded, spaces are significant.  Try
697 				// to move a space from the start of this word to be outside of
698 				// the encoded text, so that there is a bit of space between
699 				// this word and the previous one to enhance word wrapping
700 				// chances later on.
701 				if (currentWord->originalWord.Length() > 1 &&
702 					isspace (currentWord->originalWord[0])) {
703 					rfc2047 << currentWord->originalWord[0];
704 					currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
705 					currentWord->ConvertWordToCharset (charset);
706 				}
707 			}
708 
709 			char *encoded = NULL;
710 			ssize_t encoded_len = 0;
711 			int32 convertedLength = currentWord->convertedWord.Length ();
712 			const char *convertedBuffer = currentWord->convertedWord.String ();
713 
714 			switch (encoding) {
715 				case quoted_printable:
716 					encoded = (char *) malloc (convertedLength * 3);
717 					encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
718 					break;
719 				case base64:
720 					encoded = (char *) malloc (convertedLength * 2);
721 					encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
722 					break;
723 				default: // Unknown encoding type, shouldn't happen.
724 					encoded = (char *) convertedBuffer;
725 					encoded_len = convertedLength;
726 					break;
727 			}
728 
729 			rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
730 			rfc2047.Append (encoded, encoded_len);
731 			rfc2047 << "?=";
732 
733 			if (encoding == quoted_printable || encoding == base64)
734 				free(encoded);
735 		}
736 		previousWordNeededEncoding = currentWord->needsEncoding;
737 		delete currentWord;
738 	}
739 
740 	free(*bufp);
741 
742 	ssize_t finalLength = rfc2047.Length ();
743 	*bufp = (char *) (malloc (finalLength + 1));
744 	memcpy (*bufp, rfc2047.String(), finalLength);
745 	(*bufp)[finalLength] = 0;
746 
747 	return finalLength;
748 }
749 
750 
751 //====================================================================
752 
753 void FoldLineAtWhiteSpaceAndAddCRLF (BString &string)
754 {
755 	int			inputLength = string.Length();
756 	int			lineStartIndex;
757 	const int	maxLineLength = 78; // Doesn't include CRLF.
758 	BString		output;
759 	int			splitIndex;
760 	int			tempIndex;
761 
762 	lineStartIndex = 0;
763 	while (true) {
764 		// If we don't need to wrap the text, just output the remainder, if any.
765 
766 		if (lineStartIndex + maxLineLength >= inputLength) {
767 			if (lineStartIndex < inputLength) {
768 				output.Insert (string, lineStartIndex /* source offset */,
769 					inputLength - lineStartIndex /* count */,
770 					output.Length() /* insert at */);
771 				output.Append (CRLF);
772 			}
773 			break;
774 		}
775 
776 		// Look ahead for a convenient spot to split it, between a comma and
777 		// space, which you often see between e-mail addresses like this:
778 		// "Joe Who" joe@dot.com, "Someone Else" else@blot.com
779 
780 		tempIndex = lineStartIndex + maxLineLength;
781 		if (tempIndex > inputLength)
782 			tempIndex = inputLength;
783 		splitIndex = string.FindLast (", ", tempIndex);
784 		if (splitIndex >= lineStartIndex)
785 			splitIndex++; // Point to the space character.
786 
787 		// If none of those exist, try splitting at any white space.
788 
789 		if (splitIndex <= lineStartIndex)
790 			splitIndex = string.FindLast (" ", tempIndex);
791 		if (splitIndex <= lineStartIndex)
792 			splitIndex = string.FindLast ("\t", tempIndex);
793 
794 		// If none of those exist, allow for a longer word - split at the next
795 		// available white space.
796 
797 		if (splitIndex <= lineStartIndex)
798 			splitIndex = string.FindFirst (" ", lineStartIndex + 1);
799 		if (splitIndex <= lineStartIndex)
800 			splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
801 
802 		// Give up, the whole rest of the line can't be split, just dump it
803 		// out.
804 
805 		if (splitIndex <= lineStartIndex) {
806 			if (lineStartIndex < inputLength) {
807 				output.Insert (string, lineStartIndex /* source offset */,
808 					inputLength - lineStartIndex /* count */,
809 					output.Length() /* insert at */);
810 				output.Append (CRLF);
811 			}
812 			break;
813 		}
814 
815 		// Do the split.  The current line up to but not including the space
816 		// gets output, followed by a CRLF.  The space remains to become the
817 		// start of the next line (and that tells the message reader that it is
818 		// a continuation line).
819 
820 		output.Insert (string, lineStartIndex /* source offset */,
821 			splitIndex - lineStartIndex /* count */,
822 			output.Length() /* insert at */);
823 		output.Append (CRLF);
824 		lineStartIndex = splitIndex;
825 	}
826 	string.SetTo (output);
827 }
828 
829 
830 //====================================================================
831 
832 _EXPORT ssize_t readfoldedline(FILE *file, char **buffer, size_t *buflen)
833 {
834 	ssize_t len = buflen && *buflen ? *buflen : 0;
835 	char * buf = buffer && *buffer ? *buffer : NULL;
836 	ssize_t cnt = 0; // Number of characters currently in the buffer.
837 	int c;
838 
839 	while (true)
840 	{
841 		// Make sure there is space in the buffer for two more characters (one
842 		// for the next character, and one for the end of string NUL byte).
843 		if (buf == NULL || cnt + 2 >= len)
844 		{
845 			char *temp = (char *)realloc(buf, len + 64);
846 			if (temp == NULL) {
847 				// Out of memory, however existing buffer remains allocated.
848 				cnt = ENOMEM;
849 				break;
850 			}
851 			len += 64;
852 			buf = temp;
853 		}
854 
855 		// Read the next character, or end of file, or IO error.
856 		if ((c = fgetc(file)) == EOF) {
857 			if (ferror (file)) {
858 				cnt = errno;
859 				if (cnt >= 0)
860 					cnt = -1; // Error codes must be negative.
861 			} else {
862 				// Really is end of file.  Also make it end of line if there is
863 				// some text already read in.  If the first thing read was EOF,
864 				// just return an empty string.
865 				if (cnt > 0) {
866 					buf[cnt++] = '\n';
867 					if (buf[cnt-2] == '\r') {
868 						buf[cnt-2] = '\n';
869 						--cnt;
870 					}
871 				}
872 			}
873 			break;
874 		}
875 
876 		buf[cnt++] = c;
877 
878 		if (c == '\n') {
879 			// Convert CRLF end of line to just a LF.  Do it before folding, in
880 			// case we don't need to fold.
881 			if (cnt >= 2 && buf[cnt-2] == '\r') {
882 				buf[cnt-2] = '\n';
883 				--cnt;
884 			}
885 			// If the current line is empty then return it (so that empty lines
886 			// don't disappear if the next line starts with a space).
887 			if (cnt <= 1)
888 				break;
889 			// Fold if first character on the next line is whitespace.
890 			c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
891 			if (c == ' ' || c == '\t')
892 				buf[cnt-1] = c; // Replace \n with the white space character.
893 			else {
894 				// Not folding, we finished reading a line; break out of the loop
895 				ungetc(c,file);
896 				break;
897 			}
898 		}
899 	}
900 
901 
902 	if (buf != NULL && cnt >= 0)
903 		buf[cnt] = '\0';
904 
905 	if (buffer)
906 		*buffer = buf;
907 	else if (buf)
908 		free(buf);
909 
910 	if (buflen)
911 		*buflen = len;
912 
913 	return cnt;
914 }
915 
916 
917 //====================================================================
918 
919 _EXPORT ssize_t readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
920 {
921 	ssize_t len = buflen && *buflen ? *buflen : 0;
922 	char * buf = buffer && *buffer ? *buffer : NULL;
923 	ssize_t cnt = 0; // Number of characters currently in the buffer.
924 	char c;
925 	status_t errorCode;
926 
927 	while (true)
928 	{
929 		// Make sure there is space in the buffer for two more characters (one
930 		// for the next character, and one for the end of string NUL byte).
931 		if (buf == NULL || cnt + 2 >= len)
932 		{
933 			char *temp = (char *)realloc(buf, len + 64);
934 			if (temp == NULL) {
935 				// Out of memory, however existing buffer remains allocated.
936 				cnt = ENOMEM;
937 				break;
938 			}
939 			len += 64;
940 			buf = temp;
941 		}
942 
943 		errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
944 		if (errorCode != 1) {
945 			if (errorCode < 0) {
946 				cnt = errorCode; // IO error encountered, just return the code.
947 			} else {
948 				// Really is end of file.  Also make it end of line if there is
949 				// some text already read in.  If the first thing read was EOF,
950 				// just return an empty string.
951 				if (cnt > 0) {
952 					buf[cnt++] = '\n';
953 					if (buf[cnt-2] == '\r') {
954 						buf[cnt-2] = '\n';
955 						--cnt;
956 					}
957 				}
958 			}
959 			break;
960 		}
961 
962 		buf[cnt++] = c;
963 
964 		if (c == '\n') {
965 			// Convert CRLF end of line to just a LF.  Do it before folding, in
966 			// case we don't need to fold.
967 			if (cnt >= 2 && buf[cnt-2] == '\r') {
968 				buf[cnt-2] = '\n';
969 				--cnt;
970 			}
971 			// If the current line is empty then return it (so that empty lines
972 			// don't disappear if the next line starts with a space).
973 			if (cnt <= 1)
974 				break;
975 			// if first character on the next line is whitespace, fold lines
976 			errorCode = in.Read(&c,1);
977 			if (errorCode == 1) {
978 				if (c == ' ' || c == '\t')
979 					buf[cnt-1] = c; // Replace \n with the white space character.
980 				else {
981 					// Not folding, we finished reading a whole line.
982 					in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
983 					break;
984 				}
985 			} else if (errorCode < 0) {
986 				cnt = errorCode;
987 				break;
988 			} else // No next line; at the end of the file.  Return the line.
989 				break;
990 		}
991 	}
992 
993 	if (buf != NULL && cnt >= 0)
994 		buf[cnt] = '\0';
995 
996 	if (buffer)
997 		*buffer = buf;
998 	else if (buf)
999 		free(buf);
1000 
1001 	if (buflen)
1002 		*buflen = len;
1003 
1004 	return cnt;
1005 }
1006 
1007 
1008 _EXPORT ssize_t
1009 nextfoldedline(const char** header, char **buffer, size_t *buflen)
1010 {
1011 	ssize_t len = buflen && *buflen ? *buflen : 0;
1012 	char * buf = buffer && *buffer ? *buffer : NULL;
1013 	ssize_t cnt = 0; // Number of characters currently in the buffer.
1014 	char c;
1015 
1016 	while (true)
1017 	{
1018 		// Make sure there is space in the buffer for two more characters (one
1019 		// for the next character, and one for the end of string NUL byte).
1020 		if (buf == NULL || cnt + 2 >= len)
1021 		{
1022 			char *temp = (char *)realloc(buf, len + 64);
1023 			if (temp == NULL) {
1024 				// Out of memory, however existing buffer remains allocated.
1025 				cnt = ENOMEM;
1026 				break;
1027 			}
1028 			len += 64;
1029 			buf = temp;
1030 		}
1031 
1032 		// Read the next character, or end of file.
1033 		if ((c = *(*header)++) == 0) {
1034 			// End of file.  Also make it end of line if there is some text
1035 			// already read in.  If the first thing read was EOF, just return
1036 			// an empty string.
1037 			if (cnt > 0) {
1038 				buf[cnt++] = '\n';
1039 				if (buf[cnt-2] == '\r') {
1040 					buf[cnt-2] = '\n';
1041 					--cnt;
1042 				}
1043 			}
1044 			break;
1045 		}
1046 
1047 		buf[cnt++] = c;
1048 
1049 		if (c == '\n') {
1050 			// Convert CRLF end of line to just a LF.  Do it before folding, in
1051 			// case we don't need to fold.
1052 			if (cnt >= 2 && buf[cnt-2] == '\r') {
1053 				buf[cnt-2] = '\n';
1054 				--cnt;
1055 			}
1056 			// If the current line is empty then return it (so that empty lines
1057 			// don't disappear if the next line starts with a space).
1058 			if (cnt <= 1)
1059 				break;
1060 			// if first character on the next line is whitespace, fold lines
1061 			c = *(*header)++;
1062 			if (c == ' ' || c == '\t')
1063 				buf[cnt-1] = c; // Replace \n with the white space character.
1064 			else {
1065 				// Not folding, we finished reading a line; break out of the loop
1066 				(*header)--; // Undo read of the non-whitespace.
1067 				break;
1068 			}
1069 		}
1070 	}
1071 
1072 
1073 	if (buf != NULL && cnt >= 0)
1074 		buf[cnt] = '\0';
1075 
1076 	if (buffer)
1077 		*buffer = buf;
1078 	else if (buf)
1079 		free(buf);
1080 
1081 	if (buflen)
1082 		*buflen = len;
1083 
1084 	return cnt;
1085 }
1086 
1087 
1088 _EXPORT void
1089 trim_white_space(BString &string)
1090 {
1091 	int32 i;
1092 	int32 length = string.Length();
1093 	char *buffer = string.LockBuffer(length + 1);
1094 
1095 	while (length > 0 && isspace(buffer[length - 1]))
1096 		length--;
1097 	buffer[length] = '\0';
1098 
1099 	for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1100 	if (i != 0) {
1101 		length -= i;
1102 		memmove(buffer,buffer + i,length + 1);
1103 	}
1104 	string.UnlockBuffer(length);
1105 }
1106 
1107 
1108 /** Tries to return a human-readable name from the specified
1109  *	header parameter (should be from "To:" or "From:").
1110  *	Tries to return the name rather than the eMail address.
1111  */
1112 
1113 _EXPORT void
1114 extract_address_name(BString &header)
1115 {
1116 	BString name;
1117 	const char *start = header.String();
1118 	const char *stop = start + strlen (start);
1119 
1120 	// Find a string S in the header (email foo) that matches:
1121 	//   Old style name in brackets: foo@bar.com (S)
1122 	//   New style quotes: "S" <foo@bar.com>
1123 	//   New style no quotes if nothing else found: S <foo@bar.com>
1124 	//   If nothing else found then use the whole thing: S
1125 
1126 	for (int i = 0; i <= 3; i++) {
1127 		// Set p1 to the first letter in the name and p2 to just past the last
1128 		// letter in the name.  p2 stays NULL if a name wasn't found in this
1129 		// pass.
1130 		const char *p1 = NULL, *p2 = NULL;
1131 
1132 		switch (i) {
1133 			case 0: // foo@bar.com (S)
1134 				if ((p1 = strchr(start,'(')) != NULL) {
1135 					p1++; // Advance to first letter in the name.
1136 					size_t nest = 1; // Handle nested brackets.
1137 					for (p2 = p1; p2 < stop; ++p2)
1138 					{
1139 						if (*p2 == ')')
1140 							--nest;
1141 						else if (*p2 == '(')
1142 							++nest;
1143 						if (nest <= 0)
1144 							break;
1145 					}
1146 					if (nest != 0)
1147 						p2 = NULL; // False alarm, no terminating bracket.
1148 				}
1149 				break;
1150 			case 1: // "S" <foo@bar.com>
1151 				if ((p1 = strchr(start, '\"')) != NULL)
1152 					p2 = strchr(++p1, '\"');
1153 				break;
1154 			case 2: // S <foo@bar.com>
1155 				p1 = start;
1156 				if (name.Length() == 0)
1157 					p2 = strchr(start, '<');
1158 				break;
1159 			case 3: // S
1160 				p1 = start;
1161 				if (name.Length() == 0)
1162 					p2 = stop;
1163 				break;
1164 		}
1165 
1166 		// Remove leading and trailing space-like characters and save the
1167 		// result if it is longer than any other likely names found.
1168 		if (p2 != NULL) {
1169 			while (p1 < p2 && (isspace (*p1)))
1170 				++p1;
1171 
1172 			while (p1 < p2 && (isspace (p2[-1])))
1173 				--p2;
1174 
1175 			int newLength = p2 - p1;
1176 			if (name.Length() < newLength)
1177 				name.SetTo(p1, newLength);
1178 		}
1179 	}
1180 
1181 	int32 lessIndex = name.FindFirst('<');
1182 	int32 greaterIndex = name.FindLast('>');
1183 
1184 	if (lessIndex == 0) {
1185 		// Have an address of the form <address> and nothing else, so remove
1186 		// the greater and less than signs, if any.
1187 		if (greaterIndex > 0)
1188 			name.Remove(greaterIndex, 1);
1189 		name.Remove(lessIndex, 1);
1190 	} else if (lessIndex > 0 && lessIndex < greaterIndex) {
1191 		// Yahoo stupidly inserts the e-mail address into the name string, so
1192 		// this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1193 		name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1194 	}
1195 
1196 	trim_white_space(name);
1197 	header = name;
1198 }
1199 
1200 
1201 
1202 // Given a subject in a BString, remove the extraneous RE: re: and other stuff
1203 // to get down to the core subject string, which should be identical for all
1204 // messages posted about a topic.  The input string is modified in place to
1205 // become the output core subject string.
1206 
1207 static int32				gLocker = 0;
1208 static size_t				gNsub = 1;
1209 static re_pattern_buffer	gRe;
1210 static re_pattern_buffer   *gRebuf = NULL;
1211 static unsigned char					gTranslation[256];
1212 
1213 _EXPORT void SubjectToThread (BString &string)
1214 {
1215 // a regex that matches a non-ASCII UTF8 character:
1216 #define U8C \
1217 	"[\302-\337][\200-\277]" \
1218 	"|\340[\302-\337][\200-\277]" \
1219 	"|[\341-\357][\200-\277][\200-\277]" \
1220 	"|\360[\220-\277][\200-\277][\200-\277]" \
1221 	"|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1222 	"|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1223 	"|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1224 	"|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1225 	"|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1226 
1227 #define PATTERN \
1228 	"^ +" \
1229 	"|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1230 	"|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1231 	"| *\\(fwd\\) *$"
1232 
1233 	if (gRebuf == NULL && atomic_add(&gLocker,1) == 0)
1234 	{
1235 		// the idea is to compile the regexp once to speed up testing
1236 
1237 		for (int i=0; i<256; ++i) gTranslation[i]=i;
1238 		for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1239 
1240 		gRe.translate = gTranslation;
1241 		gRe.regs_allocated = REGS_FIXED;
1242 		re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1243 
1244 		const char *pattern = PATTERN;
1245 		// count subexpressions in PATTERN
1246 		for (unsigned int i=0; pattern[i] != 0; ++i)
1247 		{
1248 			if (pattern[i] == '\\')
1249 				++i;
1250 			else if (pattern[i] == '(')
1251 				++gNsub;
1252 		}
1253 
1254 		const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1255 		if (err == NULL)
1256 			gRebuf = &gRe;
1257 		else
1258 			fprintf(stderr, "Failed to compile the regex: %s\n", err);
1259 	}
1260 	else
1261 	{
1262 		int32 tries = 200;
1263 		while (gRebuf == NULL && tries-- > 0)
1264 			snooze(10000);
1265 	}
1266 
1267 	if (gRebuf)
1268 	{
1269 		struct re_registers regs;
1270 		// can't be static if this function is to be thread-safe
1271 
1272 		regs.num_regs = gNsub;
1273 		regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1274 		regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1275 
1276 		for (int start=0;
1277 		    (start=re_search(gRebuf, string.String(), string.Length(),
1278 							0, string.Length(), &regs)) >= 0;
1279 			)
1280 		{
1281 			//
1282 			// we found something
1283 			//
1284 
1285 			// don't delete [bemaildaemon]...
1286 			if (start == regs.start[1])
1287 				start = regs.start[2];
1288 
1289 			string.Remove(start,regs.end[0]-start);
1290 			if (start) string.Insert(' ',1,start);
1291 		}
1292 
1293 		free(regs.start);
1294 		free(regs.end);
1295 	}
1296 
1297 	// Finally remove leading and trailing space.  Some software, like
1298 	// tm-edit 1.8, appends a space to the subject, which would break
1299 	// threading if we left it in.
1300 	trim_white_space(string);
1301 }
1302 
1303 
1304 
1305 // Converts a date to a time.  Handles numeric time zones too, unlike
1306 // parsedate.  Returns -1 if it fails.
1307 
1308 _EXPORT time_t ParseDateWithTimeZone (const char *DateString)
1309 {
1310 	time_t	currentTime;
1311 	time_t	dateAsTime;
1312 	char	tempDateString [80];
1313 	char	tempZoneString [6];
1314 	time_t	zoneDeltaTime;
1315 	int		zoneIndex;
1316 	char   *zonePntr;
1317 
1318 	// See if we can remove the time zone portion.  parsedate understands time
1319 	// zone 3 letter names, but doesn't understand the numeric +9999 time zone
1320 	// format.  To do: see if a newer parsedate exists.
1321 
1322 	strncpy (tempDateString, DateString, sizeof (tempDateString));
1323 	tempDateString[sizeof (tempDateString) - 1] = 0;
1324 
1325 	// Remove trailing spaces.
1326 	zonePntr = tempDateString + strlen (tempDateString) - 1;
1327 	while (zonePntr >= tempDateString && isspace (*zonePntr))
1328 		*zonePntr-- = 0;
1329 	if (zonePntr < tempDateString)
1330 		return -1; // Empty string.
1331 
1332 	// Remove the trailing time zone in round brackets, like in
1333 	// Fri, 22 Feb 2002 15:22:42 EST (-0500)
1334 	// Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1335 	if (tempDateString[strlen(tempDateString)-1] == ')')
1336 	{
1337 		zonePntr = strrchr (tempDateString, '(');
1338 		if (zonePntr != NULL)
1339 		{
1340 			*zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1341 			while (zonePntr >= tempDateString && isspace (*zonePntr))
1342 				*zonePntr-- = 0;
1343 			if (zonePntr < tempDateString)
1344 				return -1; // Empty string.
1345 		}
1346 	}
1347 
1348 	// Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1349 	for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1350 	{
1351 		zonePntr = tempDateString + zoneIndex;
1352 		if (zonePntr[0] == '+' || zonePntr[0] == '-')
1353 		{
1354 			if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1355 				zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1356 				zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1357 				zonePntr[4] >= '0' && zonePntr[4] <= '9')
1358 				break;
1359 		}
1360 	}
1361 	if (zoneIndex >= 0)
1362 	{
1363 		// Remove the zone from the date string and any following time zone
1364 		// letter codes.  Also put in GMT so that the date gets parsed as GMT.
1365 		memcpy (tempZoneString, zonePntr, 5);
1366 		tempZoneString [5] = 0;
1367 		strcpy (zonePntr, "GMT");
1368 	}
1369 	else // No numeric time zone found.
1370 		strcpy (tempZoneString, "+0000");
1371 
1372 	time (&currentTime);
1373 	dateAsTime = parsedate (tempDateString, currentTime);
1374 	if (dateAsTime == (time_t) -1)
1375 		return -1; // Failure.
1376 
1377 	zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1378 	tempZoneString[3] = 0;
1379 	zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1380 	if (tempZoneString[0] == '+')
1381 		zoneDeltaTime = 0 - zoneDeltaTime;
1382 	dateAsTime += zoneDeltaTime;
1383 
1384 	return dateAsTime;
1385 }
1386 
1387 
1388 /** Parses a mail header and fills the headers BMessage
1389  */
1390 
1391 _EXPORT status_t
1392 parse_header(BMessage &headers, BPositionIO &input)
1393 {
1394 	char *buffer = NULL;
1395 	size_t bufferSize = 0;
1396 	int32 length;
1397 
1398 	while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1399 		--length;
1400 			// Don't include the \n at the end of the buffer.
1401 
1402 		// convert to UTF-8 and null-terminate the buffer
1403 		length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1404 		buffer[length] = '\0';
1405 
1406 		const char *delimiter = strstr(buffer, ":");
1407 		if (delimiter == NULL)
1408 			continue;
1409 
1410 		BString header(buffer, delimiter - buffer);
1411 		header.CapitalizeEachWord();
1412 			// unified case for later fetch
1413 
1414 		delimiter++; // Skip the colon.
1415 		while (isspace (*delimiter))
1416 			delimiter++; // Skip over leading white space and tabs.  To do: (comments in brackets).
1417 
1418 		// ToDo: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1419 		headers.AddString(header.String(), delimiter);
1420 	}
1421 	free(buffer);
1422 
1423 	return B_OK;
1424 }
1425 
1426 
1427 _EXPORT status_t
1428 extract_from_header(const BString& header, const BString& field,
1429 	BString& target)
1430 {
1431 	int32 headerLength = header.Length();
1432 	int32 fieldEndPos = 0;
1433 	while (true) {
1434 		int32 pos = header.IFindFirst(field, fieldEndPos);
1435 		if (pos < 0)
1436 			return B_BAD_VALUE;
1437 		fieldEndPos = pos + field.Length();
1438 
1439 		if (pos != 0 && header.ByteAt(pos - 1) != '\n')
1440 			continue;
1441 		if (header.ByteAt(fieldEndPos) == ':')
1442 			break;
1443 	}
1444 	fieldEndPos++;
1445 
1446 	int32 crPos = fieldEndPos;
1447 	while (true) {
1448 		fieldEndPos = crPos;
1449 		crPos = header.FindFirst('\n', crPos);
1450 		if (crPos < 0)
1451 			crPos = headerLength;
1452 		BString temp;
1453 		header.CopyInto(temp, fieldEndPos, crPos - fieldEndPos);
1454 		if (header.ByteAt(crPos - 1) == '\r') {
1455 			temp.Truncate(temp.Length() - 1);
1456 			temp += " ";
1457 		}
1458 		target += temp;
1459 		crPos++;
1460 		if (crPos >= headerLength)
1461 			break;
1462 		char nextByte = header.ByteAt(crPos);
1463 		if (nextByte != ' ' && nextByte != '\t')
1464 			break;
1465 		crPos++;
1466 	}
1467 
1468 	size_t bufferSize = target.Length();
1469 	char* buffer = target.LockBuffer(bufferSize);
1470 	size_t length = rfc2047_to_utf8(&buffer, &bufferSize, bufferSize);
1471 	target.UnlockBuffer(length);
1472 
1473 	return B_OK;
1474 }
1475 
1476 
1477 _EXPORT void
1478 extract_address(BString &address)
1479 {
1480 	const char *string = address.String();
1481 	int32 first;
1482 
1483 	// first, remove all quoted text
1484 
1485 	if ((first = address.FindFirst('"')) >= 0) {
1486 		int32 last = first + 1;
1487 		while (string[last] && string[last] != '"')
1488 			last++;
1489 
1490 		if (string[last] == '"')
1491 			address.Remove(first, last + 1 - first);
1492 	}
1493 
1494 	// try to extract the address now
1495 
1496 	if ((first = address.FindFirst('<')) >= 0) {
1497 		// the world likes us and we can just get the address the easy way...
1498 		int32 last = address.FindFirst('>');
1499 		if (last >= 0) {
1500 			address.Truncate(last);
1501 			address.Remove(0, first + 1);
1502 
1503 			return;
1504 		}
1505 	}
1506 
1507 	// then, see if there is anything in parenthesis to throw away
1508 
1509 	if ((first = address.FindFirst('(')) >= 0) {
1510 		int32 last = first + 1;
1511 		while (string[last] && string[last] != ')')
1512 			last++;
1513 
1514 		if (string[last] == ')')
1515 			address.Remove(first, last + 1 - first);
1516 	}
1517 
1518 	// now, there shouldn't be much else left
1519 
1520 	trim_white_space(address);
1521 }
1522 
1523 
1524 _EXPORT void
1525 get_address_list(BList &list, const char *string, void (*cleanupFunc)(BString &))
1526 {
1527 	if (string == NULL || !string[0])
1528 		return;
1529 
1530 	const char *start = string;
1531 
1532 	while (true) {
1533 		if (string[0] == '"') {
1534 			const char *quoteEnd = ++string;
1535 
1536 			while (quoteEnd[0] && quoteEnd[0] != '"')
1537 				quoteEnd++;
1538 
1539 			if (!quoteEnd[0])	// string exceeds line!
1540 				quoteEnd = string;
1541 
1542 			string = quoteEnd + 1;
1543 		}
1544 
1545 		if (string[0] == ',' || string[0] == '\0') {
1546 			BString address(start, string - start);
1547 			trim_white_space(address);
1548 
1549 			if (cleanupFunc)
1550 				cleanupFunc(address);
1551 
1552 			list.AddItem(strdup(address.String()));
1553 
1554 			start = string + 1;
1555 		}
1556 
1557 		if (!string[0])
1558 			break;
1559 
1560 		string++;
1561 	}
1562 }
1563 
1564