xref: /haiku/src/kits/mail/mail_util.cpp (revision 04a0e9c7b68cbe3a43d38e2bca8e860fd80936fb)
1 /*
2  * Copyright 2011, Haiku, Inc. All rights reserved.
3  * Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
4  */
5 
6 
7 #include <mail_util.h>
8 
9 #include <stdlib.h>
10 #include <string.h>
11 #include <stdio.h>
12 #define __USE_GNU
13 #include <regex.h>
14 #include <ctype.h>
15 #include <errno.h>
16 
17 #include <List.h>
18 #include <Locker.h>
19 #include <parsedate.h>
20 #include <String.h>
21 #include <UTF8.h>
22 
23 #include <mail_encoding.h>
24 
25 #include <CharacterSet.h>
26 #include <CharacterSetRoster.h>
27 
28 
29 using namespace BPrivate;
30 
31 
32 #define CRLF   "\r\n"
33 
34 struct CharsetConversionEntry {
35 	const char *charset;
36 	uint32 flavor;
37 };
38 
39 extern const CharsetConversionEntry mail_charsets[] = {
40 	// In order of authority, so when searching for the name for a particular
41 	// numbered conversion, start at the beginning of the array.
42 	{"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
43 	{"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
44 	{"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
45 	{"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
46 	{"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
47 	{"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
48 	{"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
49 	{"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
50 	{"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
51 	{"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
52 	{"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
53 	{"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
54 	{"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
55 
56 	{"shift_jis",	B_SJIS_CONVERSION}, // MIME STANDARD
57 	{"shift-jis",	B_SJIS_CONVERSION},
58 	{"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
59 	{"euc-jp",		B_EUC_CONVERSION}, // MIME STANDARD
60 
61 	{"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
62 	{"ksc5601",		B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
63 	{"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
64 
65 	{"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
66 	{"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
67 	{"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
68 
69 	{"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
70 	{"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
71 	{"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
72 
73     {"big5",        24}, // MIME STANDARD
74 
75     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
76     {"gb2312",      25}, // COMPATIBLE
77     {"gbk",         25}, // COMPATIBLE
78 
79 	/* {"utf-16",		B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
80 	{"us-ascii",	B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
81 	{"utf-8",		B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
82 
83 	{NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
84 };
85 
86 
87 static int32 gLocker = 0;
88 static size_t gNsub = 1;
89 static re_pattern_buffer gRe;
90 static re_pattern_buffer *gRebuf = NULL;
91 static unsigned char gTranslation[256];
92 
93 
94 static int
95 handle_non_rfc2047_encoding(char **buffer, size_t *bufferLength,
96 	size_t *sourceLength)
97 {
98 	char *string = *buffer;
99 	int32 length = *sourceLength;
100 	int32 i;
101 
102 	// check for 8-bit characters
103 	for (i = 0;i < length;i++)
104 		if (string[i] & 0x80)
105 			break;
106 	if (i == length)
107 		return false;
108 
109 	// check for groups of 8-bit characters - this code is not very smart;
110 	// it just can detect some sort of single-byte encoded stuff, the rest
111 	// is regarded as UTF-8
112 
113 	int32 singletons = 0,doubles = 0;
114 
115 	for (i = 0;i < length;i++)
116 	{
117 		if (string[i] & 0x80)
118 		{
119 			if ((string[i + 1] & 0x80) == 0)
120 				singletons++;
121 			else doubles++;
122 			i++;
123 		}
124 	}
125 
126 	if (singletons != 0)	// can't be valid UTF-8 anymore, so we assume ISO-Latin-1
127 	{
128 		int32 state = 0;
129 		// just to be sure
130 		int32 destLength = length * 4 + 1;
131 		int32 destBufferLength = destLength;
132 		char *dest = (char*)malloc(destLength);
133 		if (dest == NULL)
134 			return 0;
135 
136 		if (convert_to_utf8(B_ISO1_CONVERSION, string, &length,dest,
137 			&destLength, &state) == B_OK) {
138 			*buffer = dest;
139 			*bufferLength = destBufferLength;
140 			*sourceLength = destLength;
141 			return true;
142 		}
143 		free(dest);
144 		return false;
145 	}
146 
147 	// we assume a valid UTF-8 string here, but yes, we don't check it
148 	return true;
149 }
150 
151 
152 // #pragma mark -
153 
154 
155 status_t
156 write_read_attr(BNode& node, read_flags flag)
157 {
158 	if (node.WriteAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
159 			< 0)
160 		return B_ERROR;
161 
162 	// manage the status string only if it currently has a "read" status
163 	BString currentStatus;
164 	if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &currentStatus) == B_OK) {
165 		if (currentStatus.ICompare("New") != 0
166 			&& currentStatus.ICompare("Read") != 0
167 			&& currentStatus.ICompare("Seen") != 0)
168 			return B_OK;
169 	}
170 
171 	const char* statusString = flag == B_READ ? "Read"
172 		: flag  == B_SEEN ? "Seen" : "New";
173 	if (node.WriteAttr(B_MAIL_ATTR_STATUS, B_STRING_TYPE, 0, statusString,
174 			strlen(statusString)) < 0)
175 		return B_ERROR;
176 
177 	return B_OK;
178 }
179 
180 
181 status_t
182 read_read_attr(BNode& node, read_flags& flag)
183 {
184 	if (node.ReadAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
185 			== sizeof(int32))
186 		return B_OK;
187 
188 	BString statusString;
189 	if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &statusString) == B_OK) {
190 		if (statusString.ICompare("New"))
191 			flag = B_UNREAD;
192 		else
193 			flag = B_READ;
194 
195 		return B_OK;
196 	}
197 
198 	return B_ERROR;
199 }
200 
201 
202 // The next couple of functions are our wrapper around convert_to_utf8 and
203 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
204 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.
205 // It also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
206 
207 
208 status_t
209 mail_convert_to_utf8(uint32 srcEncoding, const char *src, int32 *srcLen,
210 	char *dst, int32 *dstLen, int32 *state, char substitute)
211 {
212 	int32 copyAmount;
213 	char *originalDst = dst;
214 	status_t returnCode = -1;
215 
216 	if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
217 		copyAmount = *srcLen;
218 		if (*dstLen < copyAmount)
219 			copyAmount = *dstLen;
220 		memcpy (dst, src, copyAmount);
221 		*srcLen = copyAmount;
222 		*dstLen = copyAmount;
223 		returnCode = B_OK;
224 	} else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
225 		int32 i;
226 		unsigned char letter;
227 		copyAmount = *srcLen;
228 		if (*dstLen < copyAmount)
229 			copyAmount = *dstLen;
230 		for (i = 0; i < copyAmount; i++) {
231 			letter = *src++;
232 			if (letter > 0x80U)
233 				// Invalid, could also use substitute, but better to strip high bit.
234 				*dst++ = letter - 0x80U;
235 			else if (letter == 0x80U)
236 				// Can't convert to 0x00 since that's NUL, which would cause problems.
237 				*dst++ = substitute;
238 			else
239 				*dst++ = letter;
240 		}
241 		*srcLen = copyAmount;
242 		*dstLen = copyAmount;
243 		returnCode = B_OK;
244 	} else
245 		returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
246 			dst, dstLen, state, substitute);
247 
248 	if (returnCode == B_OK) {
249 		// Replace spurious NUL bytes, which should normally not be in the
250 		// output of the decoding (not normal UTF-8 characters, and no NULs are
251 		// in our usual input strings).  They happen for some odd ISO-2022-JP
252 		// byte pair combinations which are improperly handled by the BeOS
253 		// routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
254 		// first ESC $ B switches to a Japanese character set, then the next
255 		// two bytes "yD" specify a character, then ESC ( B switches back to
256 		// the ASCII character set.  The UTF-8 conversion yields a NUL byte.
257 		int32 i;
258 		for (i = 0; i < *dstLen; i++)
259 			if (originalDst[i] == 0)
260 				originalDst[i] = substitute;
261 	}
262 	return returnCode;
263 }
264 
265 
266 status_t
267 mail_convert_from_utf8(uint32 dstEncoding, const char *src, int32 *srcLen,
268 	char *dst, int32 *dstLen, int32 *state, char substitute)
269 {
270 	int32 copyAmount;
271 	status_t errorCode;
272 	int32 originalDstLen = *dstLen;
273 	int32 tempDstLen;
274 	int32 tempSrcLen;
275 
276 	if (dstEncoding == B_MAIL_UTF8_CONVERSION) {
277 		copyAmount = *srcLen;
278 		if (*dstLen < copyAmount)
279 			copyAmount = *dstLen;
280 		memcpy (dst, src, copyAmount);
281 		*srcLen = copyAmount;
282 		*dstLen = copyAmount;
283 		return B_OK;
284 	}
285 
286 	if (dstEncoding == B_MAIL_US_ASCII_CONVERSION) {
287 		int32 characterLength;
288 		int32 dstRemaining = *dstLen;
289 		unsigned char letter;
290 		int32 srcRemaining = *srcLen;
291 
292 		// state contains the number of source bytes to skip, left over from a
293 		// partial UTF-8 character split over the end of the buffer from last
294 		// time.
295 		if (srcRemaining <= *state) {
296 			*state -= srcRemaining;
297 			*dstLen = 0;
298 			return B_OK;
299 		}
300 		srcRemaining -= *state;
301 		src += *state;
302 		*state = 0;
303 
304 		while (true) {
305 			if (srcRemaining <= 0 || dstRemaining <= 0)
306 				break;
307 			letter = *src;
308 			if (letter < 0x80)
309 				characterLength = 1; // Regular ASCII equivalent code.
310 			else if (letter < 0xC0)
311 				characterLength = 1; // Invalid in-between data byte 10xxxxxx.
312 			else if (letter < 0xE0)
313 				characterLength = 2;
314 			else if (letter < 0xF0)
315 				characterLength = 3;
316 			else if (letter < 0xF8)
317 				characterLength = 4;
318 			else if (letter < 0xFC)
319 				characterLength = 5;
320 			else if (letter < 0xFE)
321 				characterLength = 6;
322 			else
323 				characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
324 			if (letter < 0x80)
325 				*dst++ = *src;
326 			else
327 				*dst++ = substitute;
328 			dstRemaining--;
329 			if (srcRemaining < characterLength) {
330 				// Character split past the end of the buffer.
331 				*state = characterLength - srcRemaining;
332 				srcRemaining = 0;
333 			} else {
334 				src += characterLength;
335 				srcRemaining -= characterLength;
336 			}
337 		}
338 		// Update with the amounts used.
339 		*srcLen = *srcLen - srcRemaining;
340 		*dstLen = *dstLen - dstRemaining;
341 		return B_OK;
342 	}
343 
344 	errorCode = convert_from_utf8(dstEncoding, src, srcLen, dst, dstLen, state,
345 		substitute);
346 	if (errorCode != B_OK)
347 		return errorCode;
348 
349 	if (dstEncoding != B_JIS_CONVERSION)
350 		return B_OK;
351 
352 	// B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
353 	// character subsets.  For E-mail headers (and other uses), it needs to be
354 	// switched back to ASCII at the end (otherwise the last character gets
355 	// lost or other weird things happen in the headers).  Note that we can't
356 	// just append the escape code since the convert_from_utf8 "state" will be
357 	// wrong.  So we append an ASCII letter and throw it away, leaving just the
358 	// escape code.  Well, it actually switches to the Roman character set, not
359 	// ASCII, but that should be OK.
360 
361 	tempDstLen = originalDstLen - *dstLen;
362 	if (tempDstLen < 3) // Not enough space remaining in the output.
363 		return B_OK; // Sort of an error, but we did convert the rest OK.
364 	tempSrcLen = 1;
365 	errorCode = convert_from_utf8(dstEncoding, "a", &tempSrcLen,
366 		dst + *dstLen, &tempDstLen, state, substitute);
367 	if (errorCode != B_OK)
368 		return errorCode;
369 	*dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
370 	return B_OK;
371 }
372 
373 
374 ssize_t
375 rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
376 {
377 	char *head, *tail;
378 	char *charset, *encoding, *end;
379 	ssize_t ret = B_OK;
380 
381 	if (bufp == NULL || *bufp == NULL)
382 		return -1;
383 
384 	char *string = *bufp;
385 
386 	//---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
387 	if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
388 		return strLen;
389 
390 	// set up string length
391 	if (strLen == 0)
392 		strLen = strlen(*bufp);
393 	char lastChar = (*bufp)[strLen];
394 	(*bufp)[strLen] = '\0';
395 
396 	//---------Whew! Now for RFC compliant mail
397 	bool encodedWordFoundPreviously = false;
398 	for (head = tail = string;
399 		((charset = strstr(tail, "=?")) != NULL)
400 		&& (((encoding = strchr(charset + 2, '?')) != NULL)
401 			&& encoding[1] && (encoding[2] == '?') && encoding[3])
402 		&& (end = strstr(encoding + 3, "?=")) != NULL;
403 		// found "=?...charset...?e?...text...?=   (e == encoding)
404 		//        ^charset       ^encoding    ^end
405 		tail = end)
406 	{
407 		// Copy non-encoded text (from tail up to charset) to the output.
408 		// Ignore spaces between two encoded "words".  RFC2047 says the words
409 		// should be concatenated without the space (designed for Asian
410 		// sentences which have no spaces yet need to be broken into "words" to
411 		// keep within the line length limits).
412 		bool nonSpaceFound = false;
413 		for (int i = 0; i < charset-tail; i++) {
414 			if (!isspace (tail[i])) {
415 				nonSpaceFound = true;
416 				break;
417 			}
418 		}
419 		if (!encodedWordFoundPreviously || nonSpaceFound) {
420 			if (string != tail && tail != charset)
421 				memmove(string, tail, charset-tail);
422 			string += charset-tail;
423 		}
424 		tail = charset;
425 		encodedWordFoundPreviously = true;
426 
427 		// move things to point at what they should:
428 		//   =?...charset...?e?...text...?=   (e == encoding)
429 		//     ^charset      ^encoding     ^end
430 		charset += 2;
431 		encoding += 1;
432 		end += 2;
433 
434 		// find the charset this text is in now
435 		size_t cLen = encoding - 1 - charset;
436 		bool base64encoded = toupper(*encoding) == 'B';
437 
438 		uint32 convertID = B_MAIL_NULL_CONVERSION;
439 		char charsetName[cLen + 1];
440 		memcpy(charsetName, charset, cLen);
441 		charsetName[cLen] = '\0';
442 		if (strcasecmp(charsetName, "us-ascii") == 0) {
443 			convertID = B_MAIL_US_ASCII_CONVERSION;
444 		} else if (strcasecmp(charsetName, "utf-8") == 0) {
445 			convertID = B_MAIL_UTF8_CONVERSION;
446 		} else {
447 			const BCharacterSet* charSet
448 				= BCharacterSetRoster::FindCharacterSetByName(charsetName);
449 			if (charSet != NULL) {
450 				convertID = charSet->GetConversionID();
451 			}
452 		}
453 		if (convertID == B_MAIL_NULL_CONVERSION) {
454 			// unidentified charset
455 			// what to do? doing nothing skips the encoded text;
456 			// but we should keep it: we copy it to the output.
457 			if (string != tail && tail != end)
458 				memmove(string, tail, end-tail);
459 			string += end-tail;
460 			continue;
461 		}
462 		// else we've successfully identified the charset
463 
464 		char *src = encoding+2;
465 		int32 srcLen = end - 2 - src;
466 		// encoded text: src..src+srcLen
467 
468 		// decode text, get decoded length (reducing xforms)
469 		srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
470 			: decode_base64(src, src, srcLen);
471 
472 		// allocate space for the converted text
473 		int32 dstLen = end-string + *bufLen-strLen;
474 		char *dst = (char*)malloc(dstLen);
475 		int32 cvLen = srcLen;
476 		int32 convState = 0;
477 
478 		//
479 		// do the conversion
480 		//
481 		ret = mail_convert_to_utf8(convertID, src, &cvLen, dst, &dstLen,
482 			&convState);
483 		if (ret != B_OK) {
484 			// what to do? doing nothing skips the encoded text
485 			// but we should keep it: we copy it to the output.
486 
487 			free(dst);
488 
489 			if (string != tail && tail != end)
490 				memmove(string, tail, end-tail);
491 			string += end-tail;
492 			continue;
493 		}
494 		/* convert_to_ is either returning something wrong or my
495 		   test data is screwed up.  Whatever it is, Not Enough
496 		   Space is not the only cause of the below, so we just
497 		   assume it succeeds if it converts anything at all.
498 		else if (cvLen < srcLen)
499 		{
500 			// not enough room to convert the data;
501 			// grow *buf and retry
502 
503 			free(dst);
504 
505 			char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
506 			if (temp == NULL)
507 			{
508 				ret = B_NO_MEMORY;
509 				break;
510 			}
511 
512 			*bufp = temp;
513 			*bufLen = 2*(*bufLen + 1);
514 
515 			string = *bufp + (string-head);
516 			tail = *bufp + (tail-head);
517 			charset = *bufp + (charset-head);
518 			encoding = *bufp + (encoding-head);
519 			end = *bufp + (end-head);
520 			src = *bufp + (src-head);
521 			head = *bufp;
522 			continue;
523 		}
524 		*/
525 		else {
526 			if (dstLen > end-string) {
527 				// copy the string forward...
528 				memmove(string+dstLen, end, strLen - (end-head) + 1);
529 				strLen += string+dstLen - end;
530 				end = string + dstLen;
531 			}
532 
533 			memcpy(string, dst, dstLen);
534 			string += dstLen;
535 			free(dst);
536 			continue;
537 		}
538 	}
539 
540 	// copy everything that's left
541 	size_t tailLen = strLen - (tail - head);
542 	memmove(string, tail, tailLen+1);
543 	string += tailLen;
544 
545 	// replace the last char
546 	(*bufp)[strLen] = lastChar;
547 
548 	return ret < B_OK ? ret : string-head;
549 }
550 
551 
552 ssize_t
553 utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding)
554 {
555 	struct word {
556 		BString	originalWord;
557 		BString	convertedWord;
558 		bool	needsEncoding;
559 
560 		// Convert the word from UTF-8 to the desired character set.  The
561 		// converted version also includes the escape codes to return to ASCII
562 		// mode, if relevant.  Also note if it uses unprintable characters,
563 		// which means it will need that special encoding treatment later.
564 		void ConvertWordToCharset (uint32 charset) {
565 			int32 state = 0;
566 			int32 originalLength = originalWord.Length();
567 			int32 convertedLength = originalLength * 5 + 1;
568 			char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
569 			mail_convert_from_utf8 (charset, originalWord.String(),
570 				&originalLength, convertedBuffer, &convertedLength, &state);
571 			for (int i = 0; i < convertedLength; i++) {
572 				if ((convertedBuffer[i] & (1 << 7)) ||
573 					(convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
574 					needsEncoding = true;
575 					break;
576 				}
577 			}
578 			convertedWord.UnlockBuffer (convertedLength);
579 		};
580 	};
581 	struct word *currentWord;
582 	BList words;
583 
584 	// Break the header into words.  White space characters (including tabs and
585 	// newlines) separate the words.  Each word includes any space before it as
586 	// part of the word.  Actually, quotes and other special characters
587 	// (",()<>@) are treated as separate words of their own so that they don't
588 	// get encoded (because MIME headers get the quotes parsed before character
589 	// set unconversion is done).  The reader is supposed to ignore all white
590 	// space between encoded words, which can be inserted so that older mail
591 	// parsers don't have overly long line length problems.
592 
593 	const char *source = *bufp;
594 	const char *bufEnd = *bufp + length;
595 	const char *specialChars = "\"()<>@,";
596 
597 	while (source < bufEnd) {
598 		currentWord = new struct word;
599 		currentWord->needsEncoding = false;
600 
601 		int wordEnd = 0;
602 
603 		// Include leading spaces as part of the word.
604 		while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
605 			wordEnd++;
606 
607 		if (source + wordEnd < bufEnd &&
608 			strchr (specialChars, source[wordEnd]) != NULL) {
609 			// Got a quote mark or other special character, which is treated as
610 			// a word in itself since it shouldn't be encoded, which would hide
611 			// it from the mail system.
612 			wordEnd++;
613 		} else {
614 			// Find the end of the word.  Leave wordEnd pointing just after the
615 			// last character in the word.
616 			while (source + wordEnd < bufEnd) {
617 				if (isspace(source[wordEnd]) ||
618 					strchr (specialChars, source[wordEnd]) != NULL)
619 					break;
620 				if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
621 					0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
622 					// No English words are that long (46 is the longest),
623 					// break up what is likely Asian text (which has no spaces)
624 					// at the start of the next non-ASCII UTF-8 character (high
625 					// two bits are both ones).  Note that two encoded words in
626 					// a row get joined together, even if there is a space
627 					// between them in the final output text, according to the
628 					// standard.  Next word will also be conveniently get
629 					// encoded due to the 0xC0 test.
630 					currentWord->needsEncoding = true;
631 					break;
632 				}
633 				wordEnd++;
634 			}
635 		}
636 		currentWord->originalWord.SetTo (source, wordEnd);
637 		currentWord->ConvertWordToCharset (charset);
638 		words.AddItem(currentWord);
639 		source += wordEnd;
640 	}
641 
642 	// Combine adjacent words which contain unprintable text so that the
643 	// overhead of switching back and forth between regular text and specially
644 	// encoded text is reduced.  However, the combined word must be shorter
645 	// than the maximum of 75 bytes, including character set specification and
646 	// all those delimiters (worst case 22 bytes of overhead).
647 
648 	struct word *run;
649 
650 	for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
651 		if (!currentWord->needsEncoding)
652 			continue; // No need to combine unencoded words.
653 		for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
654 			if (!run->needsEncoding)
655 				break; // Don't want to combine encoded and unencoded words.
656 			if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
657 				currentWord->originalWord.Append (run->originalWord);
658 				currentWord->ConvertWordToCharset (charset);
659 				words.RemoveItem(g);
660 				delete run;
661 				g--;
662 			} else // Can't merge this word, result would be too long.
663 				break;
664 		}
665 	}
666 
667 	// Combine the encoded and unencoded words into one line, doing the
668 	// quoted-printable or base64 encoding.  Insert an extra space between
669 	// words which are both encoded to make word wrapping easier, since there
670 	// is normally none, and you're allowed to insert space (the receiver
671 	// throws it away if it is between encoded words).
672 
673 	BString rfc2047;
674 	bool	previousWordNeededEncoding = false;
675 
676 	const char *charset_dec = "none-bug";
677 	for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
678 		if (mail_charsets[i].flavor == charset) {
679 			charset_dec = mail_charsets[i].charset;
680 			break;
681 		}
682 	}
683 
684 	while ((currentWord = (struct word *)words.RemoveItem((int32)0)) != NULL) {
685 		if ((encoding != quoted_printable && encoding != base64) ||
686 		!currentWord->needsEncoding) {
687 			rfc2047.Append (currentWord->convertedWord);
688 		} else {
689 			// This word needs encoding.  Try to insert a space between it and
690 			// the previous word.
691 			if (previousWordNeededEncoding)
692 				rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
693 			else {
694 				// Previous word is not encoded, spaces are significant.  Try
695 				// to move a space from the start of this word to be outside of
696 				// the encoded text, so that there is a bit of space between
697 				// this word and the previous one to enhance word wrapping
698 				// chances later on.
699 				if (currentWord->originalWord.Length() > 1 &&
700 					isspace (currentWord->originalWord[0])) {
701 					rfc2047 << currentWord->originalWord[0];
702 					currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
703 					currentWord->ConvertWordToCharset (charset);
704 				}
705 			}
706 
707 			char *encoded = NULL;
708 			ssize_t encoded_len = 0;
709 			int32 convertedLength = currentWord->convertedWord.Length ();
710 			const char *convertedBuffer = currentWord->convertedWord.String ();
711 
712 			switch (encoding) {
713 				case quoted_printable:
714 					encoded = (char *) malloc (convertedLength * 3);
715 					encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
716 					break;
717 				case base64:
718 					encoded = (char *) malloc (convertedLength * 2);
719 					encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
720 					break;
721 				default: // Unknown encoding type, shouldn't happen.
722 					encoded = (char *) convertedBuffer;
723 					encoded_len = convertedLength;
724 					break;
725 			}
726 
727 			rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
728 			rfc2047.Append (encoded, encoded_len);
729 			rfc2047 << "?=";
730 
731 			if (encoding == quoted_printable || encoding == base64)
732 				free(encoded);
733 		}
734 		previousWordNeededEncoding = currentWord->needsEncoding;
735 		delete currentWord;
736 	}
737 
738 	free(*bufp);
739 
740 	ssize_t finalLength = rfc2047.Length ();
741 	*bufp = (char *) (malloc (finalLength + 1));
742 	memcpy (*bufp, rfc2047.String(), finalLength);
743 	(*bufp)[finalLength] = 0;
744 
745 	return finalLength;
746 }
747 
748 
749 void
750 FoldLineAtWhiteSpaceAndAddCRLF(BString &string)
751 {
752 	int inputLength = string.Length();
753 	int lineStartIndex;
754 	const int maxLineLength = 78; // Doesn't include CRLF.
755 	BString output;
756 	int splitIndex;
757 	int tempIndex;
758 
759 	lineStartIndex = 0;
760 	while (true) {
761 		// If we don't need to wrap the text, just output the remainder, if any.
762 
763 		if (lineStartIndex + maxLineLength >= inputLength) {
764 			if (lineStartIndex < inputLength) {
765 				output.Insert (string, lineStartIndex /* source offset */,
766 					inputLength - lineStartIndex /* count */,
767 					output.Length() /* insert at */);
768 				output.Append (CRLF);
769 			}
770 			break;
771 		}
772 
773 		// Look ahead for a convenient spot to split it, between a comma and
774 		// space, which you often see between e-mail addresses like this:
775 		// "Joe Who" joe@dot.com, "Someone Else" else@blot.com
776 
777 		tempIndex = lineStartIndex + maxLineLength;
778 		if (tempIndex > inputLength)
779 			tempIndex = inputLength;
780 		splitIndex = string.FindLast (", ", tempIndex);
781 		if (splitIndex >= lineStartIndex)
782 			splitIndex++; // Point to the space character.
783 
784 		// If none of those exist, try splitting at any white space.
785 
786 		if (splitIndex <= lineStartIndex)
787 			splitIndex = string.FindLast (" ", tempIndex);
788 		if (splitIndex <= lineStartIndex)
789 			splitIndex = string.FindLast ("\t", tempIndex);
790 
791 		// If none of those exist, allow for a longer word - split at the next
792 		// available white space.
793 
794 		if (splitIndex <= lineStartIndex)
795 			splitIndex = string.FindFirst (" ", lineStartIndex + 1);
796 		if (splitIndex <= lineStartIndex)
797 			splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
798 
799 		// Give up, the whole rest of the line can't be split, just dump it
800 		// out.
801 
802 		if (splitIndex <= lineStartIndex) {
803 			if (lineStartIndex < inputLength) {
804 				output.Insert (string, lineStartIndex /* source offset */,
805 					inputLength - lineStartIndex /* count */,
806 					output.Length() /* insert at */);
807 				output.Append (CRLF);
808 			}
809 			break;
810 		}
811 
812 		// Do the split.  The current line up to but not including the space
813 		// gets output, followed by a CRLF.  The space remains to become the
814 		// start of the next line (and that tells the message reader that it is
815 		// a continuation line).
816 
817 		output.Insert (string, lineStartIndex /* source offset */,
818 			splitIndex - lineStartIndex /* count */,
819 			output.Length() /* insert at */);
820 		output.Append (CRLF);
821 		lineStartIndex = splitIndex;
822 	}
823 	string.SetTo (output);
824 }
825 
826 
827 ssize_t
828 readfoldedline(FILE *file, char **buffer, size_t *buflen)
829 {
830 	ssize_t len = buflen && *buflen ? *buflen : 0;
831 	char * buf = buffer && *buffer ? *buffer : NULL;
832 	ssize_t cnt = 0; // Number of characters currently in the buffer.
833 	int c;
834 
835 	while (true) {
836 		// Make sure there is space in the buffer for two more characters (one
837 		// for the next character, and one for the end of string NUL byte).
838 		if (buf == NULL || cnt + 2 >= len) {
839 			char *temp = (char *)realloc(buf, len + 64);
840 			if (temp == NULL) {
841 				// Out of memory, however existing buffer remains allocated.
842 				cnt = ENOMEM;
843 				break;
844 			}
845 			len += 64;
846 			buf = temp;
847 		}
848 
849 		// Read the next character, or end of file, or IO error.
850 		if ((c = fgetc(file)) == EOF) {
851 			if (ferror (file)) {
852 				cnt = errno;
853 				if (cnt >= 0)
854 					cnt = -1; // Error codes must be negative.
855 			} else {
856 				// Really is end of file.  Also make it end of line if there is
857 				// some text already read in.  If the first thing read was EOF,
858 				// just return an empty string.
859 				if (cnt > 0) {
860 					buf[cnt++] = '\n';
861 					if (buf[cnt-2] == '\r') {
862 						buf[cnt-2] = '\n';
863 						--cnt;
864 					}
865 				}
866 			}
867 			break;
868 		}
869 
870 		buf[cnt++] = c;
871 
872 		if (c == '\n') {
873 			// Convert CRLF end of line to just a LF.  Do it before folding, in
874 			// case we don't need to fold.
875 			if (cnt >= 2 && buf[cnt-2] == '\r') {
876 				buf[cnt-2] = '\n';
877 				--cnt;
878 			}
879 			// If the current line is empty then return it (so that empty lines
880 			// don't disappear if the next line starts with a space).
881 			if (cnt <= 1)
882 				break;
883 			// Fold if first character on the next line is whitespace.
884 			c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
885 			if (c == ' ' || c == '\t')
886 				buf[cnt-1] = c; // Replace \n with the white space character.
887 			else {
888 				// Not folding, we finished reading a line; break out of the loop
889 				ungetc(c,file);
890 				break;
891 			}
892 		}
893 	}
894 
895 	if (buf != NULL && cnt >= 0)
896 		buf[cnt] = '\0';
897 
898 	if (buffer)
899 		*buffer = buf;
900 	else if (buf)
901 		free(buf);
902 
903 	if (buflen)
904 		*buflen = len;
905 
906 	return cnt;
907 }
908 
909 
910 ssize_t
911 readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
912 {
913 	ssize_t len = buflen && *buflen ? *buflen : 0;
914 	char * buf = buffer && *buffer ? *buffer : NULL;
915 	ssize_t cnt = 0; // Number of characters currently in the buffer.
916 	char c;
917 	status_t errorCode;
918 
919 	while (true) {
920 		// Make sure there is space in the buffer for two more characters (one
921 		// for the next character, and one for the end of string NUL byte).
922 		if (buf == NULL || cnt + 2 >= len) {
923 			char *temp = (char *)realloc(buf, len + 64);
924 			if (temp == NULL) {
925 				// Out of memory, however existing buffer remains allocated.
926 				cnt = ENOMEM;
927 				break;
928 			}
929 			len += 64;
930 			buf = temp;
931 		}
932 
933 		errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
934 		if (errorCode != 1) {
935 			if (errorCode < 0) {
936 				cnt = errorCode; // IO error encountered, just return the code.
937 			} else {
938 				// Really is end of file.  Also make it end of line if there is
939 				// some text already read in.  If the first thing read was EOF,
940 				// just return an empty string.
941 				if (cnt > 0) {
942 					buf[cnt++] = '\n';
943 					if (buf[cnt-2] == '\r') {
944 						buf[cnt-2] = '\n';
945 						--cnt;
946 					}
947 				}
948 			}
949 			break;
950 		}
951 
952 		buf[cnt++] = c;
953 
954 		if (c == '\n') {
955 			// Convert CRLF end of line to just a LF.  Do it before folding, in
956 			// case we don't need to fold.
957 			if (cnt >= 2 && buf[cnt-2] == '\r') {
958 				buf[cnt-2] = '\n';
959 				--cnt;
960 			}
961 			// If the current line is empty then return it (so that empty lines
962 			// don't disappear if the next line starts with a space).
963 			if (cnt <= 1)
964 				break;
965 			// if first character on the next line is whitespace, fold lines
966 			errorCode = in.Read(&c,1);
967 			if (errorCode == 1) {
968 				if (c == ' ' || c == '\t')
969 					buf[cnt-1] = c; // Replace \n with the white space character.
970 				else {
971 					// Not folding, we finished reading a whole line.
972 					in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
973 					break;
974 				}
975 			} else if (errorCode < 0) {
976 				cnt = errorCode;
977 				break;
978 			} else // No next line; at the end of the file.  Return the line.
979 				break;
980 		}
981 	}
982 
983 	if (buf != NULL && cnt >= 0)
984 		buf[cnt] = '\0';
985 
986 	if (buffer)
987 		*buffer = buf;
988 	else if (buf)
989 		free(buf);
990 
991 	if (buflen)
992 		*buflen = len;
993 
994 	return cnt;
995 }
996 
997 
998 ssize_t
999 nextfoldedline(const char** header, char **buffer, size_t *buflen)
1000 {
1001 	ssize_t len = buflen && *buflen ? *buflen : 0;
1002 	char * buf = buffer && *buffer ? *buffer : NULL;
1003 	ssize_t cnt = 0; // Number of characters currently in the buffer.
1004 	char c;
1005 
1006 	while (true)
1007 	{
1008 		// Make sure there is space in the buffer for two more characters (one
1009 		// for the next character, and one for the end of string NUL byte).
1010 		if (buf == NULL || cnt + 2 >= len)
1011 		{
1012 			char *temp = (char *)realloc(buf, len + 64);
1013 			if (temp == NULL) {
1014 				// Out of memory, however existing buffer remains allocated.
1015 				cnt = ENOMEM;
1016 				break;
1017 			}
1018 			len += 64;
1019 			buf = temp;
1020 		}
1021 
1022 		// Read the next character, or end of file.
1023 		if ((c = *(*header)++) == 0) {
1024 			// End of file.  Also make it end of line if there is some text
1025 			// already read in.  If the first thing read was EOF, just return
1026 			// an empty string.
1027 			if (cnt > 0) {
1028 				buf[cnt++] = '\n';
1029 				if (buf[cnt-2] == '\r') {
1030 					buf[cnt-2] = '\n';
1031 					--cnt;
1032 				}
1033 			}
1034 			break;
1035 		}
1036 
1037 		buf[cnt++] = c;
1038 
1039 		if (c == '\n') {
1040 			// Convert CRLF end of line to just a LF.  Do it before folding, in
1041 			// case we don't need to fold.
1042 			if (cnt >= 2 && buf[cnt-2] == '\r') {
1043 				buf[cnt-2] = '\n';
1044 				--cnt;
1045 			}
1046 			// If the current line is empty then return it (so that empty lines
1047 			// don't disappear if the next line starts with a space).
1048 			if (cnt <= 1)
1049 				break;
1050 			// if first character on the next line is whitespace, fold lines
1051 			c = *(*header)++;
1052 			if (c == ' ' || c == '\t')
1053 				buf[cnt-1] = c; // Replace \n with the white space character.
1054 			else {
1055 				// Not folding, we finished reading a line; break out of the loop
1056 				(*header)--; // Undo read of the non-whitespace.
1057 				break;
1058 			}
1059 		}
1060 	}
1061 
1062 
1063 	if (buf != NULL && cnt >= 0)
1064 		buf[cnt] = '\0';
1065 
1066 	if (buffer)
1067 		*buffer = buf;
1068 	else if (buf)
1069 		free(buf);
1070 
1071 	if (buflen)
1072 		*buflen = len;
1073 
1074 	return cnt;
1075 }
1076 
1077 
1078 void
1079 trim_white_space(BString &string)
1080 {
1081 	int32 i;
1082 	int32 length = string.Length();
1083 	char *buffer = string.LockBuffer(length + 1);
1084 
1085 	while (length > 0 && isspace(buffer[length - 1]))
1086 		length--;
1087 	buffer[length] = '\0';
1088 
1089 	for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1090 	if (i != 0) {
1091 		length -= i;
1092 		memmove(buffer,buffer + i,length + 1);
1093 	}
1094 	string.UnlockBuffer(length);
1095 }
1096 
1097 
1098 /*!	Tries to return a human-readable name from the specified
1099 	header parameter (should be from "To:" or "From:").
1100 	Tries to return the name rather than the eMail address.
1101 */
1102 void
1103 extract_address_name(BString &header)
1104 {
1105 	BString name;
1106 	const char *start = header.String();
1107 	const char *stop = start + strlen (start);
1108 
1109 	// Find a string S in the header (email foo) that matches:
1110 	//   Old style name in brackets: foo@bar.com (S)
1111 	//   New style quotes: "S" <foo@bar.com>
1112 	//   New style no quotes if nothing else found: S <foo@bar.com>
1113 	//   If nothing else found then use the whole thing: S
1114 
1115 	for (int i = 0; i <= 3; i++) {
1116 		// Set p1 to the first letter in the name and p2 to just past the last
1117 		// letter in the name.  p2 stays NULL if a name wasn't found in this
1118 		// pass.
1119 		const char *p1 = NULL, *p2 = NULL;
1120 
1121 		switch (i) {
1122 			case 0: // foo@bar.com (S)
1123 				if ((p1 = strchr(start,'(')) != NULL) {
1124 					p1++; // Advance to first letter in the name.
1125 					size_t nest = 1; // Handle nested brackets.
1126 					for (p2 = p1; p2 < stop; ++p2)
1127 					{
1128 						if (*p2 == ')')
1129 							--nest;
1130 						else if (*p2 == '(')
1131 							++nest;
1132 						if (nest <= 0)
1133 							break;
1134 					}
1135 					if (nest != 0)
1136 						p2 = NULL; // False alarm, no terminating bracket.
1137 				}
1138 				break;
1139 			case 1: // "S" <foo@bar.com>
1140 				if ((p1 = strchr(start, '\"')) != NULL)
1141 					p2 = strchr(++p1, '\"');
1142 				break;
1143 			case 2: // S <foo@bar.com>
1144 				p1 = start;
1145 				if (name.Length() == 0)
1146 					p2 = strchr(start, '<');
1147 				break;
1148 			case 3: // S
1149 				p1 = start;
1150 				if (name.Length() == 0)
1151 					p2 = stop;
1152 				break;
1153 		}
1154 
1155 		// Remove leading and trailing space-like characters and save the
1156 		// result if it is longer than any other likely names found.
1157 		if (p2 != NULL) {
1158 			while (p1 < p2 && (isspace (*p1)))
1159 				++p1;
1160 
1161 			while (p1 < p2 && (isspace (p2[-1])))
1162 				--p2;
1163 
1164 			int newLength = p2 - p1;
1165 			if (name.Length() < newLength)
1166 				name.SetTo(p1, newLength);
1167 		}
1168 	}
1169 
1170 	int32 lessIndex = name.FindFirst('<');
1171 	int32 greaterIndex = name.FindLast('>');
1172 
1173 	if (lessIndex == 0) {
1174 		// Have an address of the form <address> and nothing else, so remove
1175 		// the greater and less than signs, if any.
1176 		if (greaterIndex > 0)
1177 			name.Remove(greaterIndex, 1);
1178 		name.Remove(lessIndex, 1);
1179 	} else if (lessIndex > 0 && lessIndex < greaterIndex) {
1180 		// Yahoo stupidly inserts the e-mail address into the name string, so
1181 		// this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1182 		name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1183 	}
1184 
1185 	trim_white_space(name);
1186 	header = name;
1187 }
1188 
1189 
1190 /*!	Given a subject in a BString, remove the extraneous RE: re: and other stuff
1191 	to get down to the core subject string, which should be identical for all
1192 	messages posted about a topic.  The input string is modified in place to
1193 	become the output core subject string.
1194 */
1195 void
1196 SubjectToThread (BString &string)
1197 {
1198 // a regex that matches a non-ASCII UTF8 character:
1199 #define U8C \
1200 	"[\302-\337][\200-\277]" \
1201 	"|\340[\302-\337][\200-\277]" \
1202 	"|[\341-\357][\200-\277][\200-\277]" \
1203 	"|\360[\220-\277][\200-\277][\200-\277]" \
1204 	"|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1205 	"|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1206 	"|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1207 	"|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1208 	"|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1209 
1210 #define PATTERN \
1211 	"^ +" \
1212 	"|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1213 	"|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1214 	"| *\\(fwd\\) *$"
1215 
1216 	if (gRebuf == NULL && atomic_add(&gLocker, 1) == 0) {
1217 		// the idea is to compile the regexp once to speed up testing
1218 
1219 		for (int i=0; i<256; ++i) gTranslation[i]=i;
1220 		for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1221 
1222 		gRe.translate = gTranslation;
1223 		gRe.regs_allocated = REGS_FIXED;
1224 		re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1225 
1226 		const char *pattern = PATTERN;
1227 		// count subexpressions in PATTERN
1228 		for (unsigned int i=0; pattern[i] != 0; ++i)
1229 		{
1230 			if (pattern[i] == '\\')
1231 				++i;
1232 			else if (pattern[i] == '(')
1233 				++gNsub;
1234 		}
1235 
1236 		const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1237 		if (err == NULL)
1238 			gRebuf = &gRe;
1239 		else
1240 			fprintf(stderr, "Failed to compile the regex: %s\n", err);
1241 	} else {
1242 		int32 tries = 200;
1243 		while (gRebuf == NULL && tries-- > 0)
1244 			snooze(10000);
1245 	}
1246 
1247 	if (gRebuf) {
1248 		struct re_registers regs;
1249 		// can't be static if this function is to be thread-safe
1250 
1251 		regs.num_regs = gNsub;
1252 		regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1253 		regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1254 
1255 		for (int start = 0; (start = re_search(gRebuf, string.String(),
1256 				string.Length(), 0, string.Length(), &regs)) >= 0;) {
1257 			//
1258 			// we found something
1259 			//
1260 
1261 			// don't delete [bemaildaemon]...
1262 			if (start == regs.start[1])
1263 				start = regs.start[2];
1264 
1265 			string.Remove(start,regs.end[0]-start);
1266 			if (start)
1267 				string.Insert(' ',1,start);
1268 
1269 			// TODO: for some subjects this results in an endless loop, check
1270 			// why this happen.
1271 			if (regs.end[0] - start <= 1)
1272 				break;
1273 		}
1274 
1275 		free(regs.start);
1276 		free(regs.end);
1277 	}
1278 
1279 	// Finally remove leading and trailing space.  Some software, like
1280 	// tm-edit 1.8, appends a space to the subject, which would break
1281 	// threading if we left it in.
1282 	trim_white_space(string);
1283 }
1284 
1285 
1286 /*!	Converts a date to a time.  Handles numeric time zones too, unlike
1287 	parsedate().  Returns -1 if it fails.
1288 */
1289 time_t
1290 ParseDateWithTimeZone(const char *DateString)
1291 {
1292 	time_t currentTime;
1293 	time_t dateAsTime;
1294 	char tempDateString[80];
1295 	char tempZoneString[6];
1296 	time_t zoneDeltaTime;
1297 	int zoneIndex;
1298 	char *zonePntr;
1299 
1300 	// See if we can remove the time zone portion.  parsedate understands time
1301 	// zone 3 letter names, but doesn't understand the numeric +9999 time zone
1302 	// format.  To do: see if a newer parsedate exists.
1303 
1304 	strncpy (tempDateString, DateString, sizeof (tempDateString));
1305 	tempDateString[sizeof (tempDateString) - 1] = 0;
1306 
1307 	// Remove trailing spaces.
1308 	zonePntr = tempDateString + strlen (tempDateString) - 1;
1309 	while (zonePntr >= tempDateString && isspace (*zonePntr))
1310 		*zonePntr-- = 0;
1311 	if (zonePntr < tempDateString)
1312 		return -1; // Empty string.
1313 
1314 	// Remove the trailing time zone in round brackets, like in
1315 	// Fri, 22 Feb 2002 15:22:42 EST (-0500)
1316 	// Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1317 	if (tempDateString[strlen(tempDateString)-1] == ')')
1318 	{
1319 		zonePntr = strrchr (tempDateString, '(');
1320 		if (zonePntr != NULL)
1321 		{
1322 			*zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1323 			while (zonePntr >= tempDateString && isspace (*zonePntr))
1324 				*zonePntr-- = 0;
1325 			if (zonePntr < tempDateString)
1326 				return -1; // Empty string.
1327 		}
1328 	}
1329 
1330 	// Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1331 	for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1332 	{
1333 		zonePntr = tempDateString + zoneIndex;
1334 		if (zonePntr[0] == '+' || zonePntr[0] == '-')
1335 		{
1336 			if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1337 				zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1338 				zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1339 				zonePntr[4] >= '0' && zonePntr[4] <= '9')
1340 				break;
1341 		}
1342 	}
1343 	if (zoneIndex >= 0)
1344 	{
1345 		// Remove the zone from the date string and any following time zone
1346 		// letter codes.  Also put in GMT so that the date gets parsed as GMT.
1347 		memcpy (tempZoneString, zonePntr, 5);
1348 		tempZoneString [5] = 0;
1349 		strcpy (zonePntr, "GMT");
1350 	}
1351 	else // No numeric time zone found.
1352 		strcpy (tempZoneString, "+0000");
1353 
1354 	time (&currentTime);
1355 	dateAsTime = parsedate (tempDateString, currentTime);
1356 	if (dateAsTime == (time_t) -1)
1357 		return -1; // Failure.
1358 
1359 	zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1360 	tempZoneString[3] = 0;
1361 	zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1362 	if (tempZoneString[0] == '+')
1363 		zoneDeltaTime = 0 - zoneDeltaTime;
1364 	dateAsTime += zoneDeltaTime;
1365 
1366 	return dateAsTime;
1367 }
1368 
1369 
1370 /*! Parses a mail header and fills the headers BMessage
1371 */
1372 status_t
1373 parse_header(BMessage &headers, BPositionIO &input)
1374 {
1375 	char *buffer = NULL;
1376 	size_t bufferSize = 0;
1377 	int32 length;
1378 
1379 	while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1380 		--length;
1381 			// Don't include the \n at the end of the buffer.
1382 
1383 		// convert to UTF-8 and null-terminate the buffer
1384 		length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1385 		buffer[length] = '\0';
1386 
1387 		const char *delimiter = strstr(buffer, ":");
1388 		if (delimiter == NULL)
1389 			continue;
1390 
1391 		BString header(buffer, delimiter - buffer);
1392 		header.CapitalizeEachWord();
1393 			// unified case for later fetch
1394 
1395 		delimiter++; // Skip the colon.
1396 		// Skip over leading white space and tabs.
1397 		// TODO: (comments in brackets).
1398 		while (isspace(*delimiter))
1399 			delimiter++;
1400 
1401 		// TODO: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1402 		headers.AddString(header.String(), delimiter);
1403 	}
1404 	free(buffer);
1405 
1406 	return B_OK;
1407 }
1408 
1409 
1410 status_t
1411 extract_from_header(const BString& header, const BString& field,
1412 	BString& target)
1413 {
1414 	int32 headerLength = header.Length();
1415 	int32 fieldEndPos = 0;
1416 	while (true) {
1417 		int32 pos = header.IFindFirst(field, fieldEndPos);
1418 		if (pos < 0)
1419 			return B_BAD_VALUE;
1420 		fieldEndPos = pos + field.Length();
1421 
1422 		if (pos != 0 && header.ByteAt(pos - 1) != '\n')
1423 			continue;
1424 		if (header.ByteAt(fieldEndPos) == ':')
1425 			break;
1426 	}
1427 	fieldEndPos++;
1428 
1429 	int32 crPos = fieldEndPos;
1430 	while (true) {
1431 		fieldEndPos = crPos;
1432 		crPos = header.FindFirst('\n', crPos);
1433 		if (crPos < 0)
1434 			crPos = headerLength;
1435 		BString temp;
1436 		header.CopyInto(temp, fieldEndPos, crPos - fieldEndPos);
1437 		if (header.ByteAt(crPos - 1) == '\r') {
1438 			temp.Truncate(temp.Length() - 1);
1439 			temp += " ";
1440 		}
1441 		target += temp;
1442 		crPos++;
1443 		if (crPos >= headerLength)
1444 			break;
1445 		char nextByte = header.ByteAt(crPos);
1446 		if (nextByte != ' ' && nextByte != '\t')
1447 			break;
1448 		crPos++;
1449 	}
1450 
1451 	size_t bufferSize = target.Length();
1452 	char* buffer = target.LockBuffer(bufferSize);
1453 	size_t length = rfc2047_to_utf8(&buffer, &bufferSize, bufferSize);
1454 	target.UnlockBuffer(length);
1455 
1456 	trim_white_space(target);
1457 
1458 	return B_OK;
1459 }
1460 
1461 
1462 void
1463 extract_address(BString &address)
1464 {
1465 	const char *string = address.String();
1466 	int32 first;
1467 
1468 	// first, remove all quoted text
1469 
1470 	if ((first = address.FindFirst('"')) >= 0) {
1471 		int32 last = first + 1;
1472 		while (string[last] && string[last] != '"')
1473 			last++;
1474 
1475 		if (string[last] == '"')
1476 			address.Remove(first, last + 1 - first);
1477 	}
1478 
1479 	// try to extract the address now
1480 
1481 	if ((first = address.FindFirst('<')) >= 0) {
1482 		// the world likes us and we can just get the address the easy way...
1483 		int32 last = address.FindFirst('>');
1484 		if (last >= 0) {
1485 			address.Truncate(last);
1486 			address.Remove(0, first + 1);
1487 
1488 			return;
1489 		}
1490 	}
1491 
1492 	// then, see if there is anything in parenthesis to throw away
1493 
1494 	if ((first = address.FindFirst('(')) >= 0) {
1495 		int32 last = first + 1;
1496 		while (string[last] && string[last] != ')')
1497 			last++;
1498 
1499 		if (string[last] == ')')
1500 			address.Remove(first, last + 1 - first);
1501 	}
1502 
1503 	// now, there shouldn't be much else left
1504 
1505 	trim_white_space(address);
1506 }
1507 
1508 
1509 void
1510 get_address_list(BList &list, const char *string,
1511 	void (*cleanupFunc)(BString &))
1512 {
1513 	if (string == NULL || !string[0])
1514 		return;
1515 
1516 	const char *start = string;
1517 
1518 	while (true) {
1519 		if (string[0] == '"') {
1520 			const char *quoteEnd = ++string;
1521 
1522 			while (quoteEnd[0] && quoteEnd[0] != '"')
1523 				quoteEnd++;
1524 
1525 			if (!quoteEnd[0])	// string exceeds line!
1526 				quoteEnd = string;
1527 
1528 			string = quoteEnd + 1;
1529 		}
1530 
1531 		if (string[0] == ',' || string[0] == '\0') {
1532 			BString address(start, string - start);
1533 			trim_white_space(address);
1534 
1535 			if (cleanupFunc)
1536 				cleanupFunc(address);
1537 
1538 			list.AddItem(strdup(address.String()));
1539 
1540 			start = string + 1;
1541 		}
1542 
1543 		if (!string[0])
1544 			break;
1545 
1546 		string++;
1547 	}
1548 }
1549 
1550