xref: /haiku/src/kits/mail/mail_util.cpp (revision b289aaf66bbf6e173aa90fa194fc256965f1b34d)
1 /* mail util - header parsing
2 **
3 ** Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
4 */
5 
6 
7 #include <UTF8.h>
8 #include <Message.h>
9 #include <String.h>
10 #include <Locker.h>
11 #include <DataIO.h>
12 #include <List.h>
13 
14 #include <stdlib.h>
15 #include <string.h>
16 #include <stdio.h>
17 #include <regex.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <parsedate.h>
21 
22 #include <mail_encoding.h>
23 
24 #include <mail_util.h>
25 
26 #include <CharacterSet.h>
27 #include <CharacterSetRoster.h>
28 
29 using namespace BPrivate;
30 
31 #define CRLF   "\r\n"
32 
33 struct CharsetConversionEntry
34 {
35 	const char *charset;
36 	uint32 flavor;
37 };
38 
39 extern const CharsetConversionEntry mail_charsets [] =
40 {
41 	// In order of authority, so when searching for the name for a particular
42 	// numbered conversion, start at the beginning of the array.
43 	{"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
44 	{"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
45 	{"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
46 	{"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
47 	{"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
48 	{"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
49 	{"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
50 	{"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
51 	{"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
52 	{"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
53 	{"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
54 	{"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
55 	{"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
56 
57 	{"shift_jis",	B_SJIS_CONVERSION}, // MIME STANDARD
58 	{"shift-jis",	B_SJIS_CONVERSION},
59 	{"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
60 	{"euc-jp",		B_EUC_CONVERSION}, // MIME STANDARD
61 
62 	{"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
63 	{"ksc5601",		B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
64 	{"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
65 
66 	{"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
67 	{"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
68 	{"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
69 
70 	{"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
71 	{"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
72 	{"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
73 
74     {"big5",        24}, // MIME STANDARD
75 
76     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
77     {"gb2312",      25}, // COMPATIBLE
78     {"gbk",         25}, // COMPATIBLE
79 
80 	/* {"utf-16",		B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
81 	{"us-ascii",	B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
82 	{"utf-8",		B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
83 
84 	{NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
85 };
86 
87 
88 // The next couple of functions are our wrapper around convert_to_utf8 and
89 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
90 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.  It
91 // also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
92 
93 _EXPORT status_t mail_convert_to_utf8 (
94 	uint32 srcEncoding,
95 	const char *src,
96 	int32 *srcLen,
97 	char *dst,
98 	int32 *dstLen,
99 	int32 *state,
100 	char substitute)
101 {
102 	int32    copyAmount;
103 	char    *originalDst = dst;
104 	status_t returnCode = -1;
105 
106 	if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
107 		copyAmount = *srcLen;
108 		if (*dstLen < copyAmount)
109 			copyAmount = *dstLen;
110 		memcpy (dst, src, copyAmount);
111 		*srcLen = copyAmount;
112 		*dstLen = copyAmount;
113 		returnCode = B_OK;
114 	} else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
115 		int32 i;
116 		unsigned char letter;
117 		copyAmount = *srcLen;
118 		if (*dstLen < copyAmount)
119 			copyAmount = *dstLen;
120 		for (i = 0; i < copyAmount; i++) {
121 			letter = *src++;
122 			if (letter > 0x80U)
123 				// Invalid, could also use substitute, but better to strip high bit.
124 				*dst++ = letter - 0x80U;
125 			else if (letter == 0x80U)
126 				// Can't convert to 0x00 since that's NUL, which would cause problems.
127 				*dst++ = substitute;
128 			else
129 				*dst++ = letter;
130 		}
131 		*srcLen = copyAmount;
132 		*dstLen = copyAmount;
133 		returnCode = B_OK;
134 	} else
135 		returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
136 			dst, dstLen, state, substitute);
137 
138 	if (returnCode == B_OK) {
139 		// Replace spurious NUL bytes, which should normally not be in the
140 		// output of the decoding (not normal UTF-8 characters, and no NULs are
141 		// in our usual input strings).  They happen for some odd ISO-2022-JP
142 		// byte pair combinations which are improperly handled by the BeOS
143 		// routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
144 		// first ESC $ B switches to a Japanese character set, then the next
145 		// two bytes "yD" specify a character, then ESC ( B switches back to
146 		// the ASCII character set.  The UTF-8 conversion yields a NUL byte.
147 		int32 i;
148 		for (i = 0; i < *dstLen; i++)
149 			if (originalDst[i] == 0)
150 				originalDst[i] = substitute;
151 	}
152 	return returnCode;
153 }
154 
155 
156 _EXPORT status_t mail_convert_from_utf8 (
157 	uint32 dstEncoding,
158 	const char *src,
159 	int32 *srcLen,
160 	char *dst,
161 	int32 *dstLen,
162 	int32 *state,
163 	char substitute)
164 {
165 	int32		copyAmount;
166 	status_t	errorCode;
167 	int32		originalDstLen = *dstLen;
168 	int32		tempDstLen;
169 	int32		tempSrcLen;
170 
171 	if (dstEncoding == B_MAIL_UTF8_CONVERSION)
172 	{
173 		copyAmount = *srcLen;
174 		if (*dstLen < copyAmount)
175 			copyAmount = *dstLen;
176 		memcpy (dst, src, copyAmount);
177 		*srcLen = copyAmount;
178 		*dstLen = copyAmount;
179 		return B_OK;
180 	}
181 
182 	if (dstEncoding == B_MAIL_US_ASCII_CONVERSION)
183 	{
184 		int32			characterLength;
185 		int32			dstRemaining = *dstLen;
186 		unsigned char	letter;
187 		int32			srcRemaining = *srcLen;
188 
189 		// state contains the number of source bytes to skip, left over from a
190 		// partial UTF-8 character split over the end of the buffer from last
191 		// time.
192 		if (srcRemaining <= *state) {
193 			*state -= srcRemaining;
194 			*dstLen = 0;
195 			return B_OK;
196 		}
197 		srcRemaining -= *state;
198 		src += *state;
199 		*state = 0;
200 
201 		while (true) {
202 			if (srcRemaining <= 0 || dstRemaining <= 0)
203 				break;
204 			letter = *src;
205 			if (letter < 0x80)
206 				characterLength = 1; // Regular ASCII equivalent code.
207 			else if (letter < 0xC0)
208 				characterLength = 1; // Invalid in-between data byte 10xxxxxx.
209 			else if (letter < 0xE0)
210 				characterLength = 2;
211 			else if (letter < 0xF0)
212 				characterLength = 3;
213 			else if (letter < 0xF8)
214 				characterLength = 4;
215 			else if (letter < 0xFC)
216 				characterLength = 5;
217 			else if (letter < 0xFE)
218 				characterLength = 6;
219 			else
220 				characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
221 			if (letter < 0x80)
222 				*dst++ = *src;
223 			else
224 				*dst++ = substitute;
225 			dstRemaining--;
226 			if (srcRemaining < characterLength) {
227 				// Character split past the end of the buffer.
228 				*state = characterLength - srcRemaining;
229 				srcRemaining = 0;
230 			} else {
231 				src += characterLength;
232 				srcRemaining -= characterLength;
233 			}
234 		}
235 		// Update with the amounts used.
236 		*srcLen = *srcLen - srcRemaining;
237 		*dstLen = *dstLen - dstRemaining;
238 		return B_OK;
239 	}
240 
241 	errorCode = convert_from_utf8 (dstEncoding, src, srcLen, dst, dstLen, state, substitute);
242 	if (errorCode != B_OK)
243 		return errorCode;
244 
245 	if (dstEncoding != B_JIS_CONVERSION)
246 		return B_OK;
247 
248 	// B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
249 	// character subsets.  For E-mail headers (and other uses), it needs to be
250 	// switched back to ASCII at the end (otherwise the last character gets
251 	// lost or other weird things happen in the headers).  Note that we can't
252 	// just append the escape code since the convert_from_utf8 "state" will be
253 	// wrong.  So we append an ASCII letter and throw it away, leaving just the
254 	// escape code.  Well, it actually switches to the Roman character set, not
255 	// ASCII, but that should be OK.
256 
257 	tempDstLen = originalDstLen - *dstLen;
258 	if (tempDstLen < 3) // Not enough space remaining in the output.
259 		return B_OK; // Sort of an error, but we did convert the rest OK.
260 	tempSrcLen = 1;
261 	errorCode = convert_from_utf8 (dstEncoding, "a", &tempSrcLen,
262 		dst + *dstLen, &tempDstLen, state, substitute);
263 	if (errorCode != B_OK)
264 		return errorCode;
265 	*dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
266 	return B_OK;
267 }
268 
269 
270 
271 static int handle_non_rfc2047_encoding(char **buffer,size_t *bufferLength,size_t *sourceLength)
272 {
273 	char *string = *buffer;
274 	int32 length = *sourceLength;
275 	int32 i;
276 
277 	// check for 8-bit characters
278 	for (i = 0;i < length;i++)
279 		if (string[i] & 0x80)
280 			break;
281 	if (i == length)
282 		return false;
283 
284 	// check for groups of 8-bit characters - this code is not very smart;
285 	// it just can detect some sort of single-byte encoded stuff, the rest
286 	// is regarded as UTF-8
287 
288 	int32 singletons = 0,doubles = 0;
289 
290 	for (i = 0;i < length;i++)
291 	{
292 		if (string[i] & 0x80)
293 		{
294 			if ((string[i + 1] & 0x80) == 0)
295 				singletons++;
296 			else doubles++;
297 			i++;
298 		}
299 	}
300 
301 	if (singletons != 0)	// can't be valid UTF-8 anymore, so we assume ISO-Latin-1
302 	{
303 		int32 state = 0;
304 		// just to be sure
305 		int32 destLength = length * 4 + 1;
306 		int32 destBufferLength = destLength;
307 		char *dest = (char *)malloc(destLength);
308 		if (dest == NULL)
309 			return 0;
310 
311 		if (convert_to_utf8(B_ISO1_CONVERSION,string,&length,dest,&destLength,&state) == B_OK)
312 		{
313 			free(*buffer);
314 			*buffer = dest;
315 			*bufferLength = destBufferLength;
316 			*sourceLength = destLength;
317 			return true;
318 		}
319 		free(dest);
320 		return false;
321 	}
322 
323 	// we assume a valid UTF-8 string here, but yes, we don't check it
324 	return true;
325 }
326 
327 
328 _EXPORT ssize_t rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
329 {
330 	char *head, *tail;
331 	char *charset, *encoding, *end;
332 	ssize_t ret = B_OK;
333 
334 	if (bufp == NULL || *bufp == NULL)
335 		return -1;
336 
337 	char *string = *bufp;
338 
339 	//---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
340 	if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
341 		return strLen;
342 
343 	// set up string length
344 	if (strLen == 0)
345 		strLen = strlen(*bufp);
346 	char lastChar = (*bufp)[strLen];
347 	(*bufp)[strLen] = '\0';
348 
349 	//---------Whew! Now for RFC compliant mail
350 	bool encodedWordFoundPreviously = false;
351 	for (head = tail = string;
352 		((charset = strstr(tail, "=?")) != NULL)
353 		&& (((encoding = strchr(charset + 2, '?')) != NULL)
354 			&& encoding[1] && (encoding[2] == '?') && encoding[3])
355 		&& (end = strstr(encoding + 3, "?=")) != NULL;
356 		// found "=?...charset...?e?...text...?=   (e == encoding)
357 		//        ^charset       ^encoding    ^end
358 		tail = end)
359 	{
360 		// Copy non-encoded text (from tail up to charset) to the output.
361 		// Ignore spaces between two encoded "words".  RFC2047 says the words
362 		// should be concatenated without the space (designed for Asian
363 		// sentences which have no spaces yet need to be broken into "words" to
364 		// keep within the line length limits).
365 		bool nonSpaceFound = false;
366 		for (int i = 0; i < charset-tail; i++) {
367 			if (!isspace (tail[i])) {
368 				nonSpaceFound = true;
369 				break;
370 			}
371 		}
372 		if (!encodedWordFoundPreviously || nonSpaceFound) {
373 			if (string != tail && tail != charset)
374 				memmove(string, tail, charset-tail);
375 			string += charset-tail;
376 		}
377 		tail = charset;
378 		encodedWordFoundPreviously = true;
379 
380 		// move things to point at what they should:
381 		//   =?...charset...?e?...text...?=   (e == encoding)
382 		//     ^charset      ^encoding     ^end
383 		charset += 2;
384 		encoding += 1;
385 		end += 2;
386 
387 		// find the charset this text is in now
388 		size_t		cLen = encoding - 1 - charset;
389 		bool		base64encoded = toupper(*encoding) == 'B';
390 
391 		uint32 convert_id = B_MAIL_NULL_CONVERSION;
392 		char charset_string[cLen+1];
393 		memcpy(charset_string, charset, cLen);
394 		charset_string[cLen] = '\0';
395 		if (strcasecmp(charset_string, "us-ascii") == 0) {
396 			convert_id = B_MAIL_US_ASCII_CONVERSION;
397 		} else if (strcasecmp(charset_string, "utf-8") == 0) {
398 			convert_id = B_MAIL_UTF8_CONVERSION;
399 		} else {
400 			const BCharacterSet * cs = BCharacterSetRoster::FindCharacterSetByName(charset_string);
401 			if (cs != NULL) {
402 				convert_id = cs->GetConversionID();
403 			}
404 		}
405 		if (convert_id == B_MAIL_NULL_CONVERSION)
406 		{
407 			// unidentified charset
408 			// what to do? doing nothing skips the encoded text;
409 			// but we should keep it: we copy it to the output.
410 			if (string != tail && tail != end)
411 				memmove(string, tail, end-tail);
412 			string += end-tail;
413 			continue;
414 		}
415 		// else we've successfully identified the charset
416 
417 		char *src = encoding+2;
418 		int32 srcLen = end - 2 - src;
419 		// encoded text: src..src+srcLen
420 
421 		// decode text, get decoded length (reducing xforms)
422 		srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
423 				: decode_base64(src, src, srcLen);
424 
425 		// allocate space for the converted text
426 		int32 dstLen = end-string + *bufLen-strLen;
427 		char *dst = (char*)malloc(dstLen);
428 		int32 cvLen = srcLen;
429 		int32 convState = 0;
430 
431 		//
432 		// do the conversion
433 		//
434 		ret = mail_convert_to_utf8(convert_id, src, &cvLen, dst, &dstLen, &convState);
435 		if (ret != B_OK)
436 		{
437 			// what to do? doing nothing skips the encoded text
438 			// but we should keep it: we copy it to the output.
439 
440 			free(dst);
441 
442 			if (string != tail && tail != end)
443 				memmove(string, tail, end-tail);
444 			string += end-tail;
445 			continue;
446 		}
447 		/* convert_to_ is either returning something wrong or my
448 		   test data is screwed up.  Whatever it is, Not Enough
449 		   Space is not the only cause of the below, so we just
450 		   assume it succeeds if it converts anything at all.
451 		else if (cvLen < srcLen)
452 		{
453 			// not enough room to convert the data;
454 			// grow *buf and retry
455 
456 			free(dst);
457 
458 			char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
459 			if (temp == NULL)
460 			{
461 				ret = B_NO_MEMORY;
462 				break;
463 			}
464 
465 			*bufp = temp;
466 			*bufLen = 2*(*bufLen + 1);
467 
468 			string = *bufp + (string-head);
469 			tail = *bufp + (tail-head);
470 			charset = *bufp + (charset-head);
471 			encoding = *bufp + (encoding-head);
472 			end = *bufp + (end-head);
473 			src = *bufp + (src-head);
474 			head = *bufp;
475 			continue;
476 		}
477 		*/
478 		else
479 		{
480 			if (dstLen > end-string)
481 			{
482 				// copy the string forward...
483 				memmove(string+dstLen, end, strLen - (end-head) + 1);
484 				strLen += string+dstLen - end;
485 				end = string + dstLen;
486 			}
487 
488 			memcpy(string, dst, dstLen);
489 			string += dstLen;
490 			free(dst);
491 			continue;
492 		}
493 	}
494 
495 	// copy everything that's left
496 	size_t tailLen = strLen - (tail - head);
497 	memmove(string, tail, tailLen+1);
498 	string += tailLen;
499 
500 	// replace the last char
501 	(*bufp)[strLen] = lastChar;
502 
503 	return ret < B_OK ? ret : string-head;
504 }
505 
506 
507 _EXPORT ssize_t utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding) {
508 	struct word {
509 		BString	originalWord;
510 		BString	convertedWord;
511 		bool	needsEncoding;
512 
513 		// Convert the word from UTF-8 to the desired character set.  The
514 		// converted version also includes the escape codes to return to ASCII
515 		// mode, if relevant.  Also note if it uses unprintable characters,
516 		// which means it will need that special encoding treatment later.
517 		void ConvertWordToCharset (uint32 charset) {
518 			int32 state = 0;
519 			int32 originalLength = originalWord.Length();
520 			int32 convertedLength = originalLength * 5 + 1;
521 			char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
522 			mail_convert_from_utf8 (charset, originalWord.String(),
523 				&originalLength, convertedBuffer, &convertedLength, &state);
524 			for (int i = 0; i < convertedLength; i++) {
525 				if ((convertedBuffer[i] & (1 << 7)) ||
526 					(convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
527 					needsEncoding = true;
528 					break;
529 				}
530 			}
531 			convertedWord.UnlockBuffer (convertedLength);
532 		};
533 	};
534 	struct word *currentWord;
535 	BList words;
536 
537 	// Break the header into words.  White space characters (including tabs and
538 	// newlines) separate the words.  Each word includes any space before it as
539 	// part of the word.  Actually, quotes and other special characters
540 	// (",()<>@) are treated as separate words of their own so that they don't
541 	// get encoded (because MIME headers get the quotes parsed before character
542 	// set unconversion is done).  The reader is supposed to ignore all white
543 	// space between encoded words, which can be inserted so that older mail
544 	// parsers don't have overly long line length problems.
545 
546 	const char *source = *bufp;
547 	const char *bufEnd = *bufp + length;
548 	const char *specialChars = "\"()<>@,";
549 
550 	while (source < bufEnd) {
551 		currentWord = new struct word;
552 		currentWord->needsEncoding = false;
553 
554 		int wordEnd = 0;
555 
556 		// Include leading spaces as part of the word.
557 		while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
558 			wordEnd++;
559 
560 		if (source + wordEnd < bufEnd &&
561 			strchr (specialChars, source[wordEnd]) != NULL) {
562 			// Got a quote mark or other special character, which is treated as
563 			// a word in itself since it shouldn't be encoded, which would hide
564 			// it from the mail system.
565 			wordEnd++;
566 		} else {
567 			// Find the end of the word.  Leave wordEnd pointing just after the
568 			// last character in the word.
569 			while (source + wordEnd < bufEnd) {
570 				if (isspace(source[wordEnd]) ||
571 					strchr (specialChars, source[wordEnd]) != NULL)
572 					break;
573 				if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
574 					0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
575 					// No English words are that long (46 is the longest),
576 					// break up what is likely Asian text (which has no spaces)
577 					// at the start of the next non-ASCII UTF-8 character (high
578 					// two bits are both ones).  Note that two encoded words in
579 					// a row get joined together, even if there is a space
580 					// between them in the final output text, according to the
581 					// standard.  Next word will also be conveniently get
582 					// encoded due to the 0xC0 test.
583 					currentWord->needsEncoding = true;
584 					break;
585 				}
586 				wordEnd++;
587 			}
588 		}
589 		currentWord->originalWord.SetTo (source, wordEnd);
590 		currentWord->ConvertWordToCharset (charset);
591 		words.AddItem(currentWord);
592 		source += wordEnd;
593 	}
594 
595 	// Combine adjacent words which contain unprintable text so that the
596 	// overhead of switching back and forth between regular text and specially
597 	// encoded text is reduced.  However, the combined word must be shorter
598 	// than the maximum of 75 bytes, including character set specification and
599 	// all those delimiters (worst case 22 bytes of overhead).
600 
601 	struct word *run;
602 
603 	for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
604 		if (!currentWord->needsEncoding)
605 			continue; // No need to combine unencoded words.
606 		for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
607 			if (!run->needsEncoding)
608 				break; // Don't want to combine encoded and unencoded words.
609 			if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
610 				currentWord->originalWord.Append (run->originalWord);
611 				currentWord->ConvertWordToCharset (charset);
612 				words.RemoveItem(g);
613 				delete run;
614 				g--;
615 			} else // Can't merge this word, result would be too long.
616 				break;
617 		}
618 	}
619 
620 	// Combine the encoded and unencoded words into one line, doing the
621 	// quoted-printable or base64 encoding.  Insert an extra space between
622 	// words which are both encoded to make word wrapping easier, since there
623 	// is normally none, and you're allowed to insert space (the receiver
624 	// throws it away if it is between encoded words).
625 
626 	BString rfc2047;
627 	bool	previousWordNeededEncoding = false;
628 
629 	const char *charset_dec = "none-bug";
630 	for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
631 		if (mail_charsets[i].flavor == charset) {
632 			charset_dec = mail_charsets[i].charset;
633 			break;
634 		}
635 	}
636 
637 	while ((currentWord = (struct word *)words.RemoveItem(0L)) != NULL) {
638 		if ((encoding != quoted_printable && encoding != base64) ||
639 		!currentWord->needsEncoding) {
640 			rfc2047.Append (currentWord->convertedWord);
641 		} else {
642 			// This word needs encoding.  Try to insert a space between it and
643 			// the previous word.
644 			if (previousWordNeededEncoding)
645 				rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
646 			else {
647 				// Previous word is not encoded, spaces are significant.  Try
648 				// to move a space from the start of this word to be outside of
649 				// the encoded text, so that there is a bit of space between
650 				// this word and the previous one to enhance word wrapping
651 				// chances later on.
652 				if (currentWord->originalWord.Length() > 1 &&
653 					isspace (currentWord->originalWord[0])) {
654 					rfc2047 << currentWord->originalWord[0];
655 					currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
656 					currentWord->ConvertWordToCharset (charset);
657 				}
658 			}
659 
660 			char *encoded = NULL;
661 			ssize_t encoded_len = 0;
662 			int32 convertedLength = currentWord->convertedWord.Length ();
663 			const char *convertedBuffer = currentWord->convertedWord.String ();
664 
665 			switch (encoding) {
666 				case quoted_printable:
667 					encoded = (char *) malloc (convertedLength * 3);
668 					encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
669 					break;
670 				case base64:
671 					encoded = (char *) malloc (convertedLength * 2);
672 					encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
673 					break;
674 				default: // Unknown encoding type, shouldn't happen.
675 					encoded = (char *) convertedBuffer;
676 					encoded_len = convertedLength;
677 					break;
678 			}
679 
680 			rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
681 			rfc2047.Append (encoded, encoded_len);
682 			rfc2047 << "?=";
683 
684 			if (encoding == quoted_printable || encoding == base64)
685 				free(encoded);
686 		}
687 		previousWordNeededEncoding = currentWord->needsEncoding;
688 		delete currentWord;
689 	}
690 
691 	free(*bufp);
692 
693 	ssize_t finalLength = rfc2047.Length ();
694 	*bufp = (char *) (malloc (finalLength + 1));
695 	memcpy (*bufp, rfc2047.String(), finalLength);
696 	(*bufp)[finalLength] = 0;
697 
698 	return finalLength;
699 }
700 
701 
702 //====================================================================
703 
704 void FoldLineAtWhiteSpaceAndAddCRLF (BString &string)
705 {
706 	int			inputLength = string.Length();
707 	int			lineStartIndex;
708 	const int	maxLineLength = 78; // Doesn't include CRLF.
709 	BString		output;
710 	int			splitIndex;
711 	int			tempIndex;
712 
713 	lineStartIndex = 0;
714 	while (true) {
715 		// If we don't need to wrap the text, just output the remainder, if any.
716 
717 		if (lineStartIndex + maxLineLength >= inputLength) {
718 			if (lineStartIndex < inputLength) {
719 				output.Insert (string, lineStartIndex /* source offset */,
720 					inputLength - lineStartIndex /* count */,
721 					output.Length() /* insert at */);
722 				output.Append (CRLF);
723 			}
724 			break;
725 		}
726 
727 		// Look ahead for a convenient spot to split it, between a comma and
728 		// space, which you often see between e-mail addresses like this:
729 		// "Joe Who" joe@dot.com, "Someone Else" else@blot.com
730 
731 		tempIndex = lineStartIndex + maxLineLength;
732 		if (tempIndex > inputLength)
733 			tempIndex = inputLength;
734 		splitIndex = string.FindLast (", ", tempIndex);
735 		if (splitIndex >= lineStartIndex)
736 			splitIndex++; // Point to the space character.
737 
738 		// If none of those exist, try splitting at any white space.
739 
740 		if (splitIndex <= lineStartIndex)
741 			splitIndex = string.FindLast (" ", tempIndex);
742 		if (splitIndex <= lineStartIndex)
743 			splitIndex = string.FindLast ("\t", tempIndex);
744 
745 		// If none of those exist, allow for a longer word - split at the next
746 		// available white space.
747 
748 		if (splitIndex <= lineStartIndex)
749 			splitIndex = string.FindFirst (" ", lineStartIndex + 1);
750 		if (splitIndex <= lineStartIndex)
751 			splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
752 
753 		// Give up, the whole rest of the line can't be split, just dump it
754 		// out.
755 
756 		if (splitIndex <= lineStartIndex) {
757 			if (lineStartIndex < inputLength) {
758 				output.Insert (string, lineStartIndex /* source offset */,
759 					inputLength - lineStartIndex /* count */,
760 					output.Length() /* insert at */);
761 				output.Append (CRLF);
762 			}
763 			break;
764 		}
765 
766 		// Do the split.  The current line up to but not including the space
767 		// gets output, followed by a CRLF.  The space remains to become the
768 		// start of the next line (and that tells the message reader that it is
769 		// a continuation line).
770 
771 		output.Insert (string, lineStartIndex /* source offset */,
772 			splitIndex - lineStartIndex /* count */,
773 			output.Length() /* insert at */);
774 		output.Append (CRLF);
775 		lineStartIndex = splitIndex;
776 	}
777 	string.SetTo (output);
778 }
779 
780 
781 //====================================================================
782 
783 _EXPORT ssize_t readfoldedline(FILE *file, char **buffer, size_t *buflen)
784 {
785 	ssize_t len = buflen && *buflen ? *buflen : 0;
786 	char * buf = buffer && *buffer ? *buffer : NULL;
787 	ssize_t cnt = 0; // Number of characters currently in the buffer.
788 	int c;
789 
790 	while (true)
791 	{
792 		// Make sure there is space in the buffer for two more characters (one
793 		// for the next character, and one for the end of string NUL byte).
794 		if (buf == NULL || cnt + 2 >= len)
795 		{
796 			char *temp = (char *)realloc(buf, len + 64);
797 			if (temp == NULL) {
798 				// Out of memory, however existing buffer remains allocated.
799 				cnt = ENOMEM;
800 				break;
801 			}
802 			len += 64;
803 			buf = temp;
804 		}
805 
806 		// Read the next character, or end of file, or IO error.
807 		if ((c = fgetc(file)) == EOF) {
808 			if (ferror (file)) {
809 				cnt = errno;
810 				if (cnt >= 0)
811 					cnt = -1; // Error codes must be negative.
812 			} else {
813 				// Really is end of file.  Also make it end of line if there is
814 				// some text already read in.  If the first thing read was EOF,
815 				// just return an empty string.
816 				if (cnt > 0) {
817 					buf[cnt++] = '\n';
818 					if (buf[cnt-2] == '\r') {
819 						buf[cnt-2] = '\n';
820 						--cnt;
821 					}
822 				}
823 			}
824 			break;
825 		}
826 
827 		buf[cnt++] = c;
828 
829 		if (c == '\n') {
830 			// Convert CRLF end of line to just a LF.  Do it before folding, in
831 			// case we don't need to fold.
832 			if (cnt >= 2 && buf[cnt-2] == '\r') {
833 				buf[cnt-2] = '\n';
834 				--cnt;
835 			}
836 			// If the current line is empty then return it (so that empty lines
837 			// don't disappear if the next line starts with a space).
838 			if (cnt <= 1)
839 				break;
840 			// Fold if first character on the next line is whitespace.
841 			c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
842 			if (c == ' ' || c == '\t')
843 				buf[cnt-1] = c; // Replace \n with the white space character.
844 			else {
845 				// Not folding, we finished reading a line; break out of the loop
846 				ungetc(c,file);
847 				break;
848 			}
849 		}
850 	}
851 
852 
853 	if (buf != NULL && cnt >= 0)
854 		buf[cnt] = '\0';
855 
856 	if (buffer)
857 		*buffer = buf;
858 	else if (buf)
859 		free(buf);
860 
861 	if (buflen)
862 		*buflen = len;
863 
864 	return cnt;
865 }
866 
867 
868 //====================================================================
869 
870 _EXPORT ssize_t readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
871 {
872 	ssize_t len = buflen && *buflen ? *buflen : 0;
873 	char * buf = buffer && *buffer ? *buffer : NULL;
874 	ssize_t cnt = 0; // Number of characters currently in the buffer.
875 	char c;
876 	status_t errorCode;
877 
878 	while (true)
879 	{
880 		// Make sure there is space in the buffer for two more characters (one
881 		// for the next character, and one for the end of string NUL byte).
882 		if (buf == NULL || cnt + 2 >= len)
883 		{
884 			char *temp = (char *)realloc(buf, len + 64);
885 			if (temp == NULL) {
886 				// Out of memory, however existing buffer remains allocated.
887 				cnt = ENOMEM;
888 				break;
889 			}
890 			len += 64;
891 			buf = temp;
892 		}
893 
894 		errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
895 		if (errorCode != 1) {
896 			if (errorCode < 0) {
897 				cnt = errorCode; // IO error encountered, just return the code.
898 			} else {
899 				// Really is end of file.  Also make it end of line if there is
900 				// some text already read in.  If the first thing read was EOF,
901 				// just return an empty string.
902 				if (cnt > 0) {
903 					buf[cnt++] = '\n';
904 					if (buf[cnt-2] == '\r') {
905 						buf[cnt-2] = '\n';
906 						--cnt;
907 					}
908 				}
909 			}
910 			break;
911 		}
912 
913 		buf[cnt++] = c;
914 
915 		if (c == '\n') {
916 			// Convert CRLF end of line to just a LF.  Do it before folding, in
917 			// case we don't need to fold.
918 			if (cnt >= 2 && buf[cnt-2] == '\r') {
919 				buf[cnt-2] = '\n';
920 				--cnt;
921 			}
922 			// If the current line is empty then return it (so that empty lines
923 			// don't disappear if the next line starts with a space).
924 			if (cnt <= 1)
925 				break;
926 			// if first character on the next line is whitespace, fold lines
927 			errorCode = in.Read(&c,1);
928 			if (errorCode == 1) {
929 				if (c == ' ' || c == '\t')
930 					buf[cnt-1] = c; // Replace \n with the white space character.
931 				else {
932 					// Not folding, we finished reading a whole line.
933 					in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
934 					break;
935 				}
936 			} else if (errorCode < 0) {
937 				cnt = errorCode;
938 				break;
939 			} else // No next line; at the end of the file.  Return the line.
940 				break;
941 		}
942 	}
943 
944 	if (buf != NULL && cnt >= 0)
945 		buf[cnt] = '\0';
946 
947 	if (buffer)
948 		*buffer = buf;
949 	else if (buf)
950 		free(buf);
951 
952 	if (buflen)
953 		*buflen = len;
954 
955 	return cnt;
956 }
957 
958 
959 _EXPORT ssize_t
960 nextfoldedline(const char** header, char **buffer, size_t *buflen)
961 {
962 	ssize_t len = buflen && *buflen ? *buflen : 0;
963 	char * buf = buffer && *buffer ? *buffer : NULL;
964 	ssize_t cnt = 0; // Number of characters currently in the buffer.
965 	char c;
966 
967 	while (true)
968 	{
969 		// Make sure there is space in the buffer for two more characters (one
970 		// for the next character, and one for the end of string NUL byte).
971 		if (buf == NULL || cnt + 2 >= len)
972 		{
973 			char *temp = (char *)realloc(buf, len + 64);
974 			if (temp == NULL) {
975 				// Out of memory, however existing buffer remains allocated.
976 				cnt = ENOMEM;
977 				break;
978 			}
979 			len += 64;
980 			buf = temp;
981 		}
982 
983 		// Read the next character, or end of file.
984 		if ((c = *(*header)++) == 0) {
985 			// End of file.  Also make it end of line if there is some text
986 			// already read in.  If the first thing read was EOF, just return
987 			// an empty string.
988 			if (cnt > 0) {
989 				buf[cnt++] = '\n';
990 				if (buf[cnt-2] == '\r') {
991 					buf[cnt-2] = '\n';
992 					--cnt;
993 				}
994 			}
995 			break;
996 		}
997 
998 		buf[cnt++] = c;
999 
1000 		if (c == '\n') {
1001 			// Convert CRLF end of line to just a LF.  Do it before folding, in
1002 			// case we don't need to fold.
1003 			if (cnt >= 2 && buf[cnt-2] == '\r') {
1004 				buf[cnt-2] = '\n';
1005 				--cnt;
1006 			}
1007 			// If the current line is empty then return it (so that empty lines
1008 			// don't disappear if the next line starts with a space).
1009 			if (cnt <= 1)
1010 				break;
1011 			// if first character on the next line is whitespace, fold lines
1012 			c = *(*header)++;
1013 			if (c == ' ' || c == '\t')
1014 				buf[cnt-1] = c; // Replace \n with the white space character.
1015 			else {
1016 				// Not folding, we finished reading a line; break out of the loop
1017 				(*header)--; // Undo read of the non-whitespace.
1018 				break;
1019 			}
1020 		}
1021 	}
1022 
1023 
1024 	if (buf != NULL && cnt >= 0)
1025 		buf[cnt] = '\0';
1026 
1027 	if (buffer)
1028 		*buffer = buf;
1029 	else if (buf)
1030 		free(buf);
1031 
1032 	if (buflen)
1033 		*buflen = len;
1034 
1035 	return cnt;
1036 }
1037 
1038 
1039 _EXPORT void
1040 trim_white_space(BString &string)
1041 {
1042 	int32 i;
1043 	int32 length = string.Length();
1044 	char *buffer = string.LockBuffer(length + 1);
1045 
1046 	while (length > 0 && isspace(buffer[length - 1]))
1047 		length--;
1048 	buffer[length] = '\0';
1049 
1050 	for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1051 	if (i != 0) {
1052 		length -= i;
1053 		memmove(buffer,buffer + i,length + 1);
1054 	}
1055 	string.UnlockBuffer(length);
1056 }
1057 
1058 
1059 /** Tries to return a human-readable name from the specified
1060  *	header parameter (should be from "To:" or "From:").
1061  *	Tries to return the name rather than the eMail address.
1062  */
1063 
1064 _EXPORT void
1065 extract_address_name(BString &header)
1066 {
1067 	BString name;
1068 	const char *start = header.String();
1069 	const char *stop = start + strlen (start);
1070 
1071 	// Find a string S in the header (email foo) that matches:
1072 	//   Old style name in brackets: foo@bar.com (S)
1073 	//   New style quotes: "S" <foo@bar.com>
1074 	//   New style no quotes if nothing else found: S <foo@bar.com>
1075 	//   If nothing else found then use the whole thing: S
1076 
1077 	for (int i = 0; i <= 3; i++) {
1078 		// Set p1 to the first letter in the name and p2 to just past the last
1079 		// letter in the name.  p2 stays NULL if a name wasn't found in this
1080 		// pass.
1081 		const char *p1 = NULL, *p2 = NULL;
1082 
1083 		switch (i) {
1084 			case 0: // foo@bar.com (S)
1085 				if ((p1 = strchr(start,'(')) != NULL) {
1086 					p1++; // Advance to first letter in the name.
1087 					size_t nest = 1; // Handle nested brackets.
1088 					for (p2 = p1; p2 < stop; ++p2)
1089 					{
1090 						if (*p2 == ')')
1091 							--nest;
1092 						else if (*p2 == '(')
1093 							++nest;
1094 						if (nest <= 0)
1095 							break;
1096 					}
1097 					if (nest != 0)
1098 						p2 = NULL; // False alarm, no terminating bracket.
1099 				}
1100 				break;
1101 			case 1: // "S" <foo@bar.com>
1102 				if ((p1 = strchr(start, '\"')) != NULL)
1103 					p2 = strchr(++p1, '\"');
1104 				break;
1105 			case 2: // S <foo@bar.com>
1106 				p1 = start;
1107 				if (name.Length() == 0)
1108 					p2 = strchr(start, '<');
1109 				break;
1110 			case 3: // S
1111 				p1 = start;
1112 				if (name.Length() == 0)
1113 					p2 = stop;
1114 				break;
1115 		}
1116 
1117 		// Remove leading and trailing space-like characters and save the
1118 		// result if it is longer than any other likely names found.
1119 		if (p2 != NULL) {
1120 			while (p1 < p2 && (isspace (*p1)))
1121 				++p1;
1122 
1123 			while (p1 < p2 && (isspace (p2[-1])))
1124 				--p2;
1125 
1126 			int newLength = p2 - p1;
1127 			if (name.Length() < newLength)
1128 				name.SetTo(p1, newLength);
1129 		}
1130 	}
1131 
1132 	int32 lessIndex = name.FindFirst('<');
1133 	int32 greaterIndex = name.FindLast('>');
1134 
1135 	if (lessIndex == 0) {
1136 		// Have an address of the form <address> and nothing else, so remove
1137 		// the greater and less than signs, if any.
1138 		if (greaterIndex > 0)
1139 			name.Remove(greaterIndex, 1);
1140 		name.Remove(lessIndex, 1);
1141 	} else if (lessIndex > 0 && lessIndex < greaterIndex) {
1142 		// Yahoo stupidly inserts the e-mail address into the name string, so
1143 		// this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1144 		name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1145 	}
1146 
1147 	trim_white_space(name);
1148 	header = name;
1149 }
1150 
1151 
1152 
1153 // Given a subject in a BString, remove the extraneous RE: re: and other stuff
1154 // to get down to the core subject string, which should be identical for all
1155 // messages posted about a topic.  The input string is modified in place to
1156 // become the output core subject string.
1157 
1158 static int32				gLocker = 0;
1159 static size_t				gNsub = 1;
1160 static re_pattern_buffer	gRe;
1161 static re_pattern_buffer   *gRebuf = NULL;
1162 static char					gTranslation[256];
1163 
1164 _EXPORT void SubjectToThread (BString &string)
1165 {
1166 // a regex that matches a non-ASCII UTF8 character:
1167 #define U8C \
1168 	"[\302-\337][\200-\277]" \
1169 	"|\340[\302-\337][\200-\277]" \
1170 	"|[\341-\357][\200-\277][\200-\277]" \
1171 	"|\360[\220-\277][\200-\277][\200-\277]" \
1172 	"|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1173 	"|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1174 	"|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1175 	"|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1176 	"|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1177 
1178 #define PATTERN \
1179 	"^ +" \
1180 	"|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1181 	"|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1182 	"| *\\(fwd\\) *$"
1183 
1184 	if (gRebuf == NULL && atomic_add(&gLocker,1) == 0)
1185 	{
1186 		// the idea is to compile the regexp once to speed up testing
1187 
1188 		for (int i=0; i<256; ++i) gTranslation[i]=i;
1189 		for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1190 
1191 		gRe.translate = gTranslation;
1192 		gRe.regs_allocated = REGS_FIXED;
1193 		re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1194 
1195 		const char *pattern = PATTERN;
1196 		// count subexpressions in PATTERN
1197 		for (unsigned int i=0; pattern[i] != 0; ++i)
1198 		{
1199 			if (pattern[i] == '\\')
1200 				++i;
1201 			else if (pattern[i] == '(')
1202 				++gNsub;
1203 		}
1204 
1205 		const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1206 		if (err == NULL)
1207 			gRebuf = &gRe;
1208 		else
1209 			fprintf(stderr, "Failed to compile the regex: %s\n", err);
1210 	}
1211 	else
1212 	{
1213 		int32 tries = 200;
1214 		while (gRebuf == NULL && tries-- > 0)
1215 			snooze(10000);
1216 	}
1217 
1218 	if (gRebuf)
1219 	{
1220 		struct re_registers regs;
1221 		// can't be static if this function is to be thread-safe
1222 
1223 		regs.num_regs = gNsub;
1224 		regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1225 		regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1226 
1227 		for (int start=0;
1228 		    (start=re_search(gRebuf, string.String(), string.Length(),
1229 							0, string.Length(), &regs)) >= 0;
1230 			)
1231 		{
1232 			//
1233 			// we found something
1234 			//
1235 
1236 			// don't delete [bemaildaemon]...
1237 			if (start == regs.start[1])
1238 				start = regs.start[2];
1239 
1240 			string.Remove(start,regs.end[0]-start);
1241 			if (start) string.Insert(' ',1,start);
1242 		}
1243 
1244 		free(regs.start);
1245 		free(regs.end);
1246 	}
1247 
1248 	// Finally remove leading and trailing space.  Some software, like
1249 	// tm-edit 1.8, appends a space to the subject, which would break
1250 	// threading if we left it in.
1251 	trim_white_space(string);
1252 }
1253 
1254 
1255 
1256 // Converts a date to a time.  Handles numeric time zones too, unlike
1257 // parsedate.  Returns -1 if it fails.
1258 
1259 _EXPORT time_t ParseDateWithTimeZone (const char *DateString)
1260 {
1261 	time_t	currentTime;
1262 	time_t	dateAsTime;
1263 	char	tempDateString [80];
1264 	char	tempZoneString [6];
1265 	time_t	zoneDeltaTime;
1266 	int		zoneIndex;
1267 	char   *zonePntr;
1268 
1269 	// See if we can remove the time zone portion.  parsedate understands time
1270 	// zone 3 letter names, but doesn't understand the numeric +9999 time zone
1271 	// format.  To do: see if a newer parsedate exists.
1272 
1273 	strncpy (tempDateString, DateString, sizeof (tempDateString));
1274 	tempDateString[sizeof (tempDateString) - 1] = 0;
1275 
1276 	// Remove trailing spaces.
1277 	zonePntr = tempDateString + strlen (tempDateString) - 1;
1278 	while (zonePntr >= tempDateString && isspace (*zonePntr))
1279 		*zonePntr-- = 0;
1280 	if (zonePntr < tempDateString)
1281 		return -1; // Empty string.
1282 
1283 	// Remove the trailing time zone in round brackets, like in
1284 	// Fri, 22 Feb 2002 15:22:42 EST (-0500)
1285 	// Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1286 	if (tempDateString[strlen(tempDateString)-1] == ')')
1287 	{
1288 		zonePntr = strrchr (tempDateString, '(');
1289 		if (zonePntr != NULL)
1290 		{
1291 			*zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1292 			while (zonePntr >= tempDateString && isspace (*zonePntr))
1293 				*zonePntr-- = 0;
1294 			if (zonePntr < tempDateString)
1295 				return -1; // Empty string.
1296 		}
1297 	}
1298 
1299 	// Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1300 	for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1301 	{
1302 		zonePntr = tempDateString + zoneIndex;
1303 		if (zonePntr[0] == '+' || zonePntr[0] == '-')
1304 		{
1305 			if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1306 				zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1307 				zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1308 				zonePntr[4] >= '0' && zonePntr[4] <= '9')
1309 				break;
1310 		}
1311 	}
1312 	if (zoneIndex >= 0)
1313 	{
1314 		// Remove the zone from the date string and any following time zone
1315 		// letter codes.  Also put in GMT so that the date gets parsed as GMT.
1316 		memcpy (tempZoneString, zonePntr, 5);
1317 		tempZoneString [5] = 0;
1318 		strcpy (zonePntr, "GMT");
1319 	}
1320 	else // No numeric time zone found.
1321 		strcpy (tempZoneString, "+0000");
1322 
1323 	time (&currentTime);
1324 	dateAsTime = parsedate (tempDateString, currentTime);
1325 	if (dateAsTime == (time_t) -1)
1326 		return -1; // Failure.
1327 
1328 	zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1329 	tempZoneString[3] = 0;
1330 	zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1331 	if (tempZoneString[0] == '+')
1332 		zoneDeltaTime = 0 - zoneDeltaTime;
1333 	dateAsTime += zoneDeltaTime;
1334 
1335 	return dateAsTime;
1336 }
1337 
1338 
1339 /** Parses a mail header and fills the headers BMessage
1340  */
1341 
1342 _EXPORT status_t
1343 parse_header(BMessage &headers, BPositionIO &input)
1344 {
1345 	char *buffer = NULL;
1346 	size_t bufferSize = 0;
1347 	int32 length;
1348 
1349 	while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1350 		--length;
1351 			// Don't include the \n at the end of the buffer.
1352 
1353 		// convert to UTF-8 and null-terminate the buffer
1354 		length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1355 		buffer[length] = '\0';
1356 
1357 		const char *delimiter = strstr(buffer, ":");
1358 		if (delimiter == NULL)
1359 			continue;
1360 
1361 		BString header(buffer, delimiter - buffer);
1362 		header.CapitalizeEachWord();
1363 			// unified case for later fetch
1364 
1365 		delimiter++; // Skip the colon.
1366 		while (isspace (*delimiter))
1367 			delimiter++; // Skip over leading white space and tabs.  To do: (comments in brackets).
1368 
1369 		// ToDo: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1370 		headers.AddString(header.String(), delimiter);
1371 	}
1372 	free(buffer);
1373 
1374 	return B_OK;
1375 }
1376 
1377 
1378 _EXPORT void
1379 extract_address(BString &address)
1380 {
1381 	const char *string = address.String();
1382 	int32 first;
1383 
1384 	// first, remove all quoted text
1385 
1386 	if ((first = address.FindFirst('"')) >= 0) {
1387 		int32 last = first + 1;
1388 		while (string[last] && string[last] != '"')
1389 			last++;
1390 
1391 		if (string[last] == '"')
1392 			address.Remove(first, last + 1 - first);
1393 	}
1394 
1395 	// try to extract the address now
1396 
1397 	if ((first = address.FindFirst('<')) >= 0) {
1398 		// the world likes us and we can just get the address the easy way...
1399 		int32 last = address.FindFirst('>');
1400 		if (last >= 0) {
1401 			address.Truncate(last);
1402 			address.Remove(0, first + 1);
1403 
1404 			return;
1405 		}
1406 	}
1407 
1408 	// then, see if there is anything in parenthesis to throw away
1409 
1410 	if ((first = address.FindFirst('(')) >= 0) {
1411 		int32 last = first + 1;
1412 		while (string[last] && string[last] != ')')
1413 			last++;
1414 
1415 		if (string[last] == ')')
1416 			address.Remove(first, last + 1 - first);
1417 	}
1418 
1419 	// now, there shouldn't be much else left
1420 
1421 	trim_white_space(address);
1422 }
1423 
1424 
1425 _EXPORT void
1426 get_address_list(BList &list, const char *string, void (*cleanupFunc)(BString &))
1427 {
1428 	if (string == NULL || !string[0])
1429 		return;
1430 
1431 	const char *start = string;
1432 
1433 	while (true) {
1434 		if (string[0] == '"') {
1435 			const char *quoteEnd = ++string;
1436 
1437 			while (quoteEnd[0] && quoteEnd[0] != '"')
1438 				quoteEnd++;
1439 
1440 			if (!quoteEnd[0])	// string exceeds line!
1441 				quoteEnd = string;
1442 
1443 			string = quoteEnd + 1;
1444 		}
1445 
1446 		if (string[0] == ',' || string[0] == '\0') {
1447 			BString address(start, string - start);
1448 			trim_white_space(address);
1449 
1450 			if (cleanupFunc)
1451 				cleanupFunc(address);
1452 
1453 			list.AddItem(strdup(address.String()));
1454 
1455 			start = string + 1;
1456 		}
1457 
1458 		if (!string[0])
1459 			break;
1460 
1461 		string++;
1462 	}
1463 }
1464 
1465