xref: /haiku/src/kits/mail/mail_util.cpp (revision 1345706a9ff6ad0dc041339a02d4259998b0765d)
1 /* mail util - header parsing
2 **
3 ** Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
4 */
5 
6 
7 #include <UTF8.h>
8 #include <Message.h>
9 #include <String.h>
10 #include <Locker.h>
11 #include <DataIO.h>
12 #include <List.h>
13 
14 #include <stdlib.h>
15 #include <string.h>
16 #include <stdio.h>
17 #define __USE_GNU
18 #include <regex.h>
19 #include <ctype.h>
20 #include <errno.h>
21 #include <parsedate.h>
22 
23 #include <mail_encoding.h>
24 
25 #include <mail_util.h>
26 
27 #include <CharacterSet.h>
28 #include <CharacterSetRoster.h>
29 
30 using namespace BPrivate;
31 
32 #define CRLF   "\r\n"
33 
34 struct CharsetConversionEntry
35 {
36 	const char *charset;
37 	uint32 flavor;
38 };
39 
40 extern const CharsetConversionEntry mail_charsets [] =
41 {
42 	// In order of authority, so when searching for the name for a particular
43 	// numbered conversion, start at the beginning of the array.
44 	{"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
45 	{"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
46 	{"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
47 	{"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
48 	{"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
49 	{"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
50 	{"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
51 	{"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
52 	{"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
53 	{"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
54 	{"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
55 	{"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
56 	{"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
57 
58 	{"shift_jis",	B_SJIS_CONVERSION}, // MIME STANDARD
59 	{"shift-jis",	B_SJIS_CONVERSION},
60 	{"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
61 	{"euc-jp",		B_EUC_CONVERSION}, // MIME STANDARD
62 
63 	{"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
64 	{"ksc5601",		B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
65 	{"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
66 
67 	{"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
68 	{"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
69 	{"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
70 
71 	{"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
72 	{"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
73 	{"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
74 
75     {"big5",        24}, // MIME STANDARD
76 
77     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
78     {"gb2312",      25}, // COMPATIBLE
79     {"gbk",         25}, // COMPATIBLE
80 
81 	/* {"utf-16",		B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
82 	{"us-ascii",	B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
83 	{"utf-8",		B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
84 
85 	{NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
86 };
87 
88 
89 // The next couple of functions are our wrapper around convert_to_utf8 and
90 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
91 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.  It
92 // also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
93 
94 _EXPORT status_t mail_convert_to_utf8 (
95 	uint32 srcEncoding,
96 	const char *src,
97 	int32 *srcLen,
98 	char *dst,
99 	int32 *dstLen,
100 	int32 *state,
101 	char substitute)
102 {
103 	int32    copyAmount;
104 	char    *originalDst = dst;
105 	status_t returnCode = -1;
106 
107 	if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
108 		copyAmount = *srcLen;
109 		if (*dstLen < copyAmount)
110 			copyAmount = *dstLen;
111 		memcpy (dst, src, copyAmount);
112 		*srcLen = copyAmount;
113 		*dstLen = copyAmount;
114 		returnCode = B_OK;
115 	} else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
116 		int32 i;
117 		unsigned char letter;
118 		copyAmount = *srcLen;
119 		if (*dstLen < copyAmount)
120 			copyAmount = *dstLen;
121 		for (i = 0; i < copyAmount; i++) {
122 			letter = *src++;
123 			if (letter > 0x80U)
124 				// Invalid, could also use substitute, but better to strip high bit.
125 				*dst++ = letter - 0x80U;
126 			else if (letter == 0x80U)
127 				// Can't convert to 0x00 since that's NUL, which would cause problems.
128 				*dst++ = substitute;
129 			else
130 				*dst++ = letter;
131 		}
132 		*srcLen = copyAmount;
133 		*dstLen = copyAmount;
134 		returnCode = B_OK;
135 	} else
136 		returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
137 			dst, dstLen, state, substitute);
138 
139 	if (returnCode == B_OK) {
140 		// Replace spurious NUL bytes, which should normally not be in the
141 		// output of the decoding (not normal UTF-8 characters, and no NULs are
142 		// in our usual input strings).  They happen for some odd ISO-2022-JP
143 		// byte pair combinations which are improperly handled by the BeOS
144 		// routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
145 		// first ESC $ B switches to a Japanese character set, then the next
146 		// two bytes "yD" specify a character, then ESC ( B switches back to
147 		// the ASCII character set.  The UTF-8 conversion yields a NUL byte.
148 		int32 i;
149 		for (i = 0; i < *dstLen; i++)
150 			if (originalDst[i] == 0)
151 				originalDst[i] = substitute;
152 	}
153 	return returnCode;
154 }
155 
156 
157 _EXPORT status_t mail_convert_from_utf8 (
158 	uint32 dstEncoding,
159 	const char *src,
160 	int32 *srcLen,
161 	char *dst,
162 	int32 *dstLen,
163 	int32 *state,
164 	char substitute)
165 {
166 	int32		copyAmount;
167 	status_t	errorCode;
168 	int32		originalDstLen = *dstLen;
169 	int32		tempDstLen;
170 	int32		tempSrcLen;
171 
172 	if (dstEncoding == B_MAIL_UTF8_CONVERSION)
173 	{
174 		copyAmount = *srcLen;
175 		if (*dstLen < copyAmount)
176 			copyAmount = *dstLen;
177 		memcpy (dst, src, copyAmount);
178 		*srcLen = copyAmount;
179 		*dstLen = copyAmount;
180 		return B_OK;
181 	}
182 
183 	if (dstEncoding == B_MAIL_US_ASCII_CONVERSION)
184 	{
185 		int32			characterLength;
186 		int32			dstRemaining = *dstLen;
187 		unsigned char	letter;
188 		int32			srcRemaining = *srcLen;
189 
190 		// state contains the number of source bytes to skip, left over from a
191 		// partial UTF-8 character split over the end of the buffer from last
192 		// time.
193 		if (srcRemaining <= *state) {
194 			*state -= srcRemaining;
195 			*dstLen = 0;
196 			return B_OK;
197 		}
198 		srcRemaining -= *state;
199 		src += *state;
200 		*state = 0;
201 
202 		while (true) {
203 			if (srcRemaining <= 0 || dstRemaining <= 0)
204 				break;
205 			letter = *src;
206 			if (letter < 0x80)
207 				characterLength = 1; // Regular ASCII equivalent code.
208 			else if (letter < 0xC0)
209 				characterLength = 1; // Invalid in-between data byte 10xxxxxx.
210 			else if (letter < 0xE0)
211 				characterLength = 2;
212 			else if (letter < 0xF0)
213 				characterLength = 3;
214 			else if (letter < 0xF8)
215 				characterLength = 4;
216 			else if (letter < 0xFC)
217 				characterLength = 5;
218 			else if (letter < 0xFE)
219 				characterLength = 6;
220 			else
221 				characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
222 			if (letter < 0x80)
223 				*dst++ = *src;
224 			else
225 				*dst++ = substitute;
226 			dstRemaining--;
227 			if (srcRemaining < characterLength) {
228 				// Character split past the end of the buffer.
229 				*state = characterLength - srcRemaining;
230 				srcRemaining = 0;
231 			} else {
232 				src += characterLength;
233 				srcRemaining -= characterLength;
234 			}
235 		}
236 		// Update with the amounts used.
237 		*srcLen = *srcLen - srcRemaining;
238 		*dstLen = *dstLen - dstRemaining;
239 		return B_OK;
240 	}
241 
242 	errorCode = convert_from_utf8 (dstEncoding, src, srcLen, dst, dstLen, state, substitute);
243 	if (errorCode != B_OK)
244 		return errorCode;
245 
246 	if (dstEncoding != B_JIS_CONVERSION)
247 		return B_OK;
248 
249 	// B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
250 	// character subsets.  For E-mail headers (and other uses), it needs to be
251 	// switched back to ASCII at the end (otherwise the last character gets
252 	// lost or other weird things happen in the headers).  Note that we can't
253 	// just append the escape code since the convert_from_utf8 "state" will be
254 	// wrong.  So we append an ASCII letter and throw it away, leaving just the
255 	// escape code.  Well, it actually switches to the Roman character set, not
256 	// ASCII, but that should be OK.
257 
258 	tempDstLen = originalDstLen - *dstLen;
259 	if (tempDstLen < 3) // Not enough space remaining in the output.
260 		return B_OK; // Sort of an error, but we did convert the rest OK.
261 	tempSrcLen = 1;
262 	errorCode = convert_from_utf8 (dstEncoding, "a", &tempSrcLen,
263 		dst + *dstLen, &tempDstLen, state, substitute);
264 	if (errorCode != B_OK)
265 		return errorCode;
266 	*dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
267 	return B_OK;
268 }
269 
270 
271 
272 static int handle_non_rfc2047_encoding(char **buffer,size_t *bufferLength,size_t *sourceLength)
273 {
274 	char *string = *buffer;
275 	int32 length = *sourceLength;
276 	int32 i;
277 
278 	// check for 8-bit characters
279 	for (i = 0;i < length;i++)
280 		if (string[i] & 0x80)
281 			break;
282 	if (i == length)
283 		return false;
284 
285 	// check for groups of 8-bit characters - this code is not very smart;
286 	// it just can detect some sort of single-byte encoded stuff, the rest
287 	// is regarded as UTF-8
288 
289 	int32 singletons = 0,doubles = 0;
290 
291 	for (i = 0;i < length;i++)
292 	{
293 		if (string[i] & 0x80)
294 		{
295 			if ((string[i + 1] & 0x80) == 0)
296 				singletons++;
297 			else doubles++;
298 			i++;
299 		}
300 	}
301 
302 	if (singletons != 0)	// can't be valid UTF-8 anymore, so we assume ISO-Latin-1
303 	{
304 		int32 state = 0;
305 		// just to be sure
306 		int32 destLength = length * 4 + 1;
307 		int32 destBufferLength = destLength;
308 		char *dest = (char *)malloc(destLength);
309 		if (dest == NULL)
310 			return 0;
311 
312 		if (convert_to_utf8(B_ISO1_CONVERSION,string,&length,dest,&destLength,&state) == B_OK)
313 		{
314 			free(*buffer);
315 			*buffer = dest;
316 			*bufferLength = destBufferLength;
317 			*sourceLength = destLength;
318 			return true;
319 		}
320 		free(dest);
321 		return false;
322 	}
323 
324 	// we assume a valid UTF-8 string here, but yes, we don't check it
325 	return true;
326 }
327 
328 
329 _EXPORT ssize_t rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
330 {
331 	char *head, *tail;
332 	char *charset, *encoding, *end;
333 	ssize_t ret = B_OK;
334 
335 	if (bufp == NULL || *bufp == NULL)
336 		return -1;
337 
338 	char *string = *bufp;
339 
340 	//---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
341 	if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
342 		return strLen;
343 
344 	// set up string length
345 	if (strLen == 0)
346 		strLen = strlen(*bufp);
347 	char lastChar = (*bufp)[strLen];
348 	(*bufp)[strLen] = '\0';
349 
350 	//---------Whew! Now for RFC compliant mail
351 	bool encodedWordFoundPreviously = false;
352 	for (head = tail = string;
353 		((charset = strstr(tail, "=?")) != NULL)
354 		&& (((encoding = strchr(charset + 2, '?')) != NULL)
355 			&& encoding[1] && (encoding[2] == '?') && encoding[3])
356 		&& (end = strstr(encoding + 3, "?=")) != NULL;
357 		// found "=?...charset...?e?...text...?=   (e == encoding)
358 		//        ^charset       ^encoding    ^end
359 		tail = end)
360 	{
361 		// Copy non-encoded text (from tail up to charset) to the output.
362 		// Ignore spaces between two encoded "words".  RFC2047 says the words
363 		// should be concatenated without the space (designed for Asian
364 		// sentences which have no spaces yet need to be broken into "words" to
365 		// keep within the line length limits).
366 		bool nonSpaceFound = false;
367 		for (int i = 0; i < charset-tail; i++) {
368 			if (!isspace (tail[i])) {
369 				nonSpaceFound = true;
370 				break;
371 			}
372 		}
373 		if (!encodedWordFoundPreviously || nonSpaceFound) {
374 			if (string != tail && tail != charset)
375 				memmove(string, tail, charset-tail);
376 			string += charset-tail;
377 		}
378 		tail = charset;
379 		encodedWordFoundPreviously = true;
380 
381 		// move things to point at what they should:
382 		//   =?...charset...?e?...text...?=   (e == encoding)
383 		//     ^charset      ^encoding     ^end
384 		charset += 2;
385 		encoding += 1;
386 		end += 2;
387 
388 		// find the charset this text is in now
389 		size_t		cLen = encoding - 1 - charset;
390 		bool		base64encoded = toupper(*encoding) == 'B';
391 
392 		uint32 convert_id = B_MAIL_NULL_CONVERSION;
393 		char charset_string[cLen+1];
394 		memcpy(charset_string, charset, cLen);
395 		charset_string[cLen] = '\0';
396 		if (strcasecmp(charset_string, "us-ascii") == 0) {
397 			convert_id = B_MAIL_US_ASCII_CONVERSION;
398 		} else if (strcasecmp(charset_string, "utf-8") == 0) {
399 			convert_id = B_MAIL_UTF8_CONVERSION;
400 		} else {
401 			const BCharacterSet * cs = BCharacterSetRoster::FindCharacterSetByName(charset_string);
402 			if (cs != NULL) {
403 				convert_id = cs->GetConversionID();
404 			}
405 		}
406 		if (convert_id == B_MAIL_NULL_CONVERSION)
407 		{
408 			// unidentified charset
409 			// what to do? doing nothing skips the encoded text;
410 			// but we should keep it: we copy it to the output.
411 			if (string != tail && tail != end)
412 				memmove(string, tail, end-tail);
413 			string += end-tail;
414 			continue;
415 		}
416 		// else we've successfully identified the charset
417 
418 		char *src = encoding+2;
419 		int32 srcLen = end - 2 - src;
420 		// encoded text: src..src+srcLen
421 
422 		// decode text, get decoded length (reducing xforms)
423 		srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
424 				: decode_base64(src, src, srcLen);
425 
426 		// allocate space for the converted text
427 		int32 dstLen = end-string + *bufLen-strLen;
428 		char *dst = (char*)malloc(dstLen);
429 		int32 cvLen = srcLen;
430 		int32 convState = 0;
431 
432 		//
433 		// do the conversion
434 		//
435 		ret = mail_convert_to_utf8(convert_id, src, &cvLen, dst, &dstLen, &convState);
436 		if (ret != B_OK)
437 		{
438 			// what to do? doing nothing skips the encoded text
439 			// but we should keep it: we copy it to the output.
440 
441 			free(dst);
442 
443 			if (string != tail && tail != end)
444 				memmove(string, tail, end-tail);
445 			string += end-tail;
446 			continue;
447 		}
448 		/* convert_to_ is either returning something wrong or my
449 		   test data is screwed up.  Whatever it is, Not Enough
450 		   Space is not the only cause of the below, so we just
451 		   assume it succeeds if it converts anything at all.
452 		else if (cvLen < srcLen)
453 		{
454 			// not enough room to convert the data;
455 			// grow *buf and retry
456 
457 			free(dst);
458 
459 			char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
460 			if (temp == NULL)
461 			{
462 				ret = B_NO_MEMORY;
463 				break;
464 			}
465 
466 			*bufp = temp;
467 			*bufLen = 2*(*bufLen + 1);
468 
469 			string = *bufp + (string-head);
470 			tail = *bufp + (tail-head);
471 			charset = *bufp + (charset-head);
472 			encoding = *bufp + (encoding-head);
473 			end = *bufp + (end-head);
474 			src = *bufp + (src-head);
475 			head = *bufp;
476 			continue;
477 		}
478 		*/
479 		else
480 		{
481 			if (dstLen > end-string)
482 			{
483 				// copy the string forward...
484 				memmove(string+dstLen, end, strLen - (end-head) + 1);
485 				strLen += string+dstLen - end;
486 				end = string + dstLen;
487 			}
488 
489 			memcpy(string, dst, dstLen);
490 			string += dstLen;
491 			free(dst);
492 			continue;
493 		}
494 	}
495 
496 	// copy everything that's left
497 	size_t tailLen = strLen - (tail - head);
498 	memmove(string, tail, tailLen+1);
499 	string += tailLen;
500 
501 	// replace the last char
502 	(*bufp)[strLen] = lastChar;
503 
504 	return ret < B_OK ? ret : string-head;
505 }
506 
507 
508 _EXPORT ssize_t utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding) {
509 	struct word {
510 		BString	originalWord;
511 		BString	convertedWord;
512 		bool	needsEncoding;
513 
514 		// Convert the word from UTF-8 to the desired character set.  The
515 		// converted version also includes the escape codes to return to ASCII
516 		// mode, if relevant.  Also note if it uses unprintable characters,
517 		// which means it will need that special encoding treatment later.
518 		void ConvertWordToCharset (uint32 charset) {
519 			int32 state = 0;
520 			int32 originalLength = originalWord.Length();
521 			int32 convertedLength = originalLength * 5 + 1;
522 			char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
523 			mail_convert_from_utf8 (charset, originalWord.String(),
524 				&originalLength, convertedBuffer, &convertedLength, &state);
525 			for (int i = 0; i < convertedLength; i++) {
526 				if ((convertedBuffer[i] & (1 << 7)) ||
527 					(convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
528 					needsEncoding = true;
529 					break;
530 				}
531 			}
532 			convertedWord.UnlockBuffer (convertedLength);
533 		};
534 	};
535 	struct word *currentWord;
536 	BList words;
537 
538 	// Break the header into words.  White space characters (including tabs and
539 	// newlines) separate the words.  Each word includes any space before it as
540 	// part of the word.  Actually, quotes and other special characters
541 	// (",()<>@) are treated as separate words of their own so that they don't
542 	// get encoded (because MIME headers get the quotes parsed before character
543 	// set unconversion is done).  The reader is supposed to ignore all white
544 	// space between encoded words, which can be inserted so that older mail
545 	// parsers don't have overly long line length problems.
546 
547 	const char *source = *bufp;
548 	const char *bufEnd = *bufp + length;
549 	const char *specialChars = "\"()<>@,";
550 
551 	while (source < bufEnd) {
552 		currentWord = new struct word;
553 		currentWord->needsEncoding = false;
554 
555 		int wordEnd = 0;
556 
557 		// Include leading spaces as part of the word.
558 		while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
559 			wordEnd++;
560 
561 		if (source + wordEnd < bufEnd &&
562 			strchr (specialChars, source[wordEnd]) != NULL) {
563 			// Got a quote mark or other special character, which is treated as
564 			// a word in itself since it shouldn't be encoded, which would hide
565 			// it from the mail system.
566 			wordEnd++;
567 		} else {
568 			// Find the end of the word.  Leave wordEnd pointing just after the
569 			// last character in the word.
570 			while (source + wordEnd < bufEnd) {
571 				if (isspace(source[wordEnd]) ||
572 					strchr (specialChars, source[wordEnd]) != NULL)
573 					break;
574 				if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
575 					0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
576 					// No English words are that long (46 is the longest),
577 					// break up what is likely Asian text (which has no spaces)
578 					// at the start of the next non-ASCII UTF-8 character (high
579 					// two bits are both ones).  Note that two encoded words in
580 					// a row get joined together, even if there is a space
581 					// between them in the final output text, according to the
582 					// standard.  Next word will also be conveniently get
583 					// encoded due to the 0xC0 test.
584 					currentWord->needsEncoding = true;
585 					break;
586 				}
587 				wordEnd++;
588 			}
589 		}
590 		currentWord->originalWord.SetTo (source, wordEnd);
591 		currentWord->ConvertWordToCharset (charset);
592 		words.AddItem(currentWord);
593 		source += wordEnd;
594 	}
595 
596 	// Combine adjacent words which contain unprintable text so that the
597 	// overhead of switching back and forth between regular text and specially
598 	// encoded text is reduced.  However, the combined word must be shorter
599 	// than the maximum of 75 bytes, including character set specification and
600 	// all those delimiters (worst case 22 bytes of overhead).
601 
602 	struct word *run;
603 
604 	for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
605 		if (!currentWord->needsEncoding)
606 			continue; // No need to combine unencoded words.
607 		for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
608 			if (!run->needsEncoding)
609 				break; // Don't want to combine encoded and unencoded words.
610 			if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
611 				currentWord->originalWord.Append (run->originalWord);
612 				currentWord->ConvertWordToCharset (charset);
613 				words.RemoveItem(g);
614 				delete run;
615 				g--;
616 			} else // Can't merge this word, result would be too long.
617 				break;
618 		}
619 	}
620 
621 	// Combine the encoded and unencoded words into one line, doing the
622 	// quoted-printable or base64 encoding.  Insert an extra space between
623 	// words which are both encoded to make word wrapping easier, since there
624 	// is normally none, and you're allowed to insert space (the receiver
625 	// throws it away if it is between encoded words).
626 
627 	BString rfc2047;
628 	bool	previousWordNeededEncoding = false;
629 
630 	const char *charset_dec = "none-bug";
631 	for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
632 		if (mail_charsets[i].flavor == charset) {
633 			charset_dec = mail_charsets[i].charset;
634 			break;
635 		}
636 	}
637 
638 	while ((currentWord = (struct word *)words.RemoveItem(0L)) != NULL) {
639 		if ((encoding != quoted_printable && encoding != base64) ||
640 		!currentWord->needsEncoding) {
641 			rfc2047.Append (currentWord->convertedWord);
642 		} else {
643 			// This word needs encoding.  Try to insert a space between it and
644 			// the previous word.
645 			if (previousWordNeededEncoding)
646 				rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
647 			else {
648 				// Previous word is not encoded, spaces are significant.  Try
649 				// to move a space from the start of this word to be outside of
650 				// the encoded text, so that there is a bit of space between
651 				// this word and the previous one to enhance word wrapping
652 				// chances later on.
653 				if (currentWord->originalWord.Length() > 1 &&
654 					isspace (currentWord->originalWord[0])) {
655 					rfc2047 << currentWord->originalWord[0];
656 					currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
657 					currentWord->ConvertWordToCharset (charset);
658 				}
659 			}
660 
661 			char *encoded = NULL;
662 			ssize_t encoded_len = 0;
663 			int32 convertedLength = currentWord->convertedWord.Length ();
664 			const char *convertedBuffer = currentWord->convertedWord.String ();
665 
666 			switch (encoding) {
667 				case quoted_printable:
668 					encoded = (char *) malloc (convertedLength * 3);
669 					encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
670 					break;
671 				case base64:
672 					encoded = (char *) malloc (convertedLength * 2);
673 					encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
674 					break;
675 				default: // Unknown encoding type, shouldn't happen.
676 					encoded = (char *) convertedBuffer;
677 					encoded_len = convertedLength;
678 					break;
679 			}
680 
681 			rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
682 			rfc2047.Append (encoded, encoded_len);
683 			rfc2047 << "?=";
684 
685 			if (encoding == quoted_printable || encoding == base64)
686 				free(encoded);
687 		}
688 		previousWordNeededEncoding = currentWord->needsEncoding;
689 		delete currentWord;
690 	}
691 
692 	free(*bufp);
693 
694 	ssize_t finalLength = rfc2047.Length ();
695 	*bufp = (char *) (malloc (finalLength + 1));
696 	memcpy (*bufp, rfc2047.String(), finalLength);
697 	(*bufp)[finalLength] = 0;
698 
699 	return finalLength;
700 }
701 
702 
703 //====================================================================
704 
705 void FoldLineAtWhiteSpaceAndAddCRLF (BString &string)
706 {
707 	int			inputLength = string.Length();
708 	int			lineStartIndex;
709 	const int	maxLineLength = 78; // Doesn't include CRLF.
710 	BString		output;
711 	int			splitIndex;
712 	int			tempIndex;
713 
714 	lineStartIndex = 0;
715 	while (true) {
716 		// If we don't need to wrap the text, just output the remainder, if any.
717 
718 		if (lineStartIndex + maxLineLength >= inputLength) {
719 			if (lineStartIndex < inputLength) {
720 				output.Insert (string, lineStartIndex /* source offset */,
721 					inputLength - lineStartIndex /* count */,
722 					output.Length() /* insert at */);
723 				output.Append (CRLF);
724 			}
725 			break;
726 		}
727 
728 		// Look ahead for a convenient spot to split it, between a comma and
729 		// space, which you often see between e-mail addresses like this:
730 		// "Joe Who" joe@dot.com, "Someone Else" else@blot.com
731 
732 		tempIndex = lineStartIndex + maxLineLength;
733 		if (tempIndex > inputLength)
734 			tempIndex = inputLength;
735 		splitIndex = string.FindLast (", ", tempIndex);
736 		if (splitIndex >= lineStartIndex)
737 			splitIndex++; // Point to the space character.
738 
739 		// If none of those exist, try splitting at any white space.
740 
741 		if (splitIndex <= lineStartIndex)
742 			splitIndex = string.FindLast (" ", tempIndex);
743 		if (splitIndex <= lineStartIndex)
744 			splitIndex = string.FindLast ("\t", tempIndex);
745 
746 		// If none of those exist, allow for a longer word - split at the next
747 		// available white space.
748 
749 		if (splitIndex <= lineStartIndex)
750 			splitIndex = string.FindFirst (" ", lineStartIndex + 1);
751 		if (splitIndex <= lineStartIndex)
752 			splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
753 
754 		// Give up, the whole rest of the line can't be split, just dump it
755 		// out.
756 
757 		if (splitIndex <= lineStartIndex) {
758 			if (lineStartIndex < inputLength) {
759 				output.Insert (string, lineStartIndex /* source offset */,
760 					inputLength - lineStartIndex /* count */,
761 					output.Length() /* insert at */);
762 				output.Append (CRLF);
763 			}
764 			break;
765 		}
766 
767 		// Do the split.  The current line up to but not including the space
768 		// gets output, followed by a CRLF.  The space remains to become the
769 		// start of the next line (and that tells the message reader that it is
770 		// a continuation line).
771 
772 		output.Insert (string, lineStartIndex /* source offset */,
773 			splitIndex - lineStartIndex /* count */,
774 			output.Length() /* insert at */);
775 		output.Append (CRLF);
776 		lineStartIndex = splitIndex;
777 	}
778 	string.SetTo (output);
779 }
780 
781 
782 //====================================================================
783 
784 _EXPORT ssize_t readfoldedline(FILE *file, char **buffer, size_t *buflen)
785 {
786 	ssize_t len = buflen && *buflen ? *buflen : 0;
787 	char * buf = buffer && *buffer ? *buffer : NULL;
788 	ssize_t cnt = 0; // Number of characters currently in the buffer.
789 	int c;
790 
791 	while (true)
792 	{
793 		// Make sure there is space in the buffer for two more characters (one
794 		// for the next character, and one for the end of string NUL byte).
795 		if (buf == NULL || cnt + 2 >= len)
796 		{
797 			char *temp = (char *)realloc(buf, len + 64);
798 			if (temp == NULL) {
799 				// Out of memory, however existing buffer remains allocated.
800 				cnt = ENOMEM;
801 				break;
802 			}
803 			len += 64;
804 			buf = temp;
805 		}
806 
807 		// Read the next character, or end of file, or IO error.
808 		if ((c = fgetc(file)) == EOF) {
809 			if (ferror (file)) {
810 				cnt = errno;
811 				if (cnt >= 0)
812 					cnt = -1; // Error codes must be negative.
813 			} else {
814 				// Really is end of file.  Also make it end of line if there is
815 				// some text already read in.  If the first thing read was EOF,
816 				// just return an empty string.
817 				if (cnt > 0) {
818 					buf[cnt++] = '\n';
819 					if (buf[cnt-2] == '\r') {
820 						buf[cnt-2] = '\n';
821 						--cnt;
822 					}
823 				}
824 			}
825 			break;
826 		}
827 
828 		buf[cnt++] = c;
829 
830 		if (c == '\n') {
831 			// Convert CRLF end of line to just a LF.  Do it before folding, in
832 			// case we don't need to fold.
833 			if (cnt >= 2 && buf[cnt-2] == '\r') {
834 				buf[cnt-2] = '\n';
835 				--cnt;
836 			}
837 			// If the current line is empty then return it (so that empty lines
838 			// don't disappear if the next line starts with a space).
839 			if (cnt <= 1)
840 				break;
841 			// Fold if first character on the next line is whitespace.
842 			c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
843 			if (c == ' ' || c == '\t')
844 				buf[cnt-1] = c; // Replace \n with the white space character.
845 			else {
846 				// Not folding, we finished reading a line; break out of the loop
847 				ungetc(c,file);
848 				break;
849 			}
850 		}
851 	}
852 
853 
854 	if (buf != NULL && cnt >= 0)
855 		buf[cnt] = '\0';
856 
857 	if (buffer)
858 		*buffer = buf;
859 	else if (buf)
860 		free(buf);
861 
862 	if (buflen)
863 		*buflen = len;
864 
865 	return cnt;
866 }
867 
868 
869 //====================================================================
870 
871 _EXPORT ssize_t readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
872 {
873 	ssize_t len = buflen && *buflen ? *buflen : 0;
874 	char * buf = buffer && *buffer ? *buffer : NULL;
875 	ssize_t cnt = 0; // Number of characters currently in the buffer.
876 	char c;
877 	status_t errorCode;
878 
879 	while (true)
880 	{
881 		// Make sure there is space in the buffer for two more characters (one
882 		// for the next character, and one for the end of string NUL byte).
883 		if (buf == NULL || cnt + 2 >= len)
884 		{
885 			char *temp = (char *)realloc(buf, len + 64);
886 			if (temp == NULL) {
887 				// Out of memory, however existing buffer remains allocated.
888 				cnt = ENOMEM;
889 				break;
890 			}
891 			len += 64;
892 			buf = temp;
893 		}
894 
895 		errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
896 		if (errorCode != 1) {
897 			if (errorCode < 0) {
898 				cnt = errorCode; // IO error encountered, just return the code.
899 			} else {
900 				// Really is end of file.  Also make it end of line if there is
901 				// some text already read in.  If the first thing read was EOF,
902 				// just return an empty string.
903 				if (cnt > 0) {
904 					buf[cnt++] = '\n';
905 					if (buf[cnt-2] == '\r') {
906 						buf[cnt-2] = '\n';
907 						--cnt;
908 					}
909 				}
910 			}
911 			break;
912 		}
913 
914 		buf[cnt++] = c;
915 
916 		if (c == '\n') {
917 			// Convert CRLF end of line to just a LF.  Do it before folding, in
918 			// case we don't need to fold.
919 			if (cnt >= 2 && buf[cnt-2] == '\r') {
920 				buf[cnt-2] = '\n';
921 				--cnt;
922 			}
923 			// If the current line is empty then return it (so that empty lines
924 			// don't disappear if the next line starts with a space).
925 			if (cnt <= 1)
926 				break;
927 			// if first character on the next line is whitespace, fold lines
928 			errorCode = in.Read(&c,1);
929 			if (errorCode == 1) {
930 				if (c == ' ' || c == '\t')
931 					buf[cnt-1] = c; // Replace \n with the white space character.
932 				else {
933 					// Not folding, we finished reading a whole line.
934 					in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
935 					break;
936 				}
937 			} else if (errorCode < 0) {
938 				cnt = errorCode;
939 				break;
940 			} else // No next line; at the end of the file.  Return the line.
941 				break;
942 		}
943 	}
944 
945 	if (buf != NULL && cnt >= 0)
946 		buf[cnt] = '\0';
947 
948 	if (buffer)
949 		*buffer = buf;
950 	else if (buf)
951 		free(buf);
952 
953 	if (buflen)
954 		*buflen = len;
955 
956 	return cnt;
957 }
958 
959 
960 _EXPORT ssize_t
961 nextfoldedline(const char** header, char **buffer, size_t *buflen)
962 {
963 	ssize_t len = buflen && *buflen ? *buflen : 0;
964 	char * buf = buffer && *buffer ? *buffer : NULL;
965 	ssize_t cnt = 0; // Number of characters currently in the buffer.
966 	char c;
967 
968 	while (true)
969 	{
970 		// Make sure there is space in the buffer for two more characters (one
971 		// for the next character, and one for the end of string NUL byte).
972 		if (buf == NULL || cnt + 2 >= len)
973 		{
974 			char *temp = (char *)realloc(buf, len + 64);
975 			if (temp == NULL) {
976 				// Out of memory, however existing buffer remains allocated.
977 				cnt = ENOMEM;
978 				break;
979 			}
980 			len += 64;
981 			buf = temp;
982 		}
983 
984 		// Read the next character, or end of file.
985 		if ((c = *(*header)++) == 0) {
986 			// End of file.  Also make it end of line if there is some text
987 			// already read in.  If the first thing read was EOF, just return
988 			// an empty string.
989 			if (cnt > 0) {
990 				buf[cnt++] = '\n';
991 				if (buf[cnt-2] == '\r') {
992 					buf[cnt-2] = '\n';
993 					--cnt;
994 				}
995 			}
996 			break;
997 		}
998 
999 		buf[cnt++] = c;
1000 
1001 		if (c == '\n') {
1002 			// Convert CRLF end of line to just a LF.  Do it before folding, in
1003 			// case we don't need to fold.
1004 			if (cnt >= 2 && buf[cnt-2] == '\r') {
1005 				buf[cnt-2] = '\n';
1006 				--cnt;
1007 			}
1008 			// If the current line is empty then return it (so that empty lines
1009 			// don't disappear if the next line starts with a space).
1010 			if (cnt <= 1)
1011 				break;
1012 			// if first character on the next line is whitespace, fold lines
1013 			c = *(*header)++;
1014 			if (c == ' ' || c == '\t')
1015 				buf[cnt-1] = c; // Replace \n with the white space character.
1016 			else {
1017 				// Not folding, we finished reading a line; break out of the loop
1018 				(*header)--; // Undo read of the non-whitespace.
1019 				break;
1020 			}
1021 		}
1022 	}
1023 
1024 
1025 	if (buf != NULL && cnt >= 0)
1026 		buf[cnt] = '\0';
1027 
1028 	if (buffer)
1029 		*buffer = buf;
1030 	else if (buf)
1031 		free(buf);
1032 
1033 	if (buflen)
1034 		*buflen = len;
1035 
1036 	return cnt;
1037 }
1038 
1039 
1040 _EXPORT void
1041 trim_white_space(BString &string)
1042 {
1043 	int32 i;
1044 	int32 length = string.Length();
1045 	char *buffer = string.LockBuffer(length + 1);
1046 
1047 	while (length > 0 && isspace(buffer[length - 1]))
1048 		length--;
1049 	buffer[length] = '\0';
1050 
1051 	for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1052 	if (i != 0) {
1053 		length -= i;
1054 		memmove(buffer,buffer + i,length + 1);
1055 	}
1056 	string.UnlockBuffer(length);
1057 }
1058 
1059 
1060 /** Tries to return a human-readable name from the specified
1061  *	header parameter (should be from "To:" or "From:").
1062  *	Tries to return the name rather than the eMail address.
1063  */
1064 
1065 _EXPORT void
1066 extract_address_name(BString &header)
1067 {
1068 	BString name;
1069 	const char *start = header.String();
1070 	const char *stop = start + strlen (start);
1071 
1072 	// Find a string S in the header (email foo) that matches:
1073 	//   Old style name in brackets: foo@bar.com (S)
1074 	//   New style quotes: "S" <foo@bar.com>
1075 	//   New style no quotes if nothing else found: S <foo@bar.com>
1076 	//   If nothing else found then use the whole thing: S
1077 
1078 	for (int i = 0; i <= 3; i++) {
1079 		// Set p1 to the first letter in the name and p2 to just past the last
1080 		// letter in the name.  p2 stays NULL if a name wasn't found in this
1081 		// pass.
1082 		const char *p1 = NULL, *p2 = NULL;
1083 
1084 		switch (i) {
1085 			case 0: // foo@bar.com (S)
1086 				if ((p1 = strchr(start,'(')) != NULL) {
1087 					p1++; // Advance to first letter in the name.
1088 					size_t nest = 1; // Handle nested brackets.
1089 					for (p2 = p1; p2 < stop; ++p2)
1090 					{
1091 						if (*p2 == ')')
1092 							--nest;
1093 						else if (*p2 == '(')
1094 							++nest;
1095 						if (nest <= 0)
1096 							break;
1097 					}
1098 					if (nest != 0)
1099 						p2 = NULL; // False alarm, no terminating bracket.
1100 				}
1101 				break;
1102 			case 1: // "S" <foo@bar.com>
1103 				if ((p1 = strchr(start, '\"')) != NULL)
1104 					p2 = strchr(++p1, '\"');
1105 				break;
1106 			case 2: // S <foo@bar.com>
1107 				p1 = start;
1108 				if (name.Length() == 0)
1109 					p2 = strchr(start, '<');
1110 				break;
1111 			case 3: // S
1112 				p1 = start;
1113 				if (name.Length() == 0)
1114 					p2 = stop;
1115 				break;
1116 		}
1117 
1118 		// Remove leading and trailing space-like characters and save the
1119 		// result if it is longer than any other likely names found.
1120 		if (p2 != NULL) {
1121 			while (p1 < p2 && (isspace (*p1)))
1122 				++p1;
1123 
1124 			while (p1 < p2 && (isspace (p2[-1])))
1125 				--p2;
1126 
1127 			int newLength = p2 - p1;
1128 			if (name.Length() < newLength)
1129 				name.SetTo(p1, newLength);
1130 		}
1131 	}
1132 
1133 	int32 lessIndex = name.FindFirst('<');
1134 	int32 greaterIndex = name.FindLast('>');
1135 
1136 	if (lessIndex == 0) {
1137 		// Have an address of the form <address> and nothing else, so remove
1138 		// the greater and less than signs, if any.
1139 		if (greaterIndex > 0)
1140 			name.Remove(greaterIndex, 1);
1141 		name.Remove(lessIndex, 1);
1142 	} else if (lessIndex > 0 && lessIndex < greaterIndex) {
1143 		// Yahoo stupidly inserts the e-mail address into the name string, so
1144 		// this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1145 		name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1146 	}
1147 
1148 	trim_white_space(name);
1149 	header = name;
1150 }
1151 
1152 
1153 
1154 // Given a subject in a BString, remove the extraneous RE: re: and other stuff
1155 // to get down to the core subject string, which should be identical for all
1156 // messages posted about a topic.  The input string is modified in place to
1157 // become the output core subject string.
1158 
1159 static int32				gLocker = 0;
1160 static size_t				gNsub = 1;
1161 static re_pattern_buffer	gRe;
1162 static re_pattern_buffer   *gRebuf = NULL;
1163 static unsigned char					gTranslation[256];
1164 
1165 _EXPORT void SubjectToThread (BString &string)
1166 {
1167 // a regex that matches a non-ASCII UTF8 character:
1168 #define U8C \
1169 	"[\302-\337][\200-\277]" \
1170 	"|\340[\302-\337][\200-\277]" \
1171 	"|[\341-\357][\200-\277][\200-\277]" \
1172 	"|\360[\220-\277][\200-\277][\200-\277]" \
1173 	"|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1174 	"|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1175 	"|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1176 	"|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1177 	"|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1178 
1179 #define PATTERN \
1180 	"^ +" \
1181 	"|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1182 	"|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1183 	"| *\\(fwd\\) *$"
1184 
1185 	if (gRebuf == NULL && atomic_add(&gLocker,1) == 0)
1186 	{
1187 		// the idea is to compile the regexp once to speed up testing
1188 
1189 		for (int i=0; i<256; ++i) gTranslation[i]=i;
1190 		for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1191 
1192 		gRe.translate = gTranslation;
1193 		gRe.regs_allocated = REGS_FIXED;
1194 		re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1195 
1196 		const char *pattern = PATTERN;
1197 		// count subexpressions in PATTERN
1198 		for (unsigned int i=0; pattern[i] != 0; ++i)
1199 		{
1200 			if (pattern[i] == '\\')
1201 				++i;
1202 			else if (pattern[i] == '(')
1203 				++gNsub;
1204 		}
1205 
1206 		const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1207 		if (err == NULL)
1208 			gRebuf = &gRe;
1209 		else
1210 			fprintf(stderr, "Failed to compile the regex: %s\n", err);
1211 	}
1212 	else
1213 	{
1214 		int32 tries = 200;
1215 		while (gRebuf == NULL && tries-- > 0)
1216 			snooze(10000);
1217 	}
1218 
1219 	if (gRebuf)
1220 	{
1221 		struct re_registers regs;
1222 		// can't be static if this function is to be thread-safe
1223 
1224 		regs.num_regs = gNsub;
1225 		regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1226 		regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1227 
1228 		for (int start=0;
1229 		    (start=re_search(gRebuf, string.String(), string.Length(),
1230 							0, string.Length(), &regs)) >= 0;
1231 			)
1232 		{
1233 			//
1234 			// we found something
1235 			//
1236 
1237 			// don't delete [bemaildaemon]...
1238 			if (start == regs.start[1])
1239 				start = regs.start[2];
1240 
1241 			string.Remove(start,regs.end[0]-start);
1242 			if (start) string.Insert(' ',1,start);
1243 		}
1244 
1245 		free(regs.start);
1246 		free(regs.end);
1247 	}
1248 
1249 	// Finally remove leading and trailing space.  Some software, like
1250 	// tm-edit 1.8, appends a space to the subject, which would break
1251 	// threading if we left it in.
1252 	trim_white_space(string);
1253 }
1254 
1255 
1256 
1257 // Converts a date to a time.  Handles numeric time zones too, unlike
1258 // parsedate.  Returns -1 if it fails.
1259 
1260 _EXPORT time_t ParseDateWithTimeZone (const char *DateString)
1261 {
1262 	time_t	currentTime;
1263 	time_t	dateAsTime;
1264 	char	tempDateString [80];
1265 	char	tempZoneString [6];
1266 	time_t	zoneDeltaTime;
1267 	int		zoneIndex;
1268 	char   *zonePntr;
1269 
1270 	// See if we can remove the time zone portion.  parsedate understands time
1271 	// zone 3 letter names, but doesn't understand the numeric +9999 time zone
1272 	// format.  To do: see if a newer parsedate exists.
1273 
1274 	strncpy (tempDateString, DateString, sizeof (tempDateString));
1275 	tempDateString[sizeof (tempDateString) - 1] = 0;
1276 
1277 	// Remove trailing spaces.
1278 	zonePntr = tempDateString + strlen (tempDateString) - 1;
1279 	while (zonePntr >= tempDateString && isspace (*zonePntr))
1280 		*zonePntr-- = 0;
1281 	if (zonePntr < tempDateString)
1282 		return -1; // Empty string.
1283 
1284 	// Remove the trailing time zone in round brackets, like in
1285 	// Fri, 22 Feb 2002 15:22:42 EST (-0500)
1286 	// Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1287 	if (tempDateString[strlen(tempDateString)-1] == ')')
1288 	{
1289 		zonePntr = strrchr (tempDateString, '(');
1290 		if (zonePntr != NULL)
1291 		{
1292 			*zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1293 			while (zonePntr >= tempDateString && isspace (*zonePntr))
1294 				*zonePntr-- = 0;
1295 			if (zonePntr < tempDateString)
1296 				return -1; // Empty string.
1297 		}
1298 	}
1299 
1300 	// Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1301 	for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1302 	{
1303 		zonePntr = tempDateString + zoneIndex;
1304 		if (zonePntr[0] == '+' || zonePntr[0] == '-')
1305 		{
1306 			if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1307 				zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1308 				zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1309 				zonePntr[4] >= '0' && zonePntr[4] <= '9')
1310 				break;
1311 		}
1312 	}
1313 	if (zoneIndex >= 0)
1314 	{
1315 		// Remove the zone from the date string and any following time zone
1316 		// letter codes.  Also put in GMT so that the date gets parsed as GMT.
1317 		memcpy (tempZoneString, zonePntr, 5);
1318 		tempZoneString [5] = 0;
1319 		strcpy (zonePntr, "GMT");
1320 	}
1321 	else // No numeric time zone found.
1322 		strcpy (tempZoneString, "+0000");
1323 
1324 	time (&currentTime);
1325 	dateAsTime = parsedate (tempDateString, currentTime);
1326 	if (dateAsTime == (time_t) -1)
1327 		return -1; // Failure.
1328 
1329 	zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1330 	tempZoneString[3] = 0;
1331 	zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1332 	if (tempZoneString[0] == '+')
1333 		zoneDeltaTime = 0 - zoneDeltaTime;
1334 	dateAsTime += zoneDeltaTime;
1335 
1336 	return dateAsTime;
1337 }
1338 
1339 
1340 /** Parses a mail header and fills the headers BMessage
1341  */
1342 
1343 _EXPORT status_t
1344 parse_header(BMessage &headers, BPositionIO &input)
1345 {
1346 	char *buffer = NULL;
1347 	size_t bufferSize = 0;
1348 	int32 length;
1349 
1350 	while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1351 		--length;
1352 			// Don't include the \n at the end of the buffer.
1353 
1354 		// convert to UTF-8 and null-terminate the buffer
1355 		length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1356 		buffer[length] = '\0';
1357 
1358 		const char *delimiter = strstr(buffer, ":");
1359 		if (delimiter == NULL)
1360 			continue;
1361 
1362 		BString header(buffer, delimiter - buffer);
1363 		header.CapitalizeEachWord();
1364 			// unified case for later fetch
1365 
1366 		delimiter++; // Skip the colon.
1367 		while (isspace (*delimiter))
1368 			delimiter++; // Skip over leading white space and tabs.  To do: (comments in brackets).
1369 
1370 		// ToDo: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1371 		headers.AddString(header.String(), delimiter);
1372 	}
1373 	free(buffer);
1374 
1375 	return B_OK;
1376 }
1377 
1378 
1379 _EXPORT void
1380 extract_address(BString &address)
1381 {
1382 	const char *string = address.String();
1383 	int32 first;
1384 
1385 	// first, remove all quoted text
1386 
1387 	if ((first = address.FindFirst('"')) >= 0) {
1388 		int32 last = first + 1;
1389 		while (string[last] && string[last] != '"')
1390 			last++;
1391 
1392 		if (string[last] == '"')
1393 			address.Remove(first, last + 1 - first);
1394 	}
1395 
1396 	// try to extract the address now
1397 
1398 	if ((first = address.FindFirst('<')) >= 0) {
1399 		// the world likes us and we can just get the address the easy way...
1400 		int32 last = address.FindFirst('>');
1401 		if (last >= 0) {
1402 			address.Truncate(last);
1403 			address.Remove(0, first + 1);
1404 
1405 			return;
1406 		}
1407 	}
1408 
1409 	// then, see if there is anything in parenthesis to throw away
1410 
1411 	if ((first = address.FindFirst('(')) >= 0) {
1412 		int32 last = first + 1;
1413 		while (string[last] && string[last] != ')')
1414 			last++;
1415 
1416 		if (string[last] == ')')
1417 			address.Remove(first, last + 1 - first);
1418 	}
1419 
1420 	// now, there shouldn't be much else left
1421 
1422 	trim_white_space(address);
1423 }
1424 
1425 
1426 _EXPORT void
1427 get_address_list(BList &list, const char *string, void (*cleanupFunc)(BString &))
1428 {
1429 	if (string == NULL || !string[0])
1430 		return;
1431 
1432 	const char *start = string;
1433 
1434 	while (true) {
1435 		if (string[0] == '"') {
1436 			const char *quoteEnd = ++string;
1437 
1438 			while (quoteEnd[0] && quoteEnd[0] != '"')
1439 				quoteEnd++;
1440 
1441 			if (!quoteEnd[0])	// string exceeds line!
1442 				quoteEnd = string;
1443 
1444 			string = quoteEnd + 1;
1445 		}
1446 
1447 		if (string[0] == ',' || string[0] == '\0') {
1448 			BString address(start, string - start);
1449 			trim_white_space(address);
1450 
1451 			if (cleanupFunc)
1452 				cleanupFunc(address);
1453 
1454 			list.AddItem(strdup(address.String()));
1455 
1456 			start = string + 1;
1457 		}
1458 
1459 		if (!string[0])
1460 			break;
1461 
1462 		string++;
1463 	}
1464 }
1465 
1466