xref: /haiku/src/kits/mail/mail_util.cpp (revision 95bac3fda53a4cb21880712d7b43f8c21db32a2e)
1 /* mail util - header parsing
2 **
3 ** Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
4 */
5 
6 
7 #include <UTF8.h>
8 #include <Message.h>
9 #include <String.h>
10 #include <Locker.h>
11 #include <DataIO.h>
12 #include <List.h>
13 
14 #include <stdlib.h>
15 #include <string.h>
16 #include <stdio.h>
17 #include <regex.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <parsedate.h>
21 
22 #include <mail_encoding.h>
23 
24 #include <mail_util.h>
25 
26 #include <CharacterSet.h>
27 #include <CharacterSetRoster.h>
28 
29 using namespace BPrivate;
30 
31 #define CRLF   "\r\n"
32 
33 struct CharsetConversionEntry
34 {
35 	const char *charset;
36 	uint32 flavor;
37 };
38 
39 extern const CharsetConversionEntry mail_charsets [] =
40 {
41 	// In order of authority, so when searching for the name for a particular
42 	// numbered conversion, start at the beginning of the array.
43 	{"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
44 	{"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
45 	{"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
46 	{"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
47 	{"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
48 	{"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
49 	{"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
50 	{"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
51 	{"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
52 	{"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
53 	{"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
54 	{"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
55 	{"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
56 
57 	{"shift_jis",	B_SJIS_CONVERSION}, // MIME STANDARD
58 	{"shift-jis",	B_SJIS_CONVERSION},
59 	{"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
60 	{"euc-jp",		B_EUC_CONVERSION}, // MIME STANDARD
61 
62 	{"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
63 	{"ksc5601",		B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
64 	{"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
65 
66 	{"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
67 	{"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
68 	{"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
69 
70 	{"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
71 	{"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
72 	{"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
73 
74     {"big5",        24}, // MIME STANDARD
75 
76     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
77     {"gb2312",      25}, // COMPATIBLE
78     {"gbk",         25}, // COMPATIBLE
79 
80 	/* {"utf-16",		B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
81 	{"us-ascii",	B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
82 	{"utf-8",		B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
83 
84 	{NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
85 };
86 
87 
88 // The next couple of functions are our wrapper around convert_to_utf8 and
89 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
90 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.  It
91 // also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
92 
93 _EXPORT status_t mail_convert_to_utf8 (
94 	uint32 srcEncoding,
95 	const char *src,
96 	int32 *srcLen,
97 	char *dst,
98 	int32 *dstLen,
99 	int32 *state,
100 	char substitute)
101 {
102 	int32    copyAmount;
103 	char    *originalDst = dst;
104 	status_t returnCode = -1;
105 
106 	if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
107 		copyAmount = *srcLen;
108 		if (*dstLen < copyAmount)
109 			copyAmount = *dstLen;
110 		memcpy (dst, src, copyAmount);
111 		*srcLen = copyAmount;
112 		*dstLen = copyAmount;
113 		returnCode = B_OK;
114 	} else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
115 		int32 i;
116 		unsigned char letter;
117 		copyAmount = *srcLen;
118 		if (*dstLen < copyAmount)
119 			copyAmount = *dstLen;
120 		for (i = 0; i < copyAmount; i++) {
121 			letter = *src++;
122 			if (letter > 0x80U)
123 				// Invalid, could also use substitute, but better to strip high bit.
124 				*dst++ = letter - 0x80U;
125 			else if (letter == 0x80U)
126 				// Can't convert to 0x00 since that's NUL, which would cause problems.
127 				*dst++ = substitute;
128 			else
129 				*dst++ = letter;
130 		}
131 		*srcLen = copyAmount;
132 		*dstLen = copyAmount;
133 		returnCode = B_OK;
134 	} else
135 		returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
136 			dst, dstLen, state, substitute);
137 
138 	if (returnCode == B_OK) {
139 		// Replace spurious NUL bytes, which should normally not be in the
140 		// output of the decoding (not normal UTF-8 characters, and no NULs are
141 		// in our usual input strings).  They happen for some odd ISO-2022-JP
142 		// byte pair combinations which are improperly handled by the BeOS
143 		// routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
144 		// first ESC $ B switches to a Japanese character set, then the next
145 		// two bytes "yD" specify a character, then ESC ( B switches back to
146 		// the ASCII character set.  The UTF-8 conversion yields a NUL byte.
147 		int32 i;
148 		for (i = 0; i < *dstLen; i++)
149 			if (originalDst[i] == 0)
150 				originalDst[i] = substitute;
151 	}
152 	return returnCode;
153 }
154 
155 
156 _EXPORT status_t mail_convert_from_utf8 (
157 	uint32 dstEncoding,
158 	const char *src,
159 	int32 *srcLen,
160 	char *dst,
161 	int32 *dstLen,
162 	int32 *state,
163 	char substitute)
164 {
165 	int32		copyAmount;
166 	status_t	errorCode;
167 	int32		originalDstLen = *dstLen;
168 	int32		tempDstLen;
169 	int32		tempSrcLen;
170 
171 	if (dstEncoding == B_MAIL_UTF8_CONVERSION)
172 	{
173 		copyAmount = *srcLen;
174 		if (*dstLen < copyAmount)
175 			copyAmount = *dstLen;
176 		memcpy (dst, src, copyAmount);
177 		*srcLen = copyAmount;
178 		*dstLen = copyAmount;
179 		return B_OK;
180 	}
181 
182 	if (dstEncoding == B_MAIL_US_ASCII_CONVERSION)
183 	{
184 		int32			characterLength;
185 		int32			dstRemaining = *dstLen;
186 		unsigned char	letter;
187 		int32			srcRemaining = *srcLen;
188 
189 		// state contains the number of source bytes to skip, left over from a
190 		// partial UTF-8 character split over the end of the buffer from last
191 		// time.
192 		if (srcRemaining <= *state) {
193 			*state -= srcRemaining;
194 			*dstLen = 0;
195 			return B_OK;
196 		}
197 		srcRemaining -= *state;
198 		src += *state;
199 		*state = 0;
200 
201 		while (true) {
202 			if (srcRemaining <= 0 || dstRemaining <= 0)
203 				break;
204 			letter = *src;
205 			if (letter < 0x80)
206 				characterLength = 1; // Regular ASCII equivalent code.
207 			else if (letter < 0xC0)
208 				characterLength = 1; // Invalid in-between data byte 10xxxxxx.
209 			else if (letter < 0xE0)
210 				characterLength = 2;
211 			else if (letter < 0xF0)
212 				characterLength = 3;
213 			else if (letter < 0xF8)
214 				characterLength = 4;
215 			else if (letter < 0xFC)
216 				characterLength = 5;
217 			else if (letter < 0xFE)
218 				characterLength = 6;
219 			else
220 				characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
221 			if (letter < 0x80)
222 				*dst++ = *src;
223 			else
224 				*dst++ = substitute;
225 			dstRemaining--;
226 			if (srcRemaining < characterLength) {
227 				// Character split past the end of the buffer.
228 				*state = characterLength - srcRemaining;
229 				srcRemaining = 0;
230 			} else {
231 				src += characterLength;
232 				srcRemaining -= characterLength;
233 			}
234 		}
235 		// Update with the amounts used.
236 		*srcLen = *srcLen - srcRemaining;
237 		*dstLen = *dstLen - dstRemaining;
238 		return B_OK;
239 	}
240 
241 	errorCode = convert_from_utf8 (dstEncoding, src, srcLen, dst, dstLen, state, substitute);
242 	if (errorCode != B_OK)
243 		return errorCode;
244 
245 	if (dstEncoding != B_JIS_CONVERSION)
246 		return B_OK;
247 
248 	// B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
249 	// character subsets.  For E-mail headers (and other uses), it needs to be
250 	// switched back to ASCII at the end (otherwise the last character gets
251 	// lost or other weird things happen in the headers).  Note that we can't
252 	// just append the escape code since the convert_from_utf8 "state" will be
253 	// wrong.  So we append an ASCII letter and throw it away, leaving just the
254 	// escape code.  Well, it actually switches to the Roman character set, not
255 	// ASCII, but that should be OK.
256 
257 	tempDstLen = originalDstLen - *dstLen;
258 	if (tempDstLen < 3) // Not enough space remaining in the output.
259 		return B_OK; // Sort of an error, but we did convert the rest OK.
260 	tempSrcLen = 1;
261 	errorCode = convert_from_utf8 (dstEncoding, "a", &tempSrcLen,
262 		dst + *dstLen, &tempDstLen, state, substitute);
263 	if (errorCode != B_OK)
264 		return errorCode;
265 	*dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
266 	return B_OK;
267 }
268 
269 
270 
271 static int handle_non_rfc2047_encoding(char **buffer,size_t *bufferLength,size_t *sourceLength)
272 {
273 	char *string = *buffer;
274 	int32 length = *sourceLength;
275 	int32 i;
276 
277 	// check for 8-bit characters
278 	for (i = 0;i < length;i++)
279 		if (string[i] & 0x80)
280 			break;
281 	if (i == length)
282 		return false;
283 
284 	// check for groups of 8-bit characters - this code is not very smart;
285 	// it just can detect some sort of single-byte encoded stuff, the rest
286 	// is regarded as UTF-8
287 
288 	int32 singletons = 0,doubles = 0;
289 
290 	for (i = 0;i < length;i++)
291 	{
292 		if (string[i] & 0x80)
293 		{
294 			if ((string[i + 1] & 0x80) == 0)
295 				singletons++;
296 			else doubles++;
297 			i++;
298 		}
299 	}
300 
301 	if (singletons != 0)	// can't be valid UTF-8 anymore, so we assume ISO-Latin-1
302 	{
303 		int32 state = 0;
304 		// just to be sure
305 		int32 destLength = length * 4 + 1;
306 		int32 destBufferLength = destLength;
307 		char *dest = (char *)malloc(destLength);
308 		if (dest == NULL)
309 			return 0;
310 
311 		if (convert_to_utf8(B_ISO1_CONVERSION,string,&length,dest,&destLength,&state) == B_OK)
312 		{
313 			free(*buffer);
314 			*buffer = dest;
315 			*bufferLength = destBufferLength;
316 			*sourceLength = destLength;
317 			return true;
318 		}
319 		free(dest);
320 		return false;
321 	}
322 
323 	// we assume a valid UTF-8 string here, but yes, we don't check it
324 	return true;
325 }
326 
327 
328 _EXPORT ssize_t rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
329 {
330 	char *string = *bufp;
331 	char *head, *tail;
332 	char *charset, *encoding, *end;
333 	ssize_t ret = B_OK;
334 
335 	if (bufp == NULL || *bufp == NULL)
336 		return -1;
337 
338 	//---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
339 	if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
340 		return strLen;
341 
342 	// set up string length
343 	if (strLen == 0)
344 		strLen = strlen(*bufp);
345 	char lastChar = (*bufp)[strLen];
346 	(*bufp)[strLen] = '\0';
347 
348 	//---------Whew! Now for RFC compliant mail
349 	bool encodedWordFoundPreviously = false;
350 	for (head = tail = string;
351 		((charset = strstr(tail, "=?")) != NULL)
352 		&& (((encoding = strchr(charset + 2, '?')) != NULL)
353 			&& encoding[1] && (encoding[2] == '?') && encoding[3])
354 		&& (end = strstr(encoding + 3, "?=")) != NULL;
355 		// found "=?...charset...?e?...text...?=   (e == encoding)
356 		//        ^charset       ^encoding    ^end
357 		tail = end)
358 	{
359 		// Copy non-encoded text (from tail up to charset) to the output.
360 		// Ignore spaces between two encoded "words".  RFC2047 says the words
361 		// should be concatenated without the space (designed for Asian
362 		// sentences which have no spaces yet need to be broken into "words" to
363 		// keep within the line length limits).
364 		bool nonSpaceFound = false;
365 		for (int i = 0; i < charset-tail; i++) {
366 			if (!isspace (tail[i])) {
367 				nonSpaceFound = true;
368 				break;
369 			}
370 		}
371 		if (!encodedWordFoundPreviously || nonSpaceFound) {
372 			if (string != tail && tail != charset)
373 				memmove(string, tail, charset-tail);
374 			string += charset-tail;
375 		}
376 		tail = charset;
377 		encodedWordFoundPreviously = true;
378 
379 		// move things to point at what they should:
380 		//   =?...charset...?e?...text...?=   (e == encoding)
381 		//     ^charset      ^encoding     ^end
382 		charset += 2;
383 		encoding += 1;
384 		end += 2;
385 
386 		// find the charset this text is in now
387 		size_t		cLen = encoding - 1 - charset;
388 		bool		base64encoded = toupper(*encoding) == 'B';
389 
390 		uint32 convert_id = B_MAIL_NULL_CONVERSION;
391 		char charset_string[cLen+1];
392 		memcpy(charset_string, charset, cLen);
393 		charset_string[cLen] = '\0';
394 		if (strcasecmp(charset_string, "us-ascii") == 0) {
395 			convert_id = B_MAIL_US_ASCII_CONVERSION;
396 		} else if (strcasecmp(charset_string, "utf-8") == 0) {
397 			convert_id = B_MAIL_UTF8_CONVERSION;
398 		} else {
399 			const BCharacterSet * cs = BCharacterSetRoster::FindCharacterSetByName(charset_string);
400 			if (cs != NULL) {
401 				convert_id = cs->GetConversionID();
402 			}
403 		}
404 		if (convert_id == B_MAIL_NULL_CONVERSION)
405 		{
406 			// unidentified charset
407 			// what to do? doing nothing skips the encoded text;
408 			// but we should keep it: we copy it to the output.
409 			if (string != tail && tail != end)
410 				memmove(string, tail, end-tail);
411 			string += end-tail;
412 			continue;
413 		}
414 		// else we've successfully identified the charset
415 
416 		char *src = encoding+2;
417 		int32 srcLen = end - 2 - src;
418 		// encoded text: src..src+srcLen
419 
420 		// decode text, get decoded length (reducing xforms)
421 		srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
422 				: decode_base64(src, src, srcLen);
423 
424 		// allocate space for the converted text
425 		int32 dstLen = end-string + *bufLen-strLen;
426 		char *dst = (char*)malloc(dstLen);
427 		int32 cvLen = srcLen;
428 		int32 convState = 0;
429 
430 		//
431 		// do the conversion
432 		//
433 		ret = mail_convert_to_utf8(convert_id, src, &cvLen, dst, &dstLen, &convState);
434 		if (ret != B_OK)
435 		{
436 			// what to do? doing nothing skips the encoded text
437 			// but we should keep it: we copy it to the output.
438 
439 			free(dst);
440 
441 			if (string != tail && tail != end)
442 				memmove(string, tail, end-tail);
443 			string += end-tail;
444 			continue;
445 		}
446 		/* convert_to_ is either returning something wrong or my
447 		   test data is screwed up.  Whatever it is, Not Enough
448 		   Space is not the only cause of the below, so we just
449 		   assume it succeeds if it converts anything at all.
450 		else if (cvLen < srcLen)
451 		{
452 			// not enough room to convert the data;
453 			// grow *buf and retry
454 
455 			free(dst);
456 
457 			char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
458 			if (temp == NULL)
459 			{
460 				ret = B_NO_MEMORY;
461 				break;
462 			}
463 
464 			*bufp = temp;
465 			*bufLen = 2*(*bufLen + 1);
466 
467 			string = *bufp + (string-head);
468 			tail = *bufp + (tail-head);
469 			charset = *bufp + (charset-head);
470 			encoding = *bufp + (encoding-head);
471 			end = *bufp + (end-head);
472 			src = *bufp + (src-head);
473 			head = *bufp;
474 			continue;
475 		}
476 		*/
477 		else
478 		{
479 			if (dstLen > end-string)
480 			{
481 				// copy the string forward...
482 				memmove(string+dstLen, end, strLen - (end-head) + 1);
483 				strLen += string+dstLen - end;
484 				end = string + dstLen;
485 			}
486 
487 			memcpy(string, dst, dstLen);
488 			string += dstLen;
489 			free(dst);
490 			continue;
491 		}
492 	}
493 
494 	// copy everything that's left
495 	size_t tailLen = strLen - (tail - head);
496 	memmove(string, tail, tailLen+1);
497 	string += tailLen;
498 
499 	// replace the last char
500 	(*bufp)[strLen] = lastChar;
501 
502 	return ret < B_OK ? ret : string-head;
503 }
504 
505 
506 _EXPORT ssize_t utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding) {
507 	struct word {
508 		BString	originalWord;
509 		BString	convertedWord;
510 		bool	needsEncoding;
511 
512 		// Convert the word from UTF-8 to the desired character set.  The
513 		// converted version also includes the escape codes to return to ASCII
514 		// mode, if relevant.  Also note if it uses unprintable characters,
515 		// which means it will need that special encoding treatment later.
516 		void ConvertWordToCharset (uint32 charset) {
517 			int32 state = 0;
518 			int32 originalLength = originalWord.Length();
519 			int32 convertedLength = originalLength * 5 + 1;
520 			char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
521 			mail_convert_from_utf8 (charset, originalWord.String(),
522 				&originalLength, convertedBuffer, &convertedLength, &state);
523 			for (int i = 0; i < convertedLength; i++) {
524 				if ((convertedBuffer[i] & (1 << 7)) ||
525 					(convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
526 					needsEncoding = true;
527 					break;
528 				}
529 			}
530 			convertedWord.UnlockBuffer (convertedLength);
531 		};
532 	};
533 	struct word *currentWord;
534 	BList words;
535 
536 	// Break the header into words.  White space characters (including tabs and
537 	// newlines) separate the words.  Each word includes any space before it as
538 	// part of the word.  Actually, quotes and other special characters
539 	// (",()<>@) are treated as separate words of their own so that they don't
540 	// get encoded (because MIME headers get the quotes parsed before character
541 	// set unconversion is done).  The reader is supposed to ignore all white
542 	// space between encoded words, which can be inserted so that older mail
543 	// parsers don't have overly long line length problems.
544 
545 	const char *source = *bufp;
546 	const char *bufEnd = *bufp + length;
547 	const char *specialChars = "\"()<>@,";
548 
549 	while (source < bufEnd) {
550 		currentWord = new struct word;
551 		currentWord->needsEncoding = false;
552 
553 		int wordEnd = 0;
554 
555 		// Include leading spaces as part of the word.
556 		while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
557 			wordEnd++;
558 
559 		if (source + wordEnd < bufEnd &&
560 			strchr (specialChars, source[wordEnd]) != NULL) {
561 			// Got a quote mark or other special character, which is treated as
562 			// a word in itself since it shouldn't be encoded, which would hide
563 			// it from the mail system.
564 			wordEnd++;
565 		} else {
566 			// Find the end of the word.  Leave wordEnd pointing just after the
567 			// last character in the word.
568 			while (source + wordEnd < bufEnd) {
569 				if (isspace(source[wordEnd]) ||
570 					strchr (specialChars, source[wordEnd]) != NULL)
571 					break;
572 				if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
573 					0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
574 					// No English words are that long (46 is the longest),
575 					// break up what is likely Asian text (which has no spaces)
576 					// at the start of the next non-ASCII UTF-8 character (high
577 					// two bits are both ones).  Note that two encoded words in
578 					// a row get joined together, even if there is a space
579 					// between them in the final output text, according to the
580 					// standard.  Next word will also be conveniently get
581 					// encoded due to the 0xC0 test.
582 					currentWord->needsEncoding = true;
583 					break;
584 				}
585 				wordEnd++;
586 			}
587 		}
588 		currentWord->originalWord.SetTo (source, wordEnd);
589 		currentWord->ConvertWordToCharset (charset);
590 		words.AddItem(currentWord);
591 		source += wordEnd;
592 	}
593 
594 	// Combine adjacent words which contain unprintable text so that the
595 	// overhead of switching back and forth between regular text and specially
596 	// encoded text is reduced.  However, the combined word must be shorter
597 	// than the maximum of 75 bytes, including character set specification and
598 	// all those delimiters (worst case 22 bytes of overhead).
599 
600 	struct word *run;
601 
602 	for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
603 		if (!currentWord->needsEncoding)
604 			continue; // No need to combine unencoded words.
605 		for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
606 			if (!run->needsEncoding)
607 				break; // Don't want to combine encoded and unencoded words.
608 			if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
609 				currentWord->originalWord.Append (run->originalWord);
610 				currentWord->ConvertWordToCharset (charset);
611 				words.RemoveItem(g);
612 				delete run;
613 				g--;
614 			} else // Can't merge this word, result would be too long.
615 				break;
616 		}
617 	}
618 
619 	// Combine the encoded and unencoded words into one line, doing the
620 	// quoted-printable or base64 encoding.  Insert an extra space between
621 	// words which are both encoded to make word wrapping easier, since there
622 	// is normally none, and you're allowed to insert space (the receiver
623 	// throws it away if it is between encoded words).
624 
625 	BString rfc2047;
626 	bool	previousWordNeededEncoding = false;
627 
628 	const char *charset_dec = "none-bug";
629 	for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
630 		if (mail_charsets[i].flavor == charset) {
631 			charset_dec = mail_charsets[i].charset;
632 			break;
633 		}
634 	}
635 
636 	while ((currentWord = (struct word *)words.RemoveItem(0L)) != NULL) {
637 		if ((encoding != quoted_printable && encoding != base64) ||
638 		!currentWord->needsEncoding) {
639 			rfc2047.Append (currentWord->convertedWord);
640 		} else {
641 			// This word needs encoding.  Try to insert a space between it and
642 			// the previous word.
643 			if (previousWordNeededEncoding)
644 				rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
645 			else {
646 				// Previous word is not encoded, spaces are significant.  Try
647 				// to move a space from the start of this word to be outside of
648 				// the encoded text, so that there is a bit of space between
649 				// this word and the previous one to enhance word wrapping
650 				// chances later on.
651 				if (currentWord->originalWord.Length() > 1 &&
652 					isspace (currentWord->originalWord[0])) {
653 					rfc2047 << currentWord->originalWord[0];
654 					currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
655 					currentWord->ConvertWordToCharset (charset);
656 				}
657 			}
658 
659 			char *encoded = NULL;
660 			ssize_t encoded_len = 0;
661 			int32 convertedLength = currentWord->convertedWord.Length ();
662 			const char *convertedBuffer = currentWord->convertedWord.String ();
663 
664 			switch (encoding) {
665 				case quoted_printable:
666 					encoded = (char *) malloc (convertedLength * 3);
667 					encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
668 					break;
669 				case base64:
670 					encoded = (char *) malloc (convertedLength * 2);
671 					encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
672 					break;
673 				default: // Unknown encoding type, shouldn't happen.
674 					encoded = (char *) convertedBuffer;
675 					encoded_len = convertedLength;
676 					break;
677 			}
678 
679 			rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
680 			rfc2047.Append (encoded, encoded_len);
681 			rfc2047 << "?=";
682 
683 			if (encoding == quoted_printable || encoding == base64)
684 				free(encoded);
685 		}
686 		previousWordNeededEncoding = currentWord->needsEncoding;
687 		delete currentWord;
688 	}
689 
690 	free(*bufp);
691 
692 	ssize_t finalLength = rfc2047.Length ();
693 	*bufp = (char *) (malloc (finalLength + 1));
694 	memcpy (*bufp, rfc2047.String(), finalLength);
695 	(*bufp)[finalLength] = 0;
696 
697 	return finalLength;
698 }
699 
700 
701 //====================================================================
702 
703 void FoldLineAtWhiteSpaceAndAddCRLF (BString &string)
704 {
705 	int			inputLength = string.Length();
706 	int			lineStartIndex;
707 	const int	maxLineLength = 78; // Doesn't include CRLF.
708 	BString		output;
709 	int			splitIndex;
710 	int			tempIndex;
711 
712 	lineStartIndex = 0;
713 	while (true) {
714 		// If we don't need to wrap the text, just output the remainder, if any.
715 
716 		if (lineStartIndex + maxLineLength >= inputLength) {
717 			if (lineStartIndex < inputLength) {
718 				output.Insert (string, lineStartIndex /* source offset */,
719 					inputLength - lineStartIndex /* count */,
720 					output.Length() /* insert at */);
721 				output.Append (CRLF);
722 			}
723 			break;
724 		}
725 
726 		// Look ahead for a convenient spot to split it, between a comma and
727 		// space, which you often see between e-mail addresses like this:
728 		// "Joe Who" joe@dot.com, "Someone Else" else@blot.com
729 
730 		tempIndex = lineStartIndex + maxLineLength;
731 		if (tempIndex > inputLength)
732 			tempIndex = inputLength;
733 		splitIndex = string.FindLast (", ", tempIndex);
734 		if (splitIndex >= lineStartIndex)
735 			splitIndex++; // Point to the space character.
736 
737 		// If none of those exist, try splitting at any white space.
738 
739 		if (splitIndex <= lineStartIndex)
740 			splitIndex = string.FindLast (" ", tempIndex);
741 		if (splitIndex <= lineStartIndex)
742 			splitIndex = string.FindLast ("\t", tempIndex);
743 
744 		// If none of those exist, allow for a longer word - split at the next
745 		// available white space.
746 
747 		if (splitIndex <= lineStartIndex)
748 			splitIndex = string.FindFirst (" ", lineStartIndex + 1);
749 		if (splitIndex <= lineStartIndex)
750 			splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
751 
752 		// Give up, the whole rest of the line can't be split, just dump it
753 		// out.
754 
755 		if (splitIndex <= lineStartIndex) {
756 			if (lineStartIndex < inputLength) {
757 				output.Insert (string, lineStartIndex /* source offset */,
758 					inputLength - lineStartIndex /* count */,
759 					output.Length() /* insert at */);
760 				output.Append (CRLF);
761 			}
762 			break;
763 		}
764 
765 		// Do the split.  The current line up to but not including the space
766 		// gets output, followed by a CRLF.  The space remains to become the
767 		// start of the next line (and that tells the message reader that it is
768 		// a continuation line).
769 
770 		output.Insert (string, lineStartIndex /* source offset */,
771 			splitIndex - lineStartIndex /* count */,
772 			output.Length() /* insert at */);
773 		output.Append (CRLF);
774 		lineStartIndex = splitIndex;
775 	}
776 	string.SetTo (output);
777 }
778 
779 
780 //====================================================================
781 
782 _EXPORT ssize_t readfoldedline(FILE *file, char **buffer, size_t *buflen)
783 {
784 	ssize_t len = buflen && *buflen ? *buflen : 0;
785 	char * buf = buffer && *buffer ? *buffer : NULL;
786 	ssize_t cnt = 0; // Number of characters currently in the buffer.
787 	int c;
788 
789 	while (true)
790 	{
791 		// Make sure there is space in the buffer for two more characters (one
792 		// for the next character, and one for the end of string NUL byte).
793 		if (buf == NULL || cnt + 2 >= len)
794 		{
795 			char *temp = (char *)realloc(buf, len + 64);
796 			if (temp == NULL) {
797 				// Out of memory, however existing buffer remains allocated.
798 				cnt = ENOMEM;
799 				break;
800 			}
801 			len += 64;
802 			buf = temp;
803 		}
804 
805 		// Read the next character, or end of file, or IO error.
806 		if ((c = fgetc(file)) == EOF) {
807 			if (ferror (file)) {
808 				cnt = errno;
809 				if (cnt >= 0)
810 					cnt = -1; // Error codes must be negative.
811 			} else {
812 				// Really is end of file.  Also make it end of line if there is
813 				// some text already read in.  If the first thing read was EOF,
814 				// just return an empty string.
815 				if (cnt > 0) {
816 					buf[cnt++] = '\n';
817 					if (buf[cnt-2] == '\r') {
818 						buf[cnt-2] = '\n';
819 						--cnt;
820 					}
821 				}
822 			}
823 			break;
824 		}
825 
826 		buf[cnt++] = c;
827 
828 		if (c == '\n') {
829 			// Convert CRLF end of line to just a LF.  Do it before folding, in
830 			// case we don't need to fold.
831 			if (cnt >= 2 && buf[cnt-2] == '\r') {
832 				buf[cnt-2] = '\n';
833 				--cnt;
834 			}
835 			// If the current line is empty then return it (so that empty lines
836 			// don't disappear if the next line starts with a space).
837 			if (cnt <= 1)
838 				break;
839 			// Fold if first character on the next line is whitespace.
840 			c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
841 			if (c == ' ' || c == '\t')
842 				buf[cnt-1] = c; // Replace \n with the white space character.
843 			else {
844 				// Not folding, we finished reading a line; break out of the loop
845 				ungetc(c,file);
846 				break;
847 			}
848 		}
849 	}
850 
851 
852 	if (buf != NULL && cnt >= 0)
853 		buf[cnt] = '\0';
854 
855 	if (buffer)
856 		*buffer = buf;
857 	else if (buf)
858 		free(buf);
859 
860 	if (buflen)
861 		*buflen = len;
862 
863 	return cnt;
864 }
865 
866 
867 //====================================================================
868 
869 _EXPORT ssize_t readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
870 {
871 	ssize_t len = buflen && *buflen ? *buflen : 0;
872 	char * buf = buffer && *buffer ? *buffer : NULL;
873 	ssize_t cnt = 0; // Number of characters currently in the buffer.
874 	char c;
875 	status_t errorCode;
876 
877 	while (true)
878 	{
879 		// Make sure there is space in the buffer for two more characters (one
880 		// for the next character, and one for the end of string NUL byte).
881 		if (buf == NULL || cnt + 2 >= len)
882 		{
883 			char *temp = (char *)realloc(buf, len + 64);
884 			if (temp == NULL) {
885 				// Out of memory, however existing buffer remains allocated.
886 				cnt = ENOMEM;
887 				break;
888 			}
889 			len += 64;
890 			buf = temp;
891 		}
892 
893 		errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
894 		if (errorCode != 1) {
895 			if (errorCode < 0) {
896 				cnt = errorCode; // IO error encountered, just return the code.
897 			} else {
898 				// Really is end of file.  Also make it end of line if there is
899 				// some text already read in.  If the first thing read was EOF,
900 				// just return an empty string.
901 				if (cnt > 0) {
902 					buf[cnt++] = '\n';
903 					if (buf[cnt-2] == '\r') {
904 						buf[cnt-2] = '\n';
905 						--cnt;
906 					}
907 				}
908 			}
909 			break;
910 		}
911 
912 		buf[cnt++] = c;
913 
914 		if (c == '\n') {
915 			// Convert CRLF end of line to just a LF.  Do it before folding, in
916 			// case we don't need to fold.
917 			if (cnt >= 2 && buf[cnt-2] == '\r') {
918 				buf[cnt-2] = '\n';
919 				--cnt;
920 			}
921 			// If the current line is empty then return it (so that empty lines
922 			// don't disappear if the next line starts with a space).
923 			if (cnt <= 1)
924 				break;
925 			// if first character on the next line is whitespace, fold lines
926 			errorCode = in.Read(&c,1);
927 			if (errorCode == 1) {
928 				if (c == ' ' || c == '\t')
929 					buf[cnt-1] = c; // Replace \n with the white space character.
930 				else {
931 					// Not folding, we finished reading a whole line.
932 					in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
933 					break;
934 				}
935 			} else if (errorCode < 0) {
936 				cnt = errorCode;
937 				break;
938 			} else // No next line; at the end of the file.  Return the line.
939 				break;
940 		}
941 	}
942 
943 	if (buf != NULL && cnt >= 0)
944 		buf[cnt] = '\0';
945 
946 	if (buffer)
947 		*buffer = buf;
948 	else if (buf)
949 		free(buf);
950 
951 	if (buflen)
952 		*buflen = len;
953 
954 	return cnt;
955 }
956 
957 
958 _EXPORT ssize_t
959 nextfoldedline(const char** header, char **buffer, size_t *buflen)
960 {
961 	ssize_t len = buflen && *buflen ? *buflen : 0;
962 	char * buf = buffer && *buffer ? *buffer : NULL;
963 	ssize_t cnt = 0; // Number of characters currently in the buffer.
964 	char c;
965 
966 	while (true)
967 	{
968 		// Make sure there is space in the buffer for two more characters (one
969 		// for the next character, and one for the end of string NUL byte).
970 		if (buf == NULL || cnt + 2 >= len)
971 		{
972 			char *temp = (char *)realloc(buf, len + 64);
973 			if (temp == NULL) {
974 				// Out of memory, however existing buffer remains allocated.
975 				cnt = ENOMEM;
976 				break;
977 			}
978 			len += 64;
979 			buf = temp;
980 		}
981 
982 		// Read the next character, or end of file.
983 		if ((c = *(*header)++) == 0) {
984 			// End of file.  Also make it end of line if there is some text
985 			// already read in.  If the first thing read was EOF, just return
986 			// an empty string.
987 			if (cnt > 0) {
988 				buf[cnt++] = '\n';
989 				if (buf[cnt-2] == '\r') {
990 					buf[cnt-2] = '\n';
991 					--cnt;
992 				}
993 			}
994 			break;
995 		}
996 
997 		buf[cnt++] = c;
998 
999 		if (c == '\n') {
1000 			// Convert CRLF end of line to just a LF.  Do it before folding, in
1001 			// case we don't need to fold.
1002 			if (cnt >= 2 && buf[cnt-2] == '\r') {
1003 				buf[cnt-2] = '\n';
1004 				--cnt;
1005 			}
1006 			// If the current line is empty then return it (so that empty lines
1007 			// don't disappear if the next line starts with a space).
1008 			if (cnt <= 1)
1009 				break;
1010 			// if first character on the next line is whitespace, fold lines
1011 			c = *(*header)++;
1012 			if (c == ' ' || c == '\t')
1013 				buf[cnt-1] = c; // Replace \n with the white space character.
1014 			else {
1015 				// Not folding, we finished reading a line; break out of the loop
1016 				(*header)--; // Undo read of the non-whitespace.
1017 				break;
1018 			}
1019 		}
1020 	}
1021 
1022 
1023 	if (buf != NULL && cnt >= 0)
1024 		buf[cnt] = '\0';
1025 
1026 	if (buffer)
1027 		*buffer = buf;
1028 	else if (buf)
1029 		free(buf);
1030 
1031 	if (buflen)
1032 		*buflen = len;
1033 
1034 	return cnt;
1035 }
1036 
1037 
1038 _EXPORT void
1039 trim_white_space(BString &string)
1040 {
1041 	int32 i;
1042 	int32 length = string.Length();
1043 	char *buffer = string.LockBuffer(length + 1);
1044 
1045 	while (length > 0 && isspace(buffer[length - 1]))
1046 		length--;
1047 	buffer[length] = '\0';
1048 
1049 	for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1050 	if (i != 0) {
1051 		length -= i;
1052 		memmove(buffer,buffer + i,length + 1);
1053 	}
1054 	string.UnlockBuffer(length);
1055 }
1056 
1057 
1058 /** Tries to return a human-readable name from the specified
1059  *	header parameter (should be from "To:" or "From:").
1060  *	Tries to return the name rather than the eMail address.
1061  */
1062 
1063 _EXPORT void
1064 extract_address_name(BString &header)
1065 {
1066 	BString name;
1067 	const char *start = header.String();
1068 	const char *stop = start + strlen (start);
1069 
1070 	// Find a string S in the header (email foo) that matches:
1071 	//   Old style name in brackets: foo@bar.com (S)
1072 	//   New style quotes: "S" <foo@bar.com>
1073 	//   New style no quotes if nothing else found: S <foo@bar.com>
1074 	//   If nothing else found then use the whole thing: S
1075 
1076 	for (int i = 0; i <= 3; i++) {
1077 		// Set p1 to the first letter in the name and p2 to just past the last
1078 		// letter in the name.  p2 stays NULL if a name wasn't found in this
1079 		// pass.
1080 		const char *p1 = NULL, *p2 = NULL;
1081 
1082 		switch (i) {
1083 			case 0: // foo@bar.com (S)
1084 				if ((p1 = strchr(start,'(')) != NULL) {
1085 					p1++; // Advance to first letter in the name.
1086 					size_t nest = 1; // Handle nested brackets.
1087 					for (p2 = p1; p2 < stop; ++p2)
1088 					{
1089 						if (*p2 == ')')
1090 							--nest;
1091 						else if (*p2 == '(')
1092 							++nest;
1093 						if (nest <= 0)
1094 							break;
1095 					}
1096 					if (nest != 0)
1097 						p2 = NULL; // False alarm, no terminating bracket.
1098 				}
1099 				break;
1100 			case 1: // "S" <foo@bar.com>
1101 				if ((p1 = strchr(start, '\"')) != NULL)
1102 					p2 = strchr(++p1, '\"');
1103 				break;
1104 			case 2: // S <foo@bar.com>
1105 				p1 = start;
1106 				if (name.Length() == 0)
1107 					p2 = strchr(start, '<');
1108 				break;
1109 			case 3: // S
1110 				p1 = start;
1111 				if (name.Length() == 0)
1112 					p2 = stop;
1113 				break;
1114 		}
1115 
1116 		// Remove leading and trailing space-like characters and save the
1117 		// result if it is longer than any other likely names found.
1118 		if (p2 != NULL) {
1119 			while (p1 < p2 && (isspace (*p1)))
1120 				++p1;
1121 
1122 			while (p1 < p2 && (isspace (p2[-1])))
1123 				--p2;
1124 
1125 			int newLength = p2 - p1;
1126 			if (name.Length() < newLength)
1127 				name.SetTo(p1, newLength);
1128 		}
1129 	}
1130 
1131 	int32 lessIndex = name.FindFirst('<');
1132 	int32 greaterIndex = name.FindLast('>');
1133 
1134 	if (lessIndex == 0) {
1135 		// Have an address of the form <address> and nothing else, so remove
1136 		// the greater and less than signs, if any.
1137 		if (greaterIndex > 0)
1138 			name.Remove(greaterIndex, 1);
1139 		name.Remove(lessIndex, 1);
1140 	} else if (lessIndex > 0 && lessIndex < greaterIndex) {
1141 		// Yahoo stupidly inserts the e-mail address into the name string, so
1142 		// this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1143 		name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1144 	}
1145 
1146 	trim_white_space(name);
1147 	header = name;
1148 }
1149 
1150 
1151 
1152 // Given a subject in a BString, remove the extraneous RE: re: and other stuff
1153 // to get down to the core subject string, which should be identical for all
1154 // messages posted about a topic.  The input string is modified in place to
1155 // become the output core subject string.
1156 
1157 static int32				gLocker = 0;
1158 static size_t				gNsub = 1;
1159 static re_pattern_buffer	gRe;
1160 static re_pattern_buffer   *gRebuf = NULL;
1161 static char					gTranslation[256];
1162 
1163 _EXPORT void SubjectToThread (BString &string)
1164 {
1165 // a regex that matches a non-ASCII UTF8 character:
1166 #define U8C \
1167 	"[\302-\337][\200-\277]" \
1168 	"|\340[\302-\337][\200-\277]" \
1169 	"|[\341-\357][\200-\277][\200-\277]" \
1170 	"|\360[\220-\277][\200-\277][\200-\277]" \
1171 	"|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1172 	"|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1173 	"|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1174 	"|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1175 	"|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1176 
1177 #define PATTERN \
1178 	"^ +" \
1179 	"|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1180 	"|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1181 	"| *\\(fwd\\) *$"
1182 
1183 	if (gRebuf == NULL && atomic_add(&gLocker,1) == 0)
1184 	{
1185 		// the idea is to compile the regexp once to speed up testing
1186 
1187 		for (int i=0; i<256; ++i) gTranslation[i]=i;
1188 		for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1189 
1190 		gRe.translate = gTranslation;
1191 		gRe.regs_allocated = REGS_FIXED;
1192 		re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1193 
1194 		const char *pattern = PATTERN;
1195 		// count subexpressions in PATTERN
1196 		for (unsigned int i=0; pattern[i] != 0; ++i)
1197 		{
1198 			if (pattern[i] == '\\')
1199 				++i;
1200 			else if (pattern[i] == '(')
1201 				++gNsub;
1202 		}
1203 
1204 		const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1205 		if (err == NULL)
1206 			gRebuf = &gRe;
1207 		else
1208 			fprintf(stderr, "Failed to compile the regex: %s\n", err);
1209 	}
1210 	else
1211 	{
1212 		int32 tries = 200;
1213 		while (gRebuf == NULL && tries-- > 0)
1214 			snooze(10000);
1215 	}
1216 
1217 	if (gRebuf)
1218 	{
1219 		struct re_registers regs;
1220 		// can't be static if this function is to be thread-safe
1221 
1222 		regs.num_regs = gNsub;
1223 		regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1224 		regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1225 
1226 		for (int start=0;
1227 		    (start=re_search(gRebuf, string.String(), string.Length(),
1228 							0, string.Length(), &regs)) >= 0;
1229 			)
1230 		{
1231 			//
1232 			// we found something
1233 			//
1234 
1235 			// don't delete [bemaildaemon]...
1236 			if (start == regs.start[1])
1237 				start = regs.start[2];
1238 
1239 			string.Remove(start,regs.end[0]-start);
1240 			if (start) string.Insert(' ',1,start);
1241 		}
1242 
1243 		free(regs.start);
1244 		free(regs.end);
1245 	}
1246 
1247 	// Finally remove leading and trailing space.  Some software, like
1248 	// tm-edit 1.8, appends a space to the subject, which would break
1249 	// threading if we left it in.
1250 	trim_white_space(string);
1251 }
1252 
1253 
1254 
1255 // Converts a date to a time.  Handles numeric time zones too, unlike
1256 // parsedate.  Returns -1 if it fails.
1257 
1258 _EXPORT time_t ParseDateWithTimeZone (const char *DateString)
1259 {
1260 	time_t	currentTime;
1261 	time_t	dateAsTime;
1262 	char	tempDateString [80];
1263 	char	tempZoneString [6];
1264 	time_t	zoneDeltaTime;
1265 	int		zoneIndex;
1266 	char   *zonePntr;
1267 
1268 	// See if we can remove the time zone portion.  parsedate understands time
1269 	// zone 3 letter names, but doesn't understand the numeric +9999 time zone
1270 	// format.  To do: see if a newer parsedate exists.
1271 
1272 	strncpy (tempDateString, DateString, sizeof (tempDateString));
1273 	tempDateString[sizeof (tempDateString) - 1] = 0;
1274 
1275 	// Remove trailing spaces.
1276 	zonePntr = tempDateString + strlen (tempDateString) - 1;
1277 	while (zonePntr >= tempDateString && isspace (*zonePntr))
1278 		*zonePntr-- = 0;
1279 	if (zonePntr < tempDateString)
1280 		return -1; // Empty string.
1281 
1282 	// Remove the trailing time zone in round brackets, like in
1283 	// Fri, 22 Feb 2002 15:22:42 EST (-0500)
1284 	// Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1285 	if (tempDateString[strlen(tempDateString)-1] == ')')
1286 	{
1287 		zonePntr = strrchr (tempDateString, '(');
1288 		if (zonePntr != NULL)
1289 		{
1290 			*zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1291 			while (zonePntr >= tempDateString && isspace (*zonePntr))
1292 				*zonePntr-- = 0;
1293 			if (zonePntr < tempDateString)
1294 				return -1; // Empty string.
1295 		}
1296 	}
1297 
1298 	// Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1299 	for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1300 	{
1301 		zonePntr = tempDateString + zoneIndex;
1302 		if (zonePntr[0] == '+' || zonePntr[0] == '-')
1303 		{
1304 			if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1305 				zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1306 				zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1307 				zonePntr[4] >= '0' && zonePntr[4] <= '9')
1308 				break;
1309 		}
1310 	}
1311 	if (zoneIndex >= 0)
1312 	{
1313 		// Remove the zone from the date string and any following time zone
1314 		// letter codes.  Also put in GMT so that the date gets parsed as GMT.
1315 		memcpy (tempZoneString, zonePntr, 5);
1316 		tempZoneString [5] = 0;
1317 		strcpy (zonePntr, "GMT");
1318 	}
1319 	else // No numeric time zone found.
1320 		strcpy (tempZoneString, "+0000");
1321 
1322 	time (&currentTime);
1323 	dateAsTime = parsedate (tempDateString, currentTime);
1324 	if (dateAsTime == (time_t) -1)
1325 		return -1; // Failure.
1326 
1327 	zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1328 	tempZoneString[3] = 0;
1329 	zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1330 	if (tempZoneString[0] == '+')
1331 		zoneDeltaTime = 0 - zoneDeltaTime;
1332 	dateAsTime += zoneDeltaTime;
1333 
1334 	return dateAsTime;
1335 }
1336 
1337 
1338 /** Parses a mail header and fills the headers BMessage
1339  */
1340 
1341 _EXPORT status_t
1342 parse_header(BMessage &headers, BPositionIO &input)
1343 {
1344 	char *buffer = NULL;
1345 	size_t bufferSize = 0;
1346 	int32 length;
1347 
1348 	while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1349 		--length;
1350 			// Don't include the \n at the end of the buffer.
1351 
1352 		// convert to UTF-8 and null-terminate the buffer
1353 		length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1354 		buffer[length] = '\0';
1355 
1356 		const char *delimiter = strstr(buffer, ":");
1357 		if (delimiter == NULL)
1358 			continue;
1359 
1360 		BString header(buffer, delimiter - buffer);
1361 		header.CapitalizeEachWord();
1362 			// unified case for later fetch
1363 
1364 		delimiter++; // Skip the colon.
1365 		while (isspace (*delimiter))
1366 			delimiter++; // Skip over leading white space and tabs.  To do: (comments in brackets).
1367 
1368 		// ToDo: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1369 		headers.AddString(header.String(), delimiter);
1370 	}
1371 	free(buffer);
1372 
1373 	return B_OK;
1374 }
1375 
1376 
1377 _EXPORT void
1378 extract_address(BString &address)
1379 {
1380 	const char *string = address.String();
1381 	int32 first;
1382 
1383 	// first, remove all quoted text
1384 
1385 	if ((first = address.FindFirst('"')) >= 0) {
1386 		int32 last = first + 1;
1387 		while (string[last] && string[last] != '"')
1388 			last++;
1389 
1390 		if (string[last] == '"')
1391 			address.Remove(first, last + 1 - first);
1392 	}
1393 
1394 	// try to extract the address now
1395 
1396 	if ((first = address.FindFirst('<')) >= 0) {
1397 		// the world likes us and we can just get the address the easy way...
1398 		int32 last = address.FindFirst('>');
1399 		if (last >= 0) {
1400 			address.Truncate(last);
1401 			address.Remove(0, first + 1);
1402 
1403 			return;
1404 		}
1405 	}
1406 
1407 	// then, see if there is anything in parenthesis to throw away
1408 
1409 	if ((first = address.FindFirst('(')) >= 0) {
1410 		int32 last = first + 1;
1411 		while (string[last] && string[last] != ')')
1412 			last++;
1413 
1414 		if (string[last] == ')')
1415 			address.Remove(first, last + 1 - first);
1416 	}
1417 
1418 	// now, there shouldn't be much else left
1419 
1420 	trim_white_space(address);
1421 }
1422 
1423 
1424 _EXPORT void
1425 get_address_list(BList &list, const char *string, void (*cleanupFunc)(BString &))
1426 {
1427 	if (string == NULL || !string[0])
1428 		return;
1429 
1430 	const char *start = string;
1431 
1432 	while (true) {
1433 		if (string[0] == '"') {
1434 			const char *quoteEnd = ++string;
1435 
1436 			while (quoteEnd[0] && quoteEnd[0] != '"')
1437 				quoteEnd++;
1438 
1439 			if (!quoteEnd[0])	// string exceeds line!
1440 				quoteEnd = string;
1441 
1442 			string = quoteEnd + 1;
1443 		}
1444 
1445 		if (string[0] == ',' || string[0] == '\0') {
1446 			BString address(start, string - start);
1447 			trim_white_space(address);
1448 
1449 			if (cleanupFunc)
1450 				cleanupFunc(address);
1451 
1452 			list.AddItem(strdup(address.String()));
1453 
1454 			start = string + 1;
1455 		}
1456 
1457 		if (!string[0])
1458 			break;
1459 
1460 		string++;
1461 	}
1462 }
1463 
1464