xref: /haiku/src/kits/mail/mail_util.cpp (revision 4015b5877f668e870c12f4214af98078eed3b4e3)
1 /*
2  * Copyright 2011-2016, Haiku, Inc. All rights reserved.
3  * Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
4  */
5 
6 
7 #include <mail_util.h>
8 
9 #include <stdlib.h>
10 #include <strings.h>
11 #include <stdio.h>
12 #define __USE_GNU
13 #include <regex.h>
14 #include <ctype.h>
15 #include <errno.h>
16 
17 #include <FindDirectory.h>
18 #include <List.h>
19 #include <Locker.h>
20 #include <parsedate.h>
21 #include <Path.h>
22 #include <String.h>
23 #include <UTF8.h>
24 
25 #include <mail_encoding.h>
26 
27 #include <AttributeUtilities.h>
28 #include <CharacterSet.h>
29 #include <CharacterSetRoster.h>
30 
31 
32 using namespace BPrivate;
33 
34 
35 #define CRLF   "\r\n"
36 
37 struct CharsetConversionEntry {
38 	const char *charset;
39 	uint32 flavor;
40 };
41 
42 extern const CharsetConversionEntry mail_charsets[] = {
43 	// In order of authority, so when searching for the name for a particular
44 	// numbered conversion, start at the beginning of the array.
45 	{"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
46 	{"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
47 	{"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
48 	{"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
49 	{"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
50 	{"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
51 	{"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
52 	{"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
53 	{"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
54 	{"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
55 	{"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
56 	{"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
57 	{"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
58 
59 	{"shift_jis",	B_SJIS_CONVERSION}, // MIME STANDARD
60 	{"shift-jis",	B_SJIS_CONVERSION},
61 	{"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
62 	{"euc-jp",		B_EUC_CONVERSION}, // MIME STANDARD
63 
64 	{"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
65 	{"ksc5601",		B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
66 	{"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
67 
68 	{"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
69 	{"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
70 	{"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
71 
72 	{"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
73 	{"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
74 	{"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
75 
76     {"big5",        24}, // MIME STANDARD
77 
78     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
79     {"gb2312",      25}, // COMPATIBLE
80     {"gbk",         25}, // COMPATIBLE
81 
82 	/* {"utf-16",		B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
83 	{"us-ascii",	B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
84 	{"utf-8",		B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
85 
86 	{NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
87 };
88 
89 
90 static int32 gLocker = 0;
91 static size_t gNsub = 1;
92 static re_pattern_buffer gRe;
93 static re_pattern_buffer *gRebuf = NULL;
94 static unsigned char gTranslation[256];
95 
96 
97 static int
handle_non_rfc2047_encoding(char ** buffer,size_t * bufferLength,size_t * sourceLength)98 handle_non_rfc2047_encoding(char **buffer, size_t *bufferLength,
99 	size_t *sourceLength)
100 {
101 	char *string = *buffer;
102 	int32 length = *sourceLength;
103 	int32 i;
104 
105 	// check for 8-bit characters
106 	for (i = 0;i < length;i++)
107 		if (string[i] & 0x80)
108 			break;
109 	if (i == length)
110 		return false;
111 
112 	// check for groups of 8-bit characters - this code is not very smart;
113 	// it just can detect some sort of single-byte encoded stuff, the rest
114 	// is regarded as UTF-8
115 
116 	int32 singletons = 0,doubles = 0;
117 
118 	for (i = 0;i < length;i++)
119 	{
120 		if (string[i] & 0x80)
121 		{
122 			if ((string[i + 1] & 0x80) == 0)
123 				singletons++;
124 			else doubles++;
125 			i++;
126 		}
127 	}
128 
129 	if (singletons != 0)	// can't be valid UTF-8 anymore, so we assume ISO-Latin-1
130 	{
131 		int32 state = 0;
132 		// just to be sure
133 		int32 destLength = length * 4 + 1;
134 		int32 destBufferLength = destLength;
135 		char *dest = (char*)malloc(destLength);
136 		if (dest == NULL)
137 			return 0;
138 
139 		if (convert_to_utf8(B_ISO1_CONVERSION, string, &length,dest,
140 			&destLength, &state) == B_OK) {
141 			*buffer = dest;
142 			*bufferLength = destBufferLength;
143 			*sourceLength = destLength;
144 			return true;
145 		}
146 		free(dest);
147 		return false;
148 	}
149 
150 	// we assume a valid UTF-8 string here, but yes, we don't check it
151 	return true;
152 }
153 
154 
155 // #pragma mark -
156 
157 
158 status_t
write_read_attr(BNode & node,read_flags flag)159 write_read_attr(BNode& node, read_flags flag)
160 {
161 	if (node.WriteAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
162 			< 0)
163 		return B_ERROR;
164 
165 	// Manage the status string only if it currently has a known state
166 	BString currentStatus;
167 	if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &currentStatus) == B_OK
168 		&& currentStatus.ICompare("New") != 0
169 		&& currentStatus.ICompare("Read") != 0
170 		&& currentStatus.ICompare("Seen") != 0) {
171 		return B_OK;
172 	}
173 
174 	BString statusString = flag == B_READ ? "Read"
175 		: flag == B_SEEN ? "Seen" : "New";
176 	if (node.WriteAttrString(B_MAIL_ATTR_STATUS, &statusString) < 0)
177 		return B_ERROR;
178 
179 	return B_OK;
180 }
181 
182 
183 status_t
read_read_attr(BNode & node,read_flags & flag)184 read_read_attr(BNode& node, read_flags& flag)
185 {
186 	if (node.ReadAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
187 			== sizeof(int32))
188 		return B_OK;
189 
190 	BString statusString;
191 	if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &statusString) == B_OK) {
192 		if (statusString.ICompare("New") == 0)
193 			flag = B_UNREAD;
194 		else
195 			flag = B_READ;
196 
197 		return B_OK;
198 	}
199 
200 	return B_ERROR;
201 }
202 
203 
204 // The next couple of functions are our wrapper around convert_to_utf8 and
205 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
206 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.
207 // It also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
208 
209 
210 status_t
mail_convert_to_utf8(uint32 srcEncoding,const char * src,int32 * srcLen,char * dst,int32 * dstLen,int32 * state,char substitute)211 mail_convert_to_utf8(uint32 srcEncoding, const char *src, int32 *srcLen,
212 	char *dst, int32 *dstLen, int32 *state, char substitute)
213 {
214 	int32 copyAmount;
215 	char *originalDst = dst;
216 	status_t returnCode = -1;
217 
218 	if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
219 		copyAmount = *srcLen;
220 		if (*dstLen < copyAmount)
221 			copyAmount = *dstLen;
222 		memcpy (dst, src, copyAmount);
223 		*srcLen = copyAmount;
224 		*dstLen = copyAmount;
225 		returnCode = B_OK;
226 	} else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
227 		int32 i;
228 		unsigned char letter;
229 		copyAmount = *srcLen;
230 		if (*dstLen < copyAmount)
231 			copyAmount = *dstLen;
232 		for (i = 0; i < copyAmount; i++) {
233 			letter = *src++;
234 			if (letter > 0x80U)
235 				// Invalid, could also use substitute, but better to strip high bit.
236 				*dst++ = letter - 0x80U;
237 			else if (letter == 0x80U)
238 				// Can't convert to 0x00 since that's NUL, which would cause problems.
239 				*dst++ = substitute;
240 			else
241 				*dst++ = letter;
242 		}
243 		*srcLen = copyAmount;
244 		*dstLen = copyAmount;
245 		returnCode = B_OK;
246 	} else
247 		returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
248 			dst, dstLen, state, substitute);
249 
250 	if (returnCode == B_OK) {
251 		// Replace spurious NUL bytes, which should normally not be in the
252 		// output of the decoding (not normal UTF-8 characters, and no NULs are
253 		// in our usual input strings).  They happen for some odd ISO-2022-JP
254 		// byte pair combinations which are improperly handled by the BeOS
255 		// routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
256 		// first ESC $ B switches to a Japanese character set, then the next
257 		// two bytes "yD" specify a character, then ESC ( B switches back to
258 		// the ASCII character set.  The UTF-8 conversion yields a NUL byte.
259 		int32 i;
260 		for (i = 0; i < *dstLen; i++)
261 			if (originalDst[i] == 0)
262 				originalDst[i] = substitute;
263 	}
264 	return returnCode;
265 }
266 
267 
268 status_t
mail_convert_from_utf8(uint32 dstEncoding,const char * src,int32 * srcLen,char * dst,int32 * dstLen,int32 * state,char substitute)269 mail_convert_from_utf8(uint32 dstEncoding, const char *src, int32 *srcLen,
270 	char *dst, int32 *dstLen, int32 *state, char substitute)
271 {
272 	int32 copyAmount;
273 	status_t errorCode;
274 	int32 originalDstLen = *dstLen;
275 	int32 tempDstLen;
276 	int32 tempSrcLen;
277 
278 	if (dstEncoding == B_MAIL_UTF8_CONVERSION) {
279 		copyAmount = *srcLen;
280 		if (*dstLen < copyAmount)
281 			copyAmount = *dstLen;
282 		memcpy (dst, src, copyAmount);
283 		*srcLen = copyAmount;
284 		*dstLen = copyAmount;
285 		return B_OK;
286 	}
287 
288 	if (dstEncoding == B_MAIL_US_ASCII_CONVERSION) {
289 		int32 characterLength;
290 		int32 dstRemaining = *dstLen;
291 		unsigned char letter;
292 		int32 srcRemaining = *srcLen;
293 
294 		// state contains the number of source bytes to skip, left over from a
295 		// partial UTF-8 character split over the end of the buffer from last
296 		// time.
297 		if (srcRemaining <= *state) {
298 			*state -= srcRemaining;
299 			*dstLen = 0;
300 			return B_OK;
301 		}
302 		srcRemaining -= *state;
303 		src += *state;
304 		*state = 0;
305 
306 		while (true) {
307 			if (srcRemaining <= 0 || dstRemaining <= 0)
308 				break;
309 			letter = *src;
310 			if (letter < 0x80)
311 				characterLength = 1; // Regular ASCII equivalent code.
312 			else if (letter < 0xC0)
313 				characterLength = 1; // Invalid in-between data byte 10xxxxxx.
314 			else if (letter < 0xE0)
315 				characterLength = 2;
316 			else if (letter < 0xF0)
317 				characterLength = 3;
318 			else if (letter < 0xF8)
319 				characterLength = 4;
320 			else if (letter < 0xFC)
321 				characterLength = 5;
322 			else if (letter < 0xFE)
323 				characterLength = 6;
324 			else
325 				characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
326 			if (letter < 0x80)
327 				*dst++ = *src;
328 			else
329 				*dst++ = substitute;
330 			dstRemaining--;
331 			if (srcRemaining < characterLength) {
332 				// Character split past the end of the buffer.
333 				*state = characterLength - srcRemaining;
334 				srcRemaining = 0;
335 			} else {
336 				src += characterLength;
337 				srcRemaining -= characterLength;
338 			}
339 		}
340 		// Update with the amounts used.
341 		*srcLen = *srcLen - srcRemaining;
342 		*dstLen = *dstLen - dstRemaining;
343 		return B_OK;
344 	}
345 
346 	errorCode = convert_from_utf8(dstEncoding, src, srcLen, dst, dstLen, state,
347 		substitute);
348 	if (errorCode != B_OK)
349 		return errorCode;
350 
351 	if (dstEncoding != B_JIS_CONVERSION)
352 		return B_OK;
353 
354 	// B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
355 	// character subsets.  For E-mail headers (and other uses), it needs to be
356 	// switched back to ASCII at the end (otherwise the last character gets
357 	// lost or other weird things happen in the headers).  Note that we can't
358 	// just append the escape code since the convert_from_utf8 "state" will be
359 	// wrong.  So we append an ASCII letter and throw it away, leaving just the
360 	// escape code.  Well, it actually switches to the Roman character set, not
361 	// ASCII, but that should be OK.
362 
363 	tempDstLen = originalDstLen - *dstLen;
364 	if (tempDstLen < 3) // Not enough space remaining in the output.
365 		return B_OK; // Sort of an error, but we did convert the rest OK.
366 	tempSrcLen = 1;
367 	errorCode = convert_from_utf8(dstEncoding, "a", &tempSrcLen,
368 		dst + *dstLen, &tempDstLen, state, substitute);
369 	if (errorCode != B_OK)
370 		return errorCode;
371 	*dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
372 	return B_OK;
373 }
374 
375 
376 ssize_t
rfc2047_to_utf8(char ** bufp,size_t * bufLen,size_t strLen)377 rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
378 {
379 	char *head, *tail;
380 	char *charset, *encoding, *end;
381 	ssize_t ret = B_OK;
382 
383 	if (bufp == NULL || *bufp == NULL)
384 		return -1;
385 
386 	char *string = *bufp;
387 
388 	//---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
389 	if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
390 		return strLen;
391 
392 	// set up string length
393 	if (strLen == 0)
394 		strLen = strlen(*bufp);
395 	char lastChar = (*bufp)[strLen];
396 	(*bufp)[strLen] = '\0';
397 
398 	//---------Whew! Now for RFC compliant mail
399 	bool encodedWordFoundPreviously = false;
400 	for (head = tail = string;
401 		((charset = strstr(tail, "=?")) != NULL)
402 		&& (((encoding = strchr(charset + 2, '?')) != NULL)
403 			&& encoding[1] && (encoding[2] == '?') && encoding[3])
404 		&& (end = strstr(encoding + 3, "?=")) != NULL;
405 		// found "=?...charset...?e?...text...?=   (e == encoding)
406 		//        ^charset       ^encoding    ^end
407 		tail = end)
408 	{
409 		// Copy non-encoded text (from tail up to charset) to the output.
410 		// Ignore spaces between two encoded "words".  RFC2047 says the words
411 		// should be concatenated without the space (designed for Asian
412 		// sentences which have no spaces yet need to be broken into "words" to
413 		// keep within the line length limits).
414 		bool nonSpaceFound = false;
415 		for (int i = 0; i < charset-tail; i++) {
416 			if (!isspace (tail[i])) {
417 				nonSpaceFound = true;
418 				break;
419 			}
420 		}
421 		if (!encodedWordFoundPreviously || nonSpaceFound) {
422 			if (string != tail && tail != charset)
423 				memmove(string, tail, charset-tail);
424 			string += charset-tail;
425 		}
426 		tail = charset;
427 		encodedWordFoundPreviously = true;
428 
429 		// move things to point at what they should:
430 		//   =?...charset...?e?...text...?=   (e == encoding)
431 		//     ^charset      ^encoding     ^end
432 		charset += 2;
433 		encoding += 1;
434 		end += 2;
435 
436 		// find the charset this text is in now
437 		size_t cLen = encoding - 1 - charset;
438 		bool base64encoded = toupper(*encoding) == 'B';
439 
440 		uint32 convertID = B_MAIL_NULL_CONVERSION;
441 		char charsetName[cLen + 1];
442 		memcpy(charsetName, charset, cLen);
443 		charsetName[cLen] = '\0';
444 		if (strcasecmp(charsetName, "us-ascii") == 0) {
445 			convertID = B_MAIL_US_ASCII_CONVERSION;
446 		} else if (strcasecmp(charsetName, "utf-8") == 0) {
447 			convertID = B_MAIL_UTF8_CONVERSION;
448 		} else {
449 			const BCharacterSet* charSet
450 				= BCharacterSetRoster::FindCharacterSetByName(charsetName);
451 			if (charSet != NULL) {
452 				convertID = charSet->GetConversionID();
453 			}
454 		}
455 		if (convertID == B_MAIL_NULL_CONVERSION) {
456 			// unidentified charset
457 			// what to do? doing nothing skips the encoded text;
458 			// but we should keep it: we copy it to the output.
459 			if (string != tail && tail != end)
460 				memmove(string, tail, end-tail);
461 			string += end-tail;
462 			continue;
463 		}
464 		// else we've successfully identified the charset
465 
466 		char *src = encoding+2;
467 		int32 srcLen = end - 2 - src;
468 		// encoded text: src..src+srcLen
469 
470 		// decode text, get decoded length (reducing xforms)
471 		srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
472 			: decode_base64(src, src, srcLen);
473 
474 		// allocate space for the converted text
475 		int32 dstLen = end-string + *bufLen-strLen;
476 		char *dst = (char*)malloc(dstLen);
477 		int32 cvLen = srcLen;
478 		int32 convState = 0;
479 
480 		//
481 		// do the conversion
482 		//
483 		ret = mail_convert_to_utf8(convertID, src, &cvLen, dst, &dstLen,
484 			&convState);
485 		if (ret != B_OK) {
486 			// what to do? doing nothing skips the encoded text
487 			// but we should keep it: we copy it to the output.
488 
489 			free(dst);
490 
491 			if (string != tail && tail != end)
492 				memmove(string, tail, end-tail);
493 			string += end-tail;
494 			continue;
495 		}
496 		/* convert_to_ is either returning something wrong or my
497 		   test data is screwed up.  Whatever it is, Not Enough
498 		   Space is not the only cause of the below, so we just
499 		   assume it succeeds if it converts anything at all.
500 		else if (cvLen < srcLen)
501 		{
502 			// not enough room to convert the data;
503 			// grow *buf and retry
504 
505 			free(dst);
506 
507 			char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
508 			if (temp == NULL)
509 			{
510 				ret = B_NO_MEMORY;
511 				break;
512 			}
513 
514 			*bufp = temp;
515 			*bufLen = 2*(*bufLen + 1);
516 
517 			string = *bufp + (string-head);
518 			tail = *bufp + (tail-head);
519 			charset = *bufp + (charset-head);
520 			encoding = *bufp + (encoding-head);
521 			end = *bufp + (end-head);
522 			src = *bufp + (src-head);
523 			head = *bufp;
524 			continue;
525 		}
526 		*/
527 		else {
528 			if (dstLen > end-string) {
529 				// copy the string forward...
530 				memmove(string+dstLen, end, strLen - (end-head) + 1);
531 				strLen += string+dstLen - end;
532 				end = string + dstLen;
533 			}
534 
535 			memcpy(string, dst, dstLen);
536 			string += dstLen;
537 			free(dst);
538 			continue;
539 		}
540 	}
541 
542 	// copy everything that's left
543 	size_t tailLen = strLen - (tail - head);
544 	memmove(string, tail, tailLen+1);
545 	string += tailLen;
546 
547 	// replace the last char
548 	(*bufp)[strLen] = lastChar;
549 
550 	return ret < B_OK ? ret : string-head;
551 }
552 
553 
554 ssize_t
utf8_to_rfc2047(char ** bufp,ssize_t length,uint32 charset,char encoding)555 utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding)
556 {
557 	struct word {
558 		BString	originalWord;
559 		BString	convertedWord;
560 		bool	needsEncoding;
561 
562 		// Convert the word from UTF-8 to the desired character set.  The
563 		// converted version also includes the escape codes to return to ASCII
564 		// mode, if relevant.  Also note if it uses unprintable characters,
565 		// which means it will need that special encoding treatment later.
566 		void ConvertWordToCharset (uint32 charset) {
567 			int32 state = 0;
568 			int32 originalLength = originalWord.Length();
569 			int32 convertedLength = originalLength * 5 + 1;
570 			char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
571 			mail_convert_from_utf8 (charset, originalWord.String(),
572 				&originalLength, convertedBuffer, &convertedLength, &state);
573 			for (int i = 0; i < convertedLength; i++) {
574 				if ((convertedBuffer[i] & (1 << 7)) ||
575 					(convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
576 					needsEncoding = true;
577 					break;
578 				}
579 			}
580 			convertedWord.UnlockBuffer (convertedLength);
581 		};
582 	};
583 	struct word *currentWord;
584 	BList words;
585 
586 	// Break the header into words.  White space characters (including tabs and
587 	// newlines) separate the words.  Each word includes any space before it as
588 	// part of the word.  Actually, quotes and other special characters
589 	// (",()<>@) are treated as separate words of their own so that they don't
590 	// get encoded (because MIME headers get the quotes parsed before character
591 	// set unconversion is done).  The reader is supposed to ignore all white
592 	// space between encoded words, which can be inserted so that older mail
593 	// parsers don't have overly long line length problems.
594 
595 	const char *source = *bufp;
596 	const char *bufEnd = *bufp + length;
597 	const char *specialChars = "\"()<>@,";
598 
599 	while (source < bufEnd) {
600 		currentWord = new struct word;
601 		currentWord->needsEncoding = false;
602 
603 		int wordEnd = 0;
604 
605 		// Include leading spaces as part of the word.
606 		while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
607 			wordEnd++;
608 
609 		if (source + wordEnd < bufEnd &&
610 			strchr (specialChars, source[wordEnd]) != NULL) {
611 			// Got a quote mark or other special character, which is treated as
612 			// a word in itself since it shouldn't be encoded, which would hide
613 			// it from the mail system.
614 			wordEnd++;
615 		} else {
616 			// Find the end of the word.  Leave wordEnd pointing just after the
617 			// last character in the word.
618 			while (source + wordEnd < bufEnd) {
619 				if (isspace(source[wordEnd]) ||
620 					strchr (specialChars, source[wordEnd]) != NULL)
621 					break;
622 				if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
623 					0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
624 					// No English words are that long (46 is the longest),
625 					// break up what is likely Asian text (which has no spaces)
626 					// at the start of the next non-ASCII UTF-8 character (high
627 					// two bits are both ones).  Note that two encoded words in
628 					// a row get joined together, even if there is a space
629 					// between them in the final output text, according to the
630 					// standard.  Next word will also be conveniently get
631 					// encoded due to the 0xC0 test.
632 					currentWord->needsEncoding = true;
633 					break;
634 				}
635 				wordEnd++;
636 			}
637 		}
638 		currentWord->originalWord.SetTo (source, wordEnd);
639 		currentWord->ConvertWordToCharset (charset);
640 		words.AddItem(currentWord);
641 		source += wordEnd;
642 	}
643 
644 	// Combine adjacent words which contain unprintable text so that the
645 	// overhead of switching back and forth between regular text and specially
646 	// encoded text is reduced.  However, the combined word must be shorter
647 	// than the maximum of 75 bytes, including character set specification and
648 	// all those delimiters (worst case 22 bytes of overhead).
649 
650 	struct word *run;
651 
652 	for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
653 		if (!currentWord->needsEncoding)
654 			continue; // No need to combine unencoded words.
655 		for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
656 			if (!run->needsEncoding)
657 				break; // Don't want to combine encoded and unencoded words.
658 			if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
659 				currentWord->originalWord.Append (run->originalWord);
660 				currentWord->ConvertWordToCharset (charset);
661 				words.RemoveItem(g);
662 				delete run;
663 				g--;
664 			} else // Can't merge this word, result would be too long.
665 				break;
666 		}
667 	}
668 
669 	// Combine the encoded and unencoded words into one line, doing the
670 	// quoted-printable or base64 encoding.  Insert an extra space between
671 	// words which are both encoded to make word wrapping easier, since there
672 	// is normally none, and you're allowed to insert space (the receiver
673 	// throws it away if it is between encoded words).
674 
675 	BString rfc2047;
676 	bool	previousWordNeededEncoding = false;
677 
678 	const char *charset_dec = "none-bug";
679 	for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
680 		if (mail_charsets[i].flavor == charset) {
681 			charset_dec = mail_charsets[i].charset;
682 			break;
683 		}
684 	}
685 
686 	while ((currentWord = (struct word *)words.RemoveItem((int32)0)) != NULL) {
687 		if ((encoding != quoted_printable && encoding != base64) ||
688 		!currentWord->needsEncoding) {
689 			rfc2047.Append (currentWord->convertedWord);
690 		} else {
691 			// This word needs encoding.  Try to insert a space between it and
692 			// the previous word.
693 			if (previousWordNeededEncoding)
694 				rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
695 			else {
696 				// Previous word is not encoded, spaces are significant.  Try
697 				// to move a space from the start of this word to be outside of
698 				// the encoded text, so that there is a bit of space between
699 				// this word and the previous one to enhance word wrapping
700 				// chances later on.
701 				if (currentWord->originalWord.Length() > 1 &&
702 					isspace (currentWord->originalWord[0])) {
703 					rfc2047 << currentWord->originalWord[0];
704 					currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
705 					currentWord->ConvertWordToCharset (charset);
706 				}
707 			}
708 
709 			char *encoded = NULL;
710 			ssize_t encoded_len = 0;
711 			int32 convertedLength = currentWord->convertedWord.Length ();
712 			const char *convertedBuffer = currentWord->convertedWord.String ();
713 
714 			switch (encoding) {
715 				case quoted_printable:
716 					encoded = (char *) malloc (convertedLength * 3);
717 					encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
718 					break;
719 				case base64:
720 					encoded = (char *) malloc (convertedLength * 2);
721 					encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
722 					break;
723 				default: // Unknown encoding type, shouldn't happen.
724 					encoded = (char *) convertedBuffer;
725 					encoded_len = convertedLength;
726 					break;
727 			}
728 
729 			rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
730 			rfc2047.Append (encoded, encoded_len);
731 			rfc2047 << "?=";
732 
733 			if (encoding == quoted_printable || encoding == base64)
734 				free(encoded);
735 		}
736 		previousWordNeededEncoding = currentWord->needsEncoding;
737 		delete currentWord;
738 	}
739 
740 	free(*bufp);
741 
742 	ssize_t finalLength = rfc2047.Length ();
743 	*bufp = (char *) (malloc (finalLength + 1));
744 	memcpy (*bufp, rfc2047.String(), finalLength);
745 	(*bufp)[finalLength] = 0;
746 
747 	return finalLength;
748 }
749 
750 
751 void
FoldLineAtWhiteSpaceAndAddCRLF(BString & string)752 FoldLineAtWhiteSpaceAndAddCRLF(BString &string)
753 {
754 	int inputLength = string.Length();
755 	int lineStartIndex;
756 	const int maxLineLength = 78; // Doesn't include CRLF.
757 	BString output;
758 	int splitIndex;
759 	int tempIndex;
760 
761 	lineStartIndex = 0;
762 	while (true) {
763 		// If we don't need to wrap the text, just output the remainder, if any.
764 
765 		if (lineStartIndex + maxLineLength >= inputLength) {
766 			if (lineStartIndex < inputLength) {
767 				output.Insert (string, lineStartIndex /* source offset */,
768 					inputLength - lineStartIndex /* count */,
769 					output.Length() /* insert at */);
770 				output.Append (CRLF);
771 			}
772 			break;
773 		}
774 
775 		// Look ahead for a convenient spot to split it, between a comma and
776 		// space, which you often see between e-mail addresses like this:
777 		// "Joe Who" joe@dot.com, "Someone Else" else@blot.com
778 
779 		tempIndex = lineStartIndex + maxLineLength;
780 		if (tempIndex > inputLength)
781 			tempIndex = inputLength;
782 		splitIndex = string.FindLast (", ", tempIndex);
783 		if (splitIndex >= lineStartIndex)
784 			splitIndex++; // Point to the space character.
785 
786 		// If none of those exist, try splitting at any white space.
787 
788 		if (splitIndex <= lineStartIndex)
789 			splitIndex = string.FindLast (" ", tempIndex);
790 		if (splitIndex <= lineStartIndex)
791 			splitIndex = string.FindLast ("\t", tempIndex);
792 
793 		// If none of those exist, allow for a longer word - split at the next
794 		// available white space.
795 
796 		if (splitIndex <= lineStartIndex)
797 			splitIndex = string.FindFirst (" ", lineStartIndex + 1);
798 		if (splitIndex <= lineStartIndex)
799 			splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
800 
801 		// Give up, the whole rest of the line can't be split, just dump it
802 		// out.
803 
804 		if (splitIndex <= lineStartIndex) {
805 			if (lineStartIndex < inputLength) {
806 				output.Insert (string, lineStartIndex /* source offset */,
807 					inputLength - lineStartIndex /* count */,
808 					output.Length() /* insert at */);
809 				output.Append (CRLF);
810 			}
811 			break;
812 		}
813 
814 		// Do the split.  The current line up to but not including the space
815 		// gets output, followed by a CRLF.  The space remains to become the
816 		// start of the next line (and that tells the message reader that it is
817 		// a continuation line).
818 
819 		output.Insert (string, lineStartIndex /* source offset */,
820 			splitIndex - lineStartIndex /* count */,
821 			output.Length() /* insert at */);
822 		output.Append (CRLF);
823 		lineStartIndex = splitIndex;
824 	}
825 	string.SetTo (output);
826 }
827 
828 
829 ssize_t
readfoldedline(FILE * file,char ** buffer,size_t * buflen)830 readfoldedline(FILE *file, char **buffer, size_t *buflen)
831 {
832 	ssize_t len = buflen && *buflen ? *buflen : 0;
833 	char * buf = buffer && *buffer ? *buffer : NULL;
834 	ssize_t cnt = 0; // Number of characters currently in the buffer.
835 	int c;
836 
837 	while (true) {
838 		// Make sure there is space in the buffer for two more characters (one
839 		// for the next character, and one for the end of string NUL byte).
840 		if (buf == NULL || cnt + 2 >= len) {
841 			char *temp = (char *)realloc(buf, len + 64);
842 			if (temp == NULL) {
843 				// Out of memory, however existing buffer remains allocated.
844 				cnt = ENOMEM;
845 				break;
846 			}
847 			len += 64;
848 			buf = temp;
849 		}
850 
851 		// Read the next character, or end of file, or IO error.
852 		if ((c = fgetc(file)) == EOF) {
853 			if (ferror (file)) {
854 				cnt = errno;
855 				if (cnt >= 0)
856 					cnt = -1; // Error codes must be negative.
857 			} else {
858 				// Really is end of file.  Also make it end of line if there is
859 				// some text already read in.  If the first thing read was EOF,
860 				// just return an empty string.
861 				if (cnt > 0) {
862 					buf[cnt++] = '\n';
863 					if (buf[cnt-2] == '\r') {
864 						buf[cnt-2] = '\n';
865 						--cnt;
866 					}
867 				}
868 			}
869 			break;
870 		}
871 
872 		buf[cnt++] = c;
873 
874 		if (c == '\n') {
875 			// Convert CRLF end of line to just a LF.  Do it before folding, in
876 			// case we don't need to fold.
877 			if (cnt >= 2 && buf[cnt-2] == '\r') {
878 				buf[cnt-2] = '\n';
879 				--cnt;
880 			}
881 			// If the current line is empty then return it (so that empty lines
882 			// don't disappear if the next line starts with a space).
883 			if (cnt <= 1)
884 				break;
885 			// Fold if first character on the next line is whitespace.
886 			c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
887 			if (c == ' ' || c == '\t')
888 				buf[cnt-1] = c; // Replace \n with the white space character.
889 			else {
890 				// Not folding, we finished reading a line; break out of the loop
891 				ungetc(c,file);
892 				break;
893 			}
894 		}
895 	}
896 
897 	if (buf != NULL && cnt >= 0)
898 		buf[cnt] = '\0';
899 
900 	if (buffer)
901 		*buffer = buf;
902 	else if (buf)
903 		free(buf);
904 
905 	if (buflen)
906 		*buflen = len;
907 
908 	return cnt;
909 }
910 
911 
912 ssize_t
readfoldedline(BPositionIO & in,char ** buffer,size_t * buflen)913 readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
914 {
915 	ssize_t len = buflen && *buflen ? *buflen : 0;
916 	char * buf = buffer && *buffer ? *buffer : NULL;
917 	ssize_t cnt = 0; // Number of characters currently in the buffer.
918 	char c;
919 	status_t errorCode;
920 
921 	while (true) {
922 		// Make sure there is space in the buffer for two more characters (one
923 		// for the next character, and one for the end of string NUL byte).
924 		if (buf == NULL || cnt + 2 >= len) {
925 			char *temp = (char *)realloc(buf, len + 64);
926 			if (temp == NULL) {
927 				// Out of memory, however existing buffer remains allocated.
928 				cnt = ENOMEM;
929 				break;
930 			}
931 			len += 64;
932 			buf = temp;
933 		}
934 
935 		errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
936 		if (errorCode != 1) {
937 			if (errorCode < 0) {
938 				cnt = errorCode; // IO error encountered, just return the code.
939 			} else {
940 				// Really is end of file.  Also make it end of line if there is
941 				// some text already read in.  If the first thing read was EOF,
942 				// just return an empty string.
943 				if (cnt > 0) {
944 					buf[cnt++] = '\n';
945 					if (buf[cnt-2] == '\r') {
946 						buf[cnt-2] = '\n';
947 						--cnt;
948 					}
949 				}
950 			}
951 			break;
952 		}
953 
954 		buf[cnt++] = c;
955 
956 		if (c == '\n') {
957 			// Convert CRLF end of line to just a LF.  Do it before folding, in
958 			// case we don't need to fold.
959 			if (cnt >= 2 && buf[cnt-2] == '\r') {
960 				buf[cnt-2] = '\n';
961 				--cnt;
962 			}
963 			// If the current line is empty then return it (so that empty lines
964 			// don't disappear if the next line starts with a space).
965 			if (cnt <= 1)
966 				break;
967 			// if first character on the next line is whitespace, fold lines
968 			errorCode = in.Read(&c,1);
969 			if (errorCode == 1) {
970 				if (c == ' ' || c == '\t')
971 					buf[cnt-1] = c; // Replace \n with the white space character.
972 				else {
973 					// Not folding, we finished reading a whole line.
974 					in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
975 					break;
976 				}
977 			} else if (errorCode < 0) {
978 				cnt = errorCode;
979 				break;
980 			} else // No next line; at the end of the file.  Return the line.
981 				break;
982 		}
983 	}
984 
985 	if (buf != NULL && cnt >= 0)
986 		buf[cnt] = '\0';
987 
988 	if (buffer)
989 		*buffer = buf;
990 	else if (buf)
991 		free(buf);
992 
993 	if (buflen)
994 		*buflen = len;
995 
996 	return cnt;
997 }
998 
999 
1000 ssize_t
nextfoldedline(const char ** header,char ** buffer,size_t * buflen)1001 nextfoldedline(const char** header, char **buffer, size_t *buflen)
1002 {
1003 	ssize_t len = buflen && *buflen ? *buflen : 0;
1004 	char * buf = buffer && *buffer ? *buffer : NULL;
1005 	ssize_t cnt = 0; // Number of characters currently in the buffer.
1006 	char c;
1007 
1008 	while (true)
1009 	{
1010 		// Make sure there is space in the buffer for two more characters (one
1011 		// for the next character, and one for the end of string NUL byte).
1012 		if (buf == NULL || cnt + 2 >= len)
1013 		{
1014 			char *temp = (char *)realloc(buf, len + 64);
1015 			if (temp == NULL) {
1016 				// Out of memory, however existing buffer remains allocated.
1017 				cnt = ENOMEM;
1018 				break;
1019 			}
1020 			len += 64;
1021 			buf = temp;
1022 		}
1023 
1024 		// Read the next character, or end of file.
1025 		if ((c = *(*header)++) == 0) {
1026 			// End of file.  Also make it end of line if there is some text
1027 			// already read in.  If the first thing read was EOF, just return
1028 			// an empty string.
1029 			if (cnt > 0) {
1030 				buf[cnt++] = '\n';
1031 				if (buf[cnt-2] == '\r') {
1032 					buf[cnt-2] = '\n';
1033 					--cnt;
1034 				}
1035 			}
1036 			break;
1037 		}
1038 
1039 		buf[cnt++] = c;
1040 
1041 		if (c == '\n') {
1042 			// Convert CRLF end of line to just a LF.  Do it before folding, in
1043 			// case we don't need to fold.
1044 			if (cnt >= 2 && buf[cnt-2] == '\r') {
1045 				buf[cnt-2] = '\n';
1046 				--cnt;
1047 			}
1048 			// If the current line is empty then return it (so that empty lines
1049 			// don't disappear if the next line starts with a space).
1050 			if (cnt <= 1)
1051 				break;
1052 			// if first character on the next line is whitespace, fold lines
1053 			c = *(*header)++;
1054 			if (c == ' ' || c == '\t')
1055 				buf[cnt-1] = c; // Replace \n with the white space character.
1056 			else {
1057 				// Not folding, we finished reading a line; break out of the loop
1058 				(*header)--; // Undo read of the non-whitespace.
1059 				break;
1060 			}
1061 		}
1062 	}
1063 
1064 
1065 	if (buf != NULL && cnt >= 0)
1066 		buf[cnt] = '\0';
1067 
1068 	if (buffer)
1069 		*buffer = buf;
1070 	else if (buf)
1071 		free(buf);
1072 
1073 	if (buflen)
1074 		*buflen = len;
1075 
1076 	return cnt;
1077 }
1078 
1079 
1080 void
trim_white_space(BString & string)1081 trim_white_space(BString &string)
1082 {
1083 	int32 i;
1084 	int32 length = string.Length();
1085 	char *buffer = string.LockBuffer(length + 1);
1086 
1087 	while (length > 0 && isspace(buffer[length - 1]))
1088 		length--;
1089 	buffer[length] = '\0';
1090 
1091 	for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1092 	if (i != 0) {
1093 		length -= i;
1094 		memmove(buffer,buffer + i,length + 1);
1095 	}
1096 	string.UnlockBuffer(length);
1097 }
1098 
1099 
1100 /*!	Tries to return a human-readable name from the specified
1101 	header parameter (should be from "To:" or "From:").
1102 	Tries to return the name rather than the eMail address.
1103 */
1104 void
extract_address_name(BString & header)1105 extract_address_name(BString &header)
1106 {
1107 	BString name;
1108 	const char *start = header.String();
1109 	const char *stop = start + strlen (start);
1110 
1111 	// Find a string S in the header (email foo) that matches:
1112 	//   Old style name in brackets: foo@bar.com (S)
1113 	//   New style quotes: "S" <foo@bar.com>
1114 	//   New style no quotes if nothing else found: S <foo@bar.com>
1115 	//   If nothing else found then use the whole thing: S
1116 
1117 	for (int i = 0; i <= 3; i++) {
1118 		// Set p1 to the first letter in the name and p2 to just past the last
1119 		// letter in the name.  p2 stays NULL if a name wasn't found in this
1120 		// pass.
1121 		const char *p1 = NULL, *p2 = NULL;
1122 
1123 		switch (i) {
1124 			case 0: // foo@bar.com (S)
1125 				if ((p1 = strchr(start,'(')) != NULL) {
1126 					p1++; // Advance to first letter in the name.
1127 					size_t nest = 1; // Handle nested brackets.
1128 					for (p2 = p1; p2 < stop; ++p2)
1129 					{
1130 						if (*p2 == ')')
1131 							--nest;
1132 						else if (*p2 == '(')
1133 							++nest;
1134 						if (nest <= 0)
1135 							break;
1136 					}
1137 					if (nest != 0)
1138 						p2 = NULL; // False alarm, no terminating bracket.
1139 				}
1140 				break;
1141 			case 1: // "S" <foo@bar.com>
1142 				if ((p1 = strchr(start, '\"')) != NULL)
1143 					p2 = strchr(++p1, '\"');
1144 				break;
1145 			case 2: // S <foo@bar.com>
1146 				p1 = start;
1147 				if (name.Length() == 0)
1148 					p2 = strchr(start, '<');
1149 				break;
1150 			case 3: // S
1151 				p1 = start;
1152 				if (name.Length() == 0)
1153 					p2 = stop;
1154 				break;
1155 		}
1156 
1157 		// Remove leading and trailing space-like characters and save the
1158 		// result if it is longer than any other likely names found.
1159 		if (p2 != NULL) {
1160 			while (p1 < p2 && (isspace (*p1)))
1161 				++p1;
1162 
1163 			while (p1 < p2 && (isspace (p2[-1])))
1164 				--p2;
1165 
1166 			int newLength = p2 - p1;
1167 			if (name.Length() < newLength)
1168 				name.SetTo(p1, newLength);
1169 		}
1170 	}
1171 
1172 	int32 lessIndex = name.FindFirst('<');
1173 	int32 greaterIndex = name.FindLast('>');
1174 
1175 	if (lessIndex == 0) {
1176 		// Have an address of the form <address> and nothing else, so remove
1177 		// the greater and less than signs, if any.
1178 		if (greaterIndex > 0)
1179 			name.Remove(greaterIndex, 1);
1180 		name.Remove(lessIndex, 1);
1181 	} else if (lessIndex > 0 && lessIndex < greaterIndex) {
1182 		// Yahoo stupidly inserts the e-mail address into the name string, so
1183 		// this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1184 		name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1185 	}
1186 
1187 	trim_white_space(name);
1188 	header = name;
1189 }
1190 
1191 
1192 /*!	Given a subject in a BString, remove the extraneous RE: re: and other stuff
1193 	to get down to the core subject string, which should be identical for all
1194 	messages posted about a topic.  The input string is modified in place to
1195 	become the output core subject string.
1196 */
1197 void
SubjectToThread(BString & string)1198 SubjectToThread (BString &string)
1199 {
1200 // a regex that matches a non-ASCII UTF8 character:
1201 #define U8C \
1202 	"[\302-\337][\200-\277]" \
1203 	"|\340[\302-\337][\200-\277]" \
1204 	"|[\341-\357][\200-\277][\200-\277]" \
1205 	"|\360[\220-\277][\200-\277][\200-\277]" \
1206 	"|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1207 	"|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1208 	"|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1209 	"|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1210 	"|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1211 
1212 #define PATTERN \
1213 	"^ +" \
1214 	"|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1215 	"|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1216 	"| *\\(fwd\\) *$"
1217 
1218 	if (gRebuf == NULL && atomic_add(&gLocker, 1) == 0) {
1219 		// the idea is to compile the regexp once to speed up testing
1220 
1221 		for (int i=0; i<256; ++i) gTranslation[i]=i;
1222 		for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1223 
1224 		gRe.translate = gTranslation;
1225 		gRe.regs_allocated = REGS_FIXED;
1226 		re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1227 
1228 		const char *pattern = PATTERN;
1229 		// count subexpressions in PATTERN
1230 		for (unsigned int i=0; pattern[i] != 0; ++i)
1231 		{
1232 			if (pattern[i] == '\\')
1233 				++i;
1234 			else if (pattern[i] == '(')
1235 				++gNsub;
1236 		}
1237 
1238 		const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1239 		if (err == NULL)
1240 			gRebuf = &gRe;
1241 		else
1242 			fprintf(stderr, "Failed to compile the regex: %s\n", err);
1243 	} else {
1244 		int32 tries = 200;
1245 		while (gRebuf == NULL && tries-- > 0)
1246 			snooze(10000);
1247 	}
1248 
1249 	if (gRebuf) {
1250 		struct re_registers regs;
1251 		// can't be static if this function is to be thread-safe
1252 
1253 		regs.num_regs = gNsub;
1254 		regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1255 		regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1256 
1257 		for (int start = 0; (start = re_search(gRebuf, string.String(),
1258 				string.Length(), 0, string.Length(), &regs)) >= 0;) {
1259 			//
1260 			// we found something
1261 			//
1262 
1263 			// don't delete [bemaildaemon]...
1264 			if (start == regs.start[1])
1265 				start = regs.start[2];
1266 
1267 			string.Remove(start,regs.end[0]-start);
1268 			if (start)
1269 				string.Insert(' ',1,start);
1270 
1271 			// TODO: for some subjects this results in an endless loop, check
1272 			// why this happen.
1273 			if (regs.end[0] - start <= 1)
1274 				break;
1275 		}
1276 
1277 		free(regs.start);
1278 		free(regs.end);
1279 	}
1280 
1281 	// Finally remove leading and trailing space.  Some software, like
1282 	// tm-edit 1.8, appends a space to the subject, which would break
1283 	// threading if we left it in.
1284 	trim_white_space(string);
1285 }
1286 
1287 
1288 /*!	Converts a date to a time.  Handles numeric time zones too, unlike
1289 	parsedate().  Returns -1 if it fails.
1290 */
1291 time_t
ParseDateWithTimeZone(const char * DateString)1292 ParseDateWithTimeZone(const char *DateString)
1293 {
1294 	time_t currentTime;
1295 	time_t dateAsTime;
1296 	char tempDateString[80];
1297 	char tempZoneString[6];
1298 	time_t zoneDeltaTime;
1299 	int zoneIndex;
1300 	char *zonePntr;
1301 
1302 	// See if we can remove the time zone portion.  parsedate understands time
1303 	// zone 3 letter names, but doesn't understand the numeric +9999 time zone
1304 	// format.  To do: see if a newer parsedate exists.
1305 
1306 	strncpy (tempDateString, DateString, sizeof (tempDateString));
1307 	tempDateString[sizeof (tempDateString) - 1] = 0;
1308 
1309 	// Remove trailing spaces.
1310 	zonePntr = tempDateString + strlen (tempDateString) - 1;
1311 	while (zonePntr >= tempDateString && isspace (*zonePntr))
1312 		*zonePntr-- = 0;
1313 	if (zonePntr < tempDateString)
1314 		return -1; // Empty string.
1315 
1316 	// Remove the trailing time zone in round brackets, like in
1317 	// Fri, 22 Feb 2002 15:22:42 EST (-0500)
1318 	// Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1319 	if (tempDateString[strlen(tempDateString)-1] == ')')
1320 	{
1321 		zonePntr = strrchr (tempDateString, '(');
1322 		if (zonePntr != NULL)
1323 		{
1324 			*zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1325 			while (zonePntr >= tempDateString && isspace (*zonePntr))
1326 				*zonePntr-- = 0;
1327 			if (zonePntr < tempDateString)
1328 				return -1; // Empty string.
1329 		}
1330 	}
1331 
1332 	// Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1333 	for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1334 	{
1335 		zonePntr = tempDateString + zoneIndex;
1336 		if (zonePntr[0] == '+' || zonePntr[0] == '-')
1337 		{
1338 			if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1339 				zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1340 				zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1341 				zonePntr[4] >= '0' && zonePntr[4] <= '9')
1342 				break;
1343 		}
1344 	}
1345 	if (zoneIndex >= 0)
1346 	{
1347 		// Remove the zone from the date string and any following time zone
1348 		// letter codes.  Also put in GMT so that the date gets parsed as GMT.
1349 		memcpy (tempZoneString, zonePntr, 5);
1350 		tempZoneString [5] = 0;
1351 		strcpy (zonePntr, "GMT");
1352 	}
1353 	else // No numeric time zone found.
1354 		strcpy (tempZoneString, "+0000");
1355 
1356 	time (&currentTime);
1357 	dateAsTime = parsedate (tempDateString, currentTime);
1358 	if (dateAsTime == (time_t) -1)
1359 		return -1; // Failure.
1360 
1361 	zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1362 	tempZoneString[3] = 0;
1363 	zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1364 	if (tempZoneString[0] == '+')
1365 		zoneDeltaTime = 0 - zoneDeltaTime;
1366 	dateAsTime += zoneDeltaTime;
1367 
1368 	return dateAsTime;
1369 }
1370 
1371 
1372 /*! Parses a mail header and fills the headers BMessage
1373 */
1374 status_t
parse_header(BMessage & headers,BPositionIO & input)1375 parse_header(BMessage &headers, BPositionIO &input)
1376 {
1377 	char *buffer = NULL;
1378 	size_t bufferSize = 0;
1379 	int32 length;
1380 
1381 	while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1382 		--length;
1383 			// Don't include the \n at the end of the buffer.
1384 
1385 		// convert to UTF-8 and null-terminate the buffer
1386 		length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1387 		buffer[length] = '\0';
1388 
1389 		const char *delimiter = strstr(buffer, ":");
1390 		if (delimiter == NULL)
1391 			continue;
1392 
1393 		BString header(buffer, delimiter - buffer);
1394 		header.CapitalizeEachWord();
1395 			// unified case for later fetch
1396 
1397 		delimiter++; // Skip the colon.
1398 		// Skip over leading white space and tabs.
1399 		// TODO: (comments in brackets).
1400 		while (isspace(*delimiter))
1401 			delimiter++;
1402 
1403 		// TODO: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1404 		headers.AddString(header.String(), delimiter);
1405 	}
1406 	free(buffer);
1407 
1408 	return B_OK;
1409 }
1410 
1411 
1412 status_t
extract_from_header(const BString & header,const BString & field,BString & target)1413 extract_from_header(const BString& header, const BString& field,
1414 	BString& target)
1415 {
1416 	int32 headerLength = header.Length();
1417 	int32 fieldEndPos = 0;
1418 	while (true) {
1419 		int32 pos = header.IFindFirst(field, fieldEndPos);
1420 		if (pos < 0)
1421 			return B_BAD_VALUE;
1422 		fieldEndPos = pos + field.Length();
1423 
1424 		if (pos != 0 && header.ByteAt(pos - 1) != '\n')
1425 			continue;
1426 		if (header.ByteAt(fieldEndPos) == ':')
1427 			break;
1428 	}
1429 	fieldEndPos++;
1430 
1431 	int32 crPos = fieldEndPos;
1432 	while (true) {
1433 		fieldEndPos = crPos;
1434 		crPos = header.FindFirst('\n', crPos);
1435 		if (crPos < 0)
1436 			crPos = headerLength;
1437 		BString temp;
1438 		header.CopyInto(temp, fieldEndPos, crPos - fieldEndPos);
1439 		if (header.ByteAt(crPos - 1) == '\r') {
1440 			temp.Truncate(temp.Length() - 1);
1441 			temp += " ";
1442 		}
1443 		target += temp;
1444 		crPos++;
1445 		if (crPos >= headerLength)
1446 			break;
1447 		char nextByte = header.ByteAt(crPos);
1448 		if (nextByte != ' ' && nextByte != '\t')
1449 			break;
1450 		crPos++;
1451 	}
1452 
1453 	size_t bufferSize = target.Length();
1454 	char* buffer = target.LockBuffer(bufferSize);
1455 	size_t length = rfc2047_to_utf8(&buffer, &bufferSize, bufferSize);
1456 	target.UnlockBuffer(length);
1457 
1458 	trim_white_space(target);
1459 
1460 	return B_OK;
1461 }
1462 
1463 
1464 void
extract_address(BString & address)1465 extract_address(BString &address)
1466 {
1467 	const char *string = address.String();
1468 	int32 first;
1469 
1470 	// first, remove all quoted text
1471 
1472 	if ((first = address.FindFirst('"')) >= 0) {
1473 		int32 last = first + 1;
1474 		while (string[last] && string[last] != '"')
1475 			last++;
1476 
1477 		if (string[last] == '"')
1478 			address.Remove(first, last + 1 - first);
1479 	}
1480 
1481 	// try to extract the address now
1482 
1483 	if ((first = address.FindFirst('<')) >= 0) {
1484 		// the world likes us and we can just get the address the easy way...
1485 		int32 last = address.FindFirst('>');
1486 		if (last >= 0) {
1487 			address.Truncate(last);
1488 			address.Remove(0, first + 1);
1489 
1490 			return;
1491 		}
1492 	}
1493 
1494 	// then, see if there is anything in parenthesis to throw away
1495 
1496 	if ((first = address.FindFirst('(')) >= 0) {
1497 		int32 last = first + 1;
1498 		while (string[last] && string[last] != ')')
1499 			last++;
1500 
1501 		if (string[last] == ')')
1502 			address.Remove(first, last + 1 - first);
1503 	}
1504 
1505 	// now, there shouldn't be much else left
1506 
1507 	trim_white_space(address);
1508 }
1509 
1510 
1511 void
get_address_list(BList & list,const char * string,void (* cleanupFunc)(BString &))1512 get_address_list(BList &list, const char *string,
1513 	void (*cleanupFunc)(BString &))
1514 {
1515 	if (string == NULL || !string[0])
1516 		return;
1517 
1518 	const char *start = string;
1519 
1520 	while (true) {
1521 		if (string[0] == '"') {
1522 			const char *quoteEnd = ++string;
1523 
1524 			while (quoteEnd[0] && quoteEnd[0] != '"')
1525 				quoteEnd++;
1526 
1527 			if (!quoteEnd[0])	// string exceeds line!
1528 				quoteEnd = string;
1529 
1530 			string = quoteEnd + 1;
1531 		}
1532 
1533 		if (string[0] == ',' || string[0] == '\0') {
1534 			BString address(start, string - start);
1535 			trim_white_space(address);
1536 
1537 			if (cleanupFunc)
1538 				cleanupFunc(address);
1539 
1540 			list.AddItem(strdup(address.String()));
1541 
1542 			start = string + 1;
1543 		}
1544 
1545 		if (!string[0])
1546 			break;
1547 
1548 		string++;
1549 	}
1550 }
1551 
1552 
1553 status_t
CopyMailFolderAttributes(const char * targetPath)1554 CopyMailFolderAttributes(const char* targetPath)
1555 {
1556 	BPath path;
1557 	status_t status = find_directory(B_USER_SETTINGS_DIRECTORY, &path);
1558 	if (status != B_OK)
1559 		return status;
1560 
1561 	path.Append("Tracker");
1562 	path.Append("DefaultQueryTemplates");
1563 	path.Append("text_x-email");
1564 
1565 	BNode source(path.Path());
1566 	BNode target(targetPath);
1567 	return BPrivate::CopyAttributes(source, target);
1568 }
1569