xref: /haiku/src/add-ons/mail_daemon/inbound_filters/spam_filter/SpamFilter.cpp (revision 05f730b0f8ea4b779af0ef62c26a5f29ded24a31)
1 /*
2  * Copyright 2002-2013, Haiku, Inc. All rights reserved.
3  * Copyright 2002 Alexander G. M. Smith.
4  * Copyright 2011, Clemens Zeidler <haiku@clemens-zeidler.de>
5  * Distributed under the terms of the MIT License.
6  */
7 
8 /*!	Uses Bayesian statistics to evaluate the spaminess of a message.
9 	The evaluation is done by a separate server, this add-on just gets
10 	the text and uses scripting commands to get an evaluation from the server.
11 	If the server isn't running, it will be found and started up.  Once the
12 	evaluation has been received, it is added to the message as an attribute and
13 	optionally as an addition to the subject.  Some other add-on later in the
14 	pipeline will use the attribute to delete the message or move it to some
15 	other folder.
16 */
17 
18 
19 #include "SpamFilter.h"
20 
21 #include <stdlib.h>
22 #include <stdio.h>
23 
24 #include <Beep.h>
25 #include <Catalog.h>
26 #include <fs_attr.h>
27 #include <Messenger.h>
28 #include <Node.h>
29 #include <Path.h>
30 #include <Roster.h>
31 #include <String.h>
32 #include <FindDirectory.h>
33 #include <Entry.h>
34 
35 
36 #undef B_TRANSLATION_CONTEXT
37 #define B_TRANSLATION_CONTEXT "SpamFilter"
38 
39 
40 // The names match the ones set up by spamdbm for sound effects.
41 static const char* kAGMSBayesBeepGenuineName = "SpamFilter-Genuine";
42 static const char* kAGMSBayesBeepSpamName = "SpamFilter-Spam";
43 static const char* kAGMSBayesBeepUncertainName = "SpamFilter-Uncertain";
44 
45 static const char* kServerSignature = "application/x-vnd.agmsmith.spamdbm";
46 
47 
SpamFilter(BMailProtocol & protocol,const BMailAddOnSettings & settings)48 SpamFilter::SpamFilter(BMailProtocol& protocol,
49 	const BMailAddOnSettings& settings)
50 	:
51 	BMailFilter(protocol, &settings)
52 {
53 	fAddSpamToSubject = settings.GetBool("AddMarkerToSubject", false);
54 	fAutoTraining = settings.GetBool("AutoTraining", true);
55 	fGenuineCutoffRatio = settings.GetFloat("GenuineCutoffRatio", 0.01f);
56 	fNoWordsMeansSpam = settings.GetBool("NoWordsMeansSpam", true);
57 	fQuitServerWhenFinished = settings.GetBool("QuitServerWhenFinished", false);
58 	fSpamCutoffRatio = settings.GetFloat("SpamCutoffRatio", 0.99f);
59 }
60 
61 
~SpamFilter()62 SpamFilter::~SpamFilter()
63 {
64 	if (fQuitServerWhenFinished)
65 		fMessengerToServer.SendMessage(B_QUIT_REQUESTED);
66 }
67 
68 
69 BMailFilterAction
HeaderFetched(entry_ref & ref,BFile & file,BMessage & attributes)70 SpamFilter::HeaderFetched(entry_ref& ref, BFile& file, BMessage& attributes)
71 {
72 	_CheckForSpam(file);
73 	return B_NO_MAIL_ACTION;
74 }
75 
76 
77 void
BodyFetched(const entry_ref & ref,BFile & file,BMessage & attributes)78 SpamFilter::BodyFetched(const entry_ref& ref, BFile& file, BMessage& attributes)
79 {
80 	if (fHeaderOnly)
81 		return;
82 
83 	// See if the message has already been classified.  Happens for messages
84 	// which are partially downloaded when you have auto-training on.  Could
85 	// untrain the partial part before training on the complete message, but we
86 	// don't know how big it was, so instead just ignore the message.
87 	attr_info attributeInfo;
88 	if (file.GetAttrInfo("MAIL:classification", &attributeInfo) == B_OK)
89 		return;
90 
91 	_CheckForSpam(file);
92 }
93 
94 
95 status_t
_CheckForSpam(BFile & file)96 SpamFilter::_CheckForSpam(BFile& file)
97 {
98 	// Get a connection to the spam database server.  Launch if needed, should
99 	// only need it once, unless another e-mail thread shuts down the server
100 	// inbetween messages.  This code used to be in InitCheck, but apparently
101 	// that isn't called.
102 	printf("Checking for Spam Server.\n");
103 	if (fLaunchAttemptCount == 0 || !fMessengerToServer.IsValid()) {
104 		if (_GetTokenizeMode() != B_OK)
105 			return B_ERROR;
106 	}
107 
108 	off_t dataSize;
109 	file.GetSize(&dataSize);
110 	char* stringBuffer = new char[dataSize + 1];
111 	file.Read(stringBuffer, dataSize);
112 	stringBuffer[dataSize] = 0; // Add an end of string NUL, just in case.
113 
114 	float spamRatio;
115 	if (_GetSpamRatio(stringBuffer, dataSize, spamRatio) != B_OK)
116 		return B_ERROR;
117 
118 	// If we are auto-training, feed back the message to the server as a
119 	// training example (don't train if it is uncertain).
120 	if (fAutoTraining && (spamRatio >= fSpamCutoffRatio
121 		|| spamRatio < fGenuineCutoffRatio)) {
122 		_TrainServer(stringBuffer, dataSize, spamRatio);
123 	}
124 
125 	delete[] stringBuffer;
126 
127 	// write attributes
128 	BString classificationString = spamRatio >= fSpamCutoffRatio ? "Spam"
129 		: spamRatio < fGenuineCutoffRatio ? "Genuine" : "Uncertain";
130 	file.WriteAttrString("MAIL:classification", &classificationString);
131 
132 	// Store the spam ratio in an attribute called MAIL:ratio_spam,
133 	// attached to the eventual output file.
134 	file.WriteAttr("MAIL:ratio_spam", B_FLOAT_TYPE, 0 /* offset */, &spamRatio,
135 		sizeof(spamRatio));
136 
137 	// Also add it to the subject, if requested.
138 	if (fAddSpamToSubject && spamRatio >= fSpamCutoffRatio)
139 		_AddSpamToSubject(file, spamRatio);
140 
141 	// Beep using different sounds for spam and genuine, as Jeremy Friesner
142 	// nudged me to get around to implementing.  And add uncertain to that, as
143 	// "BiPolar" suggested.  If the user doesn't want to hear the sound, they
144 	// can turn it off in the system sound preferences.
145 
146 	if (spamRatio >= fSpamCutoffRatio)
147 		system_beep(kAGMSBayesBeepSpamName);
148 	else if (spamRatio < fGenuineCutoffRatio)
149 		system_beep(kAGMSBayesBeepGenuineName);
150 	else
151 		system_beep(kAGMSBayesBeepUncertainName);
152 
153 	return B_OK;
154 }
155 
156 
157 status_t
_CheckForSpamServer()158 SpamFilter::_CheckForSpamServer()
159 {
160 	// Make sure the server is running.
161 	if (be_roster->IsRunning (kServerSignature))
162 		return B_OK;
163 
164 	status_t status = be_roster->Launch (kServerSignature);
165 	if (status == B_OK)
166 		return status;
167 
168 	BPath path;
169 	entry_ref ref;
170 	const directory_which kPlaces[] = {
171 		B_SYSTEM_NONPACKAGED_BIN_DIRECTORY,
172 		B_SYSTEM_BIN_DIRECTORY};
173 	for (size_t i = 0; i < sizeof(kPlaces) / sizeof(kPlaces[0]); i++) {
174 		find_directory(kPlaces[i], &path);
175 		path.Append("spamdbm");
176 		if (!BEntry(path.Path()).Exists())
177 			continue;
178 		get_ref_for_path(path.Path(), &ref);
179 		if ((status = be_roster->Launch(&ref)) == B_OK)
180 			break;
181 	}
182 
183 	return status;
184 }
185 
186 
187 status_t
_GetTokenizeMode()188 SpamFilter::_GetTokenizeMode()
189 {
190 	if (fLaunchAttemptCount > 3)
191 		return B_ERROR; // Don't try to start the server too many times.
192 	fLaunchAttemptCount++;
193 
194 	// Make sure the server is running.
195 	status_t status = _CheckForSpamServer();
196 	if (status != B_OK)
197 		return status;
198 
199 	// Set up the messenger to the database server.
200 	fMessengerToServer = BMessenger(kServerSignature);
201 	if (!fMessengerToServer.IsValid())
202 		return B_ERROR;
203 
204 	// Check if the server is running in headers only mode.  If so, we only
205 	// need to download the header rather than the entire message.
206 	BMessage scriptingMessage(B_GET_PROPERTY);
207 	scriptingMessage.AddSpecifier("TokenizeMode");
208 	BMessage replyMessage;
209 	if ((status = fMessengerToServer.SendMessage(&scriptingMessage,
210 			&replyMessage)) != B_OK)
211 		return status;
212 	status_t errorCode;
213 	if ((status = replyMessage.FindInt32("error", &errorCode)) != B_OK)
214 		return status;
215 	if (errorCode != B_OK)
216 		return errorCode;
217 
218 	const char* tokenizeMode;
219 	if ((status = replyMessage.FindString("result", &tokenizeMode)) != B_OK)
220 		return status;
221 
222 	fHeaderOnly = tokenizeMode != NULL && !strcmp(tokenizeMode, "JustHeader");
223 	return B_OK;
224 }
225 
226 
227 status_t
_GetSpamRatio(const char * stringBuffer,off_t dataSize,float & ratio)228 SpamFilter::_GetSpamRatio(const char* stringBuffer, off_t dataSize,
229 	float& ratio)
230 {
231 	// Send off a scripting command to the database server, asking it to
232 	// evaluate the string for spaminess.  Note that it can return ENOMSG
233 	// when there are no words (a good indicator of spam which is pure HTML
234 	// if you are using plain text only tokenization), so we could use that
235 	// as a spam marker too.  Code copied for the reevaluate stuff below.
236 
237 	BMessage scriptingMessage(B_SET_PROPERTY);
238 	scriptingMessage.AddSpecifier("EvaluateString");
239 	status_t errorCode = scriptingMessage.AddData("data", B_STRING_TYPE,
240 		stringBuffer, dataSize + 1, false /* fixed size */);
241 	if (errorCode != B_OK)
242 		return errorCode;
243 	BMessage replyMessage;
244 	errorCode = fMessengerToServer.SendMessage(&scriptingMessage,
245 		&replyMessage);
246 	if (errorCode != B_OK
247 		|| replyMessage.FindInt32("error", &errorCode) != B_OK)
248 		return errorCode; // Unable to read the return code.
249 	if (errorCode == ENOMSG && fNoWordsMeansSpam)
250 		ratio = fSpamCutoffRatio; // Yes, no words and that means spam.
251 	else if (errorCode != B_OK
252 		|| replyMessage.FindFloat("result", &ratio) != B_OK)
253 		return errorCode; // Classification failed in one of many ways.
254 
255 	return errorCode;
256 }
257 
258 
259 status_t
_TrainServer(const char * stringBuffer,off_t dataSize,float spamRatio)260 SpamFilter::_TrainServer(const char* stringBuffer, off_t dataSize,
261 	float spamRatio)
262 {
263 	BMessage scriptingMessage(B_SET_PROPERTY);
264 	scriptingMessage.AddSpecifier((spamRatio >= fSpamCutoffRatio)
265 		? "SpamString" : "GenuineString");
266 	status_t errorCode = scriptingMessage.AddData ("data", B_STRING_TYPE,
267 		stringBuffer, dataSize + 1, false /* fixed size */);
268 	if (errorCode != B_OK)
269 		return errorCode;
270 	BMessage replyMessage;
271 	errorCode = fMessengerToServer.SendMessage (&scriptingMessage,
272 		&replyMessage);
273 	if (errorCode != B_OK)
274 		return errorCode;
275 	errorCode = replyMessage.FindInt32("error", &errorCode);
276 
277 	return errorCode;
278 }
279 
280 
281 status_t
_AddSpamToSubject(BNode & file,float spamRatio)282 SpamFilter::_AddSpamToSubject(BNode& file, float spamRatio)
283 {
284 	attr_info info;
285 	if (file.GetAttrInfo("Subject", &info) != B_OK)
286 		return B_ERROR;
287 	if (info.type != B_STRING_TYPE)
288 		return B_ERROR;
289 
290 	char* buffer = new char[info.size];
291 	if (file.ReadAttr("Subject", B_STRING_TYPE, 0, buffer, info.size) < 0) {
292 		delete[] buffer;
293 		return B_ERROR;
294 	}
295 
296 	BString newSubjectString;
297 	newSubjectString.SetTo("[Spam ");
298 	char percentageString[30];
299 	sprintf(percentageString, "%05.2f", spamRatio * 100.0);
300 	newSubjectString << percentageString << "%] ";
301 	newSubjectString << buffer;
302 	delete[] buffer;
303 
304 	if (file.WriteAttrString("Subject", &newSubjectString) < 0)
305 		return B_ERROR;
306 
307 	return B_OK;
308 }
309 
310 
311 // #pragma mark -
312 
313 
314 BString
filter_name(const BMailAccountSettings & accountSettings,const BMailAddOnSettings * addOnSettings)315 filter_name(const BMailAccountSettings& accountSettings,
316 	const BMailAddOnSettings* addOnSettings)
317 {
318 	return B_TRANSLATE("Bayesian Spam Filter");
319 }
320 
321 
322 BMailFilter*
instantiate_filter(BMailProtocol & protocol,const BMailAddOnSettings & settings)323 instantiate_filter(BMailProtocol& protocol, const BMailAddOnSettings& settings)
324 {
325 	return new SpamFilter(protocol, settings);
326 }
327