1 /*
2 * Copyright 2002-2013, Haiku, Inc. All rights reserved.
3 * Copyright 2002 Alexander G. M. Smith.
4 * Copyright 2011, Clemens Zeidler <haiku@clemens-zeidler.de>
5 * Distributed under the terms of the MIT License.
6 */
7
8 /*! Uses Bayesian statistics to evaluate the spaminess of a message.
9 The evaluation is done by a separate server, this add-on just gets
10 the text and uses scripting commands to get an evaluation from the server.
11 If the server isn't running, it will be found and started up. Once the
12 evaluation has been received, it is added to the message as an attribute and
13 optionally as an addition to the subject. Some other add-on later in the
14 pipeline will use the attribute to delete the message or move it to some
15 other folder.
16 */
17
18
19 #include "SpamFilter.h"
20
21 #include <stdlib.h>
22 #include <stdio.h>
23
24 #include <Beep.h>
25 #include <Catalog.h>
26 #include <fs_attr.h>
27 #include <Messenger.h>
28 #include <Node.h>
29 #include <Path.h>
30 #include <Roster.h>
31 #include <String.h>
32 #include <FindDirectory.h>
33 #include <Entry.h>
34
35
36 #undef B_TRANSLATION_CONTEXT
37 #define B_TRANSLATION_CONTEXT "SpamFilter"
38
39
40 // The names match the ones set up by spamdbm for sound effects.
41 static const char* kAGMSBayesBeepGenuineName = "SpamFilter-Genuine";
42 static const char* kAGMSBayesBeepSpamName = "SpamFilter-Spam";
43 static const char* kAGMSBayesBeepUncertainName = "SpamFilter-Uncertain";
44
45 static const char* kServerSignature = "application/x-vnd.agmsmith.spamdbm";
46
47
SpamFilter(BMailProtocol & protocol,const BMailAddOnSettings & settings)48 SpamFilter::SpamFilter(BMailProtocol& protocol,
49 const BMailAddOnSettings& settings)
50 :
51 BMailFilter(protocol, &settings)
52 {
53 fAddSpamToSubject = settings.GetBool("AddMarkerToSubject", false);
54 fAutoTraining = settings.GetBool("AutoTraining", true);
55 fGenuineCutoffRatio = settings.GetFloat("GenuineCutoffRatio", 0.01f);
56 fNoWordsMeansSpam = settings.GetBool("NoWordsMeansSpam", true);
57 fQuitServerWhenFinished = settings.GetBool("QuitServerWhenFinished", false);
58 fSpamCutoffRatio = settings.GetFloat("SpamCutoffRatio", 0.99f);
59 }
60
61
~SpamFilter()62 SpamFilter::~SpamFilter()
63 {
64 if (fQuitServerWhenFinished)
65 fMessengerToServer.SendMessage(B_QUIT_REQUESTED);
66 }
67
68
69 BMailFilterAction
HeaderFetched(entry_ref & ref,BFile & file,BMessage & attributes)70 SpamFilter::HeaderFetched(entry_ref& ref, BFile& file, BMessage& attributes)
71 {
72 _CheckForSpam(file);
73 return B_NO_MAIL_ACTION;
74 }
75
76
77 void
BodyFetched(const entry_ref & ref,BFile & file,BMessage & attributes)78 SpamFilter::BodyFetched(const entry_ref& ref, BFile& file, BMessage& attributes)
79 {
80 if (fHeaderOnly)
81 return;
82
83 // See if the message has already been classified. Happens for messages
84 // which are partially downloaded when you have auto-training on. Could
85 // untrain the partial part before training on the complete message, but we
86 // don't know how big it was, so instead just ignore the message.
87 attr_info attributeInfo;
88 if (file.GetAttrInfo("MAIL:classification", &attributeInfo) == B_OK)
89 return;
90
91 _CheckForSpam(file);
92 }
93
94
95 status_t
_CheckForSpam(BFile & file)96 SpamFilter::_CheckForSpam(BFile& file)
97 {
98 // Get a connection to the spam database server. Launch if needed, should
99 // only need it once, unless another e-mail thread shuts down the server
100 // inbetween messages. This code used to be in InitCheck, but apparently
101 // that isn't called.
102 printf("Checking for Spam Server.\n");
103 if (fLaunchAttemptCount == 0 || !fMessengerToServer.IsValid()) {
104 if (_GetTokenizeMode() != B_OK)
105 return B_ERROR;
106 }
107
108 off_t dataSize;
109 file.GetSize(&dataSize);
110 char* stringBuffer = new char[dataSize + 1];
111 file.Read(stringBuffer, dataSize);
112 stringBuffer[dataSize] = 0; // Add an end of string NUL, just in case.
113
114 float spamRatio;
115 if (_GetSpamRatio(stringBuffer, dataSize, spamRatio) != B_OK)
116 return B_ERROR;
117
118 // If we are auto-training, feed back the message to the server as a
119 // training example (don't train if it is uncertain).
120 if (fAutoTraining && (spamRatio >= fSpamCutoffRatio
121 || spamRatio < fGenuineCutoffRatio)) {
122 _TrainServer(stringBuffer, dataSize, spamRatio);
123 }
124
125 delete[] stringBuffer;
126
127 // write attributes
128 BString classificationString = spamRatio >= fSpamCutoffRatio ? "Spam"
129 : spamRatio < fGenuineCutoffRatio ? "Genuine" : "Uncertain";
130 file.WriteAttrString("MAIL:classification", &classificationString);
131
132 // Store the spam ratio in an attribute called MAIL:ratio_spam,
133 // attached to the eventual output file.
134 file.WriteAttr("MAIL:ratio_spam", B_FLOAT_TYPE, 0 /* offset */, &spamRatio,
135 sizeof(spamRatio));
136
137 // Also add it to the subject, if requested.
138 if (fAddSpamToSubject && spamRatio >= fSpamCutoffRatio)
139 _AddSpamToSubject(file, spamRatio);
140
141 // Beep using different sounds for spam and genuine, as Jeremy Friesner
142 // nudged me to get around to implementing. And add uncertain to that, as
143 // "BiPolar" suggested. If the user doesn't want to hear the sound, they
144 // can turn it off in the system sound preferences.
145
146 if (spamRatio >= fSpamCutoffRatio)
147 system_beep(kAGMSBayesBeepSpamName);
148 else if (spamRatio < fGenuineCutoffRatio)
149 system_beep(kAGMSBayesBeepGenuineName);
150 else
151 system_beep(kAGMSBayesBeepUncertainName);
152
153 return B_OK;
154 }
155
156
157 status_t
_CheckForSpamServer()158 SpamFilter::_CheckForSpamServer()
159 {
160 // Make sure the server is running.
161 if (be_roster->IsRunning (kServerSignature))
162 return B_OK;
163
164 status_t status = be_roster->Launch (kServerSignature);
165 if (status == B_OK)
166 return status;
167
168 BPath path;
169 entry_ref ref;
170 const directory_which kPlaces[] = {
171 B_SYSTEM_NONPACKAGED_BIN_DIRECTORY,
172 B_SYSTEM_BIN_DIRECTORY};
173 for (size_t i = 0; i < sizeof(kPlaces) / sizeof(kPlaces[0]); i++) {
174 find_directory(kPlaces[i], &path);
175 path.Append("spamdbm");
176 if (!BEntry(path.Path()).Exists())
177 continue;
178 get_ref_for_path(path.Path(), &ref);
179 if ((status = be_roster->Launch(&ref)) == B_OK)
180 break;
181 }
182
183 return status;
184 }
185
186
187 status_t
_GetTokenizeMode()188 SpamFilter::_GetTokenizeMode()
189 {
190 if (fLaunchAttemptCount > 3)
191 return B_ERROR; // Don't try to start the server too many times.
192 fLaunchAttemptCount++;
193
194 // Make sure the server is running.
195 status_t status = _CheckForSpamServer();
196 if (status != B_OK)
197 return status;
198
199 // Set up the messenger to the database server.
200 fMessengerToServer = BMessenger(kServerSignature);
201 if (!fMessengerToServer.IsValid())
202 return B_ERROR;
203
204 // Check if the server is running in headers only mode. If so, we only
205 // need to download the header rather than the entire message.
206 BMessage scriptingMessage(B_GET_PROPERTY);
207 scriptingMessage.AddSpecifier("TokenizeMode");
208 BMessage replyMessage;
209 if ((status = fMessengerToServer.SendMessage(&scriptingMessage,
210 &replyMessage)) != B_OK)
211 return status;
212 status_t errorCode;
213 if ((status = replyMessage.FindInt32("error", &errorCode)) != B_OK)
214 return status;
215 if (errorCode != B_OK)
216 return errorCode;
217
218 const char* tokenizeMode;
219 if ((status = replyMessage.FindString("result", &tokenizeMode)) != B_OK)
220 return status;
221
222 fHeaderOnly = tokenizeMode != NULL && !strcmp(tokenizeMode, "JustHeader");
223 return B_OK;
224 }
225
226
227 status_t
_GetSpamRatio(const char * stringBuffer,off_t dataSize,float & ratio)228 SpamFilter::_GetSpamRatio(const char* stringBuffer, off_t dataSize,
229 float& ratio)
230 {
231 // Send off a scripting command to the database server, asking it to
232 // evaluate the string for spaminess. Note that it can return ENOMSG
233 // when there are no words (a good indicator of spam which is pure HTML
234 // if you are using plain text only tokenization), so we could use that
235 // as a spam marker too. Code copied for the reevaluate stuff below.
236
237 BMessage scriptingMessage(B_SET_PROPERTY);
238 scriptingMessage.AddSpecifier("EvaluateString");
239 status_t errorCode = scriptingMessage.AddData("data", B_STRING_TYPE,
240 stringBuffer, dataSize + 1, false /* fixed size */);
241 if (errorCode != B_OK)
242 return errorCode;
243 BMessage replyMessage;
244 errorCode = fMessengerToServer.SendMessage(&scriptingMessage,
245 &replyMessage);
246 if (errorCode != B_OK
247 || replyMessage.FindInt32("error", &errorCode) != B_OK)
248 return errorCode; // Unable to read the return code.
249 if (errorCode == ENOMSG && fNoWordsMeansSpam)
250 ratio = fSpamCutoffRatio; // Yes, no words and that means spam.
251 else if (errorCode != B_OK
252 || replyMessage.FindFloat("result", &ratio) != B_OK)
253 return errorCode; // Classification failed in one of many ways.
254
255 return errorCode;
256 }
257
258
259 status_t
_TrainServer(const char * stringBuffer,off_t dataSize,float spamRatio)260 SpamFilter::_TrainServer(const char* stringBuffer, off_t dataSize,
261 float spamRatio)
262 {
263 BMessage scriptingMessage(B_SET_PROPERTY);
264 scriptingMessage.AddSpecifier((spamRatio >= fSpamCutoffRatio)
265 ? "SpamString" : "GenuineString");
266 status_t errorCode = scriptingMessage.AddData ("data", B_STRING_TYPE,
267 stringBuffer, dataSize + 1, false /* fixed size */);
268 if (errorCode != B_OK)
269 return errorCode;
270 BMessage replyMessage;
271 errorCode = fMessengerToServer.SendMessage (&scriptingMessage,
272 &replyMessage);
273 if (errorCode != B_OK)
274 return errorCode;
275 errorCode = replyMessage.FindInt32("error", &errorCode);
276
277 return errorCode;
278 }
279
280
281 status_t
_AddSpamToSubject(BNode & file,float spamRatio)282 SpamFilter::_AddSpamToSubject(BNode& file, float spamRatio)
283 {
284 attr_info info;
285 if (file.GetAttrInfo("Subject", &info) != B_OK)
286 return B_ERROR;
287 if (info.type != B_STRING_TYPE)
288 return B_ERROR;
289
290 char* buffer = new char[info.size];
291 if (file.ReadAttr("Subject", B_STRING_TYPE, 0, buffer, info.size) < 0) {
292 delete[] buffer;
293 return B_ERROR;
294 }
295
296 BString newSubjectString;
297 newSubjectString.SetTo("[Spam ");
298 char percentageString[30];
299 sprintf(percentageString, "%05.2f", spamRatio * 100.0);
300 newSubjectString << percentageString << "%] ";
301 newSubjectString << buffer;
302 delete[] buffer;
303
304 if (file.WriteAttrString("Subject", &newSubjectString) < 0)
305 return B_ERROR;
306
307 return B_OK;
308 }
309
310
311 // #pragma mark -
312
313
314 BString
filter_name(const BMailAccountSettings & accountSettings,const BMailAddOnSettings * addOnSettings)315 filter_name(const BMailAccountSettings& accountSettings,
316 const BMailAddOnSettings* addOnSettings)
317 {
318 return B_TRANSLATE("Bayesian Spam Filter");
319 }
320
321
322 BMailFilter*
instantiate_filter(BMailProtocol & protocol,const BMailAddOnSettings & settings)323 instantiate_filter(BMailProtocol& protocol, const BMailAddOnSettings& settings)
324 {
325 return new SpamFilter(protocol, settings);
326 }
327