xref: /haiku/src/bin/mail_utils/spamdbm.cpp (revision 220d04022750f40f8bac8f01fa551211e28d04f2)
1 /******************************************************************************
2  * $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $
3  *
4  * This is a BeOS program for classifying e-mail messages as spam (unwanted
5  * junk mail) or as genuine mail using a Bayesian statistical approach.  There
6  * is also a Mail Daemon Replacement add-on to filter mail using the
7  * classification statistics collected earlier.
8  *
9  * See also http://www.paulgraham.com/spam.html for a good writeup and
10  * http://www.tuxedo.org/~esr/bogofilter/ for another implementation.
11  * And more recently, Gary Robinson's write up of his improved algorithm
12  * at http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
13  * which gives a better spread in spam ratios and slightly fewer
14  * misclassifications.
15  *
16  * Note that this uses the AGMS vacation coding style, not the OpenTracker one.
17  * That means no tabs, indents are two spaces, m_ is the prefix for member
18  * variables, g_ is the prefix for global names, C style comments, constants
19  * are in all capital letters and most other things are mixed case, it's word
20  * wrapped to fit in 79 characters per line to make proofreading on paper
21  * easier, and functions are listed in reverse dependency order so that forward
22  * declarations (function prototypes with no code) aren't needed.
23  *
24  * The Original Design:
25  * There is a spam database (just a file listing words and number of times they
26  * were used in spam and non-spam messages) that a BeMailDaemon input filter
27  * will use when scanning email.  It will mark the mail with the spam
28  * probability (an attribute, optionally a mail header field) and optionally do
29  * something if the probability exceeds a user defined level (delete message,
30  * change subject, file in a different folder).  Or should that be a different
31  * filter?  Outside the mail system, the probability can be used in queries to
32  * find spam.
33  *
34  * A second user application will be used to update the database.  Besides
35  * showing you the current list of words, you can drag and drop files to mark
36  * them as spam or non-spam (a balanced binary tree is used internally to make
37  * word storage fast).  It will add a second attribute to the files to show how
38  * they have been classified by the user (and won't update the database if you
39  * accidentally try to classify a file again).  Besides drag and drop, there
40  * will be a command line interface and a message passing interface.  BeMail
41  * (or other programs) will then communicate via messages to tell it when the
42  * user marks a message as spam or not (via having separate delete spam /
43  * delete genuine mail buttons and a menu item or two).
44  *
45  * Plus lots of details, like the rename swap method to update the database
46  * file (so programs with the old file open aren't affected).  A nice tab text
47  * format so you can open the database in a spreadsheet.  Startup and shutdown
48  * control of the updater from BeMail.  Automatic creation of the indices
49  * needed by the filter.  MIME types for the database file.  Icons for the app.
50  * System settings to enable tracker to display the new attributes when viewing
51  * e-mail (and maybe news articles if someone ever gets around to an NNTP as
52  * files reader).  Documentation.  Recursive directory traversal for the
53  * command line or directory drag and drop.  Options for the updater to warn or
54  * ignore non-email files.  Etc.
55  *
56  * The Actual Implementation:
57  * The spam database updates and the test for spam have been combined into one
58  * program which runs as a server.  That way there won't be as long a delay
59  * when the e-mail system wants to check for spam, because the database is
60  * already loaded by the server and in memory.  The MDR mail filter add-on
61  * simply sends scripting commands to the server (and starts it up if it isn't
62  * already running).  The filter takes care of marking the messages when it
63  * gets the rating back from the server, and then the rest of the mail system
64  * rule chain can delete the message or otherwise manipulate it.
65  *
66  * Revision History (now manually updated due to SVN's philosophy)
67  * $Log: spamdbm.cpp,v $
68  * ------------------------------------------------------------------------
69  * r15195 | agmsmith | 2005-11-27 21:07:55 -0500 (Sun, 27 Nov 2005) | 4 lines
70  * Just a few minutes after checking in, I mentioned it to Japanese expert Koki
71  * and he suggested also including the Japanese comma.  So before I forget to
72  * do it...
73  *
74  * ------------------------------------------------------------------------
75  * r15194 | agmsmith | 2005-11-27 20:37:13 -0500 (Sun, 27 Nov 2005) | 5 lines
76  * Truncate overly long URLs to the maximum word length.  Convert Japanese
77  * periods to spaces so that more "words" are found.  Fix UTF-8 comparison
78  * problems with tolower() incorrectly converting characters with the high bit
79  * set.
80  *
81  * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
82  * Added better tokenization so that HTML is parsed and things like tags
83  * between letters of a word no longer hide that word.  After testing, the
84  * result seems to be a tighter spread of ratings when done in full text plus
85  * header mode.
86  *
87  * Revision 1.10  2005/11/24 02:08:39  agmsmith
88  * Fixed up prefix codes, Z for things that are inside other things.
89  *
90  * Revision 1.9  2005/11/21 03:28:03  agmsmith
91  * Added a function for extracting URLs.
92  *
93  * Revision 1.8  2005/11/09 03:36:18  agmsmith
94  * Removed noframes detection (doesn't show up in e-mails).  Now use
95  * just H for headers and Z for HTML tag junk.
96  *
97  * Revision 1.7  2005/10/24 00:00:08  agmsmith
98  * Adding HTML tag removal, which also affected the search function so it
99  * could search for single part things like  .
100  *
101  * Revision 1.6  2005/10/17 01:55:08  agmsmith
102  * Remove HTML comments and a few other similar things.
103  *
104  * Revision 1.5  2005/10/16 18:35:36  agmsmith
105  * Under construction - looking into HTML not being in UTF-8.
106  *
107  * Revision 1.4  2005/10/11 01:51:21  agmsmith
108  * Starting on the tokenising passes.  Still need to test asian truncation.
109  *
110  * Revision 1.3  2005/10/06 11:54:07  agmsmith
111  * Not much.
112  *
113  * Revision 1.2  2005/09/12 01:49:37  agmsmith
114  * Enable case folding for the whole file tokenizer.
115  *
116  * r13961 | agmsmith | 2005-08-13 22:25:28 -0400 (Sat, 13 Aug 2005) | 2 lines
117  * Source code changes so that mboxtobemail now compiles and is in the build
118  * system.
119  *
120  * r13959 | agmsmith | 2005-08-13 22:05:27 -0400 (Sat, 13 Aug 2005) | 2 lines
121  * Rename the directory before doing anything else, otherwise svn dies badly.
122  *
123  * r13952 | agmsmith | 2005-08-13 15:31:42 -0400 (Sat, 13 Aug 2005) | 3 lines
124  * Added the resources and file type associations, changed the application
125  * signature and otherwise made the spam detection system work properly again.
126  *
127  * r13951 | agmsmith | 2005-08-13 11:40:01 -0400 (Sat, 13 Aug 2005) | 2 lines
128  * Had to do the file rename as a separate operation due to SVN limitations.
129  *
130  * r13950 | agmsmith | 2005-08-13 11:38:44 -0400 (Sat, 13 Aug 2005) | 3 lines
131  * Oops, "spamdb" is already used for a Unix package.  And spamdatabase is
132  * already reserved by a domain name squatter.  Use "spamdbm" instead.
133  *
134  * r13949 | agmsmith | 2005-08-13 11:17:52 -0400 (Sat, 13 Aug 2005) | 3 lines
135  * Renamed spamfilter to be the more meaningful spamdb (spam database) and
136  * moved it into its own source directory in preparation for adding resources.
137  *
138  * r13628 | agmsmith | 2005-07-10 20:11:29 -0400 (Sun, 10 Jul 2005) | 3 lines
139  * Updated keyword expansion to use SVN keywords.  Also seeing if svn is
140  * working well enough for me to update files from BeOS R5.
141  *
142  * r11909 | axeld | 2005-03-18 19:09:19 -0500 (Fri, 18 Mar 2005) | 2 lines
143  * Moved bin/ directory out of apps/.
144  *
145  * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
146  * Move trunk into respective module.
147  *
148  * r10362 | nwhitehorn | 2004-12-06 20:14:05 -0500 (Mon, 06 Dec 2004) | 2 lines
149  * Fixed the spam filter so it works correctly now.
150  *
151  * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
152  * Added AGMS's excellent spam detection software.  Still some weirdness with
153  * the configuration interface from E-mail prefs.
154  *
155  * Revision 1.2  2004/12/07 01:14:05  nwhitehorn
156  * Fixed the spam filter so it works correctly now.
157  *
158  * Revision 1.87  2004/09/20 15:57:26  nwhitehorn
159  * Mostly updated the tree to Be/Haiku style identifier naming conventions.  I
160  * have a few more things to work out, mostly in mail_util.h, and then I'm
161  * proceeding to jamify the build system.  Then we go into Haiku CVS.
162  *
163  * Revision 1.86  2003/07/26 16:47:46  agmsmith
164  * Bug - wasn't allowing double classification if the user had turned on
165  * the option to ignore the previous classification.
166  *
167  * Revision 1.85  2003/07/08 14:52:57  agmsmith
168  * Fix bug with classification choices dialog box coming up with weird
169  * sizes due to RefsReceived message coming in before ReadyToRun had
170  * finished setting up the default sizes of the controls.
171  *
172  * Revision 1.84  2003/07/04 19:59:29  agmsmith
173  * Now with a GUI option to let you declassify messages (set them back
174  * to uncertain, rather than spam or genuine).  Required a BAlert
175  * replacement since BAlerts can't do four buttons.
176  *
177  * Revision 1.83  2003/07/03 20:40:36  agmsmith
178  * Added Uncertain option for declassifying messages.
179  *
180  * Revision 1.82  2003/06/16 14:57:13  agmsmith
181  * Detect spam which uses mislabeled text attachments, going by the file name
182  * extension.
183  *
184  * Revision 1.81  2003/04/08 20:27:04  agmsmith
185  * AGMSBayesianSpamServer now shuts down immediately and returns true if
186  * it is asked to quit by the registrar.
187  *
188  * Revision 1.80  2003/04/07 19:20:27  agmsmith
189  * Ooops, int64 doesn't exist, use long long instead.
190  *
191  * Revision 1.79  2003/04/07 19:05:22  agmsmith
192  * Now with Allen Brunson's atoll for PPC (you need the %Ld, but that
193  * becomes %lld on other systems).
194  *
195  * Revision 1.78  2003/04/04 22:43:53  agmsmith
196  * Fixed up atoll PPC processor hack so it would actually work, was just
197  * returning zero which meant that it wouldn't load in the database file
198  * (read the size as zero).
199  *
200  * Revision 1.77  2003/01/22 03:19:48  agmsmith
201  * Don't convert words to lower case, the case is important for spam.
202  * Particularly sentences which start with exciting words, which you
203  * normally won't use at the start of a sentence (and thus capitalize).
204  *
205  * Revision 1.76  2002/12/18 02:29:22  agmsmith
206  * Add space for the Uncertain display in Tracker.
207  *
208  * Revision 1.75  2002/12/18 01:54:37  agmsmith
209  * Added uncertain sound effect.
210  *
211  * Revision 1.74  2002/12/13 23:53:12  agmsmith
212  * Minimize the window before opening it so that it doesn't flash on the
213  * screen in server mode.  Also load the database when the window is
214  * displayed so that the user can see the words.
215  *
216  * Revision 1.73  2002/12/13 20:55:57  agmsmith
217  * Documentation.
218  *
219  * Revision 1.72  2002/12/13 20:26:11  agmsmith
220  * Fixed bug with adding messages in strings to database (was limited to
221  * messages at most 1K long).  Also changed default server mode to true
222  * since that's what people use most.
223  *
224  * Revision 1.71  2002/12/11 22:37:30  agmsmith
225  * Added commands to train on spam and genuine e-mail messages passed
226  * in string arguments rather then via external files.
227  *
228  * Revision 1.70  2002/12/10 22:12:41  agmsmith
229  * Adding a message to the database now uses a BPositionIO rather than a
230  * file and file name (for future string rather than file additions).  Also
231  * now re-evaluate a file after reclassifying it so that the user can see
232  * the new ratio.  Also remove the [Spam 99.9%] subject prefix when doing
233  * a re-evaluation or classification (the number would be wrong).
234  *
235  * Revision 1.69  2002/12/10 01:46:04  agmsmith
236  * Added the Chi-Squared scoring method.
237  *
238  * Revision 1.68  2002/11/29 22:08:25  agmsmith
239  * Change default purge age to 2000 so that hitting the purge button
240  * doesn't erase stuff from the new sample database.
241  *
242  * Revision 1.67  2002/11/25 20:39:39  agmsmith
243  * Don't need to massage the MIME type since the mail library now does
244  * the lower case conversion and converts TEXT to text/plain too.
245  *
246  * Revision 1.66  2002/11/20 22:57:12  nwhitehorn
247  * PPC Compatibility Fixes
248  *
249  * Revision 1.65  2002/11/10 18:43:55  agmsmith
250  * Added a time delay to some quitting operations so that scripting commands
251  * from a second client (like a second e-mail account) will make the program
252  * abort the quit operation.
253  *
254  * Revision 1.64  2002/11/05 18:05:16  agmsmith
255  * Looked at Nathan's PPC changes (thanks!), modified style a bit.
256  *
257  * Revision 1.63  2002/11/04 03:30:22  nwhitehorn
258  * Now works (or compiles at least) on PowerPC.  I'll get around to testing it
259  * later.
260  *
261  * Revision 1.62  2002/11/04 01:03:33  agmsmith
262  * Fixed warnings so it compiles under the bemaildaemon system.
263  *
264  * Revision 1.61  2002/11/03 23:00:37  agmsmith
265  * Added to the bemaildaemon project on SourceForge.  Hmmmm, seems to switch to
266  * a new version if I commit and specify a message, but doesn't accept the
267  * message and puts up the text editor.  Must be a bug where cvs eats the first
268  * option after "commit".
269  *
270  * Revision 1.60.1.1  2002/10/22 14:29:27  agmsmith
271  * Needed to recompile with the original Libmail.so from Beta/1 since
272  * the current library uses a different constructor, and thus wouldn't
273  * run when used with the old library.
274  *
275  * Revision 1.60  2002/10/21 16:41:27  agmsmith
276  * Return a special error code when no words are found in a message,
277  * so that messages without text/plain parts can be recognized as
278  * spam by the mail filter.
279  *
280  * Revision 1.59  2002/10/20 21:29:47  agmsmith
281  * Watch out for MIME types of "text", treat as text/plain.
282  *
283  * Revision 1.58  2002/10/20 18:29:07  agmsmith
284  * *** empty log message ***
285  *
286  * Revision 1.57  2002/10/20 18:25:02  agmsmith
287  * Fix case sensitivity in MIME type tests, and fix text/any test.
288  *
289  * Revision 1.56  2002/10/19 17:00:10  agmsmith
290  * Added the pop-up menu for the tokenize modes.
291  *
292  * Revision 1.55  2002/10/19 14:54:06  agmsmith
293  * Fudge MIME type of body text components so that they get
294  * treated as text.
295  *
296  * Revision 1.54  2002/10/19 00:56:37  agmsmith
297  * The parsing of e-mail messages seems to be working now, just need
298  * to add some user interface stuff for the tokenizing mode.
299  *
300  * Revision 1.53  2002/10/18 23:37:56  agmsmith
301  * More mail kit usage, can now decode headers, but more to do.
302  *
303  * Revision 1.52  2002/10/16 23:52:33  agmsmith
304  * Getting ready to add more tokenizing modes, exploring Mail Kit to break
305  * apart messages into components (and decode BASE64 and other encodings).
306  *
307  * Revision 1.51  2002/10/11 20:05:31  agmsmith
308  * Added installation of sound effect names, which the filter will use.
309  *
310  * Revision 1.50  2002/10/02 16:50:02  agmsmith
311  * Forgot to add credits to the algorithm inventors.
312  *
313  * Revision 1.49  2002/10/01 00:39:29  agmsmith
314  * Added drag and drop to evaluate files or to add them to the list.
315  *
316  * Revision 1.48  2002/09/30 19:44:17  agmsmith
317  * Switched to Gary Robinson's method, removed max spam/genuine word.
318  *
319  * Revision 1.47  2002/09/23 17:08:55  agmsmith
320  * Add an attribute with the spam ratio to files which have been evaluated.
321  *
322  * Revision 1.46  2002/09/23 02:50:32  agmsmith
323  * Fiddling with display width of e-mail attributes.
324  *
325  * Revision 1.45  2002/09/23 01:13:56  agmsmith
326  * Oops, bug in string evaluation scripting.
327  *
328  * Revision 1.44  2002/09/22 21:00:55  agmsmith
329  * Added EvaluateString so that the BeMail add-on can pass the info without
330  * having to create a temporary file.
331  *
332  * Revision 1.43  2002/09/20 19:56:02  agmsmith
333  * Added about box and button for estimating the spam ratio of a file.
334  *
335  * Revision 1.42  2002/09/20 01:22:26  agmsmith
336  * More testing, decide that an extreme ratio bias point of 0.5 is good.
337  *
338  * Revision 1.41  2002/09/19 21:17:12  agmsmith
339  * Changed a few names and proofread the program.
340  *
341  * Revision 1.40  2002/09/19 14:27:17  agmsmith
342  * Rearranged execution of commands, moving them to a separate looper
343  * rather than the BApplication, so that thousands of files could be
344  * processed without worrying about the message queue filling up.
345  *
346  * Revision 1.39  2002/09/18 18:47:16  agmsmith
347  * Stop flickering when the view is partially obscured, update cached
348  * values in all situations except when app is busy.
349  *
350  * Revision 1.38  2002/09/18 18:08:11  agmsmith
351  * Add a function for evaluating the spam ratio of a message.
352  *
353  * Revision 1.37  2002/09/16 01:30:16  agmsmith
354  * Added Get Oldest command.
355  *
356  * Revision 1.36  2002/09/16 00:47:52  agmsmith
357  * Change the display to counter-weigh the spam ratio by the number of
358  * messages.
359  *
360  * Revision 1.35  2002/09/15 20:49:35  agmsmith
361  * Scrolling improved, buttons, keys and mouse wheel added.
362  *
363  * Revision 1.34  2002/09/15 03:46:10  agmsmith
364  * Up and down buttons under construction.
365  *
366  * Revision 1.33  2002/09/15 02:09:21  agmsmith
367  * Took out scroll bar.
368  *
369  * Revision 1.32  2002/09/15 02:05:30  agmsmith
370  * Trying to add a scroll bar, but it isn't very useful.
371  *
372  * Revision 1.31  2002/09/14 23:06:28  agmsmith
373  * Now has live updates of the list of words.
374  *
375  * Revision 1.30  2002/09/14 19:53:11  agmsmith
376  * Now with a better display of the words.
377  *
378  * Revision 1.29  2002/09/13 21:33:54  agmsmith
379  * Now draws the words in the word display view, but still primitive.
380  *
381  * Revision 1.28  2002/09/13 19:28:02  agmsmith
382  * Added display of most genuine and most spamiest, fixed up cursor.
383  *
384  * Revision 1.27  2002/09/13 03:08:42  agmsmith
385  * Show current word and message counts, and a busy cursor.
386  *
387  * Revision 1.26  2002/09/13 00:00:08  agmsmith
388  * Fixed up some deadlock problems, now using asynchronous message replies.
389  *
390  * Revision 1.25  2002/09/12 17:56:58  agmsmith
391  * Keep track of words which are spamiest and genuinest.
392  *
393  * Revision 1.24  2002/09/12 01:57:10  agmsmith
394  * Added server mode.
395  *
396  * Revision 1.23  2002/09/11 23:30:45  agmsmith
397  * Added Purge button and ignore classification checkbox.
398  *
399  * Revision 1.22  2002/09/11 21:23:13  agmsmith
400  * Added bulk update choice, purge button, moved to a BView container
401  * for all the controls (so background colour could be set, and Pulse
402  * works normally for it too).
403  *
404  * Revision 1.21  2002/09/10 22:52:49  agmsmith
405  * You can now change the database name in the GUI.
406  *
407  * Revision 1.20  2002/09/09 14:20:42  agmsmith
408  * Now can have multiple backups, and implemented refs received.
409  *
410  * Revision 1.19  2002/09/07 19:14:56  agmsmith
411  * Added standard GUI measurement code.
412  *
413  * Revision 1.18  2002/09/06 21:03:03  agmsmith
414  * Rearranging code to avoid forward references when adding a window class.
415  *
416  * Revision 1.17  2002/09/06 02:54:00  agmsmith
417  * Added the ability to purge old words from the database.
418  *
419  * Revision 1.16  2002/09/05 00:46:03  agmsmith
420  * Now adds spam to the database!
421  *
422  * Revision 1.15  2002/09/04 20:32:15  agmsmith
423  * Read ahead a couple of letters to decode quoted-printable better.
424  *
425  * Revision 1.14  2002/09/04 03:10:03  agmsmith
426  * Can now tokenize (break into words) a text file.
427  *
428  * Revision 1.13  2002/09/03 21:50:54  agmsmith
429  * Count database command, set up MIME type for the database file.
430  *
431  * Revision 1.12  2002/09/03 19:55:54  agmsmith
432  * Added loading and saving the database.
433  *
434  * Revision 1.11  2002/09/02 03:35:33  agmsmith
435  * Create indices and set up attribute associations with the e-mail MIME type.
436  *
437  * Revision 1.10  2002/09/01 15:52:49  agmsmith
438  * Can now delete the database.
439  *
440  * Revision 1.9  2002/08/31 21:55:32  agmsmith
441  * Yet more scripting.
442  *
443  * Revision 1.8  2002/08/31 21:41:37  agmsmith
444  * Under construction, with example code to decode a B_REPLY.
445  *
446  * Revision 1.7  2002/08/30 19:29:06  agmsmith
447  * Combined loading and saving settings into one function.
448  *
449  * Revision 1.6  2002/08/30 02:01:10  agmsmith
450  * Working on loading and saving settings.
451  *
452  * Revision 1.5  2002/08/29 23:17:42  agmsmith
453  * More scripting.
454  *
455  * Revision 1.4  2002/08/28 00:40:52  agmsmith
456  * Scripting now seems to work, at least the messages flow properly.
457  *
458  * Revision 1.3  2002/08/25 21:51:44  agmsmith
459  * Getting the about text formatting right.
460  *
461  * Revision 1.2  2002/08/25 21:28:20  agmsmith
462  * Trying out the BeOS scripting system as a way of implementing the program.
463  *
464  * Revision 1.1  2002/08/24 02:27:51  agmsmith
465  * Initial revision
466  */
467 
468 /* Standard C Library. */
469 
470 #include <stdio.h>
471 #include <stdlib.h>
472 #include <errno.h>
473 
474 /* Standard C++ library. */
475 
476 #include <iostream>
477 
478 /* STL (Standard Template Library) headers. */
479 
480 #include <map>
481 #include <queue>
482 #include <set>
483 #include <string>
484 #include <vector>
485 
486 using namespace std;
487 
488 /* BeOS (Be Operating System) headers. */
489 
490 #include <Alert.h>
491 #include <Application.h>
492 #include <Beep.h>
493 #include <Button.h>
494 #include <CheckBox.h>
495 #include <Cursor.h>
496 #include <Directory.h>
497 #include <Entry.h>
498 #include <File.h>
499 #include <FilePanel.h>
500 #include <FindDirectory.h>
501 #include <fs_index.h>
502 #include <fs_info.h>
503 #include <MenuBar.h>
504 #include <MenuItem.h>
505 #include <Message.h>
506 #include <MessageQueue.h>
507 #include <MessageRunner.h>
508 #include <Mime.h>
509 #include <NodeInfo.h>
510 #include <Path.h>
511 #include <Picture.h>
512 #include <PictureButton.h>
513 #include <Point.h>
514 #include <Polygon.h>
515 #include <PopUpMenu.h>
516 #include <PropertyInfo.h>
517 #include <RadioButton.h>
518 #include <Resources.h>
519 #include <Screen.h>
520 #include <ScrollBar.h>
521 #include <String.h>
522 #include <StringView.h>
523 #include <TextControl.h>
524 #include <View.h>
525 
526 /* Included from the Mail Daemon Replacement project (MDR) include/public
527 directory, available from http://sourceforge.net/projects/bemaildaemon/ */
528 
529 #include <MailMessage.h>
530 #include <MailAttachment.h>
531 
532 
533 /******************************************************************************
534  * Global variables, and not-so-variable things too.  Grouped by functionality.
535  */
536 
537 static float g_MarginBetweenControls; /* Space of a letter "M" between them. */
538 static float g_LineOfTextHeight;      /* Height of text the current font. */
539 static float g_StringViewHeight;      /* Height of a string view text box. */
540 static float g_ButtonHeight;          /* How many pixels tall buttons are. */
541 static float g_CheckBoxHeight;        /* Same for check boxes. */
542 static float g_RadioButtonHeight;     /* Also for radio buttons. */
543 static float g_PopUpMenuHeight;       /* Again for pop-up menus. */
544 static float g_TextBoxHeight;         /* Ditto for editable text controls. */
545 
546 static const char *g_ABSAppSignature =
547   "application/x-vnd.agmsmith.spamdbm";
548 
549 static const char *g_ABSDatabaseFileMIMEType =
550   "text/x-vnd.agmsmith.spam_probability_database";
551 
552 static const char *g_DefaultDatabaseFileName =
553   "SpamDBM Database";
554 
555 static const char *g_DatabaseRecognitionString =
556   "Spam Database File";
557 
558 static const char *g_AttributeNameClassification = "MAIL:classification";
559 static const char *g_AttributeNameSpamRatio = "MAIL:ratio_spam";
560 static const char *g_BeepGenuine = "SpamFilter-Genuine";
561 static const char *g_BeepSpam = "SpamFilter-Spam";
562 static const char *g_BeepUncertain = "SpamFilter-Uncertain";
563 static const char *g_ClassifiedSpam = "Spam";
564 static const char *g_ClassifiedGenuine = "Genuine";
565 static const char *g_DataName = "data";
566 static const char *g_ResultName = "result";
567 
568 static const char *g_SettingsDirectoryName = "Mail";
569 static const char *g_SettingsFileName = "SpamDBM Settings";
570 static const uint32 g_SettingsWhatCode = 'SDBM';
571 static const char *g_BackupSuffix = ".backup %d";
572 static const int g_MaxBackups = 10; /* Numbered from 0 to g_MaxBackups - 1. */
573 static const size_t g_MaxWordLength = 50; /* Words longer than this aren't. */
574 static const int g_MaxInterestingWords = 150; /* Top N words are examined. */
575 static const double g_RobinsonS = 0.45; /* Default weight for no data. */
576 static const double g_RobinsonX = 0.5; /* Halfway point for no data. */
577 
578 static bool g_CommandLineMode;
579   /* TRUE if the program was started from the command line (and thus should
580   exit after processing the command), FALSE if it is running with a graphical
581   user interface. */
582 
583 static bool g_ServerMode;
584   /* When TRUE the program runs in server mode - error messages don't result in
585   pop-up dialog boxes, but you can still see them in stderr.  Also the window
586   is minimized, if it exists. */
587 
588 static int g_QuitCountdown = -1;
589   /* Set to the number of pulse timing events (about one every half second) to
590   count down before the program quits.  Negative means stop counting.  Zero
591   means quit at the next pulse event.  This is used to keep the program alive
592   for a short while after someone requests that it quit, in case more scripting
593   commands come in, which will stop the countdown.  Needed to handle the case
594   where there are multiple e-mail accounts all requesting spam identification,
595   and one finishes first and tells the server to quit.  It also checks to see
596   that there is no more work to do before trying to quit. */
597 
598 static volatile bool g_AppReadyToRunCompleted = false;
599   /* The BApplication starts processing messages before ReadyToRun finishes,
600   which can lead to initialisation problems (button heights not determined).
601   So wait for this to turn TRUE in code that might run early, like
602   RefsReceived. */
603 
604 static class CommanderLooper *g_CommanderLooperPntr = NULL;
605 static BMessenger *g_CommanderMessenger = NULL;
606   /* Some globals for use with the looper which processes external commands
607   (arguments received, file references received), needed for avoiding deadlocks
608   which would happen if the BApplication sent a scripting message to itself. */
609 
610 static BCursor *g_BusyCursor = NULL;
611   /* The busy cursor, will be loaded from the resource file during application
612   startup. */
613 
614 typedef enum PropertyNumbersEnum
615 {
616   PN_DATABASE_FILE = 0,
617   PN_SPAM,
618   PN_SPAM_STRING,
619   PN_GENUINE,
620   PN_GENUINE_STRING,
621   PN_UNCERTAIN,
622   PN_IGNORE_PREVIOUS_CLASSIFICATION,
623   PN_SERVER_MODE,
624   PN_FLUSH,
625   PN_PURGE_AGE,
626   PN_PURGE_POPULARITY,
627   PN_PURGE,
628   PN_OLDEST,
629   PN_EVALUATE,
630   PN_EVALUATE_STRING,
631   PN_RESET_TO_DEFAULTS,
632   PN_INSTALL_THINGS,
633   PN_TOKENIZE_MODE,
634   PN_SCORING_MODE,
635   PN_MAX
636 } PropertyNumbers;
637 
638 static const char * g_PropertyNames [PN_MAX] =
639 {
640   "DatabaseFile",
641   "Spam",
642   "SpamString",
643   "Genuine",
644   "GenuineString",
645   "Uncertain",
646   "IgnorePreviousClassification",
647   "ServerMode",
648   "Flush",
649   "PurgeAge",
650   "PurgePopularity",
651   "Purge",
652   "Oldest",
653   "Evaluate",
654   "EvaluateString",
655   "ResetToDefaults",
656   "InstallThings",
657   "TokenizeMode",
658   "ScoringMode"
659 };
660 
661 /* This array lists the scripting commands we can handle, in a format that the
662 scripting system can understand too. */
663 
664 static struct property_info g_ScriptingPropertyList [] =
665 {
666   /* *name; commands[10]; specifiers[10]; *usage; extra_data; ... */
667   {g_PropertyNames[PN_DATABASE_FILE], {B_GET_PROPERTY, 0},
668     {B_DIRECT_SPECIFIER, 0}, "Get the pathname of the current database file.  "
669     "The default name is something like B_USER_SETTINGS_DIRECTORY / "
670     "Mail / SpamDBM Database", PN_DATABASE_FILE,
671     {}, {}, {}},
672   {g_PropertyNames[PN_DATABASE_FILE], {B_SET_PROPERTY, 0},
673     {B_DIRECT_SPECIFIER, 0}, "Change the pathname of the database file to "
674     "use.  It will automatically be converted to an absolute path name, "
675     "so make sure the parent directories exist before setting it.  If it "
676     "doesn't exist, you'll have to use the create command next.",
677     PN_DATABASE_FILE, {}, {}, {}},
678   {g_PropertyNames[PN_DATABASE_FILE], {B_CREATE_PROPERTY, 0},
679     {B_DIRECT_SPECIFIER, 0}, "Creates a new empty database, will replace "
680     "the existing database file too.", PN_DATABASE_FILE, {}, {}, {}},
681   {g_PropertyNames[PN_DATABASE_FILE], {B_DELETE_PROPERTY, 0},
682     {B_DIRECT_SPECIFIER, 0}, "Deletes the database file and all backup copies "
683     "of that file too.  Really only of use for uninstallers.",
684     PN_DATABASE_FILE, {}, {}, {}},
685   {g_PropertyNames[PN_DATABASE_FILE], {B_COUNT_PROPERTIES, 0},
686     {B_DIRECT_SPECIFIER, 0}, "Returns the number of words in the database.",
687     PN_DATABASE_FILE, {}, {}, {}},
688   {g_PropertyNames[PN_SPAM], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
689     "Adds the spam in the given file (specify full pathname to be safe) to "
690     "the database.  The words in the files will be added to the list of words "
691     "in the database that identify spam messages.  The files processed will "
692     "also have the attribute MAIL:classification added with a value of "
693     "\"Spam\" or \"Genuine\" as specified.  They also have their spam ratio "
694     "attribute updated, as if you had also used the Evaluate command on "
695     "them.  If they already have the MAIL:classification "
696     "attribute and it matches the new classification then they won't get "
697     "processed (and if it is different, they will get removed from the "
698     "statistics for the old class and added to the statistics for the new "
699     "one).  You can turn off that behaviour with the "
700     "IgnorePreviousClassification property.  The command line version lets "
701     "you specify more than one pathname.", PN_SPAM, {}, {}, {}},
702   {g_PropertyNames[PN_SPAM], {B_COUNT_PROPERTIES, 0}, {B_DIRECT_SPECIFIER, 0},
703     "Returns the number of spam messages in the database.", PN_SPAM,
704     {}, {}, {}},
705   {g_PropertyNames[PN_SPAM_STRING], {B_SET_PROPERTY, 0},
706     {B_DIRECT_SPECIFIER, 0}, "Adds the spam in the given string (assumed to "
707     "be the text of a whole e-mail message, not just a file name) to the "
708     "database.", PN_SPAM_STRING, {}, {}, {}},
709   {g_PropertyNames[PN_GENUINE], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
710     "Similar to adding spam except that the message file is added to the "
711     "genuine statistics.", PN_GENUINE, {}, {}, {}},
712   {g_PropertyNames[PN_GENUINE], {B_COUNT_PROPERTIES, 0},
713     {B_DIRECT_SPECIFIER, 0}, "Returns the number of genuine messages in the "
714     "database.", PN_GENUINE, {}, {}, {}},
715   {g_PropertyNames[PN_GENUINE_STRING], {B_SET_PROPERTY, 0},
716     {B_DIRECT_SPECIFIER, 0}, "Adds the genuine message in the given string "
717     "(assumed to be the text of a whole e-mail message, not just a file name) "
718     "to the database.", PN_GENUINE_STRING, {}, {}, {}},
719   {g_PropertyNames[PN_UNCERTAIN], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
720     "Similar to adding spam except that the message file is removed from the "
721     "database, undoing the previous classification.  Obviously, it needs to "
722     "have been classified previously (using the file attributes) so it can "
723     "tell if it is removing spam or genuine words.", PN_UNCERTAIN, {}, {}, {}},
724   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_SET_PROPERTY, 0},
725     {B_DIRECT_SPECIFIER, 0}, "If set to true then the previous classification "
726     "(which was saved as an attribute of the e-mail message file) will be "
727     "ignored, so that you can add the message to the database again.  If set "
728     "to false (the normal case), the attribute will be examined, and if the "
729     "message has already been classified as what you claim it is, nothing "
730     "will be done.  If it was misclassified, then the message will be removed "
731     "from the statistics for the old class and added to the stats for the "
732     "new classification you have requested.",
733     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
734   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_GET_PROPERTY, 0},
735     {B_DIRECT_SPECIFIER, 0}, "Find out the current setting of the flag for "
736     "ignoring the previously recorded classification.",
737     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
738   {g_PropertyNames[PN_SERVER_MODE], {B_SET_PROPERTY, 0},
739     {B_DIRECT_SPECIFIER, 0}, "If set to true then error messages get printed "
740     "to the standard error stream rather than showing up in an alert box.  "
741     "It also starts up with the window minimized.", PN_SERVER_MODE,
742     {}, {}, {}},
743   {g_PropertyNames[PN_SERVER_MODE], {B_GET_PROPERTY, 0},
744     {B_DIRECT_SPECIFIER, 0}, "Find out the setting of the server mode flag.",
745     PN_SERVER_MODE, {}, {}, {}},
746   {g_PropertyNames[PN_FLUSH], {B_EXECUTE_PROPERTY, 0},
747     {B_DIRECT_SPECIFIER, 0}, "Writes out the database file to disk, if it has "
748     "been updated in memory but hasn't been saved to disk.  It will "
749     "automatically get written when the program exits, so this command is "
750     "mostly useful for server mode.", PN_FLUSH, {}, {}, {}},
751   {g_PropertyNames[PN_PURGE_AGE], {B_SET_PROPERTY, 0},
752     {B_DIRECT_SPECIFIER, 0}, "Sets the old age limit.  Words which haven't "
753       "been updated since this many message additions to the database may be "
754       "deleted when you do a purge.  A good value is 1000, meaning that if a "
755       "word hasn't appeared in the last 1000 spam/genuine messages, it will "
756       "be forgotten.  Zero will purge all words, 1 will purge words not in "
757       "the last message added to the database, 2 will purge words not in the "
758       "last two messages added, and so on.  This is mostly useful for "
759       "removing those one time words which are often hunks of binary garbage, "
760       "not real words.  This acts in combination with the popularity limit; "
761       "both conditions have to be valid before the word gets deleted.",
762       PN_PURGE_AGE, {}, {}, {}},
763   {g_PropertyNames[PN_PURGE_AGE], {B_GET_PROPERTY, 0},
764     {B_DIRECT_SPECIFIER, 0}, "Gets the old age limit.", PN_PURGE_AGE,
765     {}, {}, {}},
766   {g_PropertyNames[PN_PURGE_POPULARITY], {B_SET_PROPERTY, 0},
767     {B_DIRECT_SPECIFIER, 0}, "Sets the popularity limit.  Words which aren't "
768     "this popular may be deleted when you do a purge.  A good value is 5, "
769     "which means that the word is safe from purging if it has been seen in 6 "
770     "or more e-mail messages.  If it's only in 5 or less, then it may get "
771     "purged.  The extreme is zero, where only words that haven't been seen "
772     "in any message are deleted (usually means no words).  This acts in "
773     "combination with the old age limit; both conditions have to be valid "
774     "before the word gets deleted.", PN_PURGE_POPULARITY, {}, {}, {}},
775   {g_PropertyNames[PN_PURGE_POPULARITY], {B_GET_PROPERTY, 0},
776     {B_DIRECT_SPECIFIER, 0}, "Gets the purge popularity limit.",
777     PN_PURGE_POPULARITY, {}, {}, {}},
778   {g_PropertyNames[PN_PURGE], {B_EXECUTE_PROPERTY, 0},
779     {B_DIRECT_SPECIFIER, 0}, "Purges the old obsolete words from the "
780     "database, if they are old enough according to the age limit and also "
781     "unpopular enough according to the popularity limit.", PN_PURGE,
782     {}, {}, {}},
783   {g_PropertyNames[PN_OLDEST], {B_GET_PROPERTY, 0},
784     {B_DIRECT_SPECIFIER, 0}, "Gets the age of the oldest message in the "
785     "database.  It's relative to the beginning of time, so you need to do "
786     "(total messages - age - 1) to see how many messages ago it was added.",
787     PN_OLDEST, {}, {}, {}},
788   {g_PropertyNames[PN_EVALUATE], {B_SET_PROPERTY, 0},
789     {B_DIRECT_SPECIFIER, 0}, "Evaluates a given file (by path name) to see "
790     "if it is spam or not.  Returns the ratio of spam probability vs genuine "
791     "probability, 0.0 meaning completely genuine, 1.0 for completely spam.  "
792     "Normally you should safely be able to consider it as spam if it is over "
793     "0.56 for the Robinson scoring method.  For the ChiSquared method, the "
794     "numbers are near 0 for genuine, near 1 for spam, and anywhere in the "
795     "middle means it can't decide.  The program attaches a MAIL:ratio_spam "
796     "attribute with the ratio as its "
797     "float32 value to the file.  Also returns the top few interesting words "
798     "in \"words\" and the associated per-word probability ratios in "
799     "\"ratios\".", PN_EVALUATE, {}, {}, {}},
800   {g_PropertyNames[PN_EVALUATE_STRING], {B_SET_PROPERTY, 0},
801     {B_DIRECT_SPECIFIER, 0}, "Like Evaluate, but rather than a file name, "
802     "the string argument contains the entire text of the message to be "
803     "evaluated.", PN_EVALUATE_STRING, {}, {}, {}},
804   {g_PropertyNames[PN_RESET_TO_DEFAULTS], {B_EXECUTE_PROPERTY, 0},
805     {B_DIRECT_SPECIFIER, 0}, "Resets all the configuration options to the "
806     "default values, including the database name.", PN_RESET_TO_DEFAULTS,
807     {}, {}, {}},
808   {g_PropertyNames[PN_INSTALL_THINGS], {B_EXECUTE_PROPERTY, 0},
809     {B_DIRECT_SPECIFIER, 0}, "Creates indices for the MAIL:classification and "
810     "MAIL:ratio_spam attributes on all volumes which support BeOS queries, "
811     "identifies them to the system as e-mail related attributes (modifies "
812     "the text/x-email MIME type), and sets up the new MIME type "
813     "(text/x-vnd.agmsmith.spam_probability_database) for the database file.  "
814     "Also registers names for the sound effects used by the separate filter "
815     "program (use the installsound BeOS program or the Sounds preferences "
816     "program to associate sound files with the names).", PN_INSTALL_THINGS,
817     {}, {}, {}},
818   {g_PropertyNames[PN_TOKENIZE_MODE], {B_SET_PROPERTY, 0},
819     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for breaking up the "
820     "message into words.  Use \"Whole\" for the whole file (also use it for "
821     "non-email files).  The file isn't broken into parts; the whole thing is "
822     "converted into words, headers and attachments are just more raw data.  "
823     "Well, not quite raw data since it converts quoted-printable codes "
824     "(equals sign followed by hex digits or end of line) to the equivalent "
825     "single characters.  \"PlainText\" breaks the file into MIME components "
826     "and only looks at the ones which are of MIME type text/plain.  "
827     "\"AnyText\" will look for words in all text/* things, including "
828     "text/html attachments.  \"AllParts\" will decode all message components "
829     "and look for words in them, including binary attachments.  "
830     "\"JustHeader\" will only look for words in the message header.  "
831     "\"AllPartsAndHeader\", \"PlainTextAndHeader\" and \"AnyTextAndHeader\" "
832     "will also include the words from the message headers.", PN_TOKENIZE_MODE,
833     {}, {}, {}},
834   {g_PropertyNames[PN_TOKENIZE_MODE], {B_GET_PROPERTY, 0},
835     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for breaking up the "
836     "message into words.", PN_TOKENIZE_MODE, {}, {}, {}},
837   {g_PropertyNames[PN_SCORING_MODE], {B_SET_PROPERTY, 0},
838     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for combining the "
839     "probabilities of individual words into an overall score.  "
840     "\"Robinson\" mode will use Gary Robinson's nth root of the product "
841     "method.  It gives a nice range of values between 0 and 1 so you can "
842     "see shades of spaminess.  The cutoff point between spam and genuine "
843     "varies depending on your database of words (0.56 was one point in "
844     "some experiments).  \"ChiSquared\" mode will use chi-squared "
845     "statistics to evaluate the difference in probabilities that the lists "
846     "of word ratios are random.  The result is very close to 0 for genuine "
847     "and very close to 1 for spam, and near the middle if it is uncertain.",
848     PN_SCORING_MODE, {}, {}, {}},
849   {g_PropertyNames[PN_SCORING_MODE], {B_GET_PROPERTY, 0},
850     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for combining the "
851     "individual word ratios into an overall score.", PN_SCORING_MODE,
852     {}, {}, {}},
853   {0, {0}, {0}, 0, 0, {}, {}, {}} /* End of list of property commands. */
854 };
855 
856 
857 /* The various scoring modes as text and enums.  See PN_SCORING_MODE. */
858 
859 typedef enum ScoringModeEnum
860 {
861   SM_ROBINSON = 0,
862   SM_CHISQUARED,
863   SM_MAX
864 } ScoringModes;
865 
866 static const char * g_ScoringModeNames [SM_MAX] =
867 {
868   "Robinson",
869   "ChiSquared"
870 };
871 
872 
873 /* The various tokenizing modes as text and enums.  See PN_TOKENIZE_MODE. */
874 
875 typedef enum TokenizeModeEnum
876 {
877   TM_WHOLE = 0,
878   TM_PLAIN_TEXT,
879   TM_PLAIN_TEXT_HEADER,
880   TM_ANY_TEXT,
881   TM_ANY_TEXT_HEADER,
882   TM_ALL_PARTS,
883   TM_ALL_PARTS_HEADER,
884   TM_JUST_HEADER,
885   TM_MAX
886 } TokenizeModes;
887 
888 static const char * g_TokenizeModeNames [TM_MAX] =
889 {
890   "All",
891   "Plain text",
892   "Plain text and header",
893   "Any text",
894   "Any text and header",
895   "All parts",
896   "All parts and header",
897   "Just header"
898 };
899 
900 
901 /* Possible message classifications. */
902 
903 typedef enum ClassificationTypesEnum
904 {
905   CL_GENUINE = 0,
906   CL_SPAM,
907   CL_UNCERTAIN,
908   CL_MAX
909 } ClassificationTypes;
910 
911 static const char * g_ClassificationTypeNames [CL_MAX] =
912 {
913   g_ClassifiedGenuine,
914   g_ClassifiedSpam,
915   "Uncertain"
916 };
917 
918 
919 /* Some polygon graphics for the scroll arrows. */
920 
921 static BPoint g_UpLinePoints [] =
922 {
923   BPoint (8, 2 * (1)),
924   BPoint (14, 2 * (6)),
925   BPoint (10, 2 * (6)),
926   BPoint (10, 2 * (13)),
927   BPoint (6, 2 * (13)),
928   BPoint (6, 2 * (6)),
929   BPoint (2, 2 * (6))
930 };
931 
932 static BPoint g_DownLinePoints [] =
933 {
934   BPoint (8, 2 * (14-1)),
935   BPoint (14, 2 * (14-6)),
936   BPoint (10, 2 * (14-6)),
937   BPoint (10, 2 * (14-13)),
938   BPoint (6, 2 * (14-13)),
939   BPoint (6, 2 * (14-6)),
940   BPoint (2, 2 * (14-6))
941 };
942 
943 static BPoint g_UpPagePoints [] =
944 {
945   BPoint (8, 2 * (1)),
946   BPoint (13, 2 * (6)),
947   BPoint (10, 2 * (6)),
948   BPoint (14, 2 * (10)),
949   BPoint (10, 2 * (10)),
950   BPoint (10, 2 * (13)),
951   BPoint (6, 2 * (13)),
952   BPoint (6, 2 * (10)),
953   BPoint (2, 2 * (10)),
954   BPoint (6, 2 * (6)),
955   BPoint (3, 2 * (6))
956 };
957 
958 static BPoint g_DownPagePoints [] =
959 {
960   BPoint (8, 2 * (14-1)),
961   BPoint (13, 2 * (14-6)),
962   BPoint (10, 2 * (14-6)),
963   BPoint (14, 2 * (14-10)),
964   BPoint (10, 2 * (14-10)),
965   BPoint (10, 2 * (14-13)),
966   BPoint (6, 2 * (14-13)),
967   BPoint (6, 2 * (14-10)),
968   BPoint (2, 2 * (14-10)),
969   BPoint (6, 2 * (14-6)),
970   BPoint (3, 2 * (14-6))
971 };
972 
973 
974 /* An array of flags to identify characters which are considered to be spaces.
975 If character code X has g_SpaceCharacters[X] set to true then it is a
976 space-like character.  Character codes 128 and above are always non-space since
977 they are UTF-8 characters.  Initialised in the ABSApp constructor. */
978 
979 static bool g_SpaceCharacters [128];
980 
981 
982 
983 /******************************************************************************
984  * Each word in the spam database gets one of these structures.  The database
985  * has a string (the word) as the key and this structure as the value
986  * (statistics for that word).
987  */
988 
989 typedef struct StatisticsStruct
990 {
991   uint32 age;
992     /* Sequence number for the time when this word was last updated in the
993     database, so that we can remove old words (haven't been seen in recent
994     spam).  It's zero for the first file ever added (spam or genuine) to the
995     database, 1 for all words added or updated by the second file, etc.  If a
996     later file updates an existing word, it gets the age of the later file. */
997 
998   uint32 genuineCount;
999     /* Number of genuine messages that have this word. */
1000 
1001   uint32 spamCount;
1002     /* A count of the number of spam e-mail messages which contain the word. */
1003 
1004 } StatisticsRecord, *StatisticsPointer;
1005 
1006 typedef map<string, StatisticsRecord> StatisticsMap;
1007   /* Define this type which will be used for our main data storage facility, so
1008   we can more conveniently specify things that are derived from it, like
1009   iterators. */
1010 
1011 
1012 
1013 /******************************************************************************
1014  * An alert box asking how the user wants to mark messages.  There are buttons
1015  * for each classification category, and a checkbox to mark all remaining N
1016  * messages the same way.  And a cancel button.  To use it, first create the
1017  * ClassificationChoicesWindow, specifying the input arguments.  Then call the
1018  * Go method which will show the window, stuff the user's answer into your
1019  * output arguments (class set to CL_MAX if the user cancels), and destroy the
1020  * window.  Implemented because BAlert only allows 3 buttons, max!
1021  */
1022 
1023 class ClassificationChoicesWindow : public BWindow
1024 {
1025 public:
1026   /* Constructor and destructor. */
1027   ClassificationChoicesWindow (BRect FrameRect,
1028     const char *FileName, int NumberOfFiles);
1029 
1030   /* BeOS virtual functions. */
1031   virtual void MessageReceived (BMessage *MessagePntr);
1032 
1033   /* Our methods. */
1034   void Go (bool *BulkModeSelectedPntr,
1035     ClassificationTypes *ChoosenClassificationPntr);
1036 
1037   /* Various message codes for various buttons etc. */
1038   static const uint32 MSG_CLASS_BUTTONS = 'ClB0';
1039   static const uint32 MSG_CANCEL_BUTTON = 'Cncl';
1040   static const uint32 MSG_BULK_CHECKBOX = 'BlkK';
1041 
1042 private:
1043   /* Member variables. */
1044   bool *m_BulkModeSelectedPntr;
1045   ClassificationTypes *m_ChoosenClassificationPntr;
1046 };
1047 
1048 class ClassificationChoicesView : public BView
1049 {
1050 public:
1051   /* Constructor and destructor. */
1052   ClassificationChoicesView (BRect FrameRect,
1053     const char *FileName, int NumberOfFiles);
1054 
1055   /* BeOS virtual functions. */
1056   virtual void AttachedToWindow ();
1057   virtual void GetPreferredSize (float *width, float *height);
1058 
1059 private:
1060   /* Member variables. */
1061   const char *m_FileName;
1062   int         m_NumberOfFiles;
1063   float       m_PreferredBottomY;
1064 };
1065 
1066 
1067 
1068 /******************************************************************************
1069  * Due to deadlock problems with the BApplication posting scripting messages to
1070  * itself, we need to add a second Looper.  Its job is to just to convert
1071  * command line arguments and arguments from the Tracker (refs received) into a
1072  * series of scripting commands sent to the main BApplication.  It also prints
1073  * out the replies received (to stdout for command line replies).  An instance
1074  * of this class will be created and run by the main() function, and shut down
1075  * by it too.
1076  */
1077 
1078 class CommanderLooper : public BLooper
1079 {
1080 public:
1081   CommanderLooper ();
1082   ~CommanderLooper ();
1083   virtual void MessageReceived (BMessage *MessagePntr);
1084 
1085   void CommandArguments (int argc, char **argv);
1086   void CommandReferences (BMessage *MessagePntr,
1087     bool BulkMode = false,
1088     ClassificationTypes BulkClassification = CL_GENUINE);
1089   bool IsBusy ();
1090 
1091 private:
1092   void ProcessArgs (BMessage *MessagePntr);
1093   void ProcessRefs (BMessage *MessagePntr);
1094 
1095   static const uint32 MSG_COMMAND_ARGUMENTS = 'CArg';
1096   static const uint32 MSG_COMMAND_FILE_REFS = 'CRef';
1097 
1098   bool m_IsBusy;
1099 };
1100 
1101 
1102 
1103 /******************************************************************************
1104  * This view contains the various buttons and other controls for setting
1105  * configuration options and displaying the state of the database (but not the
1106  * actual list of words).  It will appear in the top half of the
1107  * DatabaseWindow.
1108  */
1109 
1110 class ControlsView : public BView
1111 {
1112 public:
1113   /* Constructor and destructor. */
1114   ControlsView (BRect NewBounds);
1115   ~ControlsView ();
1116 
1117   /* BeOS virtual functions. */
1118   virtual void AttachedToWindow ();
1119   virtual void FrameResized (float Width, float Height);
1120   virtual void MessageReceived (BMessage *MessagePntr);
1121   virtual void Pulse ();
1122 
1123 private:
1124   /* Various message codes for various buttons etc. */
1125   static const uint32 MSG_BROWSE_BUTTON = 'Brws';
1126   static const uint32 MSG_DATABASE_NAME = 'DbNm';
1127   static const uint32 MSG_ESTIMATE_BUTTON = 'Estm';
1128   static const uint32 MSG_ESTIMATE_FILE_REFS = 'ERef';
1129   static const uint32 MSG_IGNORE_CLASSIFICATION = 'IPCl';
1130   static const uint32 MSG_PURGE_AGE = 'PuAg';
1131   static const uint32 MSG_PURGE_BUTTON = 'Purg';
1132   static const uint32 MSG_PURGE_POPULARITY = 'PuPo';
1133   static const uint32 MSG_SERVER_MODE = 'SrvM';
1134 
1135   /* Our member functions. */
1136   void BrowseForDatabaseFile ();
1137   void BrowseForFileToEstimate ();
1138   void PollServerForChanges ();
1139 
1140   /* Member variables. */
1141   BButton        *m_AboutButtonPntr;
1142   BButton        *m_AddExampleButtonPntr;
1143   BButton        *m_BrowseButtonPntr;
1144   BFilePanel     *m_BrowseFilePanelPntr;
1145   BButton        *m_CreateDatabaseButtonPntr;
1146   char            m_DatabaseFileNameCachedValue [PATH_MAX];
1147   BTextControl   *m_DatabaseFileNameTextboxPntr;
1148   bool            m_DatabaseLoadDone;
1149   BButton        *m_EstimateSpamButtonPntr;
1150   BFilePanel     *m_EstimateSpamFilePanelPntr;
1151   uint32          m_GenuineCountCachedValue;
1152   BTextControl   *m_GenuineCountTextboxPntr;
1153   bool            m_IgnorePreviousClassCachedValue;
1154   BCheckBox      *m_IgnorePreviousClassCheckboxPntr;
1155   BButton        *m_InstallThingsButtonPntr;
1156   uint32          m_PurgeAgeCachedValue;
1157   BTextControl   *m_PurgeAgeTextboxPntr;
1158   BButton        *m_PurgeButtonPntr;
1159   uint32          m_PurgePopularityCachedValue;
1160   BTextControl   *m_PurgePopularityTextboxPntr;
1161   BButton        *m_ResetToDefaultsButtonPntr;
1162   ScoringModes    m_ScoringModeCachedValue;
1163   BMenuBar       *m_ScoringModeMenuBarPntr;
1164   BPopUpMenu     *m_ScoringModePopUpMenuPntr;
1165   bool            m_ServerModeCachedValue;
1166   BCheckBox      *m_ServerModeCheckboxPntr;
1167   uint32          m_SpamCountCachedValue;
1168   BTextControl   *m_SpamCountTextboxPntr;
1169   bigtime_t       m_TimeOfLastPoll;
1170   TokenizeModes   m_TokenizeModeCachedValue;
1171   BMenuBar       *m_TokenizeModeMenuBarPntr;
1172   BPopUpMenu     *m_TokenizeModePopUpMenuPntr;
1173   uint32          m_WordCountCachedValue;
1174   BTextControl   *m_WordCountTextboxPntr;
1175 };
1176 
1177 
1178 /* Various message codes for various buttons etc. */
1179 static const uint32 MSG_LINE_DOWN = 'LnDn';
1180 static const uint32 MSG_LINE_UP = 'LnUp';
1181 static const uint32 MSG_PAGE_DOWN = 'PgDn';
1182 static const uint32 MSG_PAGE_UP = 'PgUp';
1183 
1184 /******************************************************************************
1185  * This view contains the list of words.  It displays as many as can fit in the
1186  * view rectangle, starting at a specified word (so it can simulate scrolling).
1187  * Usually it will appear in the bottom half of the DatabaseWindow.
1188  */
1189 
1190 class WordsView : public BView
1191 {
1192 public:
1193   /* Constructor and destructor. */
1194   WordsView (BRect NewBounds);
1195 
1196   /* BeOS virtual functions. */
1197   virtual void AttachedToWindow ();
1198   virtual void Draw (BRect UpdateRect);
1199   virtual void KeyDown (const char *BufferPntr, int32 NumBytes);
1200   virtual void MakeFocus (bool Focused);
1201   virtual void MessageReceived (BMessage *MessagePntr);
1202   virtual void MouseDown (BPoint point);
1203   virtual void Pulse ();
1204 
1205 private:
1206   /* Our member functions. */
1207   void MoveTextUpOrDown (uint32 MovementType);
1208   void RefsDroppedHere (BMessage *MessagePntr);
1209 
1210   /* Member variables. */
1211   BPictureButton *m_ArrowLineDownPntr;
1212   BPictureButton *m_ArrowLineUpPntr;
1213   BPictureButton *m_ArrowPageDownPntr;
1214   BPictureButton *m_ArrowPageUpPntr;
1215     /* Various buttons for controlling scrolling, since we can't use a scroll
1216     bar.  To make them less obvious, their background view colour needs to be
1217     changed whenever the main view's colour changes. */
1218 
1219   float m_AscentHeight;
1220     /* The ascent height for the font used to draw words.  Height from the top
1221     of the highest letter to the base line (which is near the middle bottom of
1222     the letters, the line where you would align your writing of the text by
1223     hand, all letters have part above, some also have descenders below this
1224     line). */
1225 
1226   rgb_color m_BackgroundColour;
1227     /* The current background colour.  Changes when the focus changes. */
1228 
1229   uint32 m_CachedTotalGenuineMessages;
1230   uint32 m_CachedTotalSpamMessages;
1231   uint32 m_CachedWordCount;
1232     /* These are cached copies of the similar values in the BApplication.  They
1233     reflect what's currently displayed.  If they are different than the values
1234     from the BApplication then the polling loop will try to redraw the display.
1235     They get set to the values actually used during drawing when drawing is
1236     successful. */
1237 
1238   char m_FirstDisplayedWord [g_MaxWordLength + 1];
1239     /* The scrolling display starts at this word.  Since we can't use index
1240     numbers (word[12345] for example), we use the word itself.  The scroll
1241     buttons set this to the next or previous word in the database.  Typing by
1242     the user when the view has the focus will also change this starting word.
1243     */
1244 
1245   rgb_color m_FocusedColour;
1246     /* The colour to use for focused mode (typing by the user is received by
1247     our view). */
1248 
1249   bigtime_t m_LastTimeAKeyWasPressed;
1250     /* Records the time when a key was last pressed.  Used for determining when
1251     the user has stopped typing a batch of letters. */
1252 
1253   float m_LineHeight;
1254     /* Height of a line of text in the font used for the word display.
1255     Includes the height of the letters plus a bit of extra space for between
1256     the lines (called leading). */
1257 
1258   BFont m_TextFont;
1259     /* The font used to draw the text in the window. */
1260 
1261   float m_TextHeight;
1262     /* Maximum total height of the letters in the text, includes the part above
1263     the baseline and the part below.  Doesn't include the sliver of space
1264     between lines. */
1265 
1266   rgb_color m_UnfocusedColour;
1267     /* The colour to use for unfocused mode, when user typing isn't active. */
1268 };
1269 
1270 
1271 
1272 /******************************************************************************
1273  * The BWindow class for this program.  It displays the database in real time,
1274  * and has various buttons and gadgets in the top half for changing settings
1275  * (live changes, no OK button, and they reflect changes done by other programs
1276  * using the server too).  The bottom half is a scrolling view listing all the
1277  * words in the database.  A simple graphic blotch behind each word shows
1278  * whether the word is strongly or weakly related to spam or genuine messages.
1279  * Most operations go through the scripting message system, but it also peeks
1280  * at the BApplication data for examining simple things and when redrawing the
1281  * list of words.
1282  */
1283 
1284 class DatabaseWindow : public BWindow
1285 {
1286 public:
1287   /* Constructor and destructor. */
1288   DatabaseWindow ();
1289 
1290   /* BeOS virtual functions. */
1291   virtual void MessageReceived (BMessage *MessagePntr);
1292   virtual bool QuitRequested ();
1293 
1294 private:
1295   /* Member variables. */
1296   ControlsView *m_ControlsViewPntr;
1297   WordsView    *m_WordsViewPntr;
1298 };
1299 
1300 
1301 
1302 /******************************************************************************
1303  * ABSApp is the BApplication class for this program.  This handles messages
1304  * from the outside world (requests to load a database, or to add files to the
1305  * collection).  It responds to command line arguments (if you start up the
1306  * program a second time, the system will just send the arguments to the
1307  * existing running program).  It responds to scripting messages.  And it
1308  * responds to messages from the window.  Its thread does the main work of
1309  * updating the database and reading / writing files.
1310  */
1311 
1312 class ABSApp : public BApplication
1313 {
1314 public:
1315   /* Constructor and destructor. */
1316   ABSApp ();
1317   ~ABSApp ();
1318 
1319   /* BeOS virtual functions. */
1320   virtual void AboutRequested ();
1321   virtual void ArgvReceived (int32 argc, char **argv);
1322   virtual status_t GetSupportedSuites (BMessage *MessagePntr);
1323   virtual void MessageReceived (BMessage *MessagePntr);
1324   virtual void Pulse ();
1325   virtual bool QuitRequested ();
1326   virtual void ReadyToRun ();
1327   virtual void RefsReceived (BMessage *MessagePntr);
1328   virtual BHandler *ResolveSpecifier (BMessage *MessagePntr, int32 Index,
1329     BMessage *SpecifierMsgPntr, int32 SpecificationKind, const char *Property);
1330 
1331 private:
1332   /* Our member functions. */
1333   status_t AddFileToDatabase (ClassificationTypes IsSpamOrWhat,
1334     const char *FileName, char *ErrorMessage);
1335   status_t AddPositionIOToDatabase (ClassificationTypes IsSpamOrWhat,
1336     BPositionIO *MessageIOPntr, const char *OptionalFileName,
1337     char *ErrorMessage);
1338   status_t AddStringToDatabase (ClassificationTypes IsSpamOrWhat,
1339     const char *String, char *ErrorMessage);
1340   void AddWordsToSet (const char *InputString, size_t NumberOfBytes,
1341     char PrefixCharacter, set<string> &WordSet);
1342   status_t CreateDatabaseFile (char *ErrorMessage);
1343   void DefaultSettings ();
1344   status_t DeleteDatabaseFile (char *ErrorMessage);
1345   status_t EvaluateFile (const char *PathName, BMessage *ReplyMessagePntr,
1346     char *ErrorMessage);
1347   status_t EvaluatePositionIO (BPositionIO *PositionIOPntr,
1348     const char *OptionalFileName, BMessage *ReplyMessagePntr,
1349     char *ErrorMessage);
1350   status_t EvaluateString (const char *BufferPntr, ssize_t BufferSize,
1351     BMessage *ReplyMessagePntr, char *ErrorMessage);
1352   status_t GetWordsFromPositionIO (BPositionIO *PositionIOPntr,
1353     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1354   status_t InstallThings (char *ErrorMessage);
1355   status_t LoadDatabaseIfNeeded (char *ErrorMessage);
1356   status_t LoadSaveDatabase (bool DoLoad, char *ErrorMessage);
1357 public:
1358   status_t LoadSaveSettings (bool DoLoad);
1359 private:
1360   status_t MakeBackup (char *ErrorMessage);
1361   void MakeDatabaseEmpty ();
1362   void ProcessScriptingMessage (BMessage *MessagePntr,
1363     struct property_info *PropInfoPntr);
1364   status_t PurgeOldWords (char *ErrorMessage);
1365   status_t RecursivelyTokenizeMailComponent (
1366     BMailComponent *ComponentPntr, const char *OptionalFileName,
1367     set<string> &WordSet, char *ErrorMessage,
1368     int RecursionLevel, int MaxRecursionLevel);
1369   status_t SaveDatabaseIfNeeded (char *ErrorMessage);
1370   status_t TokenizeParts (BPositionIO *PositionIOPntr,
1371     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1372   status_t TokenizeWhole (BPositionIO *PositionIOPntr,
1373     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1374 
1375 public:
1376   /* Member variables.  Many are read by the window thread to see if it needs
1377   updating, and to draw the words.  However, the other threads will lock the
1378   BApplication or using scripting commands if they want to make changes. */
1379 
1380   bool m_DatabaseHasChanged;
1381     /* Set to TRUE when the in-memory database (stored in m_WordMap) has
1382     changed and is different from the on-disk database file.  When the
1383     application exits, the database will be written out if it has changed. */
1384 
1385   BString m_DatabaseFileName;
1386     /* The absolute path name to use for the database file on disk. */
1387 
1388   bool m_IgnorePreviousClassification;
1389     /* If TRUE then the previous classification of a message (stored in an
1390     attribute on the message file) will be ignored, and the message will be
1391     added to the requested spam/genuine list.  If this is FALSE then the spam
1392     won't be added to the list if it has already been classified as specified,
1393     but if it was mis-classified, it will be removed from the old list and
1394     added to the new list. */
1395 
1396   uint32 m_OldestAge;
1397     /* The age of the oldest word.  This will be the smallest age number in the
1398     database.  Mostly useful for scaling graphics representing age in the word
1399     display.  If the oldest word is no longer the oldest, this variable won't
1400     get immediately updated since it would take a lot of effort to find the
1401     next older age.  Since it's only used for display, we'll let it be slightly
1402     incorrect.  The next database load or purge will fix it. */
1403 
1404   uint32 m_PurgeAge;
1405     /* When purging old words, they have to be at least this old to be eligible
1406     for deletion.  Age is measured as the number of e-mails added to the
1407     database since the word was last updated in the database.  Zero means all
1408     words are old. */
1409 
1410   uint32 m_PurgePopularity;
1411     /* When purging old words, they have to be less than or equal to this
1412     popularity limit to be eligible for deletion.  Popularity is measured as
1413     the number of messages (spam and genuine) which have the word.  Zero means
1414     no words. */
1415 
1416   ScoringModes m_ScoringMode;
1417     /* Controls how to combine the word probabilities into an overall score.
1418     See the PN_SCORING_MODE comments for details. */
1419 
1420   BPath m_SettingsDirectoryPath;
1421     /* The constructor initialises this to the settings directory path.  It
1422     never changes after that. */
1423 
1424   bool m_SettingsHaveChanged;
1425     /* Set to TRUE when the settings are changed (different than the ones which
1426     were loaded).  When the application exits, the settings will be written out
1427     if they have changed. */
1428 
1429   double m_SmallestUseableDouble;
1430     /* When multiplying fractional numbers together, avoid using numbers
1431     smaller than this because the double exponent range is close to being
1432     exhausted.  The IEEE STANDARD 754 floating-point arithmetic (used on the
1433     Intel i8087 and later math processors) has 64 bit numbers with 53 bits of
1434     mantissa, giving it an underflow starting at 0.5**1022 = 2.2e-308 where it
1435     rounds off to the nearest multiple of 0.5**1074 = 4.9e-324. */
1436 
1437   TokenizeModes m_TokenizeMode;
1438     /* Controls how to convert the raw message text into words.  See the
1439     PN_TOKENIZE_MODE comments for details. */
1440 
1441   uint32 m_TotalGenuineMessages;
1442     /* Number of genuine messages which are in the database. */
1443 
1444   uint32 m_TotalSpamMessages;
1445     /* Number of spam messages which are in the database. */
1446 
1447   uint32 m_WordCount;
1448     /* The number of words currently in the database.  Stored separately as a
1449     member variable to avoid having to call m_WordMap.size() all the time,
1450     which other threads can't do while the database is being updated (but they
1451     can look at the word count variable). */
1452 
1453   StatisticsMap m_WordMap;
1454     /* The in-memory data structure holding the set of words and their
1455     associated statistics.  When the database isn't in use, it is an empty
1456     collection.  You should lock the BApplication if you are using the word
1457     collection (reading or writing) from another thread. */
1458 };
1459 
1460 
1461 
1462 /******************************************************************************
1463  * Global utility function to display an error message and return.  The message
1464  * part describes the error, and if ErrorNumber is non-zero, gets the string
1465  * ", error code $X (standard description)." appended to it.  If the message
1466  * is NULL then it gets defaulted to "Something went wrong".  The title part
1467  * doesn't get displayed (no title bar in the dialog box, but you can see it in
1468  * the debugger as the window thread name), and defaults to "Error Message" if
1469  * you didn't specify one.  If running in command line mode, the error gets
1470  * printed to stderr rather than showing up in a dialog box.
1471  */
1472 
1473 static void
1474 DisplayErrorMessage (
1475   const char *MessageString = NULL,
1476   int ErrorNumber = 0,
1477   const char *TitleString = NULL)
1478 {
1479   BAlert *AlertPntr;
1480   char ErrorBuffer [PATH_MAX + 1500];
1481 
1482   if (TitleString == NULL)
1483     TitleString = "SpamDBM Error Message";
1484 
1485   if (MessageString == NULL)
1486   {
1487     if (ErrorNumber == 0)
1488       MessageString = "No error, no message, why bother?";
1489     else
1490       MessageString = "Something went wrong";
1491   }
1492 
1493   if (ErrorNumber != 0)
1494   {
1495     sprintf (ErrorBuffer, "%s, error code $%X/%d (%s) has occured.",
1496       MessageString, ErrorNumber, ErrorNumber, strerror (ErrorNumber));
1497     MessageString = ErrorBuffer;
1498   }
1499 
1500   if (g_CommandLineMode || g_ServerMode)
1501     cerr << TitleString << ": " << MessageString << endl;
1502   else
1503   {
1504     AlertPntr = new BAlert (TitleString, MessageString,
1505       "Acknowledge", NULL, NULL, B_WIDTH_AS_USUAL, B_STOP_ALERT);
1506     if (AlertPntr != NULL) {
1507       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1508       AlertPntr->Go ();
1509     }
1510   }
1511 }
1512 
1513 
1514 
1515 /******************************************************************************
1516  * Word wrap a long line of text into shorter 79 column lines and print the
1517  * result on the given output stream.
1518  */
1519 
1520 static void
1521 WrapTextToStream (ostream& OutputStream, const char *TextPntr)
1522 {
1523   const int LineLength = 79;
1524   char     *StringPntr;
1525   char      TempString [LineLength+1];
1526 
1527   TempString[LineLength] = 0; /* Only needs to be done once. */
1528 
1529   while (*TextPntr != 0)
1530   {
1531     while (isspace (*TextPntr))
1532       TextPntr++; /* Skip leading spaces. */
1533     if (*TextPntr == 0)
1534       break; /* It was all spaces, don't print any more. */
1535 
1536     strncpy (TempString, TextPntr, LineLength);
1537 
1538     /* Advance StringPntr to the end of the temp string, partly to see how long
1539     it is (rather than doing strlen). */
1540 
1541     StringPntr = TempString;
1542     while (*StringPntr != 0)
1543       StringPntr++;
1544 
1545     if (StringPntr - TempString < LineLength)
1546     {
1547       /* This line fits completely. */
1548       OutputStream << TempString << endl;
1549       TextPntr += StringPntr - TempString;
1550       continue;
1551     }
1552 
1553     /* Advance StringPntr to the last space in the temp string. */
1554 
1555     while (StringPntr > TempString)
1556     {
1557       if (isspace (*StringPntr))
1558         break; /* Found the trailing space. */
1559       else /* Go backwards, looking for the trailing space. */
1560         StringPntr--;
1561     }
1562 
1563     /* Remove more trailing spaces at the end of the line, in case there were
1564     several spaces in a row. */
1565 
1566     while (StringPntr > TempString && isspace (StringPntr[-1]))
1567       StringPntr--;
1568 
1569     /* Print the line of text and advance the text pointer too. */
1570 
1571     if (StringPntr == TempString)
1572     {
1573       /* This line has no spaces, don't wrap it, just split off a chunk. */
1574       OutputStream << TempString << endl;
1575       TextPntr += strlen (TempString);
1576       continue;
1577     }
1578 
1579     *StringPntr = 0; /* Cut off after the first trailing space. */
1580     OutputStream << TempString << endl;
1581     TextPntr += StringPntr - TempString;
1582   }
1583 }
1584 
1585 
1586 
1587 /******************************************************************************
1588  * Print the usage info to the stream.  Includes a list of all commands.
1589  */
1590 ostream& PrintUsage (ostream& OutputStream);
1591 
1592 ostream& PrintUsage (ostream& OutputStream)
1593 {
1594   struct property_info *PropInfoPntr;
1595 
1596   OutputStream << "\nSpamDBM - A Spam Database Manager\n";
1597   OutputStream << "Copyright © 2002 by Alexander G. M. Smith.  ";
1598   OutputStream << "Released to the public domain.\n\n";
1599   WrapTextToStream (OutputStream, "Compiled on " __DATE__ " at " __TIME__
1600 ".  $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $  $HeadURL: http://svn.haiku-os.org/haiku/haiku/trunk/src/bin/mail_utils/spamdbm.cpp $");
1601   OutputStream << "\n"
1602 "This is a program for classifying e-mail messages as spam (junk mail which\n"
1603 "you don't want to read) and regular genuine messages.  It can learn what's\n"
1604 "spam and what's genuine.  You just give it a bunch of spam messages and a\n"
1605 "bunch of non-spam ones.  It uses them to make a list of the words from the\n"
1606 "messages with the probability that each word is from a spam message or from\n"
1607 "a genuine message.  Later on, it can use those probabilities to classify\n"
1608 "new messages as spam or not spam.  If the classifier stops working well\n"
1609 "(because the spammers have changed their writing style and vocabulary, or\n"
1610 "your regular correspondants are writing like spammers), you can use this\n"
1611 "program to update the list of words to identify the new messages\n"
1612 "correctly.\n"
1613 "\n"
1614 "The original idea was from Paul Graham's algorithm, which has an excellent\n"
1615 "writeup at: http://www.paulgraham.com/spam.html\n"
1616 "\n"
1617 "Gary Robinson came up with the improved algorithm, which you can read about at:\n"
1618 "http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n"
1619 "\n"
1620 "Then he, Tim Peters and the SpamBayes mailing list developed the Chi-Squared\n"
1621 "test, see http://mail.python.org/pipermail/spambayes/2002-October/001036.html\n"
1622 "for one of the earlier messages leading from the central limit theorem to\n"
1623 "the current chi-squared scoring method.\n"
1624 "\n"
1625 "Thanks go to Isaac Yonemoto for providing a better icon, which we can\n"
1626 "unfortunately no longer use, since the Hormel company wants people to\n"
1627 "avoid associating their meat product with junk e-mail.\n"
1628 "\n"
1629 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes\n"
1630 "uses to extract words from messages.  In particular, HTML is now handled.\n"
1631 "\n"
1632 "Usage: Specify the operation as the first argument followed by more\n"
1633 "information as appropriate.  The program's configuration will affect the\n"
1634 "actual operation (things like the name of the database file to use, or\n"
1635 "whether it should allow non-email messages to be added).  In command line\n"
1636 "mode it will do the operation and exit.  In GUI/server mode a command line\n"
1637 "invocation will just send the command to the running server.  You can also\n"
1638 "use BeOS scripting (see the \"Hey\" command which you can get from\n"
1639 "http://www.bebits.com/app/2042 ) to control the Spam server.  And finally,\n"
1640 "there's also a GUI interface which shows up if you start it without any\n"
1641 "command line arguments.\n"
1642 "\n"
1643 "Commands:\n"
1644 "\n"
1645 "Quit\n"
1646 "Stop the program.  Useful if it's running as a server.\n"
1647 "\n";
1648 
1649   /* Go through all our scripting commands and add a description of each one to
1650   the usage text. */
1651 
1652   for (PropInfoPntr = g_ScriptingPropertyList + 0;
1653   PropInfoPntr->name != 0;
1654   PropInfoPntr++)
1655   {
1656     switch (PropInfoPntr->commands[0])
1657     {
1658       case B_GET_PROPERTY:
1659         OutputStream << "Get " << PropInfoPntr->name << endl;
1660         break;
1661 
1662       case B_SET_PROPERTY:
1663         OutputStream << "Set " << PropInfoPntr->name << " NewValue" << endl;
1664         break;
1665 
1666       case B_COUNT_PROPERTIES:
1667         OutputStream << "Count " << PropInfoPntr->name << endl;
1668         break;
1669 
1670       case B_CREATE_PROPERTY:
1671         OutputStream << "Create " << PropInfoPntr->name << endl;
1672         break;
1673 
1674       case B_DELETE_PROPERTY:
1675         OutputStream << "Delete " << PropInfoPntr->name << endl;
1676         break;
1677 
1678       case B_EXECUTE_PROPERTY:
1679         OutputStream << PropInfoPntr->name << endl;
1680         break;
1681 
1682       default:
1683         OutputStream << "Buggy Command: " << PropInfoPntr->name << endl;
1684         break;
1685     }
1686     WrapTextToStream (OutputStream, (char *)PropInfoPntr->usage);
1687     OutputStream << endl;
1688   }
1689 
1690   return OutputStream;
1691 }
1692 
1693 
1694 
1695 /******************************************************************************
1696  * A utility function to send a command to the application, will return after a
1697  * short delay if the application is busy (doesn't wait for it to be executed).
1698  * The reply from the application is also thrown away.  It used to be an
1699  * overloaded function, but the system couldn't distinguish between bool and
1700  * int, so now it has slightly different names depending on the arguments.
1701  */
1702 
1703 static void
1704 SubmitCommand (BMessage& CommandMessage)
1705 {
1706   status_t ErrorCode;
1707 
1708   ErrorCode = be_app_messenger.SendMessage (&CommandMessage,
1709     be_app_messenger /* reply messenger, throw away the reply */,
1710     1000000 /* delivery timeout */);
1711 
1712   if (ErrorCode != B_OK)
1713     cerr << "SubmitCommand failed to send a command, code " <<
1714     ErrorCode << " (" << strerror (ErrorCode) << ")." << endl;
1715 }
1716 
1717 
1718 static void
1719 SubmitCommandString (
1720   PropertyNumbers Property,
1721   uint32 CommandCode,
1722   const char *StringArgument = NULL)
1723 {
1724   BMessage CommandMessage (CommandCode);
1725 
1726   if (Property < 0 || Property >= PN_MAX)
1727   {
1728     DisplayErrorMessage ("SubmitCommandString bug.");
1729     return;
1730   }
1731   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1732   if (StringArgument != NULL)
1733     CommandMessage.AddString (g_DataName, StringArgument);
1734   SubmitCommand (CommandMessage);
1735 }
1736 
1737 
1738 static void
1739 SubmitCommandInt32 (
1740   PropertyNumbers Property,
1741   uint32 CommandCode,
1742   int32 Int32Argument)
1743 {
1744   BMessage CommandMessage (CommandCode);
1745 
1746   if (Property < 0 || Property >= PN_MAX)
1747   {
1748     DisplayErrorMessage ("SubmitCommandInt32 bug.");
1749     return;
1750   }
1751   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1752   CommandMessage.AddInt32 (g_DataName, Int32Argument);
1753   SubmitCommand (CommandMessage);
1754 }
1755 
1756 
1757 static void
1758 SubmitCommandBool (
1759   PropertyNumbers Property,
1760   uint32 CommandCode,
1761   bool BoolArgument)
1762 {
1763   BMessage CommandMessage (CommandCode);
1764 
1765   if (Property < 0 || Property >= PN_MAX)
1766   {
1767     DisplayErrorMessage ("SubmitCommandBool bug.");
1768     return;
1769   }
1770   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1771   CommandMessage.AddBool (g_DataName, BoolArgument);
1772   SubmitCommand (CommandMessage);
1773 }
1774 
1775 
1776 
1777 /******************************************************************************
1778  * A utility function which will estimate the spaminess of file(s), not
1779  * callable from the application thread since it sends a scripting command to
1780  * the application and waits for results.  For each file there will be an entry
1781  * reference in the message.  For each of those, run it through the spam
1782  * estimator and display a box with the results.  This function is used both by
1783  * the file requestor and by dragging and dropping into the middle of the words
1784  * view.
1785  */
1786 
1787 static void
1788 EstimateRefFilesAndDisplay (BMessage *MessagePntr)
1789 {
1790   BAlert     *AlertPntr;
1791   BEntry      Entry;
1792   entry_ref   EntryRef;
1793   status_t    ErrorCode;
1794   int         i, j;
1795   BPath       Path;
1796   BMessage    ReplyMessage;
1797   BMessage    ScriptingMessage;
1798   const char *StringPntr;
1799   float       TempFloat;
1800   int32       TempInt32;
1801   char        TempString [PATH_MAX + 1024 +
1802                 g_MaxInterestingWords * (g_MaxWordLength + 16)];
1803 
1804   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
1805   {
1806     /* See if the entry is a valid file or directory or other thing. */
1807 
1808     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
1809     if (ErrorCode != B_OK || !Entry.Exists () || Entry.GetPath (&Path) != B_OK)
1810       continue;
1811 
1812     /* Evaluate the spaminess of the file. */
1813 
1814     ScriptingMessage.MakeEmpty ();
1815     ScriptingMessage.what = B_SET_PROPERTY;
1816     ScriptingMessage.AddSpecifier (g_PropertyNames[PN_EVALUATE]);
1817     ScriptingMessage.AddString (g_DataName, Path.Path ());
1818 
1819     if (be_app_messenger.SendMessage (&ScriptingMessage,&ReplyMessage) != B_OK)
1820       break; /* App has died or something is wrong. */
1821 
1822     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
1823     TempInt32 != B_OK)
1824       break; /* Error messages will be displayed elsewhere. */
1825 
1826     ReplyMessage.FindFloat (g_ResultName, &TempFloat);
1827     sprintf (TempString, "%f spam ratio for \"%s\".\nThe top words are:",
1828       (double) TempFloat, Path.Path ());
1829 
1830     for (j = 0; j < 20 /* Don't print too many! */; j++)
1831     {
1832       if (ReplyMessage.FindString ("words", j, &StringPntr) != B_OK ||
1833       ReplyMessage.FindFloat ("ratios", j, &TempFloat) != B_OK)
1834         break;
1835 
1836       sprintf (TempString + strlen (TempString), "\n%s / %f",
1837         StringPntr, TempFloat);
1838     }
1839     if (j >= 20 && j < g_MaxInterestingWords)
1840       sprintf (TempString + strlen (TempString), "\nAnd up to %d more words.",
1841         g_MaxInterestingWords - j);
1842 
1843     AlertPntr = new BAlert ("Estimate", TempString, "OK");
1844     if (AlertPntr != NULL) {
1845       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1846       AlertPntr->Go ();
1847     }
1848   }
1849 }
1850 
1851 
1852 
1853 /******************************************************************************
1854  * A utility function from the http://sourceforge.net/projects/spambayes
1855  * SpamBayes project.  Return prob(chisq >= x2, with v degrees of freedom).  It
1856  * computes the probability that the chi-squared value (a kind of normalized
1857  * error measurement), with v degrees of freedom, would be larger than a given
1858  * number (x2; chi is the Greek letter X thus x2).  So you can tell if the
1859  * error is really unusual (the returned probability is near zero meaning that
1860  * your measured error number is kind of large - actual chi-squared is rarely
1861  * above that number merely due to random effects), or if it happens often
1862  * (usually if the probability is over 5% then it's within 3 standard
1863  * deviations - meaning that chi-squared goes over your number fairly often due
1864  * merely to random effects).  v must be even for this calculation to work.
1865  */
1866 
1867 static double ChiSquaredProbability (double x2, int v)
1868 {
1869   int    halfV = v / 2;
1870   int    i;
1871   double m;
1872   double sum;
1873   double term;
1874 
1875   if (v & 1)
1876     return -1.0; /* Out of range return value as a hint v is odd. */
1877 
1878   /* If x2 is very large, exp(-m) will underflow to 0. */
1879   m = x2 / 2.0;
1880   sum = term = exp (-m);
1881   for (i = 1; i < halfV; i++)
1882   {
1883     term *= m / i;
1884     sum += term;
1885   }
1886 
1887   /* With small x2 and large v, accumulated roundoff error, plus error in the
1888   platform exp(), can cause this to spill a few ULP above 1.0.  For example,
1889   ChiSquaredProbability(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
1890   point.  Returning a value even a teensy bit over 1.0 is no good. */
1891 
1892   if (sum > 1.0)
1893     return 1.0;
1894   return sum;
1895 }
1896 
1897 
1898 
1899 /******************************************************************************
1900  * A utility function to remove the "[Spam 99.9%] " from in front of the
1901  * MAIL:subject attribute of a file.
1902  */
1903 
1904 static status_t RemoveSpamPrefixFromSubjectAttribute (BNode *BNodePntr)
1905 {
1906   status_t    ErrorCode;
1907   const char *MailSubjectName = "MAIL:subject";
1908   char       *StringPntr;
1909   char        SubjectString [2000];
1910 
1911   ErrorCode = BNodePntr->ReadAttr (MailSubjectName,
1912     B_STRING_TYPE, 0 /* offset */, SubjectString,
1913     sizeof (SubjectString) - 1);
1914   if (ErrorCode <= 0)
1915     return 0; /* The attribute isn't there so we don't care. */
1916   if (ErrorCode >= (int) sizeof (SubjectString) - 1)
1917     return 0; /* Can't handle subjects which are too long. */
1918 
1919   SubjectString [ErrorCode] = 0;
1920   ErrorCode = 0; /* So do-nothing exit returns zero. */
1921   if (strncmp (SubjectString, "[Spam ", 6) == 0)
1922   {
1923     for (StringPntr = SubjectString;
1924     *StringPntr != 0 && *StringPntr != ']'; StringPntr++)
1925       ; /* No body in this for loop. */
1926     if (StringPntr[0] == ']' && StringPntr[1] == ' ')
1927     {
1928       ErrorCode = BNodePntr->RemoveAttr (MailSubjectName);
1929       ErrorCode = BNodePntr->WriteAttr (MailSubjectName,
1930         B_STRING_TYPE, 0 /* offset */,
1931         StringPntr + 2, strlen (StringPntr + 2) + 1);
1932       if (ErrorCode > 0)
1933         ErrorCode = 0;
1934     }
1935   }
1936 
1937   return ErrorCode;
1938 }
1939 
1940 
1941 
1942 /******************************************************************************
1943  * The tokenizing functions.  To make tokenization of the text easier to
1944  * understand, it is broken up into several passes.  Each pass goes over the
1945  * text (can include NUL bytes) and extracts all the words it can recognise
1946  * (can be none).  The extracted words are added to the WordSet, with the
1947  * PrefixCharacter prepended (zero if none) so we can distinguish between words
1948  * found in headers and in the text body.  It also modifies the input text
1949  * buffer in-place to change the text that the next pass will see (blanking out
1950  * words that it wants to delete, but not inserting much new text since the
1951  * buffer can't be enlarged).  They all return the number of bytes remaining in
1952  * InputString after it has been modified to be input for the next pass.
1953  * Returns zero if it has exhausted the possibility of getting more words, or
1954  * if something goes wrong.
1955  */
1956 
1957 static size_t TokenizerPassLowerCase (
1958   char *BufferPntr,
1959   size_t NumberOfBytes)
1960 {
1961   char *EndOfStringPntr;
1962 
1963   EndOfStringPntr = BufferPntr + NumberOfBytes;
1964 
1965   while (BufferPntr < EndOfStringPntr)
1966   {
1967     /* Do our own lower case conversion; tolower () has problems with UTF-8
1968     characters that have the high bit set. */
1969 
1970     if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
1971       *BufferPntr = *BufferPntr + ('a' - 'A');
1972     BufferPntr++;
1973   }
1974   return NumberOfBytes;
1975 }
1976 
1977 
1978 /* A utility function for some commonly repeated code.  If this was Modula-2,
1979 we could use a nested procedure.  But it's not.  Adds the given word to the set
1980 of words, checking for maximum word length and prepending the prefix to the
1981 word, which gets modified by this function to reflect the word actually added
1982 to the set. */
1983 
1984 static void
1985 AddWordAndPrefixToSet (
1986   string &Word,
1987   const char *PrefixString,
1988   set<string> &WordSet)
1989 {
1990   if (Word.empty ())
1991     return;
1992 
1993   if (Word.size () > g_MaxWordLength)
1994     Word.resize (g_MaxWordLength);
1995   Word.insert (0, PrefixString);
1996   WordSet.insert (Word);
1997 }
1998 
1999 
2000 /* Hunt through the text for various URLs and extract the components as
2001 separate words.  Doesn't affect the text in the buffer.  Looks for
2002 protocol://user:password@computer:port/path?query=key#anchor strings.  Also
2003 www.blah strings are detected and broken down.  Doesn't do HREF="" strings
2004 where the string has a relative path (no host computer name).  Assumes the
2005 input buffer is already in lower case. */
2006 
2007 static size_t TokenizerPassExtractURLs (
2008   char *BufferPntr,
2009   size_t NumberOfBytes,
2010   char PrefixCharacter,
2011   set<string> &WordSet)
2012 {
2013   char   *AtSignStringPntr;
2014   char   *HostStringPntr;
2015   char   *InputStringEndPntr;
2016   char   *InputStringPntr;
2017   char   *OptionsStringPntr;
2018   char   *PathStringPntr;
2019   char    PrefixString [2];
2020   char   *ProtocolStringPntr;
2021   string  Word;
2022 
2023   InputStringPntr = BufferPntr;
2024   InputStringEndPntr = BufferPntr + NumberOfBytes;
2025   PrefixString [0] = PrefixCharacter;
2026   PrefixString [1] = 0;
2027 
2028   while (InputStringPntr < InputStringEndPntr - 4)
2029   {
2030     HostStringPntr = NULL;
2031     if (memcmp (InputStringPntr, "www.", 4) == 0)
2032       HostStringPntr = InputStringPntr;
2033     else if (memcmp (InputStringPntr, "://", 3) == 0)
2034     {
2035       /* Find the protocol name, and add it as a word such as "ftp:" "http:" */
2036       ProtocolStringPntr = InputStringPntr;
2037       while (ProtocolStringPntr > BufferPntr &&
2038       isalpha (ProtocolStringPntr[-1]))
2039         ProtocolStringPntr--;
2040       Word.assign (ProtocolStringPntr,
2041         (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
2042       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2043       HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
2044     }
2045     if (HostStringPntr == NULL)
2046     {
2047       InputStringPntr++;
2048       continue;
2049     }
2050 
2051     /* Got a host name string starting at HostStringPntr.  It's everything
2052     until the next slash or space, like "user:password@computer:port". */
2053 
2054     InputStringPntr = HostStringPntr;
2055     AtSignStringPntr = NULL;
2056     while (InputStringPntr < InputStringEndPntr &&
2057     (*InputStringPntr != '/' && !isspace (*InputStringPntr)))
2058     {
2059       if (*InputStringPntr == '@')
2060         AtSignStringPntr = InputStringPntr;
2061       InputStringPntr++;
2062     }
2063     if (AtSignStringPntr != NULL)
2064     {
2065       /* Add a word with the user and password, unseparated. */
2066       Word.assign (HostStringPntr,
2067         AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
2068       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2069       HostStringPntr = AtSignStringPntr + 1;
2070     }
2071 
2072     /* Add a word with the computer and port, unseparated. */
2073 
2074     Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
2075     AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2076 
2077     /* Now get the path name, not including the extra junk after ?  and #
2078     separators (they're stored as separate options).  Stops at white space or a
2079     double quote mark. */
2080 
2081     PathStringPntr = InputStringPntr;
2082     OptionsStringPntr = NULL;
2083     while (InputStringPntr < InputStringEndPntr &&
2084     (*InputStringPntr != '"' && !isspace (*InputStringPntr)))
2085     {
2086       if (OptionsStringPntr == NULL &&
2087       (*InputStringPntr == '?' || *InputStringPntr == '#'))
2088         OptionsStringPntr = InputStringPntr;
2089       InputStringPntr++;
2090     }
2091 
2092     if (OptionsStringPntr == NULL)
2093     {
2094       /* No options, all path. */
2095       Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
2096       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2097     }
2098     else
2099     {
2100       /* Insert the path before the options. */
2101       Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
2102       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2103 
2104       /* Insert all the options as a word. */
2105       Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
2106       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2107     }
2108   }
2109   return NumberOfBytes;
2110 }
2111 
2112 
2113 /* Replace long Asian words (likely to actually be sentences) with the first
2114 character in the word. */
2115 
2116 static size_t TokenizerPassTruncateLongAsianWords (
2117   char *BufferPntr,
2118   size_t NumberOfBytes)
2119 {
2120   char *EndOfStringPntr;
2121   char *InputStringPntr;
2122   int   Letter;
2123   char *OutputStringPntr;
2124   char *StartOfInputLongUnicodeWord;
2125   char *StartOfOutputLongUnicodeWord;
2126 
2127   InputStringPntr = BufferPntr;
2128   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2129   OutputStringPntr = InputStringPntr;
2130   StartOfInputLongUnicodeWord = NULL; /* Non-NULL flags it as started. */
2131   StartOfOutputLongUnicodeWord = NULL;
2132 
2133   /* Copy the text from the input to the output (same buffer), but when we find
2134   a sequence of UTF-8 characters that is too long then truncate it down to one
2135   character and reset the output pointer to be after that character, thus
2136   deleting the word.  Replacing the deleted characters after it with spaces
2137   won't work since we need to preserve the lack of space to handle those sneaky
2138   HTML artificial word breakers.  So that Thelongword<blah>ing becomes
2139   "T<blah>ing" rather than "T <blah>ing", so the next step joins them up into
2140   "Ting" rather than "T" and "ing".  The first code in a UTF-8 character is
2141   11xxxxxx and subsequent ones are 10xxxxxx. */
2142 
2143   while (InputStringPntr < EndOfStringPntr)
2144   {
2145     Letter = (unsigned char) *InputStringPntr;
2146     if (Letter < 128) // Got a regular ASCII letter?
2147     {
2148       if (StartOfInputLongUnicodeWord != NULL)
2149       {
2150         if (InputStringPntr - StartOfInputLongUnicodeWord >
2151         (int) g_MaxWordLength * 2)
2152         {
2153           /* Need to truncate the long word (100 bytes or about 50 characters)
2154           back down to the first UTF-8 character, so find out where the first
2155           character ends (skip past the 10xxxxxx bytes), and rewind the output
2156           pointer to be just after that (ignoring the rest of the long word in
2157           effect). */
2158 
2159           OutputStringPntr = StartOfOutputLongUnicodeWord + 1;
2160           while (OutputStringPntr < InputStringPntr)
2161           {
2162             Letter = (unsigned char) *OutputStringPntr;
2163             if (Letter < 128 || Letter >= 192)
2164               break;
2165             ++OutputStringPntr; // Still a UTF-8 middle of the character code.
2166           }
2167         }
2168         StartOfInputLongUnicodeWord = NULL;
2169       }
2170     }
2171     else if (Letter >= 192 && StartOfInputLongUnicodeWord == NULL)
2172     {
2173       /* Got the start of a UTF-8 character.  Remember the spot so we can see
2174       if this is a too long UTF-8 word, which is often a whole sentence in
2175       asian languages, since they sort of use a single character per word. */
2176 
2177       StartOfInputLongUnicodeWord = InputStringPntr;
2178       StartOfOutputLongUnicodeWord = OutputStringPntr;
2179     }
2180     *OutputStringPntr++ = *InputStringPntr++;
2181   }
2182   return OutputStringPntr - BufferPntr;
2183 }
2184 
2185 
2186 /* Find all the words in the string and add them to our local set of words.
2187 The characters considered white space are defined by g_SpaceCharacters.  This
2188 function is also used as a subroutine by other tokenizer functions when they
2189 have a bunch of presumably plain text they want broken into words and added. */
2190 
2191 static size_t TokenizerPassGetPlainWords (
2192   char *BufferPntr,
2193   size_t NumberOfBytes,
2194   char PrefixCharacter,
2195   set<string> &WordSet)
2196 {
2197   string  AccumulatedWord;
2198   char   *EndOfStringPntr;
2199   size_t  Length;
2200   int     Letter;
2201 
2202   if (NumberOfBytes <= 0)
2203     return 0; /* Nothing to process. */
2204 
2205   if (PrefixCharacter != 0)
2206     AccumulatedWord = PrefixCharacter;
2207   EndOfStringPntr = BufferPntr + NumberOfBytes;
2208   while (true)
2209   {
2210     if (BufferPntr >= EndOfStringPntr)
2211       Letter = EOF; // Usually a negative number.
2212     else
2213       Letter = (unsigned char) *BufferPntr++;
2214 
2215     /* See if it is a letter we treat as white space.  Some word separators
2216     like dashes and periods aren't considered as space.  Note that codes above
2217     127 are UTF-8 characters, which we consider non-space. */
2218 
2219     if (Letter < 0 /* EOF is -1 */ ||
2220     (Letter < 128 && g_SpaceCharacters[Letter]))
2221     {
2222       /* That space finished off a word.  Remove trailing periods... */
2223 
2224       while ((Length = AccumulatedWord.size()) > 0 &&
2225       AccumulatedWord [Length-1] == '.')
2226         AccumulatedWord.resize (Length - 1);
2227 
2228       /* If there's anything left in the word, add it to the set.  Also ignore
2229       words which are too big (it's probably some binary encoded data).  But
2230       leave room for supercalifragilisticexpialidoceous.  According to one web
2231       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
2232       currently in English.  Note that some uuencoded data was seen with a 60
2233       character line length. */
2234 
2235       if (PrefixCharacter != 0)
2236         Length--; // Don't count prefix when judging size or emptiness.
2237       if (Length > 0 && Length <= g_MaxWordLength)
2238         WordSet.insert (AccumulatedWord);
2239 
2240       /* Empty out the string to get ready for the next word.  Not quite empty,
2241       start it off with the prefix character if any. */
2242 
2243       if (PrefixCharacter != 0)
2244         AccumulatedWord = PrefixCharacter;
2245       else
2246         AccumulatedWord.resize (0);
2247     }
2248     else /* Not a space-like character, add it to the word. */
2249       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
2250 
2251     if (Letter < 0)
2252       break; /* End of data.  Exit here so that last word got processed. */
2253   }
2254   return NumberOfBytes;
2255 }
2256 
2257 
2258 /* Delete Things from the text.  The Thing is marked by a start string and an
2259 end string, such as "<!--" and "--> for HTML comment things.  All the text
2260 between the markers will be added to the word list before it gets deleted from
2261 the buffer.  The markers must be prepared in lower case and the buffer is
2262 assumed to have already been converted to lower case.  You can specify an empty
2263 string for the end marker if you're just matching a string constant like
2264 "&nbsp;", which you would put in the starting marker.  This is a utility
2265 function used by other tokenizer functions. */
2266 
2267 static size_t TokenizerUtilRemoveStartEndThing (
2268   char *BufferPntr,
2269   size_t NumberOfBytes,
2270   char PrefixCharacter,
2271   set<string> &WordSet,
2272   const char *ThingStartCode,
2273   const char *ThingEndCode,
2274   bool ReplaceWithSpace)
2275 {
2276   char *EndOfStringPntr;
2277   bool  FoundAndDeletedThing;
2278   char *InputStringPntr;
2279   char *OutputStringPntr;
2280   int   ThingEndLength;
2281   char *ThingEndPntr;
2282   int   ThingStartLength;
2283 
2284   InputStringPntr = BufferPntr;
2285   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2286   OutputStringPntr = InputStringPntr;
2287   ThingStartLength = strlen (ThingStartCode);
2288   ThingEndLength = strlen (ThingEndCode);
2289 
2290   if (ThingStartLength <= 0)
2291     return NumberOfBytes; /* Need some things to look for first! */
2292 
2293   while (InputStringPntr < EndOfStringPntr)
2294   {
2295     /* Search for the starting marker. */
2296 
2297     FoundAndDeletedThing = false;
2298     if (EndOfStringPntr - InputStringPntr >=
2299     ThingStartLength + ThingEndLength /* space remains for start + end */ &&
2300     *InputStringPntr == *ThingStartCode &&
2301     memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
2302     {
2303       /* Found the start marker.  Look for the terminating string.  If it is an
2304       empty string, then we've found it right now! */
2305 
2306       ThingEndPntr = InputStringPntr + ThingStartLength;
2307       while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
2308       {
2309         if (ThingEndLength == 0 ||
2310         (*ThingEndPntr == *ThingEndCode &&
2311         memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
2312         {
2313           /* Got the end of the Thing.  First dump the text inbetween the start
2314           and end markers into the words list. */
2315 
2316           TokenizerPassGetPlainWords (InputStringPntr + ThingStartLength,
2317             ThingEndPntr - (InputStringPntr + ThingStartLength),
2318             PrefixCharacter, WordSet);
2319 
2320           /* Delete by not updating the output pointer while moving the input
2321           pointer to just after the ending tag. */
2322 
2323           InputStringPntr = ThingEndPntr + ThingEndLength;
2324           if (ReplaceWithSpace)
2325             *OutputStringPntr++ = ' ';
2326           FoundAndDeletedThing = true;
2327           break;
2328         }
2329         ThingEndPntr++;
2330       } /* End while ThingEndPntr */
2331     }
2332     if (!FoundAndDeletedThing)
2333       *OutputStringPntr++ = *InputStringPntr++;
2334   } /* End while InputStringPntr */
2335 
2336   return OutputStringPntr - BufferPntr;
2337 }
2338 
2339 
2340 static size_t TokenizerPassRemoveHTMLComments (
2341   char *BufferPntr,
2342   size_t NumberOfBytes,
2343   char PrefixCharacter,
2344   set<string> &WordSet)
2345 {
2346   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2347     PrefixCharacter, WordSet, "<!--", "-->", false);
2348 }
2349 
2350 
2351 static size_t TokenizerPassRemoveHTMLStyle (
2352   char *BufferPntr,
2353   size_t NumberOfBytes,
2354   char PrefixCharacter,
2355   set<string> &WordSet)
2356 {
2357   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2358     PrefixCharacter, WordSet,
2359     "<style", "/style>", false /* replace with space if true */);
2360 }
2361 
2362 
2363 /* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
2364 start of the next sentence is recognised at least as the start of a very long
2365 word.  The Japanese comma also does the same job. */
2366 
2367 static size_t TokenizerPassJapanesePeriodsToSpaces (
2368   char *BufferPntr,
2369   size_t NumberOfBytes,
2370   char PrefixCharacter,
2371   set<string> &WordSet)
2372 {
2373   size_t BytesRemaining = NumberOfBytes;
2374 
2375   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2376     BytesRemaining, PrefixCharacter, WordSet, "。" /* period */, "", true);
2377   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2378     BytesRemaining, PrefixCharacter, WordSet, "、" /* comma */, "", true);
2379   return BytesRemaining;
2380 }
2381 
2382 
2383 /* Delete HTML tags from the text.  The contents of the tag are added as words
2384 before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
2385 stage while other HTML things get replaced by nothing. */
2386 
2387 static size_t TokenizerPassRemoveHTMLTags (
2388   char *BufferPntr,
2389   size_t NumberOfBytes,
2390   char PrefixCharacter,
2391   set<string> &WordSet)
2392 {
2393   size_t BytesRemaining = NumberOfBytes;
2394 
2395   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2396     BytesRemaining, PrefixCharacter, WordSet, "&nbsp;", "", true);
2397   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2398     BytesRemaining, PrefixCharacter, WordSet, "<p", ">", true);
2399   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2400     BytesRemaining, PrefixCharacter, WordSet, "<br", ">", true);
2401   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2402     BytesRemaining, PrefixCharacter, WordSet, "<", ">", false);
2403   return BytesRemaining;
2404 }
2405 
2406 
2407 
2408 /******************************************************************************
2409  * Implementation of the ABSApp class, constructor, destructor and the rest of
2410  * the member functions in mostly alphabetical order.
2411  */
2412 
2413 ABSApp::ABSApp ()
2414 : BApplication (g_ABSAppSignature),
2415   m_DatabaseHasChanged (false),
2416   m_SettingsHaveChanged (false)
2417 {
2418   status_t    ErrorCode;
2419   int         HalvingCount;
2420   int         i;
2421   const void *ResourceData;
2422   size_t      ResourceSize;
2423   BResources *ResourcesPntr;
2424 
2425   MakeDatabaseEmpty ();
2426 
2427   /* Set up the pathname which identifies our settings directory.  Note that
2428   the actual settings are loaded later on (or set to defaults) by the main()
2429   function, before this BApplication starts running.  So we don't bother
2430   initialising the other setting related variables here. */
2431 
2432   ErrorCode =
2433     find_directory (B_USER_SETTINGS_DIRECTORY, &m_SettingsDirectoryPath);
2434   if (ErrorCode == B_OK)
2435     ErrorCode = m_SettingsDirectoryPath.Append (g_SettingsDirectoryName);
2436   if (ErrorCode != B_OK)
2437     m_SettingsDirectoryPath.SetTo (".");
2438 
2439   /* Set up the table which identifies which characters are spaces and which
2440   are not.  Spaces are all control characters and all punctuation except for:
2441   apostrophe (so "it's" and possessive versions of words get stored), dash (for
2442   hyphenated words), dollar sign (for cash amounts), period (for IP addresses,
2443   we later remove trailing periods). */
2444 
2445   memset (g_SpaceCharacters, 1, sizeof (g_SpaceCharacters));
2446   g_SpaceCharacters['\''] = false;
2447   g_SpaceCharacters['-'] = false;
2448   g_SpaceCharacters['$'] = false;
2449   g_SpaceCharacters['.'] = false;
2450   for (i = '0'; i <= '9'; i++)
2451     g_SpaceCharacters[i] = false;
2452   for (i = 'A'; i <= 'Z'; i++)
2453     g_SpaceCharacters[i] = false;
2454   for (i = 'a'; i <= 'z'; i++)
2455     g_SpaceCharacters[i] = false;
2456 
2457   /* Initialise the busy cursor from data in the application's resources. */
2458 
2459   if ((ResourcesPntr = AppResources ()) != NULL && (ResourceData =
2460   ResourcesPntr->LoadResource ('CURS', "Busy Cursor", &ResourceSize)) != NULL
2461   && ResourceSize >= 68 /* Size of a raw 2x16x16x8+4 cursor is 68 bytes */)
2462     g_BusyCursor = new BCursor (ResourceData);
2463 
2464   /* Find out the smallest usable double by seeing how small we can make it. */
2465 
2466   m_SmallestUseableDouble = 1.0;
2467   HalvingCount = 0;
2468   while (HalvingCount < 10000 && m_SmallestUseableDouble > 0.0)
2469   {
2470     HalvingCount++;
2471     m_SmallestUseableDouble /= 2;
2472   }
2473 
2474   /* Recreate the number.  But don't make quite as small, we want to allow some
2475   precision bits and a bit of extra margin for intermediate results in future
2476   calculations. */
2477 
2478   HalvingCount -= 50 + sizeof (double) * 8;
2479 
2480   m_SmallestUseableDouble = 1.0;
2481   while (HalvingCount > 0)
2482   {
2483     HalvingCount--;
2484     m_SmallestUseableDouble /= 2;
2485   }
2486 }
2487 
2488 
2489 ABSApp::~ABSApp ()
2490 {
2491   status_t ErrorCode;
2492   char     ErrorMessage [PATH_MAX + 1024];
2493 
2494   if (m_SettingsHaveChanged)
2495     LoadSaveSettings (false /* DoLoad */);
2496   if ((ErrorCode = SaveDatabaseIfNeeded (ErrorMessage)) != B_OK)
2497     DisplayErrorMessage (ErrorMessage, ErrorCode, "Exiting Error");
2498   delete g_BusyCursor;
2499   g_BusyCursor = NULL;
2500 }
2501 
2502 
2503 /* Display a box showing information about this program. */
2504 
2505 void
2506 ABSApp::AboutRequested ()
2507 {
2508   BAlert *AboutAlertPntr;
2509 
2510   AboutAlertPntr = new BAlert ("About",
2511 "SpamDBM - Spam Database Manager\n\n"
2512 
2513 "This is a BeOS program for classifying e-mail messages as spam (unwanted \
2514 junk mail) or as genuine mail using a Bayesian statistical approach.  There \
2515 is also a Mail Daemon Replacement add-on to filter mail using the \
2516 classification statistics collected earlier.\n\n"
2517 
2518 "Written by Alexander G. M. Smith, fall 2002.\n\n"
2519 
2520 "The original idea was from Paul Graham's algorithm, which has an excellent \
2521 writeup at: http://www.paulgraham.com/spam.html\n\n"
2522 
2523 "Gary Robinson came up with the improved algorithm, which you can read about \
2524 at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n\n"
2525 
2526 "Mr. Robinson, Tim Peters and the SpamBayes mailing list people then \
2527 developed the even better chi-squared scoring method.\n\n"
2528 
2529 "Icon courtesy of Isaac Yonemoto, though it is no longer used since Hormel \
2530 doesn't want their meat product associated with junk e-mail.\n\n"
2531 
2532 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes \
2533 uses to extract words from messages.  In particular, HTML is now handled.\n\n"
2534 
2535 "Released to the public domain, with no warranty.\n"
2536 "$Revision: 30630 $\n"
2537 "Compiled on " __DATE__ " at " __TIME__ ".", "Done");
2538   if (AboutAlertPntr != NULL)
2539   {
2540     AboutAlertPntr->SetFlags(AboutAlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
2541     AboutAlertPntr->Go ();
2542   }
2543 }
2544 
2545 
2546 /* Add the text in the given file to the database as an example of a spam or
2547 genuine message, or removes it from the database if you claim it is
2548 CL_UNCERTAIN.  Also resets the spam ratio attribute to show the effect of the
2549 database change. */
2550 
2551 status_t ABSApp::AddFileToDatabase (
2552   ClassificationTypes IsSpamOrWhat,
2553   const char *FileName,
2554   char *ErrorMessage)
2555 {
2556   status_t ErrorCode;
2557   BFile    MessageFile;
2558   BMessage TempBMessage;
2559 
2560   ErrorCode = MessageFile.SetTo (FileName, B_READ_ONLY);
2561   if (ErrorCode != B_OK)
2562   {
2563     sprintf (ErrorMessage, "Unable to open file \"%s\" for reading", FileName);
2564     return ErrorCode;
2565   }
2566 
2567   ErrorCode = AddPositionIOToDatabase (IsSpamOrWhat,
2568     &MessageFile, FileName, ErrorMessage);
2569   MessageFile.Unset ();
2570   if (ErrorCode != B_OK)
2571     return ErrorCode;
2572 
2573   /* Re-evaluate the file so that the user sees the new ratio attribute. */
2574   return EvaluateFile (FileName, &TempBMessage, ErrorMessage);
2575 }
2576 
2577 
2578 /* Add the given text to the database.  The unique words found in MessageIOPntr
2579 will be added to the database (incrementing the count for the number of
2580 messages using each word, either the spam or genuine count depending on
2581 IsSpamOrWhat).  It will remove the message (decrement the word counts) if you
2582 specify CL_UNCERTAIN as the new classification.  And if it switches from spam
2583 to genuine or vice versa, it will do both - decrement the counts for the old
2584 class and increment the counts for the new one.  An attribute will be added to
2585 MessageIOPntr (if it is a file) to record that it has been marked as Spam or
2586 Genuine (so that it doesn't get added to the database a second time).  If it is
2587 being removed from the database, the classification attribute gets removed too.
2588 If things go wrong, a non-zero error code will be returned and an explanation
2589 written to ErrorMessage (assumed to be at least PATH_MAX + 1024 bytes long).
2590 OptionalFileName is just used in the error message to identify the file to the
2591 user. */
2592 
2593 status_t ABSApp::AddPositionIOToDatabase (
2594   ClassificationTypes IsSpamOrWhat,
2595   BPositionIO *MessageIOPntr,
2596   const char *OptionalFileName,
2597   char *ErrorMessage)
2598 {
2599   BNode                             *BNodePntr;
2600   char                               ClassificationString [NAME_MAX];
2601   StatisticsMap::iterator            DataIter;
2602   status_t                           ErrorCode = 0;
2603   pair<StatisticsMap::iterator,bool> InsertResult;
2604   uint32                             NewAge;
2605   StatisticsRecord                   NewStatistics;
2606   ClassificationTypes                PreviousClassification;
2607   StatisticsPointer                  StatisticsPntr;
2608   set<string>::iterator              WordEndIter;
2609   set<string>::iterator              WordIter;
2610   set<string>                        WordSet;
2611 
2612   NewAge = m_TotalGenuineMessages + m_TotalSpamMessages;
2613   if (NewAge >= 0xFFFFFFF0UL)
2614   {
2615     sprintf (ErrorMessage, "The database is full!  There are %lu messages in "
2616       "it and we can't add any more without overflowing the maximum integer "
2617       "representation in 32 bits", NewAge);
2618     return B_NO_MEMORY;
2619   }
2620 
2621   /* Check that this file hasn't already been added to the database. */
2622 
2623   PreviousClassification = CL_UNCERTAIN;
2624   BNodePntr = dynamic_cast<BNode *> (MessageIOPntr);
2625   if (BNodePntr != NULL) /* If this thing might have attributes. */
2626   {
2627     ErrorCode = BNodePntr->ReadAttr (g_AttributeNameClassification,
2628       B_STRING_TYPE, 0 /* offset */, ClassificationString,
2629       sizeof (ClassificationString) - 1);
2630     if (ErrorCode <= 0) /* Positive values for the number of bytes read */
2631       strcpy (ClassificationString, "none");
2632     else /* Just in case it needs a NUL at the end. */
2633       ClassificationString [ErrorCode] = 0;
2634 
2635     if (strcasecmp (ClassificationString, g_ClassifiedSpam) == 0)
2636       PreviousClassification = CL_SPAM;
2637     else if (strcasecmp (ClassificationString, g_ClassifiedGenuine) == 0)
2638       PreviousClassification = CL_GENUINE;
2639   }
2640 
2641   if (!m_IgnorePreviousClassification &&
2642   PreviousClassification != CL_UNCERTAIN)
2643   {
2644     if (IsSpamOrWhat == PreviousClassification)
2645     {
2646       sprintf (ErrorMessage, "Ignoring file \"%s\" since it seems to have "
2647         "already been classified as %s.", OptionalFileName,
2648         g_ClassificationTypeNames [IsSpamOrWhat]);
2649     }
2650     else
2651     {
2652       sprintf (ErrorMessage, "Changing existing classification of file \"%s\" "
2653         "from %s to %s.", OptionalFileName,
2654         g_ClassificationTypeNames [PreviousClassification],
2655         g_ClassificationTypeNames [IsSpamOrWhat]);
2656     }
2657     DisplayErrorMessage (ErrorMessage, 0, "Note");
2658   }
2659 
2660   if (!m_IgnorePreviousClassification &&
2661   IsSpamOrWhat == PreviousClassification)
2662     /* Nothing to do if it is already classified correctly and the user doesn't
2663     want double classification. */
2664     return B_OK;
2665 
2666   /* Get the list of unique words in the file. */
2667 
2668   ErrorCode = GetWordsFromPositionIO (MessageIOPntr, OptionalFileName,
2669     WordSet, ErrorMessage);
2670   if (ErrorCode != B_OK)
2671     return ErrorCode;
2672 
2673   /* Update the count of the number of messages processed, with corrections if
2674   reclassifying a message. */
2675 
2676   m_DatabaseHasChanged = true;
2677 
2678   if (!m_IgnorePreviousClassification &&
2679   PreviousClassification == CL_SPAM && m_TotalSpamMessages > 0)
2680     m_TotalSpamMessages--;
2681 
2682   if (IsSpamOrWhat == CL_SPAM)
2683     m_TotalSpamMessages++;
2684 
2685   if (!m_IgnorePreviousClassification &&
2686   PreviousClassification == CL_GENUINE && m_TotalGenuineMessages > 0)
2687       m_TotalGenuineMessages--;
2688 
2689   if (IsSpamOrWhat == CL_GENUINE)
2690     m_TotalGenuineMessages++;
2691 
2692   /* Mark the file's attributes with the new classification.  Don't care if it
2693   fails. */
2694 
2695   if (BNodePntr != NULL) /* If this thing might have attributes. */
2696   {
2697     ErrorCode = BNodePntr->RemoveAttr (g_AttributeNameClassification);
2698     if (IsSpamOrWhat != CL_UNCERTAIN)
2699     {
2700       strcpy (ClassificationString, g_ClassificationTypeNames [IsSpamOrWhat]);
2701       ErrorCode = BNodePntr->WriteAttr (g_AttributeNameClassification,
2702         B_STRING_TYPE, 0 /* offset */,
2703         ClassificationString, strlen (ClassificationString) + 1);
2704     }
2705   }
2706 
2707   /* Add the words to the database by incrementing or decrementing the counts
2708   for each word as appropriate. */
2709 
2710   WordEndIter = WordSet.end ();
2711   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
2712   {
2713     if ((DataIter = m_WordMap.find (*WordIter)) == m_WordMap.end ())
2714     {
2715       /* No record in the database for the word. */
2716 
2717       if (IsSpamOrWhat == CL_UNCERTAIN)
2718         continue; /* Not adding words, don't have to subtract from nothing. */
2719 
2720       /* Create a new one record in the database for the new word. */
2721 
2722       memset (&NewStatistics, 0, sizeof (NewStatistics));
2723       InsertResult = m_WordMap.insert (
2724         StatisticsMap::value_type (*WordIter, NewStatistics));
2725       if (!InsertResult.second)
2726       {
2727         sprintf (ErrorMessage, "Failed to insert new database entry for "
2728           "word \"%s\", while processing file \"%s\"",
2729           WordIter->c_str (), OptionalFileName);
2730         return B_NO_MEMORY;
2731       }
2732       DataIter = InsertResult.first;
2733       m_WordCount++;
2734     }
2735 
2736     /* Got the database record for the word, update the statistics. */
2737 
2738     StatisticsPntr = &DataIter->second;
2739 
2740     StatisticsPntr->age = NewAge;
2741 
2742     /* Can't update m_OldestAge here, since it would take a lot of effort to
2743     find the next older age.  Since it's only used for display, we'll let it be
2744     slightly incorrect.  The next database load or purge will fix it. */
2745 
2746     if (IsSpamOrWhat == CL_SPAM)
2747       StatisticsPntr->spamCount++;
2748 
2749     if (IsSpamOrWhat == CL_GENUINE)
2750       StatisticsPntr->genuineCount++;
2751 
2752     if (!m_IgnorePreviousClassification &&
2753     PreviousClassification == CL_SPAM && StatisticsPntr->spamCount > 0)
2754       StatisticsPntr->spamCount--;
2755 
2756     if (!m_IgnorePreviousClassification &&
2757     PreviousClassification == CL_GENUINE && StatisticsPntr->genuineCount > 0)
2758       StatisticsPntr->genuineCount--;
2759   }
2760 
2761   return B_OK;
2762 }
2763 
2764 
2765 /* Add the text in the string to the database as an example of a spam or
2766 genuine message. */
2767 
2768 status_t ABSApp::AddStringToDatabase (
2769   ClassificationTypes IsSpamOrWhat,
2770   const char *String,
2771   char *ErrorMessage)
2772 {
2773   BMemoryIO MemoryIO (String, strlen (String));
2774 
2775   return AddPositionIOToDatabase (IsSpamOrWhat, &MemoryIO,
2776    "Memory Buffer" /* OptionalFileName */, ErrorMessage);
2777 }
2778 
2779 
2780 /* Given a bunch of text, find the words within it (doing special tricks to
2781 extract words from HTML), and add them to the set.  Allow NULs in the text.  If
2782 the PrefixCharacter isn't zero then it is prepended to all words found (so you
2783 can distinguish words as being from a header or from the body text).  See also
2784 TokenizeWhole which does something similar. */
2785 
2786 void
2787 ABSApp::AddWordsToSet (
2788   const char *InputString,
2789   size_t NumberOfBytes,
2790   char PrefixCharacter,
2791   set<string> &WordSet)
2792 {
2793   char   *BufferPntr;
2794   size_t  CurrentSize;
2795   int     PassNumber;
2796 
2797   /* Copy the input buffer.  The code will be modifying it in-place as HTML
2798   fragments and other junk are deleted. */
2799 
2800   BufferPntr = new char [NumberOfBytes];
2801   if (BufferPntr == NULL)
2802     return;
2803   memcpy (BufferPntr, InputString, NumberOfBytes);
2804 
2805   /* Do the tokenization.  Each pass does something to the text in the buffer,
2806   and may add words to the word set. */
2807 
2808   CurrentSize = NumberOfBytes;
2809   for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
2810   {
2811     switch (PassNumber)
2812     {
2813       case 1: /* Lowercase first, rest of them assume lower case inputs. */
2814         CurrentSize = TokenizerPassLowerCase (BufferPntr, CurrentSize);
2815         break;
2816       case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
2817         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2818       case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
2819         BufferPntr, CurrentSize); break;
2820       case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
2821         BufferPntr, CurrentSize, 'Z', WordSet); break;
2822       case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
2823         BufferPntr, CurrentSize, 'Z', WordSet); break;
2824       case 6: CurrentSize = TokenizerPassExtractURLs (
2825         BufferPntr, CurrentSize, 'Z', WordSet); break;
2826       case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
2827         BufferPntr, CurrentSize, 'Z', WordSet); break;
2828       case 8: CurrentSize = TokenizerPassGetPlainWords (
2829         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2830       default: break;
2831     }
2832   }
2833 
2834   delete [] BufferPntr;
2835 }
2836 
2837 
2838 /* The user has provided a command line.  This could actually be from a
2839 separate attempt to invoke the program (this application's resource/attributes
2840 have the launch flags set to "single launch", so the shell doesn't start the
2841 program but instead sends the arguments to the already running instance).  In
2842 either case, the command is sent to an intermediary thread where it is
2843 asynchronously converted into a scripting message(s) that are sent back to this
2844 BApplication.  The intermediary is needed since we can't recursively execute
2845 scripting messages while processing a message (this ArgsReceived one). */
2846 
2847 void
2848 ABSApp::ArgvReceived (int32 argc, char **argv)
2849 {
2850   if (g_CommanderLooperPntr != NULL)
2851     g_CommanderLooperPntr->CommandArguments (argc, argv);
2852 }
2853 
2854 
2855 /* Create a new empty database.  Note that we have to write out the new file
2856 immediately, otherwise other operations will see the empty database and then
2857 try to load the file, and complain that it doesn't exist.  Now they will see
2858 the empty database and redundantly load the empty file. */
2859 
2860 status_t ABSApp::CreateDatabaseFile (char *ErrorMessage)
2861 {
2862   MakeDatabaseEmpty ();
2863   m_DatabaseHasChanged = true;
2864   return SaveDatabaseIfNeeded (ErrorMessage); /* Make it now. */
2865 }
2866 
2867 
2868 /* Set the settings to the defaults.  Needed in case there isn't a settings
2869 file or it is obsolete. */
2870 
2871 void
2872 ABSApp::DefaultSettings ()
2873 {
2874   status_t ErrorCode;
2875   BPath    DatabasePath (m_SettingsDirectoryPath);
2876   char     TempString [PATH_MAX];
2877 
2878   /* The default database file is in the settings directory. */
2879 
2880   ErrorCode = DatabasePath.Append (g_DefaultDatabaseFileName);
2881   if (ErrorCode != B_OK)
2882     strcpy (TempString, g_DefaultDatabaseFileName); /* Unlikely to happen. */
2883   else
2884     strcpy (TempString, DatabasePath.Path ());
2885   m_DatabaseFileName.SetTo (TempString);
2886 
2887   // Users need to be allowed to undo their mistakes...
2888   m_IgnorePreviousClassification = true;
2889   g_ServerMode = true;
2890   m_PurgeAge = 2000;
2891   m_PurgePopularity = 2;
2892   m_ScoringMode = SM_CHISQUARED;
2893   m_TokenizeMode = TM_ANY_TEXT_HEADER;
2894 
2895   m_SettingsHaveChanged = true;
2896 }
2897 
2898 
2899 /* Deletes the database file, and the backup file, and clears the database but
2900 marks it as not changed so that it doesn't get written out when the program
2901 exits. */
2902 
2903 status_t ABSApp::DeleteDatabaseFile (char *ErrorMessage)
2904 {
2905   BEntry   FileEntry;
2906   status_t ErrorCode;
2907   int      i;
2908   char     TempString [PATH_MAX+20];
2909 
2910   /* Clear the in-memory database. */
2911 
2912   MakeDatabaseEmpty ();
2913   m_DatabaseHasChanged = false;
2914 
2915   /* Delete the backup files first.  Don't care if it fails. */
2916 
2917   for (i = 0; i < g_MaxBackups; i++)
2918   {
2919     strcpy (TempString, m_DatabaseFileName.String ());
2920     sprintf (TempString + strlen (TempString), g_BackupSuffix, i);
2921     ErrorCode = FileEntry.SetTo (TempString);
2922     if (ErrorCode == B_OK)
2923       FileEntry.Remove ();
2924   }
2925 
2926   /* Delete the main database file. */
2927 
2928   strcpy (TempString, m_DatabaseFileName.String ());
2929   ErrorCode = FileEntry.SetTo (TempString);
2930   if (ErrorCode != B_OK)
2931   {
2932     sprintf (ErrorMessage, "While deleting, failed to make BEntry for "
2933       "\"%s\" (does the directory exist?)", TempString);
2934     return ErrorCode;
2935   }
2936 
2937   ErrorCode = FileEntry.Remove ();
2938   if (ErrorCode != B_OK)
2939     sprintf (ErrorMessage, "While deleting, failed to remove file "
2940       "\"%s\"", TempString);
2941 
2942   return ErrorCode;
2943 }
2944 
2945 
2946 /* Evaluate the given file as being a spam message, and tag it with the
2947 resulting spam probability ratio.  If it also has an e-mail subject attribute,
2948 remove the [Spam 99.9%] prefix since the number usually changes. */
2949 
2950 status_t ABSApp::EvaluateFile (
2951   const char *PathName,
2952   BMessage *ReplyMessagePntr,
2953   char *ErrorMessage)
2954 {
2955   status_t ErrorCode;
2956   float    TempFloat;
2957   BFile    TextFile;
2958 
2959   /* Open the specified file. */
2960 
2961   ErrorCode = TextFile.SetTo (PathName, B_READ_ONLY);
2962   if (ErrorCode != B_OK)
2963   {
2964     sprintf (ErrorMessage, "Problems opening file \"%s\" for evaluating",
2965       PathName);
2966     return ErrorCode;
2967   }
2968 
2969   ErrorCode =
2970     EvaluatePositionIO (&TextFile, PathName, ReplyMessagePntr, ErrorMessage);
2971 
2972   if (ErrorCode == B_OK &&
2973   ReplyMessagePntr->FindFloat (g_ResultName, &TempFloat) == B_OK)
2974   {
2975     TextFile.WriteAttr (g_AttributeNameSpamRatio, B_FLOAT_TYPE,
2976       0 /* offset */, &TempFloat, sizeof (TempFloat));
2977     /* Don't know the spam cutoff ratio, that's in the e-mail filter, so just
2978     blindly remove the prefix, which would have the wrong percentage. */
2979     RemoveSpamPrefixFromSubjectAttribute (&TextFile);
2980   }
2981 
2982   return ErrorCode;
2983 }
2984 
2985 
2986 /* Evaluate a given file or memory buffer (a BPositionIO handles both cases)
2987 for spaminess.  The output is added to the ReplyMessagePntr message, with the
2988 probability ratio stored in "result" (0.0 means genuine and 1.0 means spam).
2989 It also adds the most significant words (used in the ratio calculation) to the
2990 array "words" and the associated per-word probability ratios in "ratios".  If
2991 it fails, an error code is returned and an error message written to the
2992 ErrorMessage string (which is at least MAX_PATH + 1024 bytes long).
2993 OptionalFileName is only used in the error message.
2994 
2995 The math used for combining the individual word probabilities in my method is
2996 based on Gary Robinson's method (formerly it was a variation of Paul Graham's
2997 method) or the Chi-Squared method.  It's input is the database of words that
2998 has a count of the number of spam and number of genuine messages each word
2999 appears in (doesn't matter if it appears more than once in a message, it still
3000 counts as 1).
3001 
3002 The spam word count is divided the by the total number of spam e-mail messages
3003 in the database to get the probability of spam and probability of genuineness
3004 is similarly computed for a particular word.  The spam probability is divided
3005 by the sum of the spam and genuine probabilities to get the Raw Spam Ratio for
3006 the word.  It's nearer to 0.0 for genuine and nearer to 1.0 for spam, and can
3007 be exactly zero or one too.
3008 
3009 To avoid multiplying later results by zero, and to compensate for a lack of
3010 data points, the Raw Spam Ratio is adjusted towards the 0.5 halfway point.  The
3011 0.5 is combined with the raw spam ratio, with a weight of 0.45 (determined to
3012 be a good value by the "spambayes" mailing list tests) messages applied to the
3013 half way point and a weight of the number of spam + genuine messages applied to
3014 the raw spam ratio.  This gives you the compensated spam ratio for the word.
3015 
3016 The top N (150 was good in the spambayes tests) extreme words are selected by
3017 the distance of each word's compensated spam ratio from 0.5.  Then the ratios
3018 of the words are combined.
3019 
3020 The Gary Robinson combining (scoring) method gets one value from the Nth root
3021 of the product of all the word ratios.  The other is the Nth root of the
3022 product of (1 - ratio) for all the words.  The final result is the first value
3023 divided by the sum of the two values.  The Nth root helps spread the resulting
3024 range of values more evenly between 0.0 and 1.0, otherwise the values all clump
3025 together at 0 or 1.  Also you can think of the Nth root as a kind of average
3026 for products; it's like a generic word probability which when multiplied by
3027 itself N times gives you the same result as the N separate actual word
3028 probabilities multiplied together.
3029 
3030 The Chi-Squared combining (scoring) method assumes that the spam word
3031 probabilities are uniformly distributed and computes an error measurement
3032 (called chi squared - see http://bmj.com/collections/statsbk/8.shtml for a good
3033 tutorial) and then sees how likely that error value would be observed in
3034 practice.  If it's rare to observe, then the words are likely not just randomly
3035 occuring and it's spammy.  The same is done for genuine words.  The two
3036 resulting unlikelynesses are compared to see which is more unlikely, if neither
3037 is, then the method says it can't decide.  The SpamBayes notes (see the
3038 classifier.py file in CVS in http://sourceforge.net/projects/spambayes) say:
3039 
3040 "Across vectors of length n, containing random uniformly-distributed
3041 probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution with 2*n
3042 degrees of freedom.  This has been proven (in some appropriate sense) to be the
3043 most sensitive possible test for rejecting the hypothesis that a vector of
3044 probabilities is uniformly distributed.  Gary Robinson's original scheme was
3045 monotonic *with* this test, but skipped the details.  Turns out that getting
3046 closer to the theoretical roots gives a much sharper classification, with a
3047 very small (in # of msgs), but also very broad (in range of scores), "middle
3048 ground", where most of the mistakes live.  In particular, this scheme seems
3049 immune to all forms of "cancellation disease": if there are many strong ham
3050 *and* spam clues, this reliably scores close to 0.5.  Most other schemes are
3051 extremely certain then -- and often wrong."
3052 
3053 I did a test with 448 example genuine messages including personal mail (some
3054 with HTML attachments) and mailing lists, and 267 spam messages for 27471 words
3055 total.  Test messages were more recent messages in the same groups.  Out of 100
3056 test genuine messages, with Gary Robinson (0.56 cutoff limit), 1 (1%) was
3057 falsely identified as spam and 8 of 73 (11%) spam messages were incorrectly
3058 classified as genuine.  With my variation of Paul Graham's scheme (0.90 cutoff)
3059 I got 6 of 100 (6%) genuine messages incorrectly marked as spam and 2 of 73
3060 (3%) spam messages were incorrectly classified as genuine.  Pretty close, but
3061 Robinson's values are more evenly spread out so you can tell just how spammy it
3062 is by looking at the number. */
3063 
3064 struct WordAndRatioStruct
3065 {
3066   double        probabilityRatio; /* Actually the compensated ratio. */
3067   const string *wordPntr;
3068 
3069   bool operator() ( /* Our less-than comparison function for sorting. */
3070     const WordAndRatioStruct &ItemA,
3071     const WordAndRatioStruct &ItemB) const
3072   {
3073     return
3074       (fabs (ItemA.probabilityRatio - 0.5) <
3075       fabs (ItemB.probabilityRatio - 0.5));
3076   };
3077 };
3078 
3079 status_t ABSApp::EvaluatePositionIO (
3080   BPositionIO *PositionIOPntr,
3081   const char *OptionalFileName,
3082   BMessage *ReplyMessagePntr,
3083   char *ErrorMessage)
3084 {
3085   StatisticsMap::iterator            DataEndIter;
3086   StatisticsMap::iterator            DataIter;
3087   status_t                           ErrorCode;
3088   double                             GenuineProbability;
3089   uint32                             GenuineSpamSum;
3090   int                                i;
3091   priority_queue<
3092     WordAndRatioStruct /* Data type stored in the queue */,
3093     vector<WordAndRatioStruct> /* Underlying container */,
3094     WordAndRatioStruct /* Function for comparing elements */>
3095                                      PriorityQueue;
3096   double                             ProductGenuine;
3097   double                             ProductLogGenuine;
3098   double                             ProductLogSpam;
3099   double                             ProductSpam;
3100   double                             RawProbabilityRatio;
3101   float                              ResultRatio;
3102   double                             SpamProbability;
3103   StatisticsPointer                  StatisticsPntr;
3104   double                             TempDouble;
3105   double                             TotalGenuine;
3106   double                             TotalSpam;
3107   WordAndRatioStruct                 WordAndRatio;
3108   set<string>::iterator              WordEndIter;
3109   set<string>::iterator              WordIter;
3110   const WordAndRatioStruct          *WordRatioPntr;
3111   set<string>                        WordSet;
3112 
3113   /* Get the list of unique words in the file / memory buffer. */
3114 
3115   ErrorCode = GetWordsFromPositionIO (PositionIOPntr, OptionalFileName,
3116     WordSet, ErrorMessage);
3117   if (ErrorCode != B_OK)
3118     return ErrorCode;
3119 
3120   /* Prepare a few variables.  Mostly these are stored double values of some of
3121   the numbers involved (to avoid the overhead of multiple conversions from
3122   integer to double), with extra precautions to avoid divide by zero. */
3123 
3124   if (m_TotalGenuineMessages <= 0)
3125     TotalGenuine = 1.0;
3126   else
3127     TotalGenuine = m_TotalGenuineMessages;
3128 
3129   if (m_TotalSpamMessages <= 0)
3130     TotalSpam = 1.0;
3131   else
3132     TotalSpam = m_TotalSpamMessages;
3133 
3134   /* Look up the words in the database and calculate their compensated spam
3135   ratio.  The results are stored in a priority queue so that we can later find
3136   the top g_MaxInterestingWords for doing the actual determination. */
3137 
3138   WordEndIter = WordSet.end ();
3139   DataEndIter = m_WordMap.end ();
3140   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
3141   {
3142     WordAndRatio.wordPntr = &(*WordIter);
3143 
3144     if ((DataIter = m_WordMap.find (*WordIter)) != DataEndIter)
3145     {
3146       StatisticsPntr = &DataIter->second;
3147 
3148       /* Calculate the probability the word is spam and the probability it is
3149       genuine.  Then the raw probability ratio. */
3150 
3151       SpamProbability = StatisticsPntr->spamCount / TotalSpam;
3152       GenuineProbability = StatisticsPntr->genuineCount / TotalGenuine;
3153 
3154       if (SpamProbability + GenuineProbability > 0)
3155         RawProbabilityRatio =
3156         SpamProbability / (SpamProbability + GenuineProbability);
3157       else /* Word with zero statistics, perhaps due to reclassification. */
3158         RawProbabilityRatio = 0.5;
3159 
3160       /* The compensated ratio leans towards 0.5 (g_RobinsonX) more for fewer
3161       data points, with a weight of 0.45 (g_RobinsonS). */
3162 
3163       GenuineSpamSum =
3164         StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
3165 
3166       WordAndRatio.probabilityRatio =
3167         (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
3168         (g_RobinsonS + GenuineSpamSum);
3169     }
3170     else /* Unknown word. With N=0, compensated ratio equation is RobinsonX. */
3171       WordAndRatio.probabilityRatio = g_RobinsonX;
3172 
3173      PriorityQueue.push (WordAndRatio);
3174   }
3175 
3176   /* Compute the combined probability (multiply them together) of the top few
3177   words.  To avoid numeric underflow (doubles can only get as small as 1E-300),
3178   logarithms are also used.  But avoid the logarithms (sum of logs of numbers
3179   is the same as the product of numbers) as much as possible due to reduced
3180   accuracy and slowness. */
3181 
3182   ProductGenuine = 1.0;
3183   ProductLogGenuine = 0.0;
3184   ProductSpam = 1.0;
3185   ProductLogSpam = 0.0;
3186   for (i = 0;
3187   i < g_MaxInterestingWords && !PriorityQueue.empty();
3188   i++, PriorityQueue.pop())
3189   {
3190     WordRatioPntr = &PriorityQueue.top();
3191     ProductSpam *= WordRatioPntr->probabilityRatio;
3192     ProductGenuine *= 1.0 - WordRatioPntr->probabilityRatio;
3193 
3194     /* Check for the numbers getting dangerously small, close to underflowing.
3195     If they are, move the value into the logarithm storage part. */
3196 
3197     if (ProductSpam < m_SmallestUseableDouble)
3198     {
3199       ProductLogSpam += log (ProductSpam);
3200       ProductSpam = 1.0;
3201     }
3202 
3203     if (ProductGenuine < m_SmallestUseableDouble)
3204     {
3205       ProductLogGenuine += log (ProductGenuine);
3206       ProductGenuine = 1.0;
3207     }
3208 
3209     ReplyMessagePntr->AddString ("words", WordRatioPntr->wordPntr->c_str ());
3210     ReplyMessagePntr->AddFloat ("ratios", WordRatioPntr->probabilityRatio);
3211   }
3212 
3213   /* Get the resulting log of the complete products. */
3214 
3215   if (i > 0)
3216   {
3217     ProductLogSpam += log (ProductSpam);
3218     ProductLogGenuine += log (ProductGenuine);
3219   }
3220 
3221   if (m_ScoringMode == SM_ROBINSON)
3222   {
3223     /* Apply Gary Robinson's scoring method where we take the Nth root of the
3224     products.  This is easiest in logarithm form. */
3225 
3226     if (i > 0)
3227     {
3228       ProductSpam = exp (ProductLogSpam / i);
3229       ProductGenuine = exp (ProductLogGenuine / i);
3230       ResultRatio = ProductSpam / (ProductGenuine + ProductSpam);
3231     }
3232     else /* Somehow got no words! */
3233       ResultRatio = g_RobinsonX;
3234   }
3235   else if (m_ScoringMode == SM_CHISQUARED)
3236   {
3237     /* From the SpamBayes notes: "We compute two chi-squared statistics, one
3238     for ham and one for spam.  The sum-of-the-logs business is more sensitive
3239     to probs near 0 than to probs near 1, so the spam measure uses 1-p (so that
3240     high-spamprob words have greatest effect), and the ham measure uses p
3241     directly (so that lo-spamprob words have greatest effect)."  That means we
3242     just reversed the meaning of the previously calculated spam and genuine
3243     products!  Oh well. */
3244 
3245     TempDouble = ProductLogSpam;
3246     ProductLogSpam = ProductLogGenuine;
3247     ProductLogGenuine = TempDouble;
3248 
3249     if (i > 0)
3250     {
3251       ProductSpam =
3252         1.0 - ChiSquaredProbability (-2.0 * ProductLogSpam, 2 * i);
3253       ProductGenuine =
3254         1.0 - ChiSquaredProbability (-2.0 * ProductLogGenuine, 2 * i);
3255 
3256       /* The SpamBayes notes say: "How to combine these into a single spam
3257       score?  We originally used (S-H)/(S+H) scaled into [0., 1.], which equals
3258       S/(S+H).  A systematic problem is that we could end up being near-certain
3259       a thing was (for example) spam, even if S was small, provided that H was
3260       much smaller.  Rob Hooft stared at these problems and invented the
3261       measure we use now, the simpler S-H, scaled into [0., 1.]." */
3262 
3263       ResultRatio = (ProductSpam - ProductGenuine + 1.0) / 2.0;
3264     }
3265     else /* No words to analyse. */
3266       ResultRatio = 0.5;
3267   }
3268   else /* Unknown scoring mode. */
3269   {
3270     strcpy (ErrorMessage, "Unknown scoring mode specified in settings");
3271     return B_BAD_VALUE;
3272   }
3273 
3274   ReplyMessagePntr->AddFloat (g_ResultName, ResultRatio);
3275   return B_OK;
3276 }
3277 
3278 
3279 /* Just evaluate the given string as being spam text. */
3280 
3281 status_t ABSApp::EvaluateString (
3282   const char *BufferPntr,
3283   ssize_t BufferSize,
3284   BMessage *ReplyMessagePntr,
3285   char *ErrorMessage)
3286 {
3287   BMemoryIO MemoryIO (BufferPntr, BufferSize);
3288 
3289   return EvaluatePositionIO (&MemoryIO, "Memory Buffer",
3290     ReplyMessagePntr, ErrorMessage);
3291 }
3292 
3293 
3294 /* Tell other programs about the scripting commands we support.  Try this
3295 command: "hey application/x-vnd.agmsmith.spamdbm getsuites" to
3296 see it in action (this program has to be already running for it to work). */
3297 
3298 status_t ABSApp::GetSupportedSuites (BMessage *MessagePntr)
3299 {
3300   BPropertyInfo TempPropInfo (g_ScriptingPropertyList);
3301 
3302   MessagePntr->AddString ("suites", "suite/x-vnd.agmsmith.spamdbm");
3303   MessagePntr->AddFlat ("messages", &TempPropInfo);
3304   return BApplication::GetSupportedSuites (MessagePntr);
3305 }
3306 
3307 
3308 /* Add all the words in the given file or memory buffer to the supplied set.
3309 The file name is only there for error messages, it assumes you have already
3310 opened the PositionIO to the right file.  If things go wrong, a non-zero error
3311 code will be returned and an explanation written to ErrorMessage (assumed to be
3312 at least PATH_MAX + 1024 bytes long). */
3313 
3314 status_t ABSApp::GetWordsFromPositionIO (
3315   BPositionIO *PositionIOPntr,
3316   const char *OptionalFileName,
3317   set<string> &WordSet,
3318   char *ErrorMessage)
3319 {
3320   status_t ErrorCode;
3321 
3322   if (m_TokenizeMode == TM_WHOLE)
3323     ErrorCode = TokenizeWhole (PositionIOPntr, OptionalFileName,
3324       WordSet, ErrorMessage);
3325   else
3326     ErrorCode = TokenizeParts (PositionIOPntr, OptionalFileName,
3327       WordSet, ErrorMessage);
3328 
3329   if (ErrorCode == B_OK && WordSet.empty ())
3330   {
3331     /* ENOMSG usually means no message found in queue, but I'm using it to show
3332     no words, a good indicator of spam which is pure HTML. */
3333 
3334     sprintf (ErrorMessage, "No words were found in \"%s\"", OptionalFileName);
3335     ErrorCode = ENOMSG;
3336   }
3337 
3338   return ErrorCode;
3339 }
3340 
3341 
3342 /* Set up indices for attributes MAIL:classification (string) and
3343 MAIL:ratio_spam (float) on all mounted disk volumes that support queries.  Also
3344 tell the system to make those attributes visible to the user (so they can see
3345 them in Tracker) and associate them with e-mail messages.  Also set up the
3346 database file MIME type (provide a description and associate it with this
3347 program so that it picks up the right icon).  And register the names for our
3348 sound effects. */
3349 
3350 status_t ABSApp::InstallThings (char *ErrorMessage)
3351 {
3352   int32       Cookie;
3353   dev_t       DeviceID;
3354   status_t    ErrorCode = B_OK;
3355   fs_info     FSInfo;
3356   int32       i;
3357   int32       iClassification;
3358   int32       iProbability;
3359   int32       j;
3360   index_info  IndexInfo;
3361   BMimeType   MimeType;
3362   BMessage    Parameters;
3363   const char *StringPntr;
3364   bool        TempBool;
3365   int32       TempInt32;
3366 
3367   /* Iterate through all mounted devices and try to make the indices on each
3368   one.  Don't bother if the index exists or the device doesn't support indices
3369   (actually queries). */
3370 
3371   Cookie = 0;
3372   while ((DeviceID = next_dev (&Cookie)) >= 0)
3373   {
3374     if (!fs_stat_dev (DeviceID, &FSInfo) && (FSInfo.flags & B_FS_HAS_QUERY))
3375     {
3376       if (fs_stat_index (DeviceID, g_AttributeNameClassification, &IndexInfo)
3377       && errno == B_ENTRY_NOT_FOUND)
3378       {
3379         if (fs_create_index (DeviceID, g_AttributeNameClassification,
3380         B_STRING_TYPE, 0 /* flags */))
3381         {
3382           ErrorCode = errno;
3383           sprintf (ErrorMessage, "Unable to make string index %s on "
3384             "volume #%d, volume name \"%s\", file system type \"%s\", "
3385             "on device \"%s\"", g_AttributeNameClassification,
3386             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3387             FSInfo.device_name);
3388         }
3389       }
3390 
3391       if (fs_stat_index (DeviceID, g_AttributeNameSpamRatio,
3392       &IndexInfo) && errno == B_ENTRY_NOT_FOUND)
3393       {
3394         if (fs_create_index (DeviceID, g_AttributeNameSpamRatio,
3395         B_FLOAT_TYPE, 0 /* flags */))
3396         {
3397           ErrorCode = errno;
3398           sprintf (ErrorMessage, "Unable to make float index %s on "
3399             "volume #%d, volume name \"%s\", file system type \"%s\", "
3400             "on device \"%s\"", g_AttributeNameSpamRatio,
3401             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3402             FSInfo.device_name);
3403         }
3404       }
3405     }
3406   }
3407   if (ErrorCode != B_OK)
3408     return ErrorCode;
3409 
3410   /* Set up the MIME types for the classification attributes, associate them
3411   with e-mail and make them visible to the user (but not editable).  First need
3412   to get the existing MIME settings, then add ours to them (otherwise the
3413   existing ones get wiped out). */
3414 
3415   ErrorCode = MimeType.SetTo ("text/x-email");
3416   if (ErrorCode != B_OK || !MimeType.IsInstalled ())
3417   {
3418     sprintf (ErrorMessage, "No e-mail MIME type (%s) in the system, can't "
3419       "update it to add our special attributes, and without e-mail this "
3420       "program is useless!", MimeType.Type ());
3421     if (ErrorCode == B_OK)
3422       ErrorCode = -1;
3423     return ErrorCode;
3424   }
3425 
3426   ErrorCode = MimeType.GetAttrInfo (&Parameters);
3427   if (ErrorCode != B_OK)
3428   {
3429     sprintf (ErrorMessage, "Unable to retrieve list of attributes "
3430       "associated with e-mail messages in the MIME database");
3431     return ErrorCode;
3432   }
3433 
3434   for (i = 0, iClassification = -1, iProbability = -1;
3435   i < 1000 && (iClassification < 0 || iProbability < 0);
3436   i++)
3437   {
3438     ErrorCode = Parameters.FindString ("attr:name", i, &StringPntr);
3439     if (ErrorCode != B_OK)
3440       break; /* Reached the end of the attributes. */
3441     if (strcmp (StringPntr, g_AttributeNameClassification) == 0)
3442       iClassification = i;
3443     else if (strcmp (StringPntr, g_AttributeNameSpamRatio) == 0)
3444       iProbability = i;
3445   }
3446 
3447   /* Add extra default settings for those programs which previously didn't
3448   update the MIME database with all the attributes that exist (so our new
3449   additions don't show up at the wrong index). */
3450 
3451   i--; /* Set i to index of last valid attribute. */
3452 
3453   for (j = 0; j <= i; j++)
3454   {
3455     if (Parameters.FindString ("attr:public_name", j, &StringPntr) ==
3456     B_BAD_INDEX)
3457     {
3458       if (Parameters.FindString ("attr:name", j, &StringPntr) != B_OK)
3459         StringPntr = "None!";
3460       Parameters.AddString ("attr:public_name", StringPntr);
3461     }
3462   }
3463 
3464   while (Parameters.FindInt32 ("attr:type", i, &TempInt32) == B_BAD_INDEX)
3465     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3466 
3467   while (Parameters.FindBool ("attr:viewable", i, &TempBool) == B_BAD_INDEX)
3468     Parameters.AddBool ("attr:viewable", true);
3469 
3470   while (Parameters.FindBool ("attr:editable", i, &TempBool) == B_BAD_INDEX)
3471     Parameters.AddBool ("attr:editable", false);
3472 
3473   while (Parameters.FindInt32 ("attr:width", i, &TempInt32) == B_BAD_INDEX)
3474     Parameters.AddInt32 ("attr:width", 60);
3475 
3476   while (Parameters.FindInt32 ("attr:alignment", i, &TempInt32) == B_BAD_INDEX)
3477     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3478 
3479   while (Parameters.FindBool ("attr:extra", i, &TempBool) == B_BAD_INDEX)
3480     Parameters.AddBool ("attr:extra", false);
3481 
3482   /* Add our new attributes to e-mail related things, if not already there. */
3483 
3484   if (iClassification < 0)
3485   {
3486     Parameters.AddString ("attr:name", g_AttributeNameClassification);
3487     Parameters.AddString ("attr:public_name", "Classification Group");
3488     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3489     Parameters.AddBool ("attr:viewable", true);
3490     Parameters.AddBool ("attr:editable", false);
3491     Parameters.AddInt32 ("attr:width", 45);
3492     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3493     Parameters.AddBool ("attr:extra", false);
3494   }
3495 
3496   if (iProbability < 0)
3497   {
3498     Parameters.AddString ("attr:name", g_AttributeNameSpamRatio);
3499     Parameters.AddString ("attr:public_name", "Spam/Genuine Estimate");
3500     Parameters.AddInt32 ("attr:type", B_FLOAT_TYPE);
3501     Parameters.AddBool ("attr:viewable", true);
3502     Parameters.AddBool ("attr:editable", false);
3503     Parameters.AddInt32 ("attr:width", 50);
3504     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3505     Parameters.AddBool ("attr:extra", false);
3506   }
3507 
3508   if (iClassification < 0 || iProbability < 0)
3509   {
3510     ErrorCode = MimeType.SetAttrInfo (&Parameters);
3511     if (ErrorCode != B_OK)
3512     {
3513       sprintf (ErrorMessage, "Unable to associate the classification "
3514         "attributes with e-mail messages in the MIME database");
3515       return ErrorCode;
3516     }
3517   }
3518 
3519   /* Set up the MIME type for the database file. */
3520 
3521   sprintf (ErrorMessage, "Problems with setting up MIME type (%s) for "
3522     "the database files", g_ABSDatabaseFileMIMEType); /* A generic message. */
3523 
3524   ErrorCode = MimeType.SetTo (g_ABSDatabaseFileMIMEType);
3525   if (ErrorCode != B_OK)
3526     return ErrorCode;
3527 
3528   MimeType.Delete ();
3529   ErrorCode = MimeType.Install ();
3530   if (ErrorCode != B_OK)
3531   {
3532     sprintf (ErrorMessage, "Failed to install MIME type (%s) in the system",
3533       MimeType.Type ());
3534     return ErrorCode;
3535   }
3536 
3537   MimeType.SetShortDescription ("Spam Database");
3538   MimeType.SetLongDescription ("Bayesian Statistical Database for "
3539     "Classifying Junk E-Mail");
3540   sprintf (ErrorMessage, "1.0 ('%s')", g_DatabaseRecognitionString);
3541   MimeType.SetSnifferRule (ErrorMessage);
3542   MimeType.SetPreferredApp (g_ABSAppSignature);
3543 
3544   /* Set up the names of the sound effects.  Later on the user can associate
3545   sound files with the names by using the Sounds preferences panel or the
3546   installsound command.  The MDR add-on filter will trigger these sounds. */
3547 
3548   add_system_beep_event (g_BeepGenuine);
3549   add_system_beep_event (g_BeepSpam);
3550   add_system_beep_event (g_BeepUncertain);
3551 
3552   return B_OK;
3553 }
3554 
3555 
3556 /* Load the database if it hasn't been loaded yet.  Otherwise do nothing. */
3557 
3558 status_t ABSApp::LoadDatabaseIfNeeded (char *ErrorMessage)
3559 {
3560   if (m_WordMap.empty ())
3561     return LoadSaveDatabase (true /* DoLoad */, ErrorMessage);
3562 
3563   return B_OK;
3564 }
3565 
3566 
3567 /* Either load the database of spam words (DoLoad is TRUE) from the file
3568 specified in the settings, or write (DoLoad is FALSE) the database to it.  If
3569 it doesn't exist (and its parent directories do exist) then it will be created
3570 when saving.  If it doesn't exist when loading, the in-memory database will be
3571 set to an empty one and an error will be returned with an explanation put into
3572 ErrorMessage (should be big enough for a path name and a couple of lines of
3573 text).
3574 
3575 The database file format is a UTF-8 text file (well, there could be some
3576 latin-1 characters and other junk in there - it just copies the bytes from the
3577 e-mail messages directly), with tab characters to separate fields (so that you
3578 can also load it into a spreadsheet).  The first line identifies the overall
3579 file type.  The second lists pairs of classifications plus the number of
3580 messages in each class.  Currently it is just Genuine and Spam, but for future
3581 compatability, that could be followed by more classification pairs.  The
3582 remaining lines each contain a word, the date it was last updated (actually
3583 it's the number of messages in the database when the word was added, smaller
3584 numbers mean it was updated longer ago), the genuine count and the spam count.
3585 */
3586 
3587 status_t ABSApp::LoadSaveDatabase (bool DoLoad, char *ErrorMessage)
3588 {
3589   time_t                             CurrentTime;
3590   FILE                              *DatabaseFile = NULL;
3591   BNode                              DatabaseNode;
3592   BNodeInfo                          DatabaseNodeInfo;
3593   StatisticsMap::iterator            DataIter;
3594   StatisticsMap::iterator            EndIter;
3595   status_t                           ErrorCode;
3596   int                                i;
3597   pair<StatisticsMap::iterator,bool> InsertResult;
3598   char                               LineString [10240];
3599   StatisticsRecord                   Statistics;
3600   const char                        *StringPntr;
3601   char                              *TabPntr;
3602   const char                        *WordPntr;
3603 
3604   if (DoLoad)
3605   {
3606     MakeDatabaseEmpty ();
3607     m_DatabaseHasChanged = false; /* In case of early error exit. */
3608   }
3609   else /* Saving the database, backup the old version on disk. */
3610   {
3611     ErrorCode = MakeBackup (ErrorMessage);
3612     if (ErrorCode != B_OK) /* Usually because the directory isn't there. */
3613       return ErrorCode;
3614   }
3615 
3616   DatabaseFile = fopen (m_DatabaseFileName.String (), DoLoad ? "rb" : "wb");
3617   if (DatabaseFile == NULL)
3618   {
3619     ErrorCode = errno;
3620     sprintf (ErrorMessage, "Can't open database file \"%s\" for %s",
3621       m_DatabaseFileName.String (), DoLoad ? "reading" : "writing");
3622     goto ErrorExit;
3623   }
3624 
3625   /* Process the first line, which identifies the file. */
3626 
3627   if (DoLoad)
3628   {
3629     sprintf (ErrorMessage, "Can't read first line of database file \"%s\", "
3630       "expected it to start with \"%s\"",
3631       m_DatabaseFileName.String (), g_DatabaseRecognitionString);
3632     ErrorCode = -1;
3633 
3634     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3635       goto ErrorExit;
3636     if (strncmp (LineString, g_DatabaseRecognitionString,
3637     strlen (g_DatabaseRecognitionString)) != 0)
3638       goto ErrorExit;
3639   }
3640   else /* Saving */
3641   {
3642     CurrentTime = time (NULL);
3643     if (fprintf (DatabaseFile, "%s V1 (word, age, genuine count, spam count)\t"
3644     "Written by SpamDBM $Revision: 30630 $\t"
3645     "Compiled on " __DATE__ " at " __TIME__ "\tThis file saved on %s",
3646     g_DatabaseRecognitionString, ctime (&CurrentTime)) <= 0)
3647     {
3648       ErrorCode = errno;
3649       sprintf (ErrorMessage, "Problems when writing to database file \"%s\"",
3650         m_DatabaseFileName.String ());
3651       goto ErrorExit;
3652     }
3653   }
3654 
3655   /* The second line lists the different classifications.  We just check to see
3656   that the first two are Genuine and Spam.  If there are others, they'll be
3657   ignored and lost when the database is saved. */
3658 
3659   if (DoLoad)
3660   {
3661     sprintf (ErrorMessage, "Can't read second line of database file \"%s\", "
3662       "expected it to list classifications %s and %s along with their totals",
3663       m_DatabaseFileName.String (), g_ClassifiedGenuine, g_ClassifiedSpam);
3664     ErrorCode = B_BAD_VALUE;
3665 
3666     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3667       goto ErrorExit;
3668     i = strlen (LineString);
3669     if (i > 0 && LineString[i-1] == '\n')
3670       LineString[i-1] = 0; /* Remove trailing line feed character. */
3671 
3672     /* Look for the title word at the start of the line. */
3673 
3674     TabPntr = LineString;
3675     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3676       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3677 
3678     if (strncmp (StringPntr, "Classifications", 15) != 0)
3679       goto ErrorExit;
3680 
3681     /* Look for the Genuine class and count. */
3682 
3683     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3684       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3685 
3686     if (strcmp (StringPntr, g_ClassifiedGenuine) != 0)
3687       goto ErrorExit;
3688 
3689     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3690       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3691 
3692     m_TotalGenuineMessages = atoll (StringPntr);
3693 
3694     /* Look for the Spam class and count. */
3695 
3696     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3697       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3698 
3699     if (strcmp (StringPntr, g_ClassifiedSpam) != 0)
3700       goto ErrorExit;
3701 
3702     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3703       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3704 
3705     m_TotalSpamMessages = atoll (StringPntr);
3706   }
3707   else /* Saving */
3708   {
3709     fprintf (DatabaseFile,
3710       "Classifications and total messages:\t%s\t%lu\t%s\t%lu\n",
3711       g_ClassifiedGenuine, m_TotalGenuineMessages,
3712       g_ClassifiedSpam, m_TotalSpamMessages);
3713   }
3714 
3715   /* The remainder of the file is the list of words and statistics.  Each line
3716   has a word, a tab, the time when the word was last changed in the database
3717   (sequence number of message addition, starts at 0 and goes up by one for each
3718   message added to the database), a tab then the number of messages in the
3719   first class (genuine) that had that word, then a tab, then the number of
3720   messages in the second class (spam) with that word, and so on. */
3721 
3722   if (DoLoad)
3723   {
3724     while (!feof (DatabaseFile))
3725     {
3726       if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3727       {
3728         ErrorCode = errno;
3729         if (feof (DatabaseFile))
3730           break;
3731         if (ErrorCode == B_OK)
3732           ErrorCode = -1;
3733         sprintf (ErrorMessage, "Error while reading words and statistics "
3734           "from database file \"%s\"", m_DatabaseFileName.String ());
3735         goto ErrorExit;
3736       }
3737 
3738       i = strlen (LineString);
3739       if (i > 0 && LineString[i-1] == '\n')
3740         LineString[i-1] = 0; /* Remove trailing line feed character. */
3741 
3742       /* Get the word at the start of the line, save in WordPntr. */
3743 
3744       TabPntr = LineString;
3745       for (WordPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3746         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3747 
3748       /* Get the date stamp.  Actually a sequence number, not a date. */
3749 
3750       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3751         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3752 
3753       Statistics.age = atoll (StringPntr);
3754 
3755       /* Get the Genuine count. */
3756 
3757       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3758         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3759 
3760       Statistics.genuineCount = atoll (StringPntr);
3761 
3762       /* Get the Spam count. */
3763 
3764       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3765         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3766 
3767       Statistics.spamCount = atoll (StringPntr);
3768 
3769       /* Ignore empty words, totally unused words and ones which are too long
3770       (avoids lots of length checking everywhere). */
3771 
3772       if (WordPntr[0] == 0 || strlen (WordPntr) > g_MaxWordLength ||
3773       (Statistics.genuineCount <= 0 && Statistics.spamCount <= 0))
3774         continue; /* Ignore this line of text, start on next one. */
3775 
3776       /* Add the combination to the database. */
3777 
3778       InsertResult = m_WordMap.insert (
3779         StatisticsMap::value_type (WordPntr, Statistics));
3780       if (InsertResult.second == false)
3781       {
3782         ErrorCode = B_BAD_VALUE;
3783         sprintf (ErrorMessage, "Error while inserting word \"%s\" from "
3784           "database \"%s\", perhaps it is a duplicate",
3785           WordPntr, m_DatabaseFileName.String ());
3786         goto ErrorExit;
3787       }
3788       m_WordCount++;
3789 
3790       /* And the hunt for the oldest word. */
3791 
3792       if (Statistics.age < m_OldestAge)
3793         m_OldestAge = Statistics.age;
3794     }
3795   }
3796   else /* Saving, dump all words and statistics to the file. */
3797   {
3798     EndIter = m_WordMap.end ();
3799     for (DataIter = m_WordMap.begin (); DataIter != EndIter; DataIter++)
3800     {
3801       if (fprintf (DatabaseFile, "%s\t%lu\t%lu\t%lu\n",
3802       DataIter->first.c_str (), DataIter->second.age,
3803       DataIter->second.genuineCount, DataIter->second.spamCount) <= 0)
3804       {
3805         ErrorCode = errno;
3806         sprintf (ErrorMessage, "Error while writing word \"%s\" to "
3807           "database \"%s\"",
3808           DataIter->first.c_str(), m_DatabaseFileName.String ());
3809         goto ErrorExit;
3810       }
3811     }
3812   }
3813 
3814   /* Set the file type so that the new file gets associated with this program,
3815   and picks up the right icon. */
3816 
3817   if (!DoLoad)
3818   {
3819     sprintf (ErrorMessage, "Unable to set attributes (file type) of database "
3820       "file \"%s\"", m_DatabaseFileName.String ());
3821     ErrorCode = DatabaseNode.SetTo (m_DatabaseFileName.String ());
3822     if (ErrorCode != B_OK)
3823       goto ErrorExit;
3824     DatabaseNodeInfo.SetTo (&DatabaseNode);
3825     ErrorCode = DatabaseNodeInfo.SetType (g_ABSDatabaseFileMIMEType);
3826     if (ErrorCode != B_OK)
3827       goto ErrorExit;
3828   }
3829 
3830   /* Success! */
3831   m_DatabaseHasChanged = false;
3832   ErrorCode = B_OK;
3833 
3834 ErrorExit:
3835   if (DatabaseFile != NULL)
3836     fclose (DatabaseFile);
3837   return ErrorCode;
3838 }
3839 
3840 
3841 /* Either load the settings (DoLoad is TRUE) from the configuration file or
3842 write them (DoLoad is FALSE) to it.  The configuration file is a flattened
3843 BMessage containing the various program settings.  If it doesn't exist (and its
3844 parent directories don't exist) then it will be created when saving.  If it
3845 doesn't exist when loading, the settings will be set to default values. */
3846 
3847 status_t ABSApp::LoadSaveSettings (bool DoLoad)
3848 {
3849   status_t    ErrorCode;
3850   const char *NamePntr;
3851   BMessage    Settings;
3852   BDirectory  SettingsDirectory;
3853   BFile       SettingsFile;
3854   const char *StringPntr;
3855   bool        TempBool;
3856   int32       TempInt32;
3857   char        TempString [PATH_MAX + 100];
3858 
3859   /* Preset things to default values if loading, in case of an error or it's an
3860   older version of the settings file which doesn't have every field defined. */
3861 
3862   if (DoLoad)
3863     DefaultSettings ();
3864 
3865   /* Look for our settings directory.  When saving we can try to create it. */
3866 
3867   ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3868   if (ErrorCode != B_OK)
3869   {
3870     if (DoLoad || ErrorCode != B_ENTRY_NOT_FOUND)
3871     {
3872       sprintf (TempString, "Can't find settings directory \"%s\"",
3873         m_SettingsDirectoryPath.Path ());
3874       goto ErrorExit;
3875     }
3876     ErrorCode = create_directory (m_SettingsDirectoryPath.Path (), 0755);
3877     if (ErrorCode == B_OK)
3878       ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3879     if (ErrorCode != B_OK)
3880     {
3881       sprintf (TempString, "Can't create settings directory \"%s\"",
3882         m_SettingsDirectoryPath.Path ());
3883       goto ErrorExit;
3884     }
3885   }
3886 
3887   ErrorCode = SettingsFile.SetTo (&SettingsDirectory, g_SettingsFileName,
3888     DoLoad ? B_READ_ONLY : B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
3889   if (ErrorCode != B_OK)
3890   {
3891     sprintf (TempString, "Can't open settings file \"%s\" in directory \"%s\" "
3892       "for %s", g_SettingsFileName, m_SettingsDirectoryPath.Path(),
3893       DoLoad ? "reading" : "writing");
3894     goto ErrorExit;
3895   }
3896 
3897   if (DoLoad)
3898   {
3899     ErrorCode = Settings.Unflatten (&SettingsFile);
3900     if (ErrorCode != 0 || Settings.what != g_SettingsWhatCode)
3901     {
3902       sprintf (TempString, "Corrupt data detected while reading settings "
3903         "file \"%s\" in directory \"%s\", will revert to defaults",
3904         g_SettingsFileName, m_SettingsDirectoryPath.Path());
3905       goto ErrorExit;
3906     }
3907   }
3908 
3909   /* Transfer the settings between the BMessage and our various global
3910   variables.  For loading, if the setting isn't present, leave it at the
3911   default value.  Note that loading and saving are intermingled here to make
3912   code maintenance easier (less chance of forgetting to update it if load and
3913   save were separate functions). */
3914 
3915   ErrorCode = B_OK; /* So that saving settings can record an error. */
3916 
3917   NamePntr = "DatabaseFileName";
3918   if (DoLoad)
3919   {
3920     if (Settings.FindString (NamePntr, &StringPntr) == B_OK)
3921       m_DatabaseFileName.SetTo (StringPntr);
3922   }
3923   else if (ErrorCode == B_OK)
3924     ErrorCode = Settings.AddString (NamePntr, m_DatabaseFileName);
3925 
3926   NamePntr = "ServerMode";
3927   if (DoLoad)
3928   {
3929     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3930       g_ServerMode = TempBool;
3931   }
3932   else if (ErrorCode == B_OK)
3933     ErrorCode = Settings.AddBool (NamePntr, g_ServerMode);
3934 
3935   NamePntr = "IgnorePreviousClassification";
3936   if (DoLoad)
3937   {
3938     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3939       m_IgnorePreviousClassification = TempBool;
3940   }
3941   else if (ErrorCode == B_OK)
3942     ErrorCode = Settings.AddBool (NamePntr, m_IgnorePreviousClassification);
3943 
3944   NamePntr = "PurgeAge";
3945   if (DoLoad)
3946   {
3947     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3948       m_PurgeAge = TempInt32;
3949   }
3950   else if (ErrorCode == B_OK)
3951     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgeAge);
3952 
3953   NamePntr = "PurgePopularity";
3954   if (DoLoad)
3955   {
3956     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3957       m_PurgePopularity = TempInt32;
3958   }
3959   else if (ErrorCode == B_OK)
3960     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgePopularity);
3961 
3962   NamePntr = "ScoringMode";
3963   if (DoLoad)
3964   {
3965     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3966       m_ScoringMode = (ScoringModes) TempInt32;
3967     if (m_ScoringMode < 0 || m_ScoringMode >= SM_MAX)
3968       m_ScoringMode = (ScoringModes) 0;
3969   }
3970   else if (ErrorCode == B_OK)
3971     ErrorCode = Settings.AddInt32 (NamePntr, m_ScoringMode);
3972 
3973   NamePntr = "TokenizeMode";
3974   if (DoLoad)
3975   {
3976     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3977       m_TokenizeMode = (TokenizeModes) TempInt32;
3978     if (m_TokenizeMode < 0 || m_TokenizeMode >= TM_MAX)
3979       m_TokenizeMode = (TokenizeModes) 0;
3980   }
3981   else if (ErrorCode == B_OK)
3982     ErrorCode = Settings.AddInt32 (NamePntr, m_TokenizeMode);
3983 
3984   if (ErrorCode != B_OK)
3985   {
3986     strcpy (TempString, "Unable to stuff the program settings into a "
3987       "temporary BMessage, settings not saved");
3988     goto ErrorExit;
3989   }
3990 
3991   /* Save the settings BMessage to the settings file. */
3992 
3993   if (!DoLoad)
3994   {
3995     Settings.what = g_SettingsWhatCode;
3996     ErrorCode = Settings.Flatten (&SettingsFile);
3997     if (ErrorCode != 0)
3998     {
3999       sprintf (TempString, "Problems while writing settings file \"%s\" in "
4000         "directory \"%s\"", g_SettingsFileName,
4001         m_SettingsDirectoryPath.Path ());
4002       goto ErrorExit;
4003     }
4004   }
4005 
4006   m_SettingsHaveChanged = false;
4007   return B_OK;
4008 
4009 ErrorExit: /* Error message in TempString, code in ErrorCode. */
4010   DisplayErrorMessage (TempString, ErrorCode, DoLoad ?
4011     "Loading Settings Error" : "Saving Settings Error");
4012   return ErrorCode;
4013 }
4014 
4015 
4016 void
4017 ABSApp::MessageReceived (BMessage *MessagePntr)
4018 {
4019   const char           *PropertyName;
4020   struct property_info *PropInfoPntr;
4021   int32                 SpecifierIndex;
4022   int32                 SpecifierKind;
4023   BMessage              SpecifierMessage;
4024 
4025   /* See if it is a scripting message that applies to the database or one of
4026   the other operations this program supports.  Pass on other scripting messages
4027   to the inherited parent MessageReceived function (they're usually scripting
4028   messages for the BApplication). */
4029 
4030   switch (MessagePntr->what)
4031   {
4032     case B_GET_PROPERTY:
4033     case B_SET_PROPERTY:
4034     case B_COUNT_PROPERTIES:
4035     case B_CREATE_PROPERTY:
4036     case B_DELETE_PROPERTY:
4037     case B_EXECUTE_PROPERTY:
4038       if (MessagePntr->GetCurrentSpecifier (&SpecifierIndex, &SpecifierMessage,
4039       &SpecifierKind, &PropertyName) == B_OK &&
4040       SpecifierKind == B_DIRECT_SPECIFIER)
4041       {
4042         for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
4043         {
4044           if (PropInfoPntr->name == 0)
4045             break; /* Ran out of commands. */
4046 
4047           if (PropInfoPntr->commands[0] == MessagePntr->what &&
4048           strcasecmp (PropInfoPntr->name, PropertyName) == 0)
4049           {
4050             ProcessScriptingMessage (MessagePntr, PropInfoPntr);
4051             return;
4052           }
4053         }
4054       }
4055       break;
4056   }
4057 
4058   /* Pass the unprocessed message to the inherited function, maybe it knows
4059   what to do.  This includes replies to messages we sent ourselves. */
4060 
4061   BApplication::MessageReceived (MessagePntr);
4062 }
4063 
4064 
4065 /* Rename the existing database file to a backup file name, potentially
4066 replacing an older backup.  If something goes wrong, returns an error code and
4067 puts an explanation in ErrorMessage. */
4068 
4069 status_t ABSApp::MakeBackup (char *ErrorMessage)
4070 {
4071   BEntry   Entry;
4072   status_t ErrorCode;
4073   int      i;
4074   char     LeafName [NAME_MAX];
4075   char     NewName [PATH_MAX+20];
4076   char     OldName [PATH_MAX+20];
4077 
4078   ErrorCode = Entry.SetTo (m_DatabaseFileName.String ());
4079   if (ErrorCode != B_OK)
4080   {
4081     sprintf (ErrorMessage, "While making backup, failed to make a BEntry for "
4082       "\"%s\" (maybe the directory doesn't exist?)",
4083       m_DatabaseFileName.String ());
4084     return ErrorCode;
4085   }
4086   if (!Entry.Exists ())
4087     return B_OK; /* No existing file to worry about overwriting. */
4088   Entry.GetName (LeafName);
4089 
4090   /* Find the first hole (no file) where we will stop the renaming chain. */
4091 
4092   for (i = 0; i < g_MaxBackups - 1; i++)
4093   {
4094     strcpy (OldName, m_DatabaseFileName.String ());
4095     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4096     Entry.SetTo (OldName);
4097     if (!Entry.Exists ())
4098       break;
4099   }
4100 
4101   /* Move the files down by one to fill in the hole in the name series. */
4102 
4103   for (i--; i >= 0; i--)
4104   {
4105     strcpy (OldName, m_DatabaseFileName.String ());
4106     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4107     Entry.SetTo (OldName);
4108     strcpy (NewName, LeafName);
4109     sprintf (NewName + strlen (NewName), g_BackupSuffix, i + 1);
4110     ErrorCode = Entry.Rename (NewName, true /* clobber */);
4111   }
4112 
4113   Entry.SetTo (m_DatabaseFileName.String ());
4114   strcpy (NewName, LeafName);
4115   sprintf (NewName + strlen (NewName), g_BackupSuffix, 0);
4116   ErrorCode = Entry.Rename (NewName, true /* clobber */);
4117   if (ErrorCode != B_OK)
4118     sprintf (ErrorMessage, "While making backup, failed to rename "
4119       "\"%s\" to \"%s\"", m_DatabaseFileName.String (), NewName);
4120 
4121   return ErrorCode;
4122 }
4123 
4124 
4125 void
4126 ABSApp::MakeDatabaseEmpty ()
4127 {
4128   m_WordMap.clear (); /* Sets the map to empty, deallocating any old data. */
4129   m_WordCount = 0;
4130   m_TotalGenuineMessages = 0;
4131   m_TotalSpamMessages = 0;
4132   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4133 }
4134 
4135 
4136 /* Do what the scripting command says.  A reply message will be sent back with
4137 several fields: "error" containing the numerical error code (0 for success),
4138 "CommandText" with a text representation of the command, "result" with the
4139 resulting data for a get or count command.  If it isn't understood, then rather
4140 than a B_REPLY kind of message, it will be a B_MESSAGE_NOT_UNDERSTOOD message
4141 with an "error" number and an "message" string with a description. */
4142 
4143 void
4144 ABSApp::ProcessScriptingMessage (
4145   BMessage *MessagePntr,
4146   struct property_info *PropInfoPntr)
4147 {
4148   bool        ArgumentBool = false;
4149   bool        ArgumentGotBool = false;
4150   bool        ArgumentGotInt32 = false;
4151   bool        ArgumentGotString = false;
4152   int32       ArgumentInt32 = 0;
4153   const char *ArgumentString = NULL;
4154   BString     CommandText;
4155   status_t    ErrorCode;
4156   int         i;
4157   BMessage    ReplyMessage (B_MESSAGE_NOT_UNDERSTOOD);
4158   ssize_t     StringBufferSize;
4159   BMessage    TempBMessage;
4160   BPath       TempPath;
4161   char        TempString [PATH_MAX + 1024];
4162 
4163   if (g_QuitCountdown >= 0 && !g_CommandLineMode)
4164   {
4165     g_QuitCountdown = -1;
4166     cerr << "Quit countdown aborted due to a scripting command arriving.\n";
4167   }
4168 
4169   if (g_BusyCursor != NULL)
4170     SetCursor (g_BusyCursor);
4171 
4172   ErrorCode = MessagePntr->FindData (g_DataName, B_STRING_TYPE,
4173     (const void **) &ArgumentString, &StringBufferSize);
4174   if (ErrorCode == B_OK)
4175   {
4176     if (PropInfoPntr->extra_data != PN_EVALUATE_STRING &&
4177     PropInfoPntr->extra_data != PN_SPAM_STRING &&
4178     PropInfoPntr->extra_data != PN_GENUINE_STRING &&
4179     strlen (ArgumentString) >= PATH_MAX)
4180     {
4181       sprintf (TempString, "\"data\" string of a scripting message is too "
4182         "long, for SET %s action", PropInfoPntr->name);
4183       ErrorCode = B_NAME_TOO_LONG;
4184       goto ErrorExit;
4185     }
4186     ArgumentGotString = true;
4187   }
4188   else if (MessagePntr->FindBool (g_DataName, &ArgumentBool) == B_OK)
4189     ArgumentGotBool = true;
4190   else if (MessagePntr->FindInt32 (g_DataName, &ArgumentInt32) == B_OK)
4191     ArgumentGotInt32 = true;
4192 
4193   /* Prepare a Human readable description of the scripting command. */
4194 
4195   switch (PropInfoPntr->commands[0])
4196   {
4197     case B_SET_PROPERTY:
4198       CommandText.SetTo ("Set ");
4199       break;
4200 
4201     case B_GET_PROPERTY:
4202       CommandText.SetTo ("Get ");
4203       break;
4204 
4205     case B_COUNT_PROPERTIES:
4206       CommandText.SetTo ("Count ");
4207       break;
4208 
4209     case B_CREATE_PROPERTY:
4210       CommandText.SetTo ("Create ");
4211       break;
4212 
4213     case B_DELETE_PROPERTY:
4214       CommandText.SetTo ("Delete ");
4215       break;
4216 
4217     case B_EXECUTE_PROPERTY:
4218       CommandText.SetTo ("Execute ");
4219       break;
4220 
4221     default:
4222       sprintf (TempString, "Bug: scripting command for \"%s\" has an unknown "
4223         "action code %d", PropInfoPntr->name,
4224         (int) PropInfoPntr->commands[0]);
4225       ErrorCode = -1;
4226       goto ErrorExit;
4227   }
4228   CommandText.Append (PropInfoPntr->name);
4229 
4230   /* Add on the argument value to our readable command, if there is one. */
4231 
4232   if (ArgumentGotString)
4233   {
4234     CommandText.Append (" \"");
4235     CommandText.Append (ArgumentString);
4236     CommandText.Append ("\"");
4237   }
4238   if (ArgumentGotBool)
4239     CommandText.Append (ArgumentBool ? " true" : " false");
4240   if (ArgumentGotInt32)
4241   {
4242     sprintf (TempString, " %ld", ArgumentInt32);
4243     CommandText.Append (TempString);
4244   }
4245 
4246   /* From now on the scripting command has been recognized and is in the
4247   correct format, so it always returns a B_REPLY message.  A readable version
4248   of the command is also added to make debugging easier. */
4249 
4250   ReplyMessage.what = B_REPLY;
4251   ReplyMessage.AddString ("CommandText", CommandText);
4252 
4253   /* Now actually do the command.  First prepare a default error message. */
4254 
4255   sprintf (TempString, "Operation code %d (get, set, count, etc) "
4256     "unsupported for property %s",
4257     (int) PropInfoPntr->commands[0], PropInfoPntr->name);
4258   ErrorCode = B_BAD_INDEX;
4259 
4260   switch (PropInfoPntr->extra_data)
4261   {
4262     case PN_DATABASE_FILE:
4263       switch (PropInfoPntr->commands[0])
4264       {
4265         case B_GET_PROPERTY: /* Get the database file name. */
4266           ReplyMessage.AddString (g_ResultName, m_DatabaseFileName);
4267           break;
4268 
4269         case B_SET_PROPERTY: /* Set the database file name to a new one. */
4270           if (!ArgumentGotString)
4271           {
4272             ErrorCode = B_BAD_TYPE;
4273             sprintf (TempString, "You need to specify a string for the "
4274               "SET %s command", PropInfoPntr->name);
4275             goto ErrorExit;
4276           }
4277           ErrorCode = TempPath.SetTo (ArgumentString, NULL /* leaf */,
4278             true /* normalize - verifies parent directories exist */);
4279           if (ErrorCode != B_OK)
4280           {
4281             sprintf (TempString, "New database path name of \"%s\" is invalid "
4282               "(parent directories must exist)", ArgumentString);
4283             goto ErrorExit;
4284           }
4285           if ((ErrorCode = SaveDatabaseIfNeeded (TempString)) != B_OK)
4286             goto ErrorExit;
4287           MakeDatabaseEmpty (); /* So that the new one gets loaded if used. */
4288 
4289           if (strlen (TempPath.Leaf ()) > NAME_MAX-strlen(g_BackupSuffix)-1)
4290           {
4291             /* Truncate the name so that there is enough space for the backup
4292             extension.  Approximately. */
4293             strcpy (TempString, TempPath.Leaf ());
4294             TempString [NAME_MAX - strlen (g_BackupSuffix) - 1] = 0;
4295             TempPath.GetParent (&TempPath);
4296             TempPath.Append (TempString);
4297           }
4298           m_DatabaseFileName.SetTo (TempPath.Path ());
4299           m_SettingsHaveChanged = true;
4300           break;
4301 
4302         case B_CREATE_PROPERTY: /* Make a new database file plus more. */
4303           if ((ErrorCode = CreateDatabaseFile (TempString)) != B_OK)
4304             goto ErrorExit;
4305           break;
4306 
4307         case B_DELETE_PROPERTY: /* Delete the file and its backups too. */
4308           if ((ErrorCode = DeleteDatabaseFile (TempString)) != B_OK)
4309             goto ErrorExit;
4310           break;
4311 
4312         case B_COUNT_PROPERTIES:
4313           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4314             goto ErrorExit;
4315           ReplyMessage.AddInt32 (g_ResultName, m_WordCount);
4316           break;
4317 
4318         default: /* Unknown operation code, error message already set. */
4319           goto ErrorExit;
4320       }
4321       break;
4322 
4323     case PN_SPAM:
4324     case PN_SPAM_STRING:
4325     case PN_GENUINE:
4326     case PN_GENUINE_STRING:
4327     case PN_UNCERTAIN:
4328       switch (PropInfoPntr->commands[0])
4329       {
4330         case B_COUNT_PROPERTIES: /* Get the number of spam/genuine messages. */
4331           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4332             goto ErrorExit;
4333           if (PropInfoPntr->extra_data == PN_SPAM ||
4334           PropInfoPntr->extra_data == PN_SPAM_STRING)
4335             ReplyMessage.AddInt32 (g_ResultName, m_TotalSpamMessages);
4336           else
4337             ReplyMessage.AddInt32 (g_ResultName, m_TotalGenuineMessages);
4338           break;
4339 
4340         case B_SET_PROPERTY: /* Add spam/genuine/uncertain to database. */
4341           if (!ArgumentGotString)
4342           {
4343             ErrorCode = B_BAD_TYPE;
4344             sprintf (TempString, "You need to specify a string (%s) "
4345               "for the SET %s command",
4346               (PropInfoPntr->extra_data == PN_GENUINE_STRING ||
4347               PropInfoPntr->extra_data == PN_SPAM_STRING)
4348               ? "text of the message to be added"
4349               : "pathname of the file containing the text to be added",
4350               PropInfoPntr->name);
4351             goto ErrorExit;
4352           }
4353           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4354             goto ErrorExit;
4355           if (PropInfoPntr->extra_data == PN_GENUINE ||
4356           PropInfoPntr->extra_data == PN_SPAM ||
4357           PropInfoPntr->extra_data == PN_UNCERTAIN)
4358             ErrorCode = AddFileToDatabase (
4359               (PropInfoPntr->extra_data == PN_SPAM) ? CL_SPAM :
4360               ((PropInfoPntr->extra_data == PN_GENUINE) ? CL_GENUINE :
4361               CL_UNCERTAIN),
4362               ArgumentString, TempString /* ErrorMessage */);
4363           else
4364             ErrorCode = AddStringToDatabase (
4365               (PropInfoPntr->extra_data == PN_SPAM_STRING) ?
4366               CL_SPAM : CL_GENUINE,
4367               ArgumentString, TempString /* ErrorMessage */);
4368           if (ErrorCode != B_OK)
4369             goto ErrorExit;
4370           break;
4371 
4372         default: /* Unknown operation code, error message already set. */
4373           goto ErrorExit;
4374       }
4375       break;
4376 
4377     case PN_IGNORE_PREVIOUS_CLASSIFICATION:
4378       switch (PropInfoPntr->commands[0])
4379       {
4380         case B_GET_PROPERTY:
4381           ReplyMessage.AddBool (g_ResultName, m_IgnorePreviousClassification);
4382           break;
4383 
4384         case B_SET_PROPERTY:
4385           if (!ArgumentGotBool)
4386           {
4387             ErrorCode = B_BAD_TYPE;
4388             sprintf (TempString, "You need to specify a boolean (true/yes, "
4389               "false/no) for the SET %s command", PropInfoPntr->name);
4390             goto ErrorExit;
4391           }
4392           m_IgnorePreviousClassification = ArgumentBool;
4393           m_SettingsHaveChanged = true;
4394           break;
4395 
4396         default: /* Unknown operation code, error message already set. */
4397           goto ErrorExit;
4398       }
4399       break;
4400 
4401     case PN_SERVER_MODE:
4402       switch (PropInfoPntr->commands[0])
4403       {
4404         case B_GET_PROPERTY:
4405           ReplyMessage.AddBool (g_ResultName, g_ServerMode);
4406           break;
4407 
4408         case B_SET_PROPERTY:
4409           if (!ArgumentGotBool)
4410           {
4411             ErrorCode = B_BAD_TYPE;
4412             sprintf (TempString, "You need to specify a boolean (true/yes, "
4413               "false/no) for the SET %s command", PropInfoPntr->name);
4414             goto ErrorExit;
4415           }
4416           g_ServerMode = ArgumentBool;
4417           m_SettingsHaveChanged = true;
4418           break;
4419 
4420         default: /* Unknown operation code, error message already set. */
4421           goto ErrorExit;
4422       }
4423       break;
4424 
4425     case PN_FLUSH:
4426       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4427       (ErrorCode = SaveDatabaseIfNeeded (TempString)) == B_OK)
4428         break;
4429       goto ErrorExit;
4430 
4431     case PN_PURGE_AGE:
4432       switch (PropInfoPntr->commands[0])
4433       {
4434         case B_GET_PROPERTY:
4435           ReplyMessage.AddInt32 (g_ResultName, m_PurgeAge);
4436           break;
4437 
4438         case B_SET_PROPERTY:
4439           if (!ArgumentGotInt32)
4440           {
4441             ErrorCode = B_BAD_TYPE;
4442             sprintf (TempString, "You need to specify a 32 bit integer "
4443               "for the SET %s command", PropInfoPntr->name);
4444             goto ErrorExit;
4445           }
4446           m_PurgeAge = ArgumentInt32;
4447           m_SettingsHaveChanged = true;
4448           break;
4449 
4450         default: /* Unknown operation code, error message already set. */
4451           goto ErrorExit;
4452       }
4453       break;
4454 
4455     case PN_PURGE_POPULARITY:
4456       switch (PropInfoPntr->commands[0])
4457       {
4458         case B_GET_PROPERTY:
4459           ReplyMessage.AddInt32 (g_ResultName, m_PurgePopularity);
4460           break;
4461 
4462         case B_SET_PROPERTY:
4463           if (!ArgumentGotInt32)
4464           {
4465             ErrorCode = B_BAD_TYPE;
4466             sprintf (TempString, "You need to specify a 32 bit integer "
4467               "for the SET %s command", PropInfoPntr->name);
4468             goto ErrorExit;
4469           }
4470           m_PurgePopularity = ArgumentInt32;
4471           m_SettingsHaveChanged = true;
4472           break;
4473 
4474         default: /* Unknown operation code, error message already set. */
4475           goto ErrorExit;
4476       }
4477       break;
4478 
4479     case PN_PURGE:
4480       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4481       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK &&
4482       (ErrorCode = PurgeOldWords (TempString)) == B_OK)
4483         break;
4484       goto ErrorExit;
4485 
4486     case PN_OLDEST:
4487       if (PropInfoPntr->commands[0] == B_GET_PROPERTY &&
4488       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4489       {
4490         ReplyMessage.AddInt32 (g_ResultName, m_OldestAge);
4491         break;
4492       }
4493       goto ErrorExit;
4494 
4495     case PN_EVALUATE:
4496     case PN_EVALUATE_STRING:
4497       if (PropInfoPntr->commands[0] == B_SET_PROPERTY)
4498       {
4499         if (!ArgumentGotString)
4500         {
4501           ErrorCode = B_BAD_TYPE;
4502           sprintf (TempString, "You need to specify a string for the "
4503             "SET %s command", PropInfoPntr->name);
4504           goto ErrorExit;
4505         }
4506         if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4507         {
4508           if (PropInfoPntr->extra_data == PN_EVALUATE)
4509           {
4510             if ((ErrorCode = EvaluateFile (ArgumentString, &ReplyMessage,
4511             TempString)) == B_OK)
4512               break;
4513           }
4514           else /* PN_EVALUATE_STRING */
4515           {
4516             if ((ErrorCode = EvaluateString (ArgumentString, StringBufferSize,
4517             &ReplyMessage, TempString)) == B_OK)
4518               break;
4519           }
4520         }
4521       }
4522       goto ErrorExit;
4523 
4524     case PN_RESET_TO_DEFAULTS:
4525       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY)
4526       {
4527         DefaultSettings ();
4528         break;
4529       }
4530       goto ErrorExit;
4531 
4532     case PN_INSTALL_THINGS:
4533       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4534       (ErrorCode = InstallThings (TempString)) == B_OK)
4535         break;
4536       goto ErrorExit;
4537 
4538     case PN_SCORING_MODE:
4539       switch (PropInfoPntr->commands[0])
4540       {
4541         case B_GET_PROPERTY:
4542           ReplyMessage.AddString (g_ResultName,
4543             g_ScoringModeNames[m_ScoringMode]);
4544           break;
4545 
4546         case B_SET_PROPERTY:
4547           i = SM_MAX;
4548           if (ArgumentGotString)
4549             for (i = 0; i < SM_MAX; i++)
4550             {
4551               if (strcasecmp (ArgumentString, g_ScoringModeNames [i]) == 0)
4552               {
4553                 m_ScoringMode = (ScoringModes) i;
4554                 m_SettingsHaveChanged = true;
4555                 break;
4556               }
4557             }
4558           if (i >= SM_MAX) /* Didn't find a valid scoring mode word. */
4559           {
4560             ErrorCode = B_BAD_TYPE;
4561             sprintf (TempString, "You used the unrecognized \"%s\" as "
4562               "a scoring mode for the SET %s command.  Should be one of: ",
4563               ArgumentGotString ? ArgumentString : "not specified",
4564               PropInfoPntr->name);
4565             for (i = 0; i < SM_MAX; i++)
4566             {
4567               strcat (TempString, g_ScoringModeNames [i]);
4568               if (i < SM_MAX - 1)
4569                 strcat (TempString, ", ");
4570             }
4571             goto ErrorExit;
4572           }
4573           break;
4574 
4575         default: /* Unknown operation code, error message already set. */
4576           goto ErrorExit;
4577       }
4578       break;
4579 
4580     case PN_TOKENIZE_MODE:
4581       switch (PropInfoPntr->commands[0])
4582       {
4583         case B_GET_PROPERTY:
4584           ReplyMessage.AddString (g_ResultName,
4585             g_TokenizeModeNames[m_TokenizeMode]);
4586           break;
4587 
4588         case B_SET_PROPERTY:
4589           i = TM_MAX;
4590           if (ArgumentGotString)
4591             for (i = 0; i < TM_MAX; i++)
4592             {
4593               if (strcasecmp (ArgumentString, g_TokenizeModeNames [i]) == 0)
4594               {
4595                 m_TokenizeMode = (TokenizeModes) i;
4596                 m_SettingsHaveChanged = true;
4597                 break;
4598               }
4599             }
4600           if (i >= TM_MAX) /* Didn't find a valid tokenize mode word. */
4601           {
4602             ErrorCode = B_BAD_TYPE;
4603             sprintf (TempString, "You used the unrecognized \"%s\" as "
4604               "a tokenize mode for the SET %s command.  Should be one of: ",
4605               ArgumentGotString ? ArgumentString : "not specified",
4606               PropInfoPntr->name);
4607             for (i = 0; i < TM_MAX; i++)
4608             {
4609               strcat (TempString, g_TokenizeModeNames [i]);
4610               if (i < TM_MAX - 1)
4611                 strcat (TempString, ", ");
4612             }
4613             goto ErrorExit;
4614           }
4615           break;
4616 
4617         default: /* Unknown operation code, error message already set. */
4618           goto ErrorExit;
4619       }
4620       break;
4621 
4622     default:
4623       sprintf (TempString, "Bug!  Unrecognized property identification "
4624         "number %d (should be between 0 and %d).  Fix the entry in "
4625         "the g_ScriptingPropertyList array!",
4626         (int) PropInfoPntr->extra_data, PN_MAX - 1);
4627       goto ErrorExit;
4628   }
4629 
4630   /* Success. */
4631 
4632   ReplyMessage.AddInt32 ("error", B_OK);
4633   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4634     this /* Reply's reply handler */, 500000 /* send timeout */);
4635   if (ErrorCode != B_OK)
4636     cerr << "ProcessScriptingMessage failed to send a reply message, code " <<
4637     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4638     CommandText.String () << endl;
4639   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4640   return;
4641 
4642 ErrorExit: /* Error message in TempString, return code in ErrorCode. */
4643   ReplyMessage.AddInt32 ("error", ErrorCode);
4644   ReplyMessage.AddString ("message", TempString);
4645   DisplayErrorMessage (TempString, ErrorCode);
4646   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4647     this /* Reply's reply handler */, 500000 /* send timeout */);
4648   if (ErrorCode != B_OK)
4649     cerr << "ProcessScriptingMessage failed to send an error message, code " <<
4650     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4651     CommandText.String () << endl;
4652   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4653 }
4654 
4655 
4656 /* Since quitting stops the program before the results of a script command are
4657 received, we use a time delay to do the quit and make sure there are no pending
4658 commands being processed by the auxiliary looper which is sending us commands.
4659 Also, we have a countdown which can be interrupted by an incoming scripting
4660 message in case one client tells us to quit while another one is still using us
4661 (happens when you have two or more e-mail accounts).  But if the system is
4662 shutting down, quit immediately! */
4663 
4664 void
4665 ABSApp::Pulse ()
4666 {
4667   if (g_QuitCountdown == 0)
4668   {
4669     if (g_CommanderLooperPntr == NULL ||
4670     !g_CommanderLooperPntr->IsBusy ())
4671       PostMessage (B_QUIT_REQUESTED);
4672   }
4673   else if (g_QuitCountdown > 0)
4674   {
4675     cerr << "SpamDBM quitting in " << g_QuitCountdown << ".\n";
4676     g_QuitCountdown--;
4677   }
4678 }
4679 
4680 
4681 /* A quit request message has come in.  If the quit countdown has reached zero,
4682 allow the request, otherwise reject it (and start the countdown if it hasn't
4683 been started). */
4684 
4685 bool
4686 ABSApp::QuitRequested ()
4687 {
4688   BMessage  *QuitMessage;
4689   team_info  RemoteInfo;
4690   BMessenger RemoteMessenger;
4691   team_id    RemoteTeam;
4692 
4693   /* See if the quit is from the system shutdown command (which goes through
4694   the registrar server), if so, quit immediately. */
4695 
4696   QuitMessage = CurrentMessage ();
4697   if (QuitMessage != NULL && QuitMessage->IsSourceRemote ())
4698   {
4699     RemoteMessenger = QuitMessage->ReturnAddress ();
4700     RemoteTeam = RemoteMessenger.Team ();
4701     if (get_team_info (RemoteTeam, &RemoteInfo) == B_OK &&
4702     strstr (RemoteInfo.args, "registrar") != NULL)
4703       g_QuitCountdown = 0;
4704   }
4705 
4706   if (g_QuitCountdown == 0)
4707     return BApplication::QuitRequested ();
4708 
4709   if (g_QuitCountdown < 0)
4710 //    g_QuitCountdown = 10; /* Start the countdown. */
4711     g_QuitCountdown = 5; /* Quit more quickly */
4712 
4713   return false;
4714 }
4715 
4716 
4717 /* Go through the current database and delete words which are too old (time is
4718 equivalent to the number of messages added to the database) and too unpopular
4719 (words not used by many messages).  Hopefully this will get rid of words which
4720 are just hunks of binary or other garbage.  The database has been loaded
4721 elsewhere. */
4722 
4723 status_t
4724 ABSApp::PurgeOldWords (char *ErrorMessage)
4725 {
4726   uint32                  CurrentTime;
4727   StatisticsMap::iterator CurrentIter;
4728   StatisticsMap::iterator EndIter;
4729   StatisticsMap::iterator NextIter;
4730   char                    TempString [80];
4731 
4732   strcpy (ErrorMessage, "Purge can't fail"); /* So argument gets used. */
4733   CurrentTime = m_TotalGenuineMessages + m_TotalSpamMessages - 1;
4734   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4735 
4736   EndIter = m_WordMap.end ();
4737   NextIter = m_WordMap.begin ();
4738   while (NextIter != EndIter) {
4739     CurrentIter = NextIter++;
4740 
4741     if (CurrentTime - CurrentIter->second.age >= m_PurgeAge &&
4742     CurrentIter->second.genuineCount + CurrentIter->second.spamCount <=
4743     m_PurgePopularity) {
4744       /* Delete this word, it is unpopular and old.  Sob. */
4745 
4746       m_WordMap.erase (CurrentIter);
4747       if (m_WordCount > 0)
4748         m_WordCount--;
4749 
4750       m_DatabaseHasChanged = true;
4751     }
4752     else /* This word is still in the database.  Update oldest age. */
4753     {
4754       if (CurrentIter->second.age < m_OldestAge)
4755         m_OldestAge = CurrentIter->second.age;
4756     }
4757   }
4758 
4759   /* Just a little bug check here.  Just in case. */
4760 
4761   if (m_WordCount != m_WordMap.size ()) {
4762     sprintf (TempString, "Our word count of %lu doesn't match the "
4763       "size of the database, %lu", m_WordCount, m_WordMap.size());
4764     DisplayErrorMessage (TempString, -1, "Bug!");
4765     m_WordCount = m_WordMap.size ();
4766   }
4767 
4768   return B_OK;
4769 }
4770 
4771 
4772 void
4773 ABSApp::ReadyToRun ()
4774 {
4775   DatabaseWindow *DatabaseWindowPntr;
4776   float           JunkFloat;
4777   BButton        *TempButtonPntr;
4778   BCheckBox      *TempCheckBoxPntr;
4779   font_height     TempFontHeight;
4780   BMenuBar       *TempMenuBarPntr;
4781   BMenuItem      *TempMenuItemPntr;
4782   BPopUpMenu     *TempPopUpMenuPntr;
4783   BRadioButton   *TempRadioButtonPntr;
4784   BRect           TempRect;
4785   const char     *TempString = "Testing My Things";
4786   BStringView    *TempStringViewPntr;
4787   BTextControl   *TempTextPntr;
4788   BWindow        *TempWindowPntr;
4789 
4790   /* This batch of code gets some measurements which will be used for laying
4791   out controls and other GUI elements.  Set the spacing between buttons and
4792   other controls to the width of the letter "M" in the user's desired font. */
4793 
4794  g_MarginBetweenControls = (int) be_plain_font->StringWidth ("M");
4795 
4796   /* Also find out how much space a line of text uses. */
4797 
4798   be_plain_font->GetHeight (&TempFontHeight);
4799   g_LineOfTextHeight = ceilf (
4800     TempFontHeight.ascent + TempFontHeight.descent + TempFontHeight.leading);
4801 
4802   /* Start finding out the height of various user interface gadgets, which can
4803   vary based on the current font size.  Make a temporary gadget, which is
4804   attached to our window, then resize it to its prefered size so that it
4805   accomodates the font size and other frills it needs. */
4806 
4807   TempWindowPntr = new (std::nothrow) BWindow (BRect (10, 20, 200, 200),
4808 	"Temporary Window", B_DOCUMENT_WINDOW,
4809 	B_NO_WORKSPACE_ACTIVATION | B_ASYNCHRONOUS_CONTROLS);
4810   if (TempWindowPntr == NULL) {
4811     DisplayErrorMessage ("Unable to create temporary window for finding "
4812       "sizes of controls.");
4813     g_QuitCountdown = 0;
4814     return;
4815   }
4816 
4817   TempRect = TempWindowPntr->Bounds ();
4818 
4819   /* Find the height of a single line of text in a BStringView. */
4820 
4821   TempStringViewPntr = new (std::nothrow) BStringView (TempRect, TempString, TempString);
4822   if (TempStringViewPntr != NULL) {
4823     TempWindowPntr->Lock();
4824     TempWindowPntr->AddChild (TempStringViewPntr);
4825     TempStringViewPntr->GetPreferredSize (&JunkFloat, &g_StringViewHeight);
4826     TempWindowPntr->RemoveChild (TempStringViewPntr);
4827     TempWindowPntr->Unlock();
4828     delete TempStringViewPntr;
4829   }
4830 
4831   /* Find the height of a button, which seems to be larger than a text
4832   control and can make life difficult.  Make a temporary button, which
4833   is attached to our window so that it resizes to accomodate the font size. */
4834 
4835   TempButtonPntr = new (std::nothrow) BButton (TempRect, TempString, TempString, NULL);
4836   if (TempButtonPntr != NULL) {
4837     TempWindowPntr->Lock();
4838     TempWindowPntr->AddChild (TempButtonPntr);
4839     TempButtonPntr->GetPreferredSize (&JunkFloat, &g_ButtonHeight);
4840     TempWindowPntr->RemoveChild (TempButtonPntr);
4841     TempWindowPntr->Unlock();
4842     delete TempButtonPntr;
4843   }
4844 
4845   /* Find the height of a text box. */
4846 
4847   TempTextPntr = new (std::nothrow) BTextControl (TempRect, TempString, NULL /* label */,
4848     TempString, NULL);
4849   if (TempTextPntr != NULL) {
4850     TempWindowPntr->Lock ();
4851     TempWindowPntr->AddChild (TempTextPntr);
4852     TempTextPntr->GetPreferredSize (&JunkFloat, &g_TextBoxHeight);
4853     TempWindowPntr->RemoveChild (TempTextPntr);
4854     TempWindowPntr->Unlock ();
4855     delete TempTextPntr;
4856   }
4857 
4858   /* Find the height of a checkbox control. */
4859 
4860   TempCheckBoxPntr = new (std::nothrow) BCheckBox (TempRect, TempString, TempString, NULL);
4861   if (TempCheckBoxPntr != NULL) {
4862     TempWindowPntr->Lock ();
4863     TempWindowPntr->AddChild (TempCheckBoxPntr);
4864     TempCheckBoxPntr->GetPreferredSize (&JunkFloat, &g_CheckBoxHeight);
4865     TempWindowPntr->RemoveChild (TempCheckBoxPntr);
4866     TempWindowPntr->Unlock ();
4867     delete TempCheckBoxPntr;
4868   }
4869 
4870   /* Find the height of a radio button control. */
4871 
4872   TempRadioButtonPntr =
4873     new (std::nothrow) BRadioButton (TempRect, TempString, TempString, NULL);
4874   if (TempRadioButtonPntr != NULL) {
4875     TempWindowPntr->Lock ();
4876     TempWindowPntr->AddChild (TempRadioButtonPntr);
4877     TempRadioButtonPntr->GetPreferredSize (&JunkFloat, &g_RadioButtonHeight);
4878     TempWindowPntr->RemoveChild (TempRadioButtonPntr);
4879     TempWindowPntr->Unlock ();
4880     delete TempRadioButtonPntr;
4881   }
4882 
4883   /* Find the height of a pop-up menu. */
4884 
4885   TempMenuBarPntr = new (std::nothrow) BMenuBar (TempRect, TempString,
4886     B_FOLLOW_LEFT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
4887     true /* resize to fit items */);
4888   TempPopUpMenuPntr = new (std::nothrow) BPopUpMenu (TempString);
4889   TempMenuItemPntr = new (std::nothrow) BMenuItem (TempString, new BMessage (12345), 'g');
4890 
4891   if (TempMenuBarPntr != NULL && TempPopUpMenuPntr != NULL &&
4892   TempMenuItemPntr != NULL)
4893   {
4894     TempPopUpMenuPntr->AddItem (TempMenuItemPntr);
4895     TempMenuBarPntr->AddItem (TempPopUpMenuPntr);
4896 
4897     TempWindowPntr->Lock ();
4898     TempWindowPntr->AddChild (TempMenuBarPntr);
4899     TempMenuBarPntr->GetPreferredSize (&JunkFloat, &g_PopUpMenuHeight);
4900     TempWindowPntr->RemoveChild (TempMenuBarPntr);
4901     TempWindowPntr->Unlock ();
4902     delete TempMenuBarPntr; // It will delete contents too.
4903   }
4904 
4905   TempWindowPntr->Lock ();
4906   TempWindowPntr->Quit ();
4907 
4908   SetPulseRate (500000);
4909 
4910   if (g_CommandLineMode)
4911     g_QuitCountdown = 0; /* Quit as soon as queued up commands done. */
4912   else /* GUI mode, make a window. */
4913   {
4914     DatabaseWindowPntr = new (std::nothrow) DatabaseWindow ();
4915     if (DatabaseWindowPntr == NULL) {
4916       DisplayErrorMessage ("Unable to create window.");
4917       g_QuitCountdown = 0;
4918     } else {
4919       DatabaseWindowPntr->Show (); /* Starts the window's message loop. */
4920     }
4921   }
4922 
4923   g_AppReadyToRunCompleted = true;
4924 }
4925 
4926 
4927 /* Given a mail component (body text, attachment, whatever), look for words in
4928 it.  If the tokenize mode specifies that it isn't one of the ones we are
4929 looking for, just skip it.  For container type components, recursively examine
4930 their contents, up to the maximum depth specified. */
4931 
4932 status_t
4933 ABSApp::RecursivelyTokenizeMailComponent (
4934   BMailComponent *ComponentPntr,
4935   const char *OptionalFileName,
4936   set<string> &WordSet,
4937   char *ErrorMessage,
4938   int RecursionLevel,
4939   int MaxRecursionLevel)
4940 {
4941   char                        AttachmentName [B_FILE_NAME_LENGTH];
4942   BMailAttachment            *AttachmentPntr;
4943   BMimeType                   ComponentMIMEType;
4944   BMailContainer             *ContainerPntr;
4945   BMallocIO                   ContentsIO;
4946   const char                 *ContentsBufferPntr;
4947   size_t                      ContentsBufferSize;
4948   status_t                    ErrorCode;
4949   bool                        ExamineComponent;
4950   const char                 *HeaderKeyPntr;
4951   const char                 *HeaderValuePntr;
4952   int                         i;
4953   int                         j;
4954   const char                 *NameExtension;
4955   int                         NumComponents;
4956   BMimeType                   TextAnyMIMEType ("text");
4957   BMimeType                   TextPlainMIMEType ("text/plain");
4958 
4959   if (ComponentPntr == NULL)
4960     return B_OK;
4961 
4962   /* Add things in the sub-headers that might be useful.  Things like the file
4963   name of attachments, the encoding type, etc. */
4964 
4965   if (m_TokenizeMode == TM_PLAIN_TEXT_HEADER ||
4966   m_TokenizeMode == TM_ANY_TEXT_HEADER ||
4967   m_TokenizeMode == TM_ALL_PARTS_HEADER ||
4968   m_TokenizeMode == TM_JUST_HEADER)
4969   {
4970     for (i = 0; i < 1000; i++)
4971     {
4972       HeaderKeyPntr = ComponentPntr->HeaderAt (i);
4973       if (HeaderKeyPntr == NULL)
4974         break;
4975       AddWordsToSet (HeaderKeyPntr, strlen (HeaderKeyPntr),
4976         'H' /* Prefix for Headers, uppercase unlike normal words. */, WordSet);
4977       for (j = 0; j < 1000; j++)
4978       {
4979         HeaderValuePntr = ComponentPntr->HeaderField (HeaderKeyPntr, j);
4980         if (HeaderValuePntr == NULL)
4981           break;
4982         AddWordsToSet (HeaderValuePntr, strlen (HeaderValuePntr),
4983           'H', WordSet);
4984       }
4985     }
4986   }
4987 
4988   /* Check the MIME type of the thing.  It's used to decide if the contents are
4989   worth examining for words. */
4990 
4991   ErrorCode = ComponentPntr->MIMEType (&ComponentMIMEType);
4992   if (ErrorCode != B_OK)
4993   {
4994     sprintf (ErrorMessage, "ABSApp::RecursivelyTokenizeMailComponent: "
4995       "Unable to get MIME type at level %d in \"%s\"",
4996       RecursionLevel, OptionalFileName);
4997     return ErrorCode;
4998   }
4999   if (ComponentMIMEType.Type() == NULL)
5000   {
5001     /* Have to make up a MIME type for things which don't have them, such as
5002     the main body text, otherwise it would get ignored. */
5003 
5004     if (NULL != dynamic_cast<BTextMailComponent *>(ComponentPntr))
5005       ComponentMIMEType.SetType ("text/plain");
5006   }
5007   if (!TextAnyMIMEType.Contains (&ComponentMIMEType) &&
5008   NULL != (AttachmentPntr = dynamic_cast<BMailAttachment *>(ComponentPntr)))
5009   {
5010     /* Sometimes spam doesn't give a text MIME type for text when they do an
5011     attachment (which is often base64 encoded).  Use the file name extension to
5012     see if it really is text. */
5013     NameExtension = NULL;
5014     if (AttachmentPntr->FileName (AttachmentName) >= 0)
5015       NameExtension = strrchr (AttachmentName, '.');
5016     if (NameExtension != NULL)
5017     {
5018       if (strcasecmp (NameExtension, ".txt") == 0)
5019         ComponentMIMEType.SetType ("text/plain");
5020       else if (strcasecmp (NameExtension, ".htm") == 0 ||
5021       strcasecmp (NameExtension, ".html") == 0)
5022         ComponentMIMEType.SetType ("text/html");
5023     }
5024   }
5025 
5026   switch (m_TokenizeMode)
5027   {
5028     case TM_PLAIN_TEXT:
5029     case TM_PLAIN_TEXT_HEADER:
5030       ExamineComponent = TextPlainMIMEType.Contains (&ComponentMIMEType);
5031       break;
5032 
5033     case TM_ANY_TEXT:
5034     case TM_ANY_TEXT_HEADER:
5035       ExamineComponent = TextAnyMIMEType.Contains (&ComponentMIMEType);
5036       break;
5037 
5038     case TM_ALL_PARTS:
5039     case TM_ALL_PARTS_HEADER:
5040       ExamineComponent = true;
5041       break;
5042 
5043     default:
5044       ExamineComponent = false;
5045       break;
5046   }
5047 
5048   if (ExamineComponent)
5049   {
5050     /* Get the contents of the component.  This will be UTF-8 text (converted
5051     from whatever encoding was used) for text attachments.  For other ones,
5052     it's just the raw data, or perhaps decoded from base64 encoding. */
5053 
5054     ContentsIO.SetBlockSize (16 * 1024);
5055     ErrorCode = ComponentPntr->GetDecodedData (&ContentsIO);
5056     if (ErrorCode == B_OK) /* Can fail for container components: no data. */
5057     {
5058       /* Look for words in the decoded data. */
5059 
5060       ContentsBufferPntr = (const char *) ContentsIO.Buffer ();
5061       ContentsBufferSize = ContentsIO.BufferLength ();
5062       if (ContentsBufferPntr != NULL /* can be empty */)
5063         AddWordsToSet (ContentsBufferPntr, ContentsBufferSize,
5064           0 /* no prefix character, this is body text */, WordSet);
5065     }
5066   }
5067 
5068   /* Examine any sub-components in the message. */
5069 
5070   if (RecursionLevel + 1 <= MaxRecursionLevel &&
5071   NULL != (ContainerPntr = dynamic_cast<BMailContainer *>(ComponentPntr)))
5072   {
5073     NumComponents = ContainerPntr->CountComponents ();
5074 
5075     for (i = 0; i < NumComponents; i++)
5076     {
5077       ComponentPntr = ContainerPntr->GetComponent (i);
5078 
5079       ErrorCode = RecursivelyTokenizeMailComponent (ComponentPntr,
5080         OptionalFileName, WordSet, ErrorMessage, RecursionLevel + 1,
5081         MaxRecursionLevel);
5082       if (ErrorCode != B_OK)
5083         break;
5084     }
5085   }
5086 
5087   return ErrorCode;
5088 }
5089 
5090 
5091 /* The user has tried to open a file or several files with this application,
5092 via Tracker's open-with menu item.  If it is a database type file, then change
5093 the database file name to it.  Otherwise, ask the user whether they want to
5094 classify it as spam or non-spam.  There will be at most around 100 files, BeOS
5095 R5.0.3's Tracker crashes if it tries to pass on more than that many using Open
5096 With... etc.  The command is sent to an intermediary thread where it is
5097 asynchronously converted into a scripting message(s) that are sent back to this
5098 BApplication.  The intermediary is needed since we can't recursively execute
5099 scripting messages while processing a message (this RefsReceived one). */
5100 
5101 void
5102 ABSApp::RefsReceived (BMessage *MessagePntr)
5103 {
5104   if (g_CommanderLooperPntr != NULL)
5105     g_CommanderLooperPntr->CommandReferences (MessagePntr);
5106 }
5107 
5108 
5109 /* A scripting command is looking for something to execute it.  See if it is
5110 targetted at our database. */
5111 
5112 BHandler * ABSApp::ResolveSpecifier (
5113   BMessage *MessagePntr,
5114   int32 Index,
5115   BMessage *SpecifierMsgPntr,
5116   int32 SpecificationKind,
5117   const char *PropertyPntr)
5118 {
5119   int i;
5120 
5121   /* See if it is one of our commands. */
5122 
5123   if (SpecificationKind == B_DIRECT_SPECIFIER)
5124   {
5125     for (i = PN_MAX - 1; i >= 0; i--)
5126     {
5127       if (strcasecmp (PropertyPntr, g_PropertyNames [i]) == 0)
5128         return this; /* Found it!  Return the Handler (which is us). */
5129     }
5130   }
5131 
5132   /* Handle an unrecognized scripting command, let the parent figure it out. */
5133 
5134   return BApplication::ResolveSpecifier (
5135     MessagePntr, Index, SpecifierMsgPntr, SpecificationKind, PropertyPntr);
5136 }
5137 
5138 
5139 /* Save the database if it hasn't been saved yet.  Otherwise do nothing. */
5140 
5141 status_t ABSApp::SaveDatabaseIfNeeded (char *ErrorMessage)
5142 {
5143   if (m_DatabaseHasChanged)
5144     return LoadSaveDatabase (false /* DoLoad */, ErrorMessage);
5145 
5146   return B_OK;
5147 }
5148 
5149 
5150 /* Presumably the file is an e-mail message (or at least the header portion of
5151 one).  Break it into parts: header, body and MIME components.  Then add the
5152 words in the portions that match the current tokenization settings to the set
5153 of words. */
5154 
5155 status_t ABSApp::TokenizeParts (
5156   BPositionIO *PositionIOPntr,
5157   const char *OptionalFileName,
5158   set<string> &WordSet,
5159   char *ErrorMessage)
5160 {
5161   status_t        ErrorCode = B_OK;
5162   BEmailMessage   WholeEMail;
5163 
5164   sprintf (ErrorMessage, "ABSApp::TokenizeParts: While getting e-mail "
5165     "headers, had problems with \"%s\"", OptionalFileName);
5166 
5167   ErrorCode = WholeEMail.SetToRFC822 (
5168     PositionIOPntr /* it does its own seeking to the start */,
5169     -1 /* length */, true /* parse_now */);
5170   if (ErrorCode < 0) goto ErrorExit;
5171 
5172   ErrorCode = RecursivelyTokenizeMailComponent (&WholeEMail,
5173     OptionalFileName, WordSet, ErrorMessage, 0 /* Initial recursion level */,
5174     (m_TokenizeMode == TM_JUST_HEADER) ? 0 : 500 /* Max recursion level */);
5175 
5176 ErrorExit:
5177   return ErrorCode;
5178 }
5179 
5180 
5181 /* Add all the words in the whole file or memory buffer to the supplied set.
5182 The file doesn't have to be an e-mail message since it isn't parsed for e-mail
5183 headers or MIME headers or anything.  It blindly adds everything that looks
5184 like a word, though it does convert quoted printable codes to the characters
5185 they represent.  See also AddWordsToSet which does something more advanced. */
5186 
5187 status_t ABSApp::TokenizeWhole (
5188   BPositionIO *PositionIOPntr,
5189   const char *OptionalFileName,
5190   set<string> &WordSet,
5191   char *ErrorMessage)
5192 {
5193   string                AccumulatedWord;
5194   uint8                 Buffer [16 * 1024];
5195   uint8                *BufferCurrentPntr = Buffer + 0;
5196   uint8                *BufferEndPntr = Buffer + 0;
5197   const char           *IOErrorString =
5198                           "TokenizeWhole: Error %ld while reading \"%s\"";
5199   size_t                Length;
5200   int                   Letter = ' ';
5201   char                  HexString [4];
5202   int                   NextLetter = ' ';
5203   int                   NextNextLetter = ' ';
5204 
5205   /* Use a buffer since reading single characters from a BFile is so slow.
5206   BufferCurrentPntr is the position of the next character to be read.  When it
5207   reaches BufferEndPntr, it is time to fill the buffer again. */
5208 
5209 #define ReadChar(CharVar) \
5210   { \
5211     if (BufferCurrentPntr < BufferEndPntr) \
5212       CharVar = *BufferCurrentPntr++; \
5213     else /* Try to fill the buffer. */ \
5214     { \
5215       ssize_t AmountRead; \
5216       AmountRead = PositionIOPntr->Read (Buffer, sizeof (Buffer)); \
5217       if (AmountRead < 0) \
5218       { \
5219         sprintf (ErrorMessage, IOErrorString, AmountRead, OptionalFileName); \
5220         return AmountRead; \
5221       } \
5222       else if (AmountRead == 0) \
5223         CharVar = EOF; \
5224       else \
5225       { \
5226         BufferEndPntr = Buffer + AmountRead; \
5227         BufferCurrentPntr = Buffer + 0; \
5228         CharVar = *BufferCurrentPntr++; \
5229       } \
5230     } \
5231   }
5232 
5233   /* Read all the words in the file and add them to our local set of words.  A
5234   set is used since we don't care how many times a word occurs. */
5235 
5236   while (true)
5237   {
5238     /* We read two letters ahead so that we can decode quoted printable
5239     characters (an equals sign followed by two hex digits or a new line).  Note
5240     that Letter can become EOF (-1) when end of file is reached. */
5241 
5242     Letter = NextLetter;
5243     NextLetter = NextNextLetter;
5244     ReadChar (NextNextLetter);
5245 
5246     /* Decode quoted printable codes first, so that the rest of the code just
5247     sees an ordinary character.  Or even nothing, if it is the hidden line
5248     break combination.  This may falsely corrupt stuff following an equals
5249     sign, but usually won't. */
5250 
5251     if (Letter == '=')
5252     {
5253       if ((NextLetter == '\r' && NextNextLetter == '\n') ||
5254       (NextLetter == '\n' && NextNextLetter == '\r'))
5255       {
5256         /* Make the "=\r\n" pair disappear.  It's not even white space. */
5257         ReadChar (NextLetter);
5258         ReadChar (NextNextLetter);
5259         continue;
5260       }
5261       if (NextLetter == '\n' || NextLetter == '\r')
5262       {
5263         /* Make the "=\n" pair disappear.  It's not even white space. */
5264         NextLetter = NextNextLetter;
5265         ReadChar (NextNextLetter);
5266         continue;
5267       }
5268       if (NextNextLetter != EOF &&
5269       isxdigit (NextLetter) && isxdigit (NextNextLetter))
5270       {
5271         /* Convert the hex code to a letter. */
5272         HexString[0] = NextLetter;
5273         HexString[1] = NextNextLetter;
5274         HexString[2] = 0;
5275         Letter = strtoul (HexString, NULL, 16 /* number system base */);
5276         ReadChar (NextLetter);
5277         ReadChar (NextNextLetter);
5278       }
5279     }
5280 
5281     /* Convert to lower case to improve word matches.  Of course this loses a
5282     bit of information, such as MONEY vs Money, an indicator of spam.  Well,
5283     apparently that isn't all that useful a distinction, so do it. */
5284 
5285     if (Letter >= 'A' && Letter < 'Z')
5286       Letter = Letter + ('a' - 'A');
5287 
5288     /* See if it is a letter we treat as white space - all control characters
5289     and all punctuation except for: apostrophe (so "it's" and possessive
5290     versions of words get stored), dash (for hyphenated words), dollar sign
5291     (for cash amounts), period (for IP addresses, we later remove trailing
5292     (periods).  Note that codes above 127 are UTF-8 characters, which we
5293     consider non-space. */
5294 
5295     if (Letter < 0 /* EOF */ || (Letter < 128 && g_SpaceCharacters[Letter]))
5296     {
5297       /* That space finished off a word.  Remove trailing periods... */
5298 
5299       while ((Length = AccumulatedWord.size()) > 0 &&
5300       AccumulatedWord [Length-1] == '.')
5301         AccumulatedWord.resize (Length - 1);
5302 
5303       /* If there's anything left in the word, add it to the set.  Also ignore
5304       words which are too big (it's probably some binary encoded data).  But
5305       leave room for supercalifragilisticexpialidoceous.  According to one web
5306       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
5307       currently in English.  Note that some uuencoded data was seen with a 60
5308       character line length. */
5309 
5310       if (Length > 0 && Length <= g_MaxWordLength)
5311         WordSet.insert (AccumulatedWord);
5312 
5313       /* Empty out the string to get ready for the next word. */
5314 
5315       AccumulatedWord.resize (0);
5316     }
5317     else /* Not a space-like character, add it to the word. */
5318       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
5319 
5320     /* Stop at end of file or error.  Don't care which.  Exit here so that last
5321     word got processed. */
5322 
5323     if (Letter == EOF)
5324       break;
5325   }
5326 
5327   return B_OK;
5328 }
5329 
5330 
5331 
5332 /******************************************************************************
5333  * Implementation of the ClassificationChoicesView class, constructor,
5334  * destructor and the rest of the member functions in mostly alphabetical
5335  * order.
5336  */
5337 
5338 ClassificationChoicesWindow::ClassificationChoicesWindow (
5339   BRect FrameRect,
5340   const char *FileName,
5341   int NumberOfFiles)
5342 : BWindow (FrameRect, "Classification Choices", B_TITLED_WINDOW,
5343     B_NOT_ZOOMABLE | B_NOT_RESIZABLE | B_ASYNCHRONOUS_CONTROLS),
5344   m_BulkModeSelectedPntr (NULL),
5345   m_ChoosenClassificationPntr (NULL)
5346 {
5347   ClassificationChoicesView *SubViewPntr;
5348 
5349   SubViewPntr = new ClassificationChoicesView (Bounds(),
5350     FileName, NumberOfFiles);
5351   AddChild (SubViewPntr);
5352   SubViewPntr->ResizeToPreferred ();
5353   ResizeTo (SubViewPntr->Frame().Width(), SubViewPntr->Frame().Height());
5354 }
5355 
5356 
5357 void
5358 ClassificationChoicesWindow::MessageReceived (BMessage *MessagePntr)
5359 {
5360   BControl *ControlPntr;
5361 
5362   if (MessagePntr->what >= MSG_CLASS_BUTTONS &&
5363   MessagePntr->what < MSG_CLASS_BUTTONS + CL_MAX)
5364   {
5365     if (m_ChoosenClassificationPntr != NULL)
5366       *m_ChoosenClassificationPntr =
5367         (ClassificationTypes) (MessagePntr->what - MSG_CLASS_BUTTONS);
5368     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5369     return;
5370   }
5371 
5372   if (MessagePntr->what == MSG_BULK_CHECKBOX)
5373   {
5374     if (m_BulkModeSelectedPntr != NULL &&
5375     MessagePntr->FindPointer ("source", (void **) &ControlPntr) == B_OK)
5376       *m_BulkModeSelectedPntr = (ControlPntr->Value() == B_CONTROL_ON);
5377     return;
5378   }
5379 
5380   if (MessagePntr->what == MSG_CANCEL_BUTTON)
5381   {
5382     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5383     return;
5384   }
5385 
5386   BWindow::MessageReceived (MessagePntr);
5387 }
5388 
5389 
5390 void
5391 ClassificationChoicesWindow::Go (
5392   bool *BulkModeSelectedPntr,
5393   ClassificationTypes *ChoosenClassificationPntr)
5394 {
5395   status_t  ErrorCode = 0;
5396   BView    *MainViewPntr;
5397   thread_id WindowThreadID;
5398 
5399   m_BulkModeSelectedPntr = BulkModeSelectedPntr;
5400   m_ChoosenClassificationPntr = ChoosenClassificationPntr;
5401   if (m_ChoosenClassificationPntr != NULL)
5402     *m_ChoosenClassificationPntr = CL_MAX;
5403 
5404   Show (); // Starts the window thread running.
5405 
5406   /* Move the window to the center of the screen it is now being displayed on
5407   (have to wait for it to be showing). */
5408 
5409   Lock ();
5410   MainViewPntr = FindView ("ClassificationChoicesView");
5411   if (MainViewPntr != NULL)
5412   {
5413     BRect   TempRect;
5414     BScreen TempScreen (this);
5415     float   X;
5416     float   Y;
5417 
5418     TempRect = TempScreen.Frame ();
5419     X = TempRect.Width() / 2;
5420     Y = TempRect.Height() / 2;
5421     TempRect = MainViewPntr->Frame();
5422     X -= TempRect.Width() / 2;
5423     Y -= TempRect.Height() / 2;
5424     MoveTo (ceilf (X), ceilf (Y));
5425   }
5426   Unlock ();
5427 
5428   /* Wait for the window to go away. */
5429 
5430   WindowThreadID = Thread ();
5431   if (WindowThreadID >= 0)
5432     // Delay until the window thread has died, presumably window deleted now.
5433     wait_for_thread (WindowThreadID, &ErrorCode);
5434 }
5435 
5436 
5437 
5438 /******************************************************************************
5439  * Implementation of the ClassificationChoicesView class, constructor,
5440  * destructor and the rest of the member functions in mostly alphabetical
5441  * order.
5442  */
5443 
5444 ClassificationChoicesView::ClassificationChoicesView (
5445   BRect FrameRect,
5446   const char *FileName,
5447   int NumberOfFiles)
5448 : BView (FrameRect, "ClassificationChoicesView",
5449     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_NAVIGABLE_JUMP),
5450   m_FileName (FileName),
5451   m_NumberOfFiles (NumberOfFiles),
5452   m_PreferredBottomY (ceilf (g_ButtonHeight * 10))
5453 {
5454 }
5455 
5456 
5457 void
5458 ClassificationChoicesView::AttachedToWindow ()
5459 {
5460   BButton            *ButtonPntr;
5461   BCheckBox          *CheckBoxPntr;
5462   ClassificationTypes Classification;
5463   float               Margin;
5464   float               RowHeight;
5465   float               RowTop;
5466   BTextView          *TextViewPntr;
5467   BRect               TempRect;
5468   char                TempString [2048];
5469   BRect               TextRect;
5470   float               X;
5471 
5472   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5473 
5474   RowHeight = g_ButtonHeight;
5475   if (g_CheckBoxHeight > RowHeight)
5476     RowHeight = g_CheckBoxHeight;
5477   RowHeight = ceilf (RowHeight * 1.1);
5478 
5479   TempRect = Bounds ();
5480   RowTop = TempRect.top;
5481 
5482   /* Show the file name text. */
5483 
5484   Margin = ceilf ((RowHeight - g_StringViewHeight) / 2);
5485   TempRect = Bounds ();
5486   TempRect.top = RowTop + Margin;
5487   TextRect = TempRect;
5488   TextRect.OffsetTo (0, 0);
5489   TextRect.InsetBy (g_MarginBetweenControls, 2);
5490   sprintf (TempString, "How do you want to classify the file named \"%s\"?",
5491     m_FileName);
5492   TextViewPntr = new BTextView (TempRect, "FileText", TextRect,
5493     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE);
5494   AddChild (TextViewPntr);
5495   TextViewPntr->SetText (TempString);
5496   TextViewPntr->MakeEditable (false);
5497   TextViewPntr->SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5498   TextViewPntr->ResizeTo (TempRect.Width (),
5499     3 + TextViewPntr->TextHeight (0, sizeof (TempString)));
5500   RowTop = TextViewPntr->Frame().bottom + Margin;
5501 
5502   /* Make the classification buttons. */
5503 
5504   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5505   TempRect = Bounds ();
5506   TempRect.top = RowTop + Margin;
5507   X = Bounds().left + g_MarginBetweenControls;
5508   for (Classification = (ClassificationTypes) 0; Classification < CL_MAX;
5509   Classification = (ClassificationTypes) ((int) Classification + 1))
5510   {
5511     TempRect = Bounds ();
5512     TempRect.top = RowTop + Margin;
5513     TempRect.left = X;
5514     sprintf (TempString, "%s Button",
5515       g_ClassificationTypeNames [Classification]);
5516     ButtonPntr = new BButton (TempRect, TempString,
5517       g_ClassificationTypeNames [Classification], new BMessage (
5518       ClassificationChoicesWindow::MSG_CLASS_BUTTONS + Classification));
5519     AddChild (ButtonPntr);
5520     ButtonPntr->ResizeToPreferred ();
5521     X = ButtonPntr->Frame().right + 3 * g_MarginBetweenControls;
5522   }
5523   RowTop += ceilf (RowHeight * 1.2);
5524 
5525   /* Make the Cancel button. */
5526 
5527   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5528   TempRect = Bounds ();
5529   TempRect.top = RowTop + Margin;
5530   TempRect.left += g_MarginBetweenControls;
5531   ButtonPntr = new BButton (TempRect, "Cancel Button",
5532     "Cancel", new BMessage (ClassificationChoicesWindow::MSG_CANCEL_BUTTON));
5533   AddChild (ButtonPntr);
5534   ButtonPntr->ResizeToPreferred ();
5535   X = ButtonPntr->Frame().right + g_MarginBetweenControls;
5536 
5537   /* Make the checkbox for bulk operations. */
5538 
5539   if (m_NumberOfFiles > 1)
5540   {
5541     Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
5542     TempRect = Bounds ();
5543     TempRect.top = RowTop + Margin;
5544     TempRect.left = X;
5545     sprintf (TempString, "Mark all %d remaining messages the same way.",
5546       m_NumberOfFiles - 1);
5547     CheckBoxPntr = new BCheckBox (TempRect, "BulkBox", TempString,
5548       new BMessage (ClassificationChoicesWindow::MSG_BULK_CHECKBOX));
5549     AddChild (CheckBoxPntr);
5550     CheckBoxPntr->ResizeToPreferred ();
5551   }
5552   RowTop += RowHeight;
5553 
5554   m_PreferredBottomY = RowTop;
5555 }
5556 
5557 
5558 void
5559 ClassificationChoicesView::GetPreferredSize (float *width, float *height)
5560 {
5561   if (width != NULL)
5562     *width = Bounds().Width();
5563   if (height != NULL)
5564     *height = m_PreferredBottomY;
5565 }
5566 
5567 
5568 
5569 /******************************************************************************
5570  * Implementation of the CommanderLooper class, constructor, destructor and the
5571  * rest of the member functions in mostly alphabetical order.
5572  */
5573 
5574 CommanderLooper::CommanderLooper ()
5575 : BLooper ("CommanderLooper", B_NORMAL_PRIORITY),
5576   m_IsBusy (false)
5577 {
5578 }
5579 
5580 
5581 CommanderLooper::~CommanderLooper ()
5582 {
5583   g_CommanderLooperPntr = NULL;
5584   delete g_CommanderMessenger;
5585   g_CommanderMessenger = NULL;
5586 }
5587 
5588 
5589 /* Process some command line arguments.  Basically just send a message to this
5590 looper itself to do the work later.  That way the caller can continue doing
5591 whatever they're doing, particularly if it's the BApplication. */
5592 
5593 void
5594 CommanderLooper::CommandArguments (int argc, char **argv)
5595 {
5596   int      i;
5597   BMessage InternalMessage;
5598 
5599   InternalMessage.what = MSG_COMMAND_ARGUMENTS;
5600   for (i = 0; i < argc; i++)
5601     InternalMessage.AddString ("arg", argv[i]);
5602 
5603   PostMessage (&InternalMessage);
5604 }
5605 
5606 
5607 /* Copy the refs out of the given message and stuff them into an internal
5608 message to ourself (so that the original message can be returned to the caller,
5609 and if it is Tracker, it can close the file handles it has open).  Optionally
5610 allow preset classification rather than asking the user (set BulkMode to TRUE
5611 and specify the class with BulkClassification). */
5612 
5613 void
5614 CommanderLooper::CommandReferences (
5615   BMessage *MessagePntr,
5616   bool BulkMode,
5617   ClassificationTypes BulkClassification)
5618 {
5619   entry_ref EntryRef;
5620   int       i;
5621   BMessage  InternalMessage;
5622 
5623   InternalMessage.what = MSG_COMMAND_FILE_REFS;
5624   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
5625     InternalMessage.AddRef ("refs", &EntryRef);
5626   InternalMessage.AddBool ("BulkMode", BulkMode);
5627   InternalMessage.AddInt32 ("BulkClassification", BulkClassification);
5628 
5629   PostMessage (&InternalMessage);
5630 }
5631 
5632 
5633 /* This function is called by other threads to see if the CommanderLooper is
5634 busy working on something. */
5635 
5636 bool
5637 CommanderLooper::IsBusy ()
5638 {
5639   if (m_IsBusy)
5640     return true;
5641 
5642   if (IsLocked () || !MessageQueue()->IsEmpty ())
5643     return true;
5644 
5645   return false;
5646 }
5647 
5648 
5649 void
5650 
5651 CommanderLooper::MessageReceived (BMessage *MessagePntr)
5652 {
5653   m_IsBusy = true;
5654 
5655   if (MessagePntr->what == MSG_COMMAND_ARGUMENTS)
5656     ProcessArgs (MessagePntr);
5657   else if (MessagePntr->what == MSG_COMMAND_FILE_REFS)
5658     ProcessRefs (MessagePntr);
5659   else
5660     BLooper::MessageReceived (MessagePntr);
5661 
5662   m_IsBusy = false;
5663 }
5664 
5665 
5666 /* Process the command line by converting it into a series of scripting
5667 messages (possibly thousands) and sent them to the BApplication synchronously
5668 (so we can print the result). */
5669 
5670 void
5671 CommanderLooper::ProcessArgs (BMessage *MessagePntr)
5672 {
5673   int32                 argc = 0;
5674   const char          **argv = NULL;
5675   int                   ArgumentIndex;
5676   uint32                CommandCode;
5677   const char           *CommandWord;
5678   status_t              ErrorCode;
5679   const char           *ErrorTitle = "ProcessArgs";
5680   char                 *EndPntr;
5681   int32                 i;
5682   BMessage              ReplyMessage;
5683   BMessage              ScriptMessage;
5684   struct property_info *PropInfoPntr;
5685   const char           *PropertyName;
5686   bool                  TempBool;
5687   float                 TempFloat;
5688   int32                 TempInt32;
5689   const char           *TempStringPntr;
5690   type_code             TypeCode;
5691   const char           *ValuePntr;
5692 
5693   /* Get the argument count and pointers to arguments out of the message and
5694   into our argc and argv. */
5695 
5696   ErrorCode = MessagePntr->GetInfo ("arg", &TypeCode, &argc);
5697   if (ErrorCode != B_OK || TypeCode != B_STRING_TYPE)
5698   {
5699     DisplayErrorMessage ("Unable to find argument strings in message",
5700       ErrorCode, ErrorTitle);
5701     goto ErrorExit;
5702   }
5703 
5704   if (argc < 2)
5705   {
5706     cerr << PrintUsage;
5707     DisplayErrorMessage ("You need to specify a command word, like GET, SET "
5708       "and so on followed by a property, like DatabaseFile, and maybe "
5709       "followed by a value of some sort", -1, ErrorTitle);
5710     goto ErrorExit;
5711   }
5712 
5713   argv = (const char **) malloc (sizeof (char *) * argc);
5714   if (argv == NULL)
5715   {
5716     DisplayErrorMessage ("Out of memory when allocating argv array",
5717       ENOMEM, ErrorTitle);
5718     goto ErrorExit;
5719   }
5720 
5721   for (i = 0; i < argc; i++)
5722   {
5723     if ((ErrorCode = MessagePntr->FindString ("arg", i, &argv[i])) != B_OK)
5724     {
5725       DisplayErrorMessage ("Unable to find argument in the BMessage",
5726         ErrorCode, ErrorTitle);
5727       goto ErrorExit;
5728     }
5729   }
5730 
5731   CommandWord = argv[1];
5732 
5733   /* Special case for the Quit command since it isn't a scripting command. */
5734 
5735   if (strcasecmp (CommandWord, "quit") == 0)
5736   {
5737     g_QuitCountdown = 10;
5738     goto ErrorExit;
5739   }
5740 
5741   /* Find the corresponding scripting command. */
5742 
5743   if (strcasecmp (CommandWord, "set") == 0)
5744     CommandCode = B_SET_PROPERTY;
5745   else if (strcasecmp (CommandWord, "get") == 0)
5746     CommandCode = B_GET_PROPERTY;
5747   else if (strcasecmp (CommandWord, "count") == 0)
5748     CommandCode = B_COUNT_PROPERTIES;
5749   else if (strcasecmp (CommandWord, "create") == 0)
5750     CommandCode = B_CREATE_PROPERTY;
5751   else if (strcasecmp (CommandWord, "delete") == 0)
5752     CommandCode = B_DELETE_PROPERTY;
5753   else
5754     CommandCode = B_EXECUTE_PROPERTY;
5755 
5756   if (CommandCode == B_EXECUTE_PROPERTY)
5757   {
5758     PropertyName = CommandWord;
5759     ArgumentIndex = 2; /* Arguments to the command start at this index. */
5760   }
5761   else
5762   {
5763     if (CommandCode == B_SET_PROPERTY)
5764     {
5765       /* SET commands require at least one argument value. */
5766       if (argc < 4)
5767       {
5768         cerr << PrintUsage;
5769         DisplayErrorMessage ("SET commands require at least one "
5770           "argument value after the property name", -1, ErrorTitle);
5771         goto ErrorExit;
5772       }
5773     }
5774     else
5775       if (argc < 3)
5776       {
5777         cerr << PrintUsage;
5778         DisplayErrorMessage ("You need to specify a property to act on",
5779           -1, ErrorTitle);
5780         goto ErrorExit;
5781       }
5782     PropertyName = argv[2];
5783     ArgumentIndex = 3;
5784   }
5785 
5786   /* See if it is one of our commands. */
5787 
5788   for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
5789   {
5790     if (PropInfoPntr->name == 0)
5791     {
5792       cerr << PrintUsage;
5793       DisplayErrorMessage ("The property specified isn't known or "
5794         "doesn't support the requested action (usually means it is an "
5795         "unknown command)", -1, ErrorTitle);
5796       goto ErrorExit; /* Unrecognized command. */
5797     }
5798 
5799     if (PropInfoPntr->commands[0] == CommandCode &&
5800     strcasecmp (PropertyName, PropInfoPntr->name) == 0)
5801       break;
5802   }
5803 
5804   /* Make the equivalent command message.  For commands with multiple
5805   arguments, repeat the message for each single argument and just change the
5806   data portion for each extra argument.  Send the command and wait for a reply,
5807   which we'll print out. */
5808 
5809   ScriptMessage.MakeEmpty ();
5810   ScriptMessage.what = CommandCode;
5811   ScriptMessage.AddSpecifier (PropertyName);
5812   while (true)
5813   {
5814     if (ArgumentIndex < argc) /* If there are arguments to be added. */
5815     {
5816       ValuePntr = argv[ArgumentIndex];
5817 
5818       /* Convert the value into the likely kind of data. */
5819 
5820       if (strcasecmp (ValuePntr, "yes") == 0 ||
5821       strcasecmp (ValuePntr, "true") == 0)
5822         ScriptMessage.AddBool (g_DataName, true);
5823       else if (strcasecmp (ValuePntr, "no") == 0 ||
5824       strcasecmp (ValuePntr, "false") == 0)
5825         ScriptMessage.AddBool (g_DataName, false);
5826       else
5827       {
5828         /* See if it is a number. */
5829         i = strtol (ValuePntr, &EndPntr, 0);
5830         if (*EndPntr == 0)
5831           ScriptMessage.AddInt32 (g_DataName, i);
5832         else /* Nope, it's just a string. */
5833           ScriptMessage.AddString (g_DataName, ValuePntr);
5834       }
5835     }
5836 
5837     ErrorCode = be_app_messenger.SendMessage (&ScriptMessage, &ReplyMessage);
5838     if (ErrorCode != B_OK)
5839     {
5840       DisplayErrorMessage ("Unable to send scripting command",
5841         ErrorCode, ErrorTitle);
5842       goto ErrorExit;
5843     }
5844 
5845     /* Print the reply to the scripting command.  Even in server mode.  To
5846     standard output. */
5847 
5848     if (ReplyMessage.FindString ("CommandText", &TempStringPntr) == B_OK)
5849     {
5850       TempInt32 = -1;
5851       if (ReplyMessage.FindInt32 ("error", &TempInt32) == B_OK &&
5852       TempInt32 == B_OK)
5853       {
5854         /* It's a successful reply to one of our scripting messages.  Print out
5855         the returned values code for command line users to see. */
5856 
5857         cout << "Result of command to " << TempStringPntr << " is:\t";
5858         if (ReplyMessage.FindString (g_ResultName, &TempStringPntr) == B_OK)
5859           cout << "\"" << TempStringPntr << "\"";
5860         else if (ReplyMessage.FindInt32 (g_ResultName, &TempInt32) == B_OK)
5861           cout << TempInt32;
5862         else if (ReplyMessage.FindFloat (g_ResultName, &TempFloat) == B_OK)
5863           cout << TempFloat;
5864         else if (ReplyMessage.FindBool (g_ResultName, &TempBool) == B_OK)
5865           cout << (TempBool ? "true" : "false");
5866         else
5867           cout << "just plain success";
5868         if (ReplyMessage.FindInt32 ("count", &TempInt32) == B_OK)
5869           cout << "\t(count " << TempInt32 << ")";
5870         for (i = 0; (i < 50) &&
5871         ReplyMessage.FindString ("words", i, &TempStringPntr) == B_OK &&
5872         ReplyMessage.FindFloat ("ratios", i, &TempFloat) == B_OK;
5873         i++)
5874         {
5875           if (i == 0)
5876             cout << "\twith top words:\t";
5877           else
5878             cout << "\t";
5879           cout << TempStringPntr << "/" << TempFloat;
5880         }
5881         cout << endl;
5882       }
5883       else /* An error reply, print out the error, even in server mode. */
5884       {
5885         cout << "Failure of command " << TempStringPntr << ", error ";
5886         cout << TempInt32 << " (" << strerror (TempInt32) << ")";
5887         if (ReplyMessage.FindString ("message", &TempStringPntr) == B_OK)
5888           cout << ", message: " << TempStringPntr;
5889         cout << "." << endl;
5890       }
5891     }
5892 
5893     /* Advance to the next argument and its scripting message. */
5894 
5895     ScriptMessage.RemoveName (g_DataName);
5896     if (++ArgumentIndex >= argc)
5897       break;
5898   }
5899 
5900 ErrorExit:
5901   free (argv);
5902 }
5903 
5904 
5905 /* Given a bunch of references to files, open the files.  If it's a database
5906 file, switch to using it as a database.  Otherwise, treat them as text files
5907 and add them to the database.  Prompt the user for the spam or genuine or
5908 uncertain (declassification) choice, with the option to bulk mark many files at
5909 once. */
5910 
5911 void
5912 CommanderLooper::ProcessRefs (BMessage *MessagePntr)
5913 {
5914   bool                         BulkMode = false;
5915   ClassificationTypes          BulkClassification = CL_GENUINE;
5916   ClassificationChoicesWindow *ChoiceWindowPntr;
5917   BEntry                       Entry;
5918   entry_ref                    EntryRef;
5919   status_t                     ErrorCode;
5920   const char                  *ErrorTitle = "CommanderLooper::ProcessRefs";
5921   int32                        NumberOfRefs = 0;
5922   BPath                        Path;
5923   int                          RefIndex;
5924   BMessage                     ReplyMessage;
5925   BMessage                     ScriptingMessage;
5926   bool                         TempBool;
5927   BFile                        TempFile;
5928   int32                        TempInt32;
5929   char                         TempString [PATH_MAX + 1024];
5930   type_code                    TypeCode;
5931 
5932   // Wait for ReadyToRun to finish initializing the globals with the sizes of
5933   // the controls, since they are needed when we show the custom alert box for
5934   // choosing the message type.
5935 
5936   TempInt32 = 0;
5937   while (!g_AppReadyToRunCompleted && TempInt32++ < 10)
5938     snooze (200000);
5939 
5940   ErrorCode = MessagePntr->GetInfo ("refs", &TypeCode, &NumberOfRefs);
5941   if (ErrorCode != B_OK || TypeCode != B_REF_TYPE || NumberOfRefs <= 0)
5942   {
5943     DisplayErrorMessage ("Unable to get refs from the message",
5944       ErrorCode, ErrorTitle);
5945     return;
5946   }
5947 
5948   if (MessagePntr->FindBool ("BulkMode", &TempBool) == B_OK)
5949     BulkMode = TempBool;
5950   if (MessagePntr->FindInt32 ("BulkClassification", &TempInt32) == B_OK &&
5951   TempInt32 >= 0 && TempInt32 < CL_MAX)
5952     BulkClassification = (ClassificationTypes) TempInt32;
5953 
5954   for (RefIndex = 0;
5955   MessagePntr->FindRef ("refs", RefIndex, &EntryRef) == B_OK;
5956   RefIndex++)
5957   {
5958     ScriptingMessage.MakeEmpty ();
5959     ScriptingMessage.what = 0; /* Haven't figured out what to do yet. */
5960 
5961     /* See if the entry is a valid file or directory or other thing. */
5962 
5963     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
5964     if (ErrorCode != B_OK ||
5965     ((ErrorCode = /* assignment */ B_ENTRY_NOT_FOUND) != 0 /* this pacifies
5966     mwcc -nwhitehorn */ && !Entry.Exists ()) ||
5967     ((ErrorCode = Entry.GetPath (&Path)) != B_OK))
5968     {
5969       DisplayErrorMessage ("Bad entry reference encountered, will skip it",
5970         ErrorCode, ErrorTitle);
5971       BulkMode = false;
5972       continue; /* Bad file reference, try the next one. */
5973     }
5974 
5975     /* If it's a file, check if it is a spam database file.  Go by the magic
5976     text at the start of the file, in case someone has edited the file with a
5977     spreadsheet or other tool and lost the MIME type. */
5978 
5979     if (Entry.IsFile ())
5980     {
5981       ErrorCode = TempFile.SetTo (&Entry, B_READ_ONLY);
5982       if (ErrorCode != B_OK)
5983       {
5984         sprintf (TempString, "Unable to open file \"%s\" for reading, will "
5985           "skip it", Path.Path ());
5986         DisplayErrorMessage (TempString, ErrorCode, ErrorTitle);
5987         BulkMode = false;
5988         continue;
5989       }
5990       if (TempFile.Read (TempString, strlen (g_DatabaseRecognitionString)) ==
5991       (int) strlen (g_DatabaseRecognitionString) && strncmp (TempString,
5992       g_DatabaseRecognitionString, strlen (g_DatabaseRecognitionString)) == 0)
5993       {
5994         ScriptingMessage.what = B_SET_PROPERTY;
5995         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
5996         ScriptingMessage.AddString (g_DataName, Path.Path ());
5997       }
5998       TempFile.Unset ();
5999     }
6000 
6001     /* Not a database file.  Could be a directory or a file.  Submit it as
6002     something to be marked spam or genuine. */
6003 
6004     if (ScriptingMessage.what == 0)
6005     {
6006       if (!Entry.IsFile ())
6007       {
6008         sprintf (TempString, "\"%s\" is not a file, can't do anything with it",
6009           Path.Path ());
6010         DisplayErrorMessage (TempString, -1, ErrorTitle);
6011         BulkMode = false;
6012         continue;
6013       }
6014 
6015       if (!BulkMode) /* Have to ask the user. */
6016       {
6017         ChoiceWindowPntr = new ClassificationChoicesWindow (
6018           BRect (40, 40, 40 + 50 * g_MarginBetweenControls,
6019           40 + g_ButtonHeight * 5), Path.Path (), NumberOfRefs - RefIndex);
6020         ChoiceWindowPntr->Go (&BulkMode, &BulkClassification);
6021         if (BulkClassification == CL_MAX)
6022           break; /* Cancel was picked. */
6023       }
6024 
6025       /* Format the command for classifying the file. */
6026 
6027       ScriptingMessage.what = B_SET_PROPERTY;
6028 
6029       if (BulkClassification == CL_GENUINE)
6030         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_GENUINE]);
6031       else if (BulkClassification == CL_SPAM)
6032         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_SPAM]);
6033       else if (BulkClassification == CL_UNCERTAIN)
6034         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_UNCERTAIN]);
6035       else /* Broken code */
6036         break;
6037       ScriptingMessage.AddString (g_DataName, Path.Path ());
6038     }
6039 
6040     /* Tell the BApplication to do the work, and wait for it to finish.  The
6041     BApplication will display any error messages for us. */
6042 
6043     ErrorCode =
6044       be_app_messenger.SendMessage (&ScriptingMessage, &ReplyMessage);
6045     if (ErrorCode != B_OK)
6046     {
6047       DisplayErrorMessage ("Unable to send scripting command",
6048         ErrorCode, ErrorTitle);
6049       return;
6050     }
6051 
6052     /* If there was an error, allow the user to stop by switching off bulk
6053     mode.  The message will already have been displayed in an alert box, if
6054     server mode is off. */
6055 
6056     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
6057     TempInt32 != B_OK)
6058       BulkMode = false;
6059   }
6060 }
6061 
6062 
6063 
6064 /******************************************************************************
6065  * Implementation of the ControlsView class, constructor, destructor and the
6066  * rest of the member functions in mostly alphabetical order.
6067  */
6068 
6069 ControlsView::ControlsView (BRect NewBounds)
6070 : BView (NewBounds, "ControlsView", B_FOLLOW_TOP | B_FOLLOW_LEFT_RIGHT,
6071     B_WILL_DRAW | B_PULSE_NEEDED | B_NAVIGABLE_JUMP | B_FRAME_EVENTS),
6072   m_AboutButtonPntr (NULL),
6073   m_AddExampleButtonPntr (NULL),
6074   m_BrowseButtonPntr (NULL),
6075   m_BrowseFilePanelPntr (NULL),
6076   m_CreateDatabaseButtonPntr (NULL),
6077   m_DatabaseFileNameTextboxPntr (NULL),
6078   m_DatabaseLoadDone (false),
6079   m_EstimateSpamButtonPntr (NULL),
6080   m_EstimateSpamFilePanelPntr (NULL),
6081   m_GenuineCountTextboxPntr (NULL),
6082   m_IgnorePreviousClassCheckboxPntr (NULL),
6083   m_InstallThingsButtonPntr (NULL),
6084   m_PurgeAgeTextboxPntr (NULL),
6085   m_PurgeButtonPntr (NULL),
6086   m_PurgePopularityTextboxPntr (NULL),
6087   m_ResetToDefaultsButtonPntr (NULL),
6088   m_ScoringModeMenuBarPntr (NULL),
6089   m_ScoringModePopUpMenuPntr (NULL),
6090   m_ServerModeCheckboxPntr (NULL),
6091   m_SpamCountTextboxPntr (NULL),
6092   m_TimeOfLastPoll (0),
6093   m_TokenizeModeMenuBarPntr (NULL),
6094   m_TokenizeModePopUpMenuPntr (NULL),
6095   m_WordCountTextboxPntr (NULL)
6096 {
6097 }
6098 
6099 
6100 ControlsView::~ControlsView ()
6101 {
6102   if (m_BrowseFilePanelPntr != NULL)
6103   {
6104     delete m_BrowseFilePanelPntr;
6105     m_BrowseFilePanelPntr = NULL;
6106   }
6107 
6108   if (m_EstimateSpamFilePanelPntr != NULL)
6109   {
6110     delete m_EstimateSpamFilePanelPntr;
6111     m_EstimateSpamFilePanelPntr = NULL;
6112   }
6113 }
6114 
6115 
6116 void
6117 ControlsView::AttachedToWindow ()
6118 {
6119   float         BigPurgeButtonTop;
6120   BMessage      CommandMessage;
6121   const char   *EightDigitsString = " 12345678 ";
6122   float         Height;
6123   float         Margin;
6124   float         RowHeight;
6125   float         RowTop;
6126   ScoringModes  ScoringMode;
6127   const char   *StringPntr;
6128   BMenuItem    *TempMenuItemPntr;
6129   BRect         TempRect;
6130   char          TempString [PATH_MAX];
6131   TokenizeModes TokenizeMode;
6132   float         Width;
6133   float         X;
6134 
6135   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
6136 
6137   TempRect = Bounds ();
6138   X = TempRect.right;
6139   RowTop = TempRect.top;
6140   RowHeight = g_ButtonHeight;
6141   if (g_TextBoxHeight > RowHeight)
6142     RowHeight = g_TextBoxHeight;
6143   RowHeight = ceilf (RowHeight * 1.1);
6144 
6145   /* Make the Create button at the far right of the first row of controls,
6146   which are all database file related. */
6147 
6148   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6149   TempRect = Bounds ();
6150   TempRect.top = RowTop + Margin;
6151   TempRect.bottom = TempRect.top + g_ButtonHeight;
6152 
6153   CommandMessage.MakeEmpty ();
6154   CommandMessage.what = B_CREATE_PROPERTY;
6155   CommandMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6156   m_CreateDatabaseButtonPntr = new BButton (TempRect, "Create Button",
6157     "Create", new BMessage (CommandMessage), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6158   if (m_CreateDatabaseButtonPntr == NULL) goto ErrorExit;
6159   AddChild (m_CreateDatabaseButtonPntr);
6160   m_CreateDatabaseButtonPntr->SetTarget (be_app);
6161   m_CreateDatabaseButtonPntr->ResizeToPreferred ();
6162   m_CreateDatabaseButtonPntr->GetPreferredSize (&Width, &Height);
6163   m_CreateDatabaseButtonPntr->MoveTo (X - Width, TempRect.top);
6164   X -= Width + g_MarginBetweenControls;
6165 
6166   /* Make the Browse button, middle of the first row. */
6167 
6168   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6169   TempRect = Bounds ();
6170   TempRect.top = RowTop + Margin;
6171   TempRect.bottom = TempRect.top + g_ButtonHeight;
6172 
6173   m_BrowseButtonPntr = new BButton (TempRect, "Browse Button",
6174     "Browse…", new BMessage (MSG_BROWSE_BUTTON), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6175   if (m_BrowseButtonPntr == NULL) goto ErrorExit;
6176   AddChild (m_BrowseButtonPntr);
6177   m_BrowseButtonPntr->SetTarget (this);
6178   m_BrowseButtonPntr->ResizeToPreferred ();
6179   m_BrowseButtonPntr->GetPreferredSize (&Width, &Height);
6180   m_BrowseButtonPntr->MoveTo (X - Width, TempRect.top);
6181   X -= Width + g_MarginBetweenControls;
6182 
6183   /* Fill the rest of the space on the first row with the file name box. */
6184 
6185   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6186   TempRect = Bounds ();
6187   TempRect.top = RowTop + Margin;
6188   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6189   TempRect.right = X;
6190 
6191   StringPntr = "Word Database:";
6192   strcpy (m_DatabaseFileNameCachedValue, "Unknown...");
6193   m_DatabaseFileNameTextboxPntr = new BTextControl (TempRect,
6194     "File Name",
6195     StringPntr /* label */,
6196     m_DatabaseFileNameCachedValue /* text */,
6197     new BMessage (MSG_DATABASE_NAME),
6198     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6199     B_WILL_DRAW | B_NAVIGABLE | B_NAVIGABLE_JUMP);
6200   AddChild (m_DatabaseFileNameTextboxPntr);
6201   m_DatabaseFileNameTextboxPntr->SetTarget (this);
6202   m_DatabaseFileNameTextboxPntr->SetDivider (
6203     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6204 
6205   /* Second row contains the purge age, and a long line explaining it.  There
6206   is space to the right where the top half of the big purge button will go. */
6207 
6208   RowTop += RowHeight /* previous row's RowHeight */;
6209   BigPurgeButtonTop = RowTop;
6210   TempRect = Bounds ();
6211   X = TempRect.left;
6212   RowHeight = g_TextBoxHeight;
6213   RowHeight = ceilf (RowHeight * 1.1);
6214 
6215   StringPntr = "Number of occurrences needed to store a word:";
6216   m_PurgeAgeCachedValue = 12345678;
6217 
6218   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6219   TempRect.top = RowTop + Margin;
6220   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6221   TempRect.left = X;
6222   TempRect.right = TempRect.left +
6223     be_plain_font->StringWidth (StringPntr) +
6224     be_plain_font->StringWidth (EightDigitsString) +
6225     3 * g_MarginBetweenControls;
6226 
6227   sprintf (TempString, "%d", (int) m_PurgeAgeCachedValue);
6228   m_PurgeAgeTextboxPntr = new BTextControl (TempRect,
6229     "Purge Age",
6230     StringPntr /* label */,
6231     TempString /* text */,
6232     new BMessage (MSG_PURGE_AGE),
6233     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6234     B_WILL_DRAW | B_NAVIGABLE);
6235   AddChild (m_PurgeAgeTextboxPntr);
6236   m_PurgeAgeTextboxPntr->SetTarget (this);
6237   m_PurgeAgeTextboxPntr->SetDivider (
6238     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6239 
6240   /* Third row contains the purge popularity and bottom half of the purge
6241   button. */
6242 
6243   RowTop += RowHeight /* previous row's RowHeight */;
6244   TempRect = Bounds ();
6245   X = TempRect.left;
6246   RowHeight = g_TextBoxHeight;
6247   RowHeight = ceilf (RowHeight * 1.1);
6248 
6249   StringPntr = "Number of messages to store words from:";
6250   m_PurgePopularityCachedValue = 87654321;
6251   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6252   TempRect.top = RowTop + Margin;
6253   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6254   TempRect.left = X;
6255   TempRect.right = TempRect.left +
6256     be_plain_font->StringWidth (StringPntr) +
6257     be_plain_font->StringWidth (EightDigitsString) +
6258     3 * g_MarginBetweenControls;
6259   X = TempRect.right + g_MarginBetweenControls;
6260 
6261   sprintf (TempString, "%d", (int) m_PurgePopularityCachedValue);
6262   m_PurgePopularityTextboxPntr = new BTextControl (TempRect,
6263     "Purge Popularity",
6264     StringPntr /* label */,
6265     TempString /* text */,
6266     new BMessage (MSG_PURGE_POPULARITY),
6267     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6268     B_WILL_DRAW | B_NAVIGABLE);
6269   AddChild (m_PurgePopularityTextboxPntr);
6270   m_PurgePopularityTextboxPntr->SetTarget (this);
6271   m_PurgePopularityTextboxPntr->SetDivider (
6272     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6273 
6274   /* Make the purge button, which will take up space in the 2nd and 3rd rows,
6275   on the right side.  Twice as tall as a regular button too. */
6276 
6277   StringPntr = "Remove Old Words";
6278   Margin = ceilf ((((RowTop + RowHeight) - BigPurgeButtonTop) -
6279     2 * g_TextBoxHeight) / 2);
6280   TempRect.top = BigPurgeButtonTop + Margin;
6281   TempRect.bottom = TempRect.top + 2 * g_TextBoxHeight;
6282   TempRect.left = X;
6283   TempRect.right = X + ceilf (2 * be_plain_font->StringWidth (StringPntr));
6284 
6285   CommandMessage.MakeEmpty ();
6286   CommandMessage.what = B_EXECUTE_PROPERTY;
6287   CommandMessage.AddSpecifier (g_PropertyNames[PN_PURGE]);
6288   m_PurgeButtonPntr = new BButton (TempRect, "Purge Button",
6289     StringPntr, new BMessage (CommandMessage), B_FOLLOW_LEFT | B_FOLLOW_TOP);
6290   if (m_PurgeButtonPntr == NULL) goto ErrorExit;
6291   m_PurgeButtonPntr->ResizeToPreferred();
6292   AddChild (m_PurgeButtonPntr);
6293   m_PurgeButtonPntr->SetTarget (be_app);
6294 
6295   /* The fourth row contains the ignore previous classification checkbox. */
6296 
6297   RowTop += RowHeight /* previous row's RowHeight */;
6298   TempRect = Bounds ();
6299   X = TempRect.left;
6300   RowHeight = g_CheckBoxHeight;
6301   RowHeight = ceilf (RowHeight * 1.1);
6302 
6303   StringPntr = "Allow Retraining on a Message";
6304   m_IgnorePreviousClassCachedValue = false;
6305 
6306   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6307   TempRect.top = RowTop + Margin;
6308   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6309   TempRect.left = X;
6310   m_IgnorePreviousClassCheckboxPntr = new BCheckBox (TempRect,
6311     "Ignore Check",
6312     StringPntr,
6313     new BMessage (MSG_IGNORE_CLASSIFICATION),
6314     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6315   if (m_IgnorePreviousClassCheckboxPntr == NULL) goto ErrorExit;
6316   AddChild (m_IgnorePreviousClassCheckboxPntr);
6317   m_IgnorePreviousClassCheckboxPntr->SetTarget (this);
6318   m_IgnorePreviousClassCheckboxPntr->ResizeToPreferred ();
6319   m_IgnorePreviousClassCheckboxPntr->GetPreferredSize (&Width, &Height);
6320   X += Width + g_MarginBetweenControls;
6321 
6322   /* The fifth row contains the server mode checkbox. */
6323 
6324   RowTop += RowHeight /* previous row's RowHeight */;
6325   TempRect = Bounds ();
6326   RowHeight = g_CheckBoxHeight;
6327   RowHeight = ceilf (RowHeight * 1.1);
6328 
6329   StringPntr = "Print errors to Terminal";
6330   m_ServerModeCachedValue = false;
6331 
6332   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6333   TempRect.top = RowTop + Margin;
6334   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6335   m_ServerModeCheckboxPntr = new BCheckBox (TempRect,
6336     "ServerMode Check",
6337     StringPntr,
6338     new BMessage (MSG_SERVER_MODE),
6339     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6340   if (m_ServerModeCheckboxPntr == NULL) goto ErrorExit;
6341   AddChild (m_ServerModeCheckboxPntr);
6342   m_ServerModeCheckboxPntr->SetTarget (this);
6343   m_ServerModeCheckboxPntr->ResizeToPreferred ();
6344   m_ServerModeCheckboxPntr->GetPreferredSize (&Width, &Height);
6345 
6346   /* This row just contains a huge pop-up menu which shows the tokenize mode
6347   and an explanation of what each mode does. */
6348 
6349   RowTop += RowHeight /* previous row's RowHeight */;
6350   TempRect = Bounds ();
6351   RowHeight = g_PopUpMenuHeight;
6352   RowHeight = ceilf (RowHeight * 1.1);
6353 
6354   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6355   TempRect.top = RowTop + Margin;
6356   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6357 
6358   m_TokenizeModeCachedValue = TM_MAX; /* Illegal value will force redraw. */
6359   m_TokenizeModeMenuBarPntr = new BMenuBar (TempRect, "TokenizeModeMenuBar",
6360     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6361     false /* resize to fit items */);
6362   if (m_TokenizeModeMenuBarPntr == NULL) goto ErrorExit;
6363   m_TokenizeModePopUpMenuPntr = new BPopUpMenu ("TokenizeModePopUpMenu");
6364   if (m_TokenizeModePopUpMenuPntr == NULL) goto ErrorExit;
6365 
6366   for (TokenizeMode = (TokenizeModes) 0;
6367   TokenizeMode < TM_MAX;
6368   TokenizeMode = (TokenizeModes) ((int) TokenizeMode + 1))
6369   {
6370     /* Each different tokenize mode gets its own menu item.  Selecting the item
6371     will send a canned command to the application to switch to the appropriate
6372     tokenize mode.  An optional explanation of each mode is added to the mode
6373     name string. */
6374 
6375     CommandMessage.MakeEmpty ();
6376     CommandMessage.what = B_SET_PROPERTY;
6377     CommandMessage.AddSpecifier (g_PropertyNames[PN_TOKENIZE_MODE]);
6378     CommandMessage.AddString (g_DataName, g_TokenizeModeNames[TokenizeMode]);
6379     strcpy (TempString, g_TokenizeModeNames[TokenizeMode]);
6380     switch (TokenizeMode)
6381     {
6382       case TM_WHOLE:
6383         strcat (TempString, " - Scan everything");
6384         break;
6385 
6386       case TM_PLAIN_TEXT:
6387         strcat (TempString, " - Scan e-mail body text except rich text");
6388         break;
6389 
6390       case TM_PLAIN_TEXT_HEADER:
6391         strcat (TempString, " - Scan entire e-mail text except rich text");
6392         break;
6393 
6394       case TM_ANY_TEXT:
6395         strcat (TempString, " - Scan e-mail body text and text attachments");
6396         break;
6397 
6398       case TM_ANY_TEXT_HEADER:
6399        strcat (TempString, " - Scan entire e-mail text and text attachments (recommended)");
6400         break;
6401 
6402       case TM_ALL_PARTS:
6403         strcat (TempString, " - Scan e-mail body and all attachments");
6404         break;
6405 
6406       case TM_ALL_PARTS_HEADER:
6407         strcat (TempString, " - Scan all parts of the e-mail");
6408         break;
6409 
6410       case TM_JUST_HEADER:
6411         strcat (TempString, " - Scan just the header (mail routing information)");
6412         break;
6413 
6414       default:
6415         break;
6416     }
6417     TempMenuItemPntr =
6418       new BMenuItem (TempString, new BMessage (CommandMessage));
6419     if (TempMenuItemPntr == NULL) goto ErrorExit;
6420     TempMenuItemPntr->SetTarget (be_app);
6421     m_TokenizeModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6422   }
6423   m_TokenizeModeMenuBarPntr->AddItem (m_TokenizeModePopUpMenuPntr);
6424   AddChild (m_TokenizeModeMenuBarPntr);
6425 
6426   /* This row just contains a huge pop-up menu which shows the scoring mode
6427   and an explanation of what each mode does. */
6428 
6429   RowTop += RowHeight /* previous row's RowHeight */;
6430   TempRect = Bounds ();
6431   RowHeight = g_PopUpMenuHeight;
6432   RowHeight = ceilf (RowHeight * 1.1);
6433 
6434   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6435   TempRect.top = RowTop + Margin;
6436   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6437 
6438   m_ScoringModeCachedValue = SM_MAX; /* Illegal value will force redraw. */
6439   m_ScoringModeMenuBarPntr = new BMenuBar (TempRect, "ScoringModeMenuBar",
6440     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6441     false /* resize to fit items */);
6442   if (m_ScoringModeMenuBarPntr == NULL) goto ErrorExit;
6443   m_ScoringModePopUpMenuPntr = new BPopUpMenu ("ScoringModePopUpMenu");
6444   if (m_ScoringModePopUpMenuPntr == NULL) goto ErrorExit;
6445 
6446   for (ScoringMode = (ScoringModes) 0;
6447   ScoringMode < SM_MAX;
6448   ScoringMode = (ScoringModes) ((int) ScoringMode + 1))
6449   {
6450     /* Each different scoring mode gets its own menu item.  Selecting the item
6451     will send a canned command to the application to switch to the appropriate
6452     scoring mode.  An optional explanation of each mode is added to the mode
6453     name string. */
6454 
6455     CommandMessage.MakeEmpty ();
6456     CommandMessage.what = B_SET_PROPERTY;
6457     CommandMessage.AddSpecifier (g_PropertyNames[PN_SCORING_MODE]);
6458     CommandMessage.AddString (g_DataName, g_ScoringModeNames[ScoringMode]);
6459 /*
6460     strcpy (TempString, g_ScoringModeNames[ScoringMode]);
6461     switch (ScoringMode)
6462     {
6463       case SM_ROBINSON:
6464         strcat (TempString, " - Learning Method 1: Naive Bayesian");
6465         break;
6466 
6467       case SM_CHISQUARED:
6468         strcat (TempString, " - Learning Method 2: Chi-Squared");
6469         break;
6470 
6471       default:
6472         break;
6473     }
6474 */
6475     switch (ScoringMode)
6476     {
6477       case SM_ROBINSON:
6478         strcpy (TempString, "Learning method 1: Naive Bayesian");
6479         break;
6480 
6481       case SM_CHISQUARED:
6482         strcpy (TempString, "Learning method 2: Chi-Squared");
6483         break;
6484 
6485       default:
6486         break;
6487     }
6488     TempMenuItemPntr =
6489       new BMenuItem (TempString, new BMessage (CommandMessage));
6490     if (TempMenuItemPntr == NULL) goto ErrorExit;
6491     TempMenuItemPntr->SetTarget (be_app);
6492     m_ScoringModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6493   }
6494   m_ScoringModeMenuBarPntr->AddItem (m_ScoringModePopUpMenuPntr);
6495   AddChild (m_ScoringModeMenuBarPntr);
6496 
6497   /* The next row has the install MIME types button and the reset to defaults
6498   button, one on the left and the other on the right. */
6499 
6500   RowTop += RowHeight /* previous row's RowHeight */;
6501   TempRect = Bounds ();
6502   RowHeight = g_ButtonHeight;
6503   RowHeight = ceilf (RowHeight * 1.1);
6504 
6505   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6506   TempRect.top = RowTop + Margin;
6507   TempRect.bottom = TempRect.top + g_ButtonHeight;
6508 
6509   CommandMessage.MakeEmpty ();
6510   CommandMessage.what = B_EXECUTE_PROPERTY;
6511   CommandMessage.AddSpecifier (g_PropertyNames[PN_INSTALL_THINGS]);
6512   m_InstallThingsButtonPntr = new BButton (TempRect, "Install Button",
6513     "Install spam types",
6514     new BMessage (CommandMessage),
6515     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6516   if (m_InstallThingsButtonPntr == NULL) goto ErrorExit;
6517   AddChild (m_InstallThingsButtonPntr);
6518   m_InstallThingsButtonPntr->SetTarget (be_app);
6519   m_InstallThingsButtonPntr->ResizeToPreferred ();
6520 
6521   /* The Reset to Defaults button.  On the right side of the row. */
6522 
6523   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6524   TempRect = Bounds ();
6525   TempRect.top = RowTop + Margin;
6526   TempRect.bottom = TempRect.top + g_ButtonHeight;
6527 
6528   CommandMessage.MakeEmpty ();
6529   CommandMessage.what = B_EXECUTE_PROPERTY;
6530   CommandMessage.AddSpecifier (g_PropertyNames[PN_RESET_TO_DEFAULTS]);
6531   m_ResetToDefaultsButtonPntr = new BButton (TempRect, "Reset Button",
6532     "Default settings", new BMessage (CommandMessage),
6533     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6534   if (m_ResetToDefaultsButtonPntr == NULL) goto ErrorExit;
6535   AddChild (m_ResetToDefaultsButtonPntr);
6536   m_ResetToDefaultsButtonPntr->SetTarget (be_app);
6537   m_ResetToDefaultsButtonPntr->ResizeToPreferred ();
6538   m_ResetToDefaultsButtonPntr->GetPreferredSize (&Width, &Height);
6539   m_ResetToDefaultsButtonPntr->MoveTo (TempRect.right - Width, TempRect.top);
6540 
6541   /* The next row contains the Estimate, Add Examples and About buttons. */
6542 
6543   RowTop += RowHeight /* previous row's RowHeight */;
6544   TempRect = Bounds ();
6545   X = TempRect.left;
6546   RowHeight = g_ButtonHeight;
6547   RowHeight = ceilf (RowHeight * 1.1);
6548 
6549   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6550   TempRect.top = RowTop + Margin;
6551   TempRect.bottom = TempRect.top + g_ButtonHeight;
6552   TempRect.left = X;
6553 
6554   m_EstimateSpamButtonPntr = new BButton (TempRect, "Estimate Button",
6555     "Scan a message",
6556     new BMessage (MSG_ESTIMATE_BUTTON),
6557     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6558   if (m_EstimateSpamButtonPntr == NULL) goto ErrorExit;
6559   AddChild (m_EstimateSpamButtonPntr);
6560   m_EstimateSpamButtonPntr->SetTarget (this);
6561   m_EstimateSpamButtonPntr->ResizeToPreferred ();
6562   X = m_EstimateSpamButtonPntr->Frame().right + g_MarginBetweenControls;
6563 
6564   /* The Add Example button in the middle.  Does the same as the browse button,
6565   but don't tell anyone that! */
6566 
6567   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6568   TempRect.top = RowTop + Margin;
6569   TempRect.bottom = TempRect.top + g_ButtonHeight;
6570   TempRect.left = X;
6571 
6572   m_AddExampleButtonPntr = new BButton (TempRect, "Example Button",
6573     "Train spam filter on a message",
6574     new BMessage (MSG_BROWSE_BUTTON),
6575     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6576     B_WILL_DRAW | B_NAVIGABLE | B_FULL_UPDATE_ON_RESIZE);
6577   if (m_AddExampleButtonPntr == NULL) goto ErrorExit;
6578   AddChild (m_AddExampleButtonPntr);
6579   m_AddExampleButtonPntr->SetTarget (this);
6580   m_AddExampleButtonPntr->ResizeToPreferred ();
6581   X = m_AddExampleButtonPntr->Frame().right + g_MarginBetweenControls;
6582 
6583   /* Add the About button on the right. */
6584 
6585   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6586   TempRect = Bounds ();
6587   TempRect.top = RowTop + Margin;
6588   TempRect.bottom = TempRect.top + g_ButtonHeight;
6589   TempRect.left = X;
6590 
6591   m_AboutButtonPntr = new BButton (TempRect, "About Button",
6592     "About…",
6593     new BMessage (B_ABOUT_REQUESTED),
6594     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6595   if (m_AboutButtonPntr == NULL) goto ErrorExit;
6596   AddChild (m_AboutButtonPntr);
6597   m_AboutButtonPntr->SetTarget (be_app);
6598 
6599   /* This row displays various counters.  Starting with the genuine messages
6600   count on the left. */
6601 
6602   RowTop += RowHeight /* previous row's RowHeight */;
6603   TempRect = Bounds ();
6604   RowHeight = g_TextBoxHeight;
6605   RowHeight = ceilf (RowHeight * 1.1);
6606 
6607   StringPntr = "Genuine messages:";
6608   m_GenuineCountCachedValue = 87654321;
6609   sprintf (TempString, "%d", (int) m_GenuineCountCachedValue);
6610 
6611   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6612   TempRect = Bounds ();
6613   TempRect.top = RowTop + Margin;
6614   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6615   TempRect.right = TempRect.left +
6616     be_plain_font->StringWidth (StringPntr) +
6617     be_plain_font->StringWidth (TempString) +
6618     3 * g_MarginBetweenControls;
6619 
6620   m_GenuineCountTextboxPntr = new BTextControl (TempRect,
6621     "Genuine count",
6622     StringPntr /* label */,
6623     TempString /* text */,
6624     NULL /* no message */,
6625     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6626     B_WILL_DRAW /* not B_NAVIGABLE */);
6627   AddChild (m_GenuineCountTextboxPntr);
6628   m_GenuineCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6629   m_GenuineCountTextboxPntr->SetDivider (
6630     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6631   m_GenuineCountTextboxPntr->SetEnabled (false); /* For display only. */
6632 
6633   /* The word count in the center. */
6634 
6635   StringPntr = "Word count:";
6636   m_WordCountCachedValue = 87654321;
6637   sprintf (TempString, "%d", (int) m_WordCountCachedValue);
6638 
6639   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6640   TempRect = Bounds ();
6641   TempRect.top = RowTop + Margin;
6642   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6643   Width = be_plain_font->StringWidth (StringPntr) +
6644     be_plain_font->StringWidth (TempString) +
6645     3 * g_MarginBetweenControls;
6646   TempRect.left = ceilf ((TempRect.right - TempRect.left) / 2 - Width / 2);
6647   TempRect.right = TempRect.left + Width;
6648 
6649   m_WordCountTextboxPntr = new BTextControl (TempRect,
6650     "Word count",
6651     StringPntr /* label */,
6652     TempString /* text */,
6653     NULL /* no message */,
6654     B_FOLLOW_H_CENTER | B_FOLLOW_TOP,
6655     B_WILL_DRAW /* not B_NAVIGABLE */);
6656   AddChild (m_WordCountTextboxPntr);
6657   m_WordCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6658   m_WordCountTextboxPntr->SetDivider (
6659     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6660   m_WordCountTextboxPntr->SetEnabled (false); /* For display only. */
6661 
6662   /* The spam count on the far right. */
6663 
6664   StringPntr = "Spam messages:";
6665   m_SpamCountCachedValue = 87654321;
6666   sprintf (TempString, "%d", (int) m_SpamCountCachedValue);
6667 
6668   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6669   TempRect = Bounds ();
6670   TempRect.top = RowTop + Margin;
6671   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6672   TempRect.left = TempRect.right -
6673     be_plain_font->StringWidth (StringPntr) -
6674     be_plain_font->StringWidth (TempString) -
6675     3 * g_MarginBetweenControls;
6676 
6677   m_SpamCountTextboxPntr = new BTextControl (TempRect,
6678     "Spam count",
6679     StringPntr /* label */,
6680     TempString /* text */,
6681     NULL /* no message */,
6682     B_FOLLOW_RIGHT | B_FOLLOW_TOP,
6683     B_WILL_DRAW /* not B_NAVIGABLE */);
6684   AddChild (m_SpamCountTextboxPntr);
6685   m_SpamCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6686   m_SpamCountTextboxPntr->SetDivider (
6687     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6688   m_SpamCountTextboxPntr->SetEnabled (false); /* For display only. */
6689 
6690   /* Change the size of our view so it only takes up the space needed by the
6691   buttons. */
6692 
6693   RowTop += RowHeight /* previous row's RowHeight */;
6694   ResizeTo (Bounds().Width(), RowTop - Bounds().top + 1);
6695 
6696   return; /* Successful. */
6697 
6698 ErrorExit:
6699   DisplayErrorMessage ("Unable to initialise the controls view.");
6700 }
6701 
6702 
6703 void
6704 ControlsView::BrowseForDatabaseFile ()
6705 {
6706   if (m_BrowseFilePanelPntr == NULL)
6707   {
6708     BEntry      DirectoryEntry;
6709     entry_ref   DirectoryEntryRef;
6710     BMessage    GetDatabasePathCommand;
6711     BMessage    GetDatabasePathResult;
6712     const char *StringPntr = NULL;
6713 
6714     /* Create a new file panel.  First set up the entry ref stuff so that the
6715     file panel can open to show the initial directory (the one where the
6716     database file currently is).  Note that we have to create it after the
6717     window and view are up and running, otherwise the BMessenger won't point to
6718     a valid looper/handler.  First find out the current database file name to
6719     use as a starting point. */
6720 
6721     GetDatabasePathCommand.what = B_GET_PROPERTY;
6722     GetDatabasePathCommand.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6723     be_app_messenger.SendMessage (&GetDatabasePathCommand,
6724       &GetDatabasePathResult, 5000000 /* delivery timeout */,
6725       5000000 /* reply timeout */);
6726     if (GetDatabasePathResult.FindString (g_ResultName, &StringPntr) != B_OK ||
6727     DirectoryEntry.SetTo (StringPntr) != B_OK ||
6728     DirectoryEntry.GetParent (&DirectoryEntry) != B_OK)
6729       DirectoryEntry.SetTo ("."); /* Default directory if we can't find it. */
6730     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6731     {
6732       DisplayErrorMessage (
6733         "Unable to set up the file requestor starting directory.  Sorry.");
6734       return;
6735     }
6736 
6737     m_BrowseFilePanelPntr = new BFilePanel (
6738       B_OPEN_PANEL /* mode */,
6739       &be_app_messenger /* target for event messages */,
6740       &DirectoryEntryRef /* starting directory */,
6741       B_FILE_NODE,
6742       true /* true for multiple selections */,
6743       NULL /* canned message */,
6744       NULL /* ref filter */,
6745       false /* true for modal */,
6746       true /* true to hide when done */);
6747   }
6748 
6749   if (m_BrowseFilePanelPntr != NULL)
6750     m_BrowseFilePanelPntr->Show (); /* Answer returned later in RefsReceived. */
6751 }
6752 
6753 
6754 void
6755 ControlsView::BrowseForFileToEstimate ()
6756 {
6757   if (m_EstimateSpamFilePanelPntr == NULL)
6758   {
6759     BEntry      DirectoryEntry;
6760     entry_ref   DirectoryEntryRef;
6761     status_t    ErrorCode;
6762     BMessenger  MessengerToSelf (this);
6763     BPath       PathToMailDirectory;
6764 
6765     /* Create a new file panel.  First set up the entry ref stuff so that the
6766     file panel can open to show the initial directory (the user's mail
6767     directory).  Note that we have to create the panel after the window and
6768     view are up and running, otherwise the BMessenger won't point to a valid
6769     looper/handler. */
6770 
6771     ErrorCode = find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6772     if (ErrorCode == B_OK)
6773     {
6774       PathToMailDirectory.Append ("mail");
6775       ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(),
6776         true /* traverse symbolic links*/);
6777       if (ErrorCode != B_OK || !DirectoryEntry.Exists ())
6778       {
6779         /* If no mail directory, try home directory. */
6780         find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6781         ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6782       }
6783     }
6784     if (ErrorCode != B_OK)
6785       PathToMailDirectory.SetTo (".");
6786 
6787     DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6788     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6789     {
6790       DisplayErrorMessage (
6791         "Unable to set up the file requestor starting directory.  Sorry.");
6792       return;
6793     }
6794 
6795     m_EstimateSpamFilePanelPntr = new BFilePanel (
6796       B_OPEN_PANEL /* mode */,
6797       &MessengerToSelf /* target for event messages */,
6798       &DirectoryEntryRef /* starting directory */,
6799       B_FILE_NODE,
6800       true /* true for multiple selections */,
6801       new BMessage (MSG_ESTIMATE_FILE_REFS) /* canned message */,
6802       NULL /* ref filter */,
6803       false /* true for modal */,
6804       true /* true to hide when done */);
6805   }
6806 
6807   if (m_EstimateSpamFilePanelPntr != NULL)
6808     m_EstimateSpamFilePanelPntr->Show (); /* Answer sent via a message. */
6809 }
6810 
6811 
6812 /* The display has been resized.  Have to manually adjust the popup menu bar to
6813 show the new size (the sub-items need to be resized too).  Then make it redraw.
6814 Well, actually just resetting the mark on the current item will resize it
6815 properly. */
6816 
6817 void
6818 ControlsView::FrameResized (float, float)
6819 {
6820   m_ScoringModeCachedValue = SM_MAX; /* Force it to reset the mark. */
6821   m_TokenizeModeCachedValue = TM_MAX; /* Force it to reset the mark. */
6822 }
6823 
6824 
6825 void
6826 ControlsView::MessageReceived (BMessage *MessagePntr)
6827 {
6828   BMessage CommandMessage;
6829   bool     TempBool;
6830   uint32   TempUint32;
6831 
6832   switch (MessagePntr->what)
6833   {
6834     case MSG_BROWSE_BUTTON:
6835       BrowseForDatabaseFile ();
6836       break;
6837 
6838     case MSG_DATABASE_NAME:
6839       if (strcmp (m_DatabaseFileNameCachedValue,
6840       m_DatabaseFileNameTextboxPntr->Text ()) != 0)
6841         SubmitCommandString (PN_DATABASE_FILE, B_SET_PROPERTY,
6842         m_DatabaseFileNameTextboxPntr->Text ());
6843       break;
6844 
6845     case MSG_ESTIMATE_BUTTON:
6846       BrowseForFileToEstimate ();
6847       break;
6848 
6849     case MSG_ESTIMATE_FILE_REFS:
6850       EstimateRefFilesAndDisplay (MessagePntr);
6851       break;
6852 
6853     case MSG_IGNORE_CLASSIFICATION:
6854       TempBool = (m_IgnorePreviousClassCheckboxPntr->Value() == B_CONTROL_ON);
6855       if (m_IgnorePreviousClassCachedValue != TempBool)
6856         SubmitCommandBool (PN_IGNORE_PREVIOUS_CLASSIFICATION,
6857         B_SET_PROPERTY, TempBool);
6858       break;
6859 
6860     case MSG_PURGE_AGE:
6861       TempUint32 = strtoul (m_PurgeAgeTextboxPntr->Text (), NULL, 10);
6862       if (m_PurgeAgeCachedValue != TempUint32)
6863         SubmitCommandInt32 (PN_PURGE_AGE, B_SET_PROPERTY, TempUint32);
6864       break;
6865 
6866     case MSG_PURGE_POPULARITY:
6867       TempUint32 = strtoul (m_PurgePopularityTextboxPntr->Text (), NULL, 10);
6868       if (m_PurgePopularityCachedValue != TempUint32)
6869         SubmitCommandInt32 (PN_PURGE_POPULARITY, B_SET_PROPERTY, TempUint32);
6870       break;
6871 
6872     case MSG_SERVER_MODE:
6873       TempBool = (m_ServerModeCheckboxPntr->Value() == B_CONTROL_ON);
6874       if (m_ServerModeCachedValue != TempBool)
6875         SubmitCommandBool (PN_SERVER_MODE, B_SET_PROPERTY, TempBool);
6876       break;
6877 
6878     default:
6879       BView::MessageReceived (MessagePntr);
6880   }
6881 }
6882 
6883 
6884 /* Check the server for changes in the state of the database, and if there are
6885 any changes, update the displayed values.  Since this is a read only
6886 examination of the server, we go directly to the application rather than
6887 sending it messages.  Also, when sending messages, we can't find out what it is
6888 doing while it is busy with a batch of spam additions (all the spam add
6889 commands will be in the queue ahead of our requests for info).  Instead, we
6890 lock the BApplication (so it isn't changing things while we're looking) and
6891 retrieve our values. */
6892 
6893 void
6894 ControlsView::PollServerForChanges ()
6895 {
6896   ABSApp     *MyAppPntr;
6897   BMenuItem  *TempMenuItemPntr;
6898   char        TempString [PATH_MAX];
6899   BWindow    *WindowPntr;
6900 
6901   /* We need a pointer to our window, for changing the title etc. */
6902 
6903   WindowPntr = Window ();
6904   if (WindowPntr == NULL)
6905     return; /* No window, no point in updating the display! */
6906 
6907   /* Check the server mode flag.  If the mode is off, then the window has to be
6908   minimized.  Similarly, if it gets turned on, maximize the window.  Note that
6909   the user can maximize the window manually, even while still in server mode.
6910   */
6911 
6912   if (g_ServerMode != m_ServerModeCachedValue &&
6913   m_ServerModeCheckboxPntr != NULL)
6914   {
6915     m_ServerModeCachedValue = g_ServerMode;
6916     m_ServerModeCheckboxPntr->SetValue (
6917       m_ServerModeCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6918     WindowPntr->Minimize (m_ServerModeCachedValue);
6919   }
6920 
6921   if (WindowPntr->IsMinimized ())
6922     return; /* Window isn't visible, don't waste time updating it. */
6923 
6924   /* So that people don't stare at a blank screen, request a database load if
6925   nothing is there.  But only do it once, so the user doesn't get a lot of
6926   invalid database messages if one doesn't exist yet.  In server mode, we never
6927   get this far so it is only loaded when the user wants to see something. */
6928 
6929   if (!m_DatabaseLoadDone)
6930   {
6931     m_DatabaseLoadDone = true;
6932     /* Counting the number of words will load the database. */
6933     SubmitCommandString (PN_DATABASE_FILE, B_COUNT_PROPERTIES, "");
6934   }
6935 
6936   /* Check various read only values, which can be read from the BApplication
6937   without having to lock it.  This is useful for displaying the number of words
6938   as it is changing.  First up is the purge age setting. */
6939 
6940   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
6941   if (MyAppPntr == NULL)
6942     return; /* Doesn't exist or is the wrong class.  Not likely! */
6943 
6944   if (MyAppPntr->m_PurgeAge != m_PurgeAgeCachedValue &&
6945   m_PurgeAgeTextboxPntr != NULL)
6946   {
6947     m_PurgeAgeCachedValue = MyAppPntr->m_PurgeAge;
6948     sprintf (TempString, "%lu", m_PurgeAgeCachedValue);
6949     m_PurgeAgeTextboxPntr->SetText (TempString);
6950   }
6951 
6952   /* Check the purge popularity. */
6953 
6954   if (MyAppPntr->m_PurgePopularity != m_PurgePopularityCachedValue &&
6955   m_PurgePopularityTextboxPntr != NULL)
6956   {
6957     m_PurgePopularityCachedValue = MyAppPntr->m_PurgePopularity;
6958     sprintf (TempString, "%lu", m_PurgePopularityCachedValue);
6959     m_PurgePopularityTextboxPntr->SetText (TempString);
6960   }
6961 
6962   /* Check the Ignore Previous Classification flag. */
6963 
6964   if (MyAppPntr->m_IgnorePreviousClassification !=
6965   m_IgnorePreviousClassCachedValue &&
6966   m_IgnorePreviousClassCheckboxPntr != NULL)
6967   {
6968     m_IgnorePreviousClassCachedValue =
6969       MyAppPntr->m_IgnorePreviousClassification;
6970     m_IgnorePreviousClassCheckboxPntr->SetValue (
6971       m_IgnorePreviousClassCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6972   }
6973 
6974   /* Update the genuine count. */
6975 
6976   if (MyAppPntr->m_TotalGenuineMessages != m_GenuineCountCachedValue &&
6977   m_GenuineCountTextboxPntr != NULL)
6978   {
6979     m_GenuineCountCachedValue = MyAppPntr->m_TotalGenuineMessages;
6980     sprintf (TempString, "%lu", m_GenuineCountCachedValue);
6981     m_GenuineCountTextboxPntr->SetText (TempString);
6982   }
6983 
6984   /* Update the spam count. */
6985 
6986   if (MyAppPntr->m_TotalSpamMessages != m_SpamCountCachedValue &&
6987   m_SpamCountTextboxPntr != NULL)
6988   {
6989     m_SpamCountCachedValue = MyAppPntr->m_TotalSpamMessages;
6990     sprintf (TempString, "%lu", m_SpamCountCachedValue);
6991     m_SpamCountTextboxPntr->SetText (TempString);
6992   }
6993 
6994   /* Update the word count. */
6995 
6996   if (MyAppPntr->m_WordCount != m_WordCountCachedValue &&
6997   m_WordCountTextboxPntr != NULL)
6998   {
6999     m_WordCountCachedValue = MyAppPntr->m_WordCount;
7000     sprintf (TempString, "%lu", m_WordCountCachedValue);
7001     m_WordCountTextboxPntr->SetText (TempString);
7002   }
7003 
7004   /* Update the tokenize mode pop-up menu. */
7005 
7006   if (MyAppPntr->m_TokenizeMode != m_TokenizeModeCachedValue &&
7007   m_TokenizeModePopUpMenuPntr != NULL)
7008   {
7009     m_TokenizeModeCachedValue = MyAppPntr->m_TokenizeMode;
7010     TempMenuItemPntr =
7011       m_TokenizeModePopUpMenuPntr->ItemAt ((int) m_TokenizeModeCachedValue);
7012     if (TempMenuItemPntr != NULL)
7013       TempMenuItemPntr->SetMarked (true);
7014   }
7015 
7016   /* Update the scoring mode pop-up menu. */
7017 
7018   if (MyAppPntr->m_ScoringMode != m_ScoringModeCachedValue &&
7019   m_ScoringModePopUpMenuPntr != NULL)
7020   {
7021     m_ScoringModeCachedValue = MyAppPntr->m_ScoringMode;
7022     TempMenuItemPntr =
7023       m_ScoringModePopUpMenuPntr->ItemAt ((int) m_ScoringModeCachedValue);
7024     if (TempMenuItemPntr != NULL)
7025       TempMenuItemPntr->SetMarked (true);
7026   }
7027 
7028   /* Lock the application.  This will stop it from processing any further
7029   messages until we are done.  Or if it is busy, the lock will fail. */
7030 
7031   if (MyAppPntr->LockWithTimeout (100000) != B_OK)
7032     return; /* It's probably busy doing something. */
7033 
7034   /* See if the database file name has changed. */
7035 
7036   if (strcmp (MyAppPntr->m_DatabaseFileName.String (),
7037   m_DatabaseFileNameCachedValue) != 0 &&
7038   m_DatabaseFileNameTextboxPntr != NULL)
7039   {
7040     strcpy (m_DatabaseFileNameCachedValue,
7041       MyAppPntr->m_DatabaseFileName.String ());
7042     m_DatabaseFileNameTextboxPntr->SetText (m_DatabaseFileNameCachedValue);
7043     WindowPntr->SetTitle (m_DatabaseFileNameCachedValue);
7044   }
7045 
7046   /* Done.  Let the BApplication continue processing messages. */
7047 
7048   MyAppPntr->Unlock ();
7049 }
7050 
7051 
7052 void
7053 ControlsView::Pulse ()
7054 {
7055   if (system_time () > m_TimeOfLastPoll + 200000)
7056   {
7057     PollServerForChanges ();
7058     m_TimeOfLastPoll = system_time ();
7059   }
7060 }
7061 
7062 
7063 
7064 /******************************************************************************
7065  * Implementation of the DatabaseWindow class, constructor, destructor and the
7066  * rest of the member functions in mostly alphabetical order.
7067  */
7068 
7069 DatabaseWindow::DatabaseWindow ()
7070 : BWindow (BRect (30, 30, 620, 400),
7071     "Haiku spam filter server",
7072     B_DOCUMENT_WINDOW, B_ASYNCHRONOUS_CONTROLS)
7073 {
7074   BRect TempRect;
7075 
7076   /* Add the controls view. */
7077 
7078   m_ControlsViewPntr = new ControlsView (Bounds ());
7079   if (m_ControlsViewPntr == NULL)
7080     goto ErrorExit;
7081   AddChild (m_ControlsViewPntr);
7082 
7083   /* Add the word view in the remaining space under the controls view. */
7084 
7085 
7086   TempRect = Bounds ();
7087   TempRect.top = m_ControlsViewPntr->Frame().bottom + 1;
7088   m_WordsViewPntr = new WordsView (TempRect);
7089   if (m_WordsViewPntr == NULL)
7090     goto ErrorExit;
7091   AddChild (m_WordsViewPntr);
7092 
7093  /* Minimize the window if we are starting up in server mode.  This is done
7094 	before the window is open so it doesn't flash onto the screen, and possibly
7095 	steal a keystroke or two.  The ControlsView will further update the minimize
7096 	mode when it detects changes in the server mode. */
7097   Minimize (g_ServerMode);
7098 
7099   return;
7100 
7101 ErrorExit:
7102   DisplayErrorMessage ("Unable to initialise the window contents.");
7103 }
7104 
7105 
7106 void
7107 DatabaseWindow::MessageReceived (BMessage *MessagePntr)
7108 {
7109   if (MessagePntr->what == B_MOUSE_WHEEL_CHANGED)
7110   {
7111     /* Pass the mouse wheel stuff down to the words view, since that's the only
7112     one which does scrolling so we don't need to worry about whether it has
7113     focus or not. */
7114 
7115     if (m_WordsViewPntr != NULL)
7116       m_WordsViewPntr->MessageReceived (MessagePntr);
7117   }
7118   else
7119     BWindow::MessageReceived (MessagePntr);
7120 }
7121 
7122 
7123 bool
7124 DatabaseWindow::QuitRequested ()
7125 {
7126   be_app->PostMessage (B_QUIT_REQUESTED);
7127   return true;
7128 }
7129 
7130 
7131 
7132 /******************************************************************************
7133  * Implementation of the word display view.
7134  */
7135 
7136 WordsView::WordsView (BRect NewBounds)
7137 : BView (NewBounds, "WordsView", B_FOLLOW_ALL_SIDES,
7138     B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE | B_NAVIGABLE | B_PULSE_NEEDED),
7139   m_ArrowLineDownPntr (NULL),
7140   m_ArrowLineUpPntr (NULL),
7141   m_ArrowPageDownPntr (NULL),
7142   m_ArrowPageUpPntr (NULL),
7143   m_LastTimeAKeyWasPressed (0)
7144 {
7145   font_height TempFontHeight;
7146 
7147   GetFont (&m_TextFont); /* Modify the default font to be our own. */
7148   m_TextFont.SetSize (ceilf (m_TextFont.Size() * 1.1));
7149   m_TextFont.GetHeight (&TempFontHeight);
7150   SetFont (&m_TextFont);
7151 
7152   m_LineHeight = ceilf (TempFontHeight.ascent +
7153     TempFontHeight.descent + TempFontHeight.leading);
7154   m_AscentHeight = ceilf (TempFontHeight.ascent);
7155   m_TextHeight = ceilf (TempFontHeight.ascent +
7156     TempFontHeight.descent);
7157 
7158   m_FocusedColour.red = 255;
7159   m_FocusedColour.green = 255;
7160   m_FocusedColour.blue = 255;
7161   m_FocusedColour.alpha = 255;
7162 
7163   m_UnfocusedColour.red = 245;
7164   m_UnfocusedColour.green = 245;
7165   m_UnfocusedColour.blue = 255;
7166   m_UnfocusedColour.alpha = 255;
7167 
7168   m_BackgroundColour = m_UnfocusedColour;
7169   SetViewColor (m_BackgroundColour);
7170   SetLowColor (m_BackgroundColour);
7171   SetHighColor (0, 0, 0);
7172 
7173   strcpy (m_FirstDisplayedWord, "a");
7174 }
7175 
7176 
7177 void
7178 WordsView::AttachedToWindow ()
7179 {
7180   BPolygon        DownLinePolygon (g_DownLinePoints,
7181                     sizeof (g_DownLinePoints) /
7182                     sizeof (g_DownLinePoints[0]));
7183 
7184   BPolygon        DownPagePolygon (g_DownPagePoints,
7185                     sizeof (g_DownPagePoints) /
7186                     sizeof (g_DownPagePoints[0]));
7187 
7188   BPolygon        UpLinePolygon (g_UpLinePoints,
7189                     sizeof (g_UpLinePoints) /
7190                     sizeof (g_UpLinePoints[0]));
7191 
7192   BPolygon        UpPagePolygon (g_UpPagePoints,
7193                     sizeof (g_UpPagePoints) /
7194                     sizeof (g_UpPagePoints[0]));
7195 
7196   BPicture        TempOffPicture;
7197   BPicture        TempOnPicture;
7198   BRect           TempRect;
7199 
7200   /* Make the buttons and associated polygon images for the forward and
7201   backwards a word or a page of words buttons.  They're the width of the scroll
7202   bar area on the right, but twice as tall as usual, since there is no scroll
7203   bar and that will make it easier to use them.  First the up a line button. */
7204 
7205   SetHighColor (0, 0, 0);
7206   BeginPicture (&TempOffPicture);
7207   FillPolygon (&UpLinePolygon);
7208   SetHighColor (180, 180, 180);
7209   StrokePolygon (&UpLinePolygon);
7210   EndPicture ();
7211 
7212   SetHighColor (128, 128, 128);
7213   BeginPicture (&TempOnPicture);
7214   FillPolygon (&UpLinePolygon);
7215   EndPicture ();
7216 
7217   TempRect = Bounds ();
7218   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7219   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7220   m_ArrowLineUpPntr = new BPictureButton (TempRect, "Up Line",
7221     &TempOffPicture, &TempOnPicture,
7222     new BMessage (MSG_LINE_UP), B_ONE_STATE_BUTTON,
7223     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7224   if (m_ArrowLineUpPntr == NULL) goto ErrorExit;
7225   AddChild (m_ArrowLineUpPntr);
7226   m_ArrowLineUpPntr->SetTarget (this);
7227 
7228   /* Up a page button. */
7229 
7230   SetHighColor (0, 0, 0);
7231   BeginPicture (&TempOffPicture);
7232   FillPolygon (&UpPagePolygon);
7233   SetHighColor (180, 180, 180);
7234   StrokePolygon (&UpPagePolygon);
7235   EndPicture ();
7236 
7237   SetHighColor (128, 128, 128);
7238   BeginPicture (&TempOnPicture);
7239   FillPolygon (&UpPagePolygon);
7240   EndPicture ();
7241 
7242   TempRect = Bounds ();
7243   TempRect.top += 2 * B_H_SCROLL_BAR_HEIGHT + 1;
7244   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7245   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7246   m_ArrowPageUpPntr = new BPictureButton (TempRect, "Up Page",
7247     &TempOffPicture, &TempOnPicture,
7248     new BMessage (MSG_PAGE_UP), B_ONE_STATE_BUTTON,
7249     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7250   if (m_ArrowPageUpPntr == NULL) goto ErrorExit;
7251   AddChild (m_ArrowPageUpPntr);
7252   m_ArrowPageUpPntr->SetTarget (this);
7253 
7254   /* Down a page button. */
7255 
7256   SetHighColor (0, 0, 0);
7257   BeginPicture (&TempOffPicture);
7258   FillPolygon (&DownPagePolygon);
7259   SetHighColor (180, 180, 180);
7260   StrokePolygon (&DownPagePolygon);
7261   EndPicture ();
7262 
7263   SetHighColor (128, 128, 128);
7264   BeginPicture (&TempOnPicture);
7265   FillPolygon (&DownPagePolygon);
7266   EndPicture ();
7267 
7268   TempRect = Bounds ();
7269   TempRect.bottom -= 3 * B_H_SCROLL_BAR_HEIGHT + 1;
7270   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7271   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7272   m_ArrowPageDownPntr = new BPictureButton (TempRect, "Down Page",
7273     &TempOffPicture, &TempOnPicture,
7274     new BMessage (MSG_PAGE_DOWN), B_ONE_STATE_BUTTON,
7275     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7276   if (m_ArrowPageDownPntr == NULL) goto ErrorExit;
7277   AddChild (m_ArrowPageDownPntr);
7278   m_ArrowPageDownPntr->SetTarget (this);
7279 
7280   /* Down a line button. */
7281 
7282   SetHighColor (0, 0, 0);
7283   BeginPicture (&TempOffPicture);
7284   FillPolygon (&DownLinePolygon);
7285   SetHighColor (180, 180, 180);
7286   StrokePolygon (&DownLinePolygon);
7287   EndPicture ();
7288 
7289   SetHighColor (128, 128, 128);
7290   BeginPicture (&TempOnPicture);
7291   FillPolygon (&DownLinePolygon);
7292   EndPicture ();
7293 
7294   TempRect = Bounds ();
7295   TempRect.bottom -= B_H_SCROLL_BAR_HEIGHT;
7296   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7297   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7298   m_ArrowLineDownPntr = new BPictureButton (TempRect, "Down Line",
7299     &TempOffPicture, &TempOnPicture,
7300     new BMessage (MSG_LINE_DOWN), B_ONE_STATE_BUTTON,
7301     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7302   if (m_ArrowLineDownPntr == NULL) goto ErrorExit;
7303   AddChild (m_ArrowLineDownPntr);
7304   m_ArrowLineDownPntr->SetTarget (this);
7305 
7306   return;
7307 
7308 ErrorExit:
7309   DisplayErrorMessage ("Problems while making view displaying the words.");
7310 }
7311 
7312 
7313 /* Draw the words starting with the one at or after m_FirstDisplayedWord.  This
7314 requires looking at the database in the BApplication, which may or may not be
7315 available (if it isn't, don't draw, a redraw will usually be requested by the
7316 Pulse member function when it keeps on noticing that the stuff on the display
7317 doesn't match the database). */
7318 
7319 void
7320 WordsView::Draw (BRect UpdateRect)
7321 {
7322   float                   AgeDifference;
7323   float                   AgeProportion;
7324   float                   CenterX;
7325   float                   ColumnLeftCenterX;
7326   float                   ColumnMiddleCenterX;
7327   float                   ColumnRightCenterX;
7328   float                   CompensatedRatio;
7329   StatisticsMap::iterator DataIter;
7330   StatisticsMap::iterator EndIter;
7331   rgb_color               FillColour;
7332   float                   GenuineProportion;
7333   uint32                  GenuineSpamSum;
7334   float                   HeightPixels;
7335   float                   HeightProportion;
7336   float                   LeftBounds;
7337   ABSApp                 *MyAppPntr;
7338   uint32                  NewestAge;
7339   uint32                  OldestAge;
7340   float                   OneFifthTotalGenuine;
7341   float                   OneFifthTotalSpam;
7342   double                  RawProbabilityRatio;
7343   float                   RightBounds;
7344   float                   SpamProportion;
7345   StatisticsPointer       StatisticsPntr;
7346   BRect                   TempRect;
7347   char                    TempString [PATH_MAX];
7348   float                   TotalGenuineMessages = 1.0; /* Avoid divide by 0. */
7349   float                   TotalSpamMessages = 1.0;
7350   float                   Width;
7351   float                   Y;
7352 
7353   /* Lock the application.  This will stop it from processing any further
7354   messages until we are done.  Or if it is busy, the lock will fail. */
7355 
7356   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7357   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (100000) != B_OK)
7358     return; /* It's probably busy doing something. */
7359 
7360   /* Set up various loop invariant variables. */
7361 
7362   if (MyAppPntr->m_TotalGenuineMessages > 0)
7363     TotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7364   OneFifthTotalGenuine = TotalGenuineMessages / 5;
7365 
7366   if (MyAppPntr->m_TotalSpamMessages > 0)
7367     TotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7368   OneFifthTotalSpam = TotalSpamMessages / 5;
7369 
7370   EndIter = MyAppPntr->m_WordMap.end ();
7371 
7372   OldestAge = MyAppPntr->m_OldestAge;
7373   NewestAge = /* actually newest age plus one */
7374     MyAppPntr->m_TotalGenuineMessages + MyAppPntr->m_TotalSpamMessages;
7375 
7376   if (NewestAge == 0)
7377     goto NormalExit; /* No words to display, or something is badly wrong. */
7378 
7379   NewestAge--; /* The newest message has age NewestAge. */
7380   AgeDifference = NewestAge - OldestAge; /* Can be zero if just one message. */
7381 
7382   LeftBounds = Bounds().left;
7383   RightBounds = Bounds().right - B_V_SCROLL_BAR_WIDTH;
7384   Width = RightBounds - LeftBounds;
7385   FillColour.alpha = 255;
7386 
7387   CenterX = ceilf (LeftBounds + Width * 0.5);
7388   ColumnLeftCenterX = ceilf (LeftBounds + Width * 0.05);
7389   ColumnMiddleCenterX = CenterX;
7390   ColumnRightCenterX = ceilf (LeftBounds + Width * 0.95);
7391 
7392   for (DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord),
7393   Y = Bounds().top;
7394   DataIter != EndIter && Y < UpdateRect.bottom;
7395   DataIter++, Y += m_LineHeight)
7396   {
7397     if (Y + m_LineHeight < UpdateRect.top)
7398       continue; /* Not in the visible area yet, don't actually draw. */
7399 
7400     /* Draw the colour bar behind the word.  It reflects the spamness or
7401     genuineness of that particular word, plus the importance of the word and
7402     the age of the word.
7403 
7404     First calculate the compensated spam ratio (described elsewhere).  It is
7405     close to 0.0 for genuine words and close to 1.0 for pure spam.  It is drawn
7406     as a blue bar to the left of center if it is less than 0.5, and a red bar
7407     on the right of center if it is greater than 0.5.  At exactly 0.5 nothing
7408     is drawn; the word is worthless as an indicator.
7409 
7410     The height of the bar corresponds to the number of messages the word was
7411     found in.  Make the height proportional to the total of spam and genuine
7412     messages for the word divided by the sum of the most extreme spam and
7413     genuine counts in the database.
7414 
7415     The staturation of the colour corresponds to the age of the word, with old
7416     words being almost white rather than solid blue or red. */
7417 
7418     StatisticsPntr = &DataIter->second;
7419 
7420     SpamProportion = StatisticsPntr->spamCount / TotalSpamMessages;
7421     GenuineProportion = StatisticsPntr->genuineCount / TotalGenuineMessages;
7422     if (SpamProportion + GenuineProportion > 0.0f)
7423       RawProbabilityRatio =
7424       SpamProportion / (SpamProportion + GenuineProportion);
7425     else
7426       RawProbabilityRatio = g_RobinsonX;
7427 
7428     /* The compensated ratio leans towards 0.5 (RobinsonX) more for fewer
7429     data points, with a weight of 0.45 (RobinsonS). */
7430 
7431     GenuineSpamSum =
7432       StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
7433     CompensatedRatio =
7434       (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
7435       (g_RobinsonS + GenuineSpamSum);
7436 
7437     /* Used to use the height based on the most frequent word, but some words,
7438     like "From", show up in all messages which made most other words just
7439     appear as a thin line.  I did a histogram plot of the sizes in my test
7440     database, and figured that you get better coverage of 90% of the messages
7441     if you use 1/5 of the total number as the count which gives you 100%
7442     height.  The other 10% get a full height bar, but most people wouldn't care
7443     that they're super frequently used. */
7444 
7445     HeightProportion = 0.5f * (StatisticsPntr->genuineCount /
7446       OneFifthTotalGenuine + StatisticsPntr->spamCount / OneFifthTotalSpam);
7447 
7448     if (HeightProportion > 1.0f)
7449       HeightProportion = 1.0f;
7450     HeightPixels = ceilf (HeightProportion * m_TextHeight);
7451 
7452     if (AgeDifference <= 0.0f)
7453       AgeProportion = 1.0; /* New is 1.0, old is 0.0 */
7454     else
7455       AgeProportion = (StatisticsPntr->age - OldestAge) / AgeDifference;
7456 
7457     TempRect.top = ceilf (Y + m_TextHeight / 2 - HeightPixels / 2);
7458     TempRect.bottom = TempRect.top + HeightPixels;
7459 
7460     if (CompensatedRatio < 0.5f)
7461     {
7462       TempRect.left = ceilf (
7463         CenterX - 1.6f * (0.5f - CompensatedRatio) * (CenterX - LeftBounds));
7464       TempRect.right = CenterX;
7465       FillColour.red = 230 - (int) (AgeProportion * 230.0f);
7466       FillColour.green = FillColour.red;
7467       FillColour.blue = 255;
7468     }
7469     else /* Ratio >= 0.5, red spam block. */
7470     {
7471       TempRect.left = CenterX;
7472       TempRect.right = ceilf (
7473         CenterX + 1.6f * (CompensatedRatio - 0.5f) * (RightBounds - CenterX));
7474       FillColour.blue = 230 - (int) (AgeProportion * 230.0f);
7475       FillColour.green = FillColour.blue;
7476       FillColour.red = 255;
7477     }
7478     SetHighColor (FillColour);
7479     SetDrawingMode (B_OP_COPY);
7480     FillRect (TempRect);
7481 
7482     /* Print the text centered in columns of various widths.  The number of
7483     genuine messages in the left 10% of the width, the word in the middle 80%,
7484     and the number of spam messages using the word in the right 10%. */
7485 
7486     SetHighColor (0, 0, 0);
7487     SetDrawingMode (B_OP_OVER); /* So that antialiased text mixes better. */
7488 
7489     sprintf (TempString, "%lu", StatisticsPntr->genuineCount);
7490     Width = m_TextFont.StringWidth (TempString);
7491     MovePenTo (ceilf (ColumnLeftCenterX - Width / 2), Y + m_AscentHeight);
7492     DrawString (TempString);
7493 
7494     strcpy (TempString, DataIter->first.c_str ());
7495     Width = m_TextFont.StringWidth (TempString);
7496     MovePenTo (ceilf (ColumnMiddleCenterX - Width / 2), Y + m_AscentHeight);
7497     DrawString (TempString);
7498 
7499     sprintf (TempString, "%lu", StatisticsPntr->spamCount);
7500     Width = m_TextFont.StringWidth (TempString);
7501     MovePenTo (ceilf (ColumnRightCenterX - Width / 2), Y + m_AscentHeight);
7502     DrawString (TempString);
7503   }
7504 
7505   /* Draw the first word (the one which the user types in to select the first
7506   displayed word) on the right, in the scroll bar margin, rotated 90 degrees to
7507   fit between the page up and page down buttons. */
7508 
7509   Width = m_TextFont.StringWidth (m_FirstDisplayedWord);
7510   if (Width > 0)
7511   {
7512     TempRect = Bounds ();
7513     TempRect.top += 4 * B_H_SCROLL_BAR_HEIGHT + 1;
7514     TempRect.bottom -= 5 * B_H_SCROLL_BAR_HEIGHT + 1;
7515 
7516     MovePenTo (TempRect.right - m_TextHeight + m_AscentHeight - 1,
7517       ceilf ((TempRect.bottom + TempRect.top) / 2 + Width / 2));
7518     m_TextFont.SetRotation (90);
7519     SetFont (&m_TextFont, B_FONT_ROTATION);
7520     DrawString (m_FirstDisplayedWord);
7521     m_TextFont.SetRotation (0);
7522     SetFont (&m_TextFont, B_FONT_ROTATION);
7523   }
7524 
7525 NormalExit:
7526 
7527   /* Successfully finished drawing.  Update the cached values to match what we
7528   have drawn. */
7529   m_CachedTotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7530   m_CachedTotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7531   m_CachedWordCount = MyAppPntr->m_WordCount;
7532 
7533   /* Done.  Let the BApplication continue processing messages. */
7534   MyAppPntr->Unlock ();
7535 }
7536 
7537 
7538 /* When the user presses keys, they select the first word to be displayed in
7539 the view (it's the word at or lexicographically after the word typed in).  The
7540 keys are appended to the starting word, until the user stops typing for a
7541 while, then the next key will be the first letter of a new starting word. */
7542 
7543 void
7544 WordsView::KeyDown (const char *BufferPntr, int32 NumBytes)
7545 {
7546   int32          CharLength;
7547   bigtime_t      CurrentTime;
7548   char           TempString [40];
7549 
7550   CurrentTime = system_time ();
7551 
7552   if (NumBytes < (int32) sizeof (TempString))
7553   {
7554     memcpy (TempString, BufferPntr, NumBytes);
7555     TempString [NumBytes] = 0;
7556     CharLength = strlen (TempString); /* So NUL bytes don't get through. */
7557 
7558     /* Check for arrow keys, which move the view up and down. */
7559 
7560     if (CharLength == 1 &&
7561     (TempString[0] == B_UP_ARROW ||
7562     TempString[0] == B_DOWN_ARROW ||
7563     TempString[0] == B_PAGE_UP ||
7564     TempString[0] == B_PAGE_DOWN))
7565     {
7566       MoveTextUpOrDown ((TempString[0] == B_UP_ARROW) ? MSG_LINE_UP :
7567         ((TempString[0] == B_DOWN_ARROW) ? MSG_LINE_DOWN :
7568         ((TempString[0] == B_PAGE_UP) ? MSG_PAGE_UP : MSG_PAGE_DOWN)));
7569     }
7570     else if (CharLength > 1 ||
7571     (CharLength == 1 && 32 <= (uint8) TempString[0]))
7572     {
7573       /* Have a non-control character, or some sort of multibyte char.  Add it
7574       to the word and mark things for redisplay starting at the resulting word.
7575       */
7576 
7577       if (CurrentTime - m_LastTimeAKeyWasPressed >= 1000000 /* microseconds */)
7578         strcpy (m_FirstDisplayedWord, TempString); /* Starting a new word. */
7579       else if (strlen (m_FirstDisplayedWord) + CharLength <= g_MaxWordLength)
7580         strcat (m_FirstDisplayedWord, TempString); /* Append to existing. */
7581 
7582       Invalidate ();
7583     }
7584   }
7585 
7586   m_LastTimeAKeyWasPressed = CurrentTime;
7587   BView::KeyDown (BufferPntr, NumBytes);
7588 }
7589 
7590 
7591 /* Change the background colour to show that we have the focus.  When we have
7592 it, keystrokes will select the word to be displayed at the top of the list. */
7593 
7594 void
7595 WordsView::MakeFocus (bool Focused)
7596 {
7597   if (Focused)
7598     m_BackgroundColour = m_FocusedColour;
7599   else
7600     m_BackgroundColour = m_UnfocusedColour;
7601   SetViewColor (m_BackgroundColour);
7602   SetLowColor (m_BackgroundColour);
7603 
7604   /* Also need to set the background colour for the scroll buttons, since they
7605   can't be made transparent. */
7606 
7607   if (m_ArrowLineDownPntr != NULL)
7608   {
7609     m_ArrowLineDownPntr->SetViewColor (m_BackgroundColour);
7610     m_ArrowLineDownPntr->Invalidate ();
7611   }
7612 
7613   if (m_ArrowLineUpPntr != NULL)
7614   {
7615     m_ArrowLineUpPntr->SetViewColor (m_BackgroundColour);
7616     m_ArrowLineUpPntr->Invalidate ();
7617   }
7618 
7619   if (m_ArrowPageDownPntr != NULL)
7620   {
7621     m_ArrowPageDownPntr->SetViewColor (m_BackgroundColour);
7622     m_ArrowPageDownPntr->Invalidate ();
7623   }
7624 
7625   if (m_ArrowPageUpPntr != NULL)
7626   {
7627     m_ArrowPageUpPntr->SetViewColor (m_BackgroundColour);
7628     m_ArrowPageUpPntr->Invalidate ();
7629   }
7630 
7631   Invalidate ();
7632 
7633   BView::MakeFocus (Focused);
7634 }
7635 
7636 
7637 void
7638 WordsView::MessageReceived (BMessage *MessagePntr)
7639 {
7640   int32     CountFound;
7641   float     DeltaY; /* Usually -1.0, 0.0 or +1.0. */
7642   type_code TypeFound;
7643 
7644   switch (MessagePntr->what)
7645   {
7646     case B_MOUSE_WHEEL_CHANGED:
7647       if (MessagePntr->FindFloat ("be:wheel_delta_y", &DeltaY) != 0) break;
7648       if (DeltaY < 0)
7649         MoveTextUpOrDown (MSG_LINE_UP);
7650       else if (DeltaY > 0)
7651         MoveTextUpOrDown (MSG_LINE_DOWN);
7652       break;
7653 
7654     case MSG_LINE_DOWN:
7655     case MSG_LINE_UP:
7656     case MSG_PAGE_DOWN:
7657     case MSG_PAGE_UP:
7658       MoveTextUpOrDown (MessagePntr->what);
7659       break;
7660 
7661     case B_SIMPLE_DATA: /* Something has been dropped in our view. */
7662       if (MessagePntr->GetInfo ("refs", &TypeFound, &CountFound) == B_OK &&
7663       CountFound > 0 && TypeFound == B_REF_TYPE)
7664       {
7665         RefsDroppedHere (MessagePntr);
7666         break;
7667       }
7668       /* Else fall through to the default case, in case it is something else
7669       dropped that the system knows about. */
7670 
7671     default:
7672       BView::MessageReceived (MessagePntr);
7673   }
7674 }
7675 
7676 
7677 /* If the user clicks on our view, take over the focus. */
7678 
7679 void
7680 WordsView::MouseDown (BPoint)
7681 {
7682   if (!IsFocus ())
7683     MakeFocus (true);
7684 }
7685 
7686 
7687 void
7688 WordsView::MoveTextUpOrDown (uint32 MovementType)
7689 {
7690   StatisticsMap::iterator  DataIter;
7691   int                      i;
7692   ABSApp                  *MyAppPntr;
7693   int                      PageSize;
7694 
7695   /* Lock the application.  This will stop it from processing any further
7696   messages until we are done (we need to look at the word list directly).  Or
7697   if it is busy, the lock will fail. */
7698 
7699   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7700   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (2000000) != B_OK)
7701     return; /* It's probably busy doing something. */
7702 
7703   PageSize = (int) (Bounds().Height() / m_LineHeight - 1);
7704   if (PageSize < 1)
7705     PageSize = 1;
7706 
7707   DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord);
7708 
7709   switch (MovementType)
7710   {
7711     case MSG_LINE_UP:
7712       if (DataIter != MyAppPntr->m_WordMap.begin ())
7713         DataIter--;
7714       break;
7715 
7716     case MSG_LINE_DOWN:
7717       if (DataIter != MyAppPntr->m_WordMap.end ())
7718         DataIter++;
7719       break;
7720 
7721     case MSG_PAGE_UP:
7722       for (i = 0; i < PageSize; i++)
7723       {
7724         if (DataIter == MyAppPntr->m_WordMap.begin ())
7725           break;
7726         DataIter--;
7727       }
7728       break;
7729 
7730     case MSG_PAGE_DOWN:
7731       for (i = 0; i < PageSize; i++)
7732       {
7733         if (DataIter == MyAppPntr->m_WordMap.end ())
7734           break;
7735         DataIter++;
7736       }
7737       break;
7738   }
7739 
7740   if (DataIter != MyAppPntr->m_WordMap.end ())
7741     strcpy (m_FirstDisplayedWord, DataIter->first.c_str ());
7742 
7743   Invalidate ();
7744 
7745   MyAppPntr->Unlock ();
7746 }
7747 
7748 
7749 /* This function periodically polls the BApplication to see if anything has
7750 changed.  If the word list is different or the display has changed in some
7751 other way, it will then try to refresh the display, repeating the attempt until
7752 it gets successfully drawn. */
7753 
7754 void
7755 WordsView::Pulse ()
7756 {
7757   ABSApp *MyAppPntr;
7758 
7759   /* Probe the BApplication to see if it has changed. */
7760 
7761   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7762   if (MyAppPntr == NULL)
7763     return; /* Something is wrong, give up. */
7764 
7765   if (MyAppPntr->m_TotalGenuineMessages != m_CachedTotalGenuineMessages ||
7766   MyAppPntr->m_TotalSpamMessages != m_CachedTotalSpamMessages ||
7767   MyAppPntr->m_WordCount != m_CachedWordCount)
7768     Invalidate ();
7769 }
7770 
7771 
7772 /* The user has dragged and dropped some file references on the words view.  If
7773 it is in the left third, add the file(s) as examples of genuine messages, right
7774 third for spam messages and if it is in the middle third then evaluate the
7775 file(s) for spaminess. */
7776 
7777 void
7778 WordsView::RefsDroppedHere (BMessage *MessagePntr)
7779 {
7780   float  Left;
7781   bool   SpamExample = true; /* TRUE if example is of spam, FALSE genuine. */
7782   float  Third;
7783   BPoint WhereDropped;
7784 
7785   /* Find out which third of the view it was dropped into. */
7786 
7787   if (MessagePntr->FindPoint ("_drop_point_", &WhereDropped) != B_OK)
7788     return;  /* Need to know where it was dropped. */
7789   ConvertFromScreen (&WhereDropped);
7790   Third = Bounds().Width() / 3;
7791   Left = Bounds().left;
7792   if (WhereDropped.x < Left + Third)
7793     SpamExample = false;
7794   else if (WhereDropped.x < Left + 2 * Third)
7795   {
7796     /* In the middle third, evaluate all files for spaminess. */
7797     EstimateRefFilesAndDisplay (MessagePntr);
7798     return;
7799   }
7800 
7801   if (g_CommanderLooperPntr != NULL)
7802     g_CommanderLooperPntr->CommandReferences (
7803     MessagePntr, true /* BulkMode */, SpamExample ? CL_SPAM : CL_GENUINE);
7804 }
7805 
7806 
7807 
7808 /******************************************************************************
7809  * Finally, the main program which drives it all.
7810  */
7811 
7812 int main (int argc, char**)
7813 {
7814   g_CommandLineMode = (argc > 1);
7815   if (!g_CommandLineMode)
7816     cout << PrintUsage; /* In case no arguments specified. */
7817 
7818   g_CommanderLooperPntr = new CommanderLooper;
7819   if (g_CommanderLooperPntr != NULL)
7820   {
7821     g_CommanderMessenger = new BMessenger (NULL, g_CommanderLooperPntr);
7822     g_CommanderLooperPntr->Run ();
7823   }
7824 
7825   ABSApp MyApp;
7826 
7827   if (MyApp.InitCheck () == 0)
7828   {
7829     MyApp.LoadSaveSettings (true /* DoLoad */);
7830     MyApp.Run ();
7831   }
7832 
7833   if (g_CommanderLooperPntr != NULL)
7834   {
7835     g_CommanderLooperPntr->PostMessage (B_QUIT_REQUESTED);
7836     snooze (100000); /* Let the CommanderLooper thread run so it quits. */
7837   }
7838 
7839   cerr << "SpamDBM shutting down..." << endl;
7840   return 0; /* And implicitly destroys MyApp, which writes out the database. */
7841 }
7842