xref: /haiku/src/bin/mail_utils/spamdbm.cpp (revision 894526b51a3d931c423878fc0eb8da610fa1fb2a)
1 /******************************************************************************
2  * $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $
3  *
4  * This is a BeOS program for classifying e-mail messages as spam (unwanted
5  * junk mail) or as genuine mail using a Bayesian statistical approach.  There
6  * is also a Mail Daemon Replacement add-on to filter mail using the
7  * classification statistics collected earlier.
8  *
9  * See also http://www.paulgraham.com/spam.html for a good writeup and
10  * http://www.tuxedo.org/~esr/bogofilter/ for another implementation.
11  * And more recently, Gary Robinson's write up of his improved algorithm
12  * at http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
13  * which gives a better spread in spam ratios and slightly fewer
14  * misclassifications.
15  *
16  * Note that this uses the AGMS vacation coding style, not the OpenTracker one.
17  * That means no tabs, indents are two spaces, m_ is the prefix for member
18  * variables, g_ is the prefix for global names, C style comments, constants
19  * are in all capital letters and most other things are mixed case, it's word
20  * wrapped to fit in 79 characters per line to make proofreading on paper
21  * easier, and functions are listed in reverse dependency order so that forward
22  * declarations (function prototypes with no code) aren't needed.
23  *
24  * The Original Design:
25  * There is a spam database (just a file listing words and number of times they
26  * were used in spam and non-spam messages) that a BeMailDaemon input filter
27  * will use when scanning email.  It will mark the mail with the spam
28  * probability (an attribute, optionally a mail header field) and optionally do
29  * something if the probability exceeds a user defined level (delete message,
30  * change subject, file in a different folder).  Or should that be a different
31  * filter?  Outside the mail system, the probability can be used in queries to
32  * find spam.
33  *
34  * A second user application will be used to update the database.  Besides
35  * showing you the current list of words, you can drag and drop files to mark
36  * them as spam or non-spam (a balanced binary tree is used internally to make
37  * word storage fast).  It will add a second attribute to the files to show how
38  * they have been classified by the user (and won't update the database if you
39  * accidentally try to classify a file again).  Besides drag and drop, there
40  * will be a command line interface and a message passing interface.  BeMail
41  * (or other programs) will then communicate via messages to tell it when the
42  * user marks a message as spam or not (via having separate delete spam /
43  * delete genuine mail buttons and a menu item or two).
44  *
45  * Plus lots of details, like the rename swap method to update the database
46  * file (so programs with the old file open aren't affected).  A nice tab text
47  * format so you can open the database in a spreadsheet.  Startup and shutdown
48  * control of the updater from BeMail.  Automatic creation of the indices
49  * needed by the filter.  MIME types for the database file.  Icons for the app.
50  * System settings to enable tracker to display the new attributes when viewing
51  * e-mail (and maybe news articles if someone ever gets around to an NNTP as
52  * files reader).  Documentation.  Recursive directory traversal for the
53  * command line or directory drag and drop.  Options for the updater to warn or
54  * ignore non-email files.  Etc.
55  *
56  * The Actual Implementation:
57  * The spam database updates and the test for spam have been combined into one
58  * program which runs as a server.  That way there won't be as long a delay
59  * when the e-mail system wants to check for spam, because the database is
60  * already loaded by the server and in memory.  The MDR mail filter add-on
61  * simply sends scripting commands to the server (and starts it up if it isn't
62  * already running).  The filter takes care of marking the messages when it
63  * gets the rating back from the server, and then the rest of the mail system
64  * rule chain can delete the message or otherwise manipulate it.
65  *
66  * Revision History (now manually updated due to SVN's philosophy)
67  * $Log: spamdbm.cpp,v $
68  * ------------------------------------------------------------------------
69  * r15195 | agmsmith | 2005-11-27 21:07:55 -0500 (Sun, 27 Nov 2005) | 4 lines
70  * Just a few minutes after checking in, I mentioned it to Japanese expert Koki
71  * and he suggested also including the Japanese comma.  So before I forget to
72  * do it...
73  *
74  * ------------------------------------------------------------------------
75  * r15194 | agmsmith | 2005-11-27 20:37:13 -0500 (Sun, 27 Nov 2005) | 5 lines
76  * Truncate overly long URLs to the maximum word length.  Convert Japanese
77  * periods to spaces so that more "words" are found.  Fix UTF-8 comparison
78  * problems with tolower() incorrectly converting characters with the high bit
79  * set.
80  *
81  * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
82  * Added better tokenization so that HTML is parsed and things like tags
83  * between letters of a word no longer hide that word.  After testing, the
84  * result seems to be a tighter spread of ratings when done in full text plus
85  * header mode.
86  *
87  * Revision 1.10  2005/11/24 02:08:39  agmsmith
88  * Fixed up prefix codes, Z for things that are inside other things.
89  *
90  * Revision 1.9  2005/11/21 03:28:03  agmsmith
91  * Added a function for extracting URLs.
92  *
93  * Revision 1.8  2005/11/09 03:36:18  agmsmith
94  * Removed noframes detection (doesn't show up in e-mails).  Now use
95  * just H for headers and Z for HTML tag junk.
96  *
97  * Revision 1.7  2005/10/24 00:00:08  agmsmith
98  * Adding HTML tag removal, which also affected the search function so it
99  * could search for single part things like  .
100  *
101  * Revision 1.6  2005/10/17 01:55:08  agmsmith
102  * Remove HTML comments and a few other similar things.
103  *
104  * Revision 1.5  2005/10/16 18:35:36  agmsmith
105  * Under construction - looking into HTML not being in UTF-8.
106  *
107  * Revision 1.4  2005/10/11 01:51:21  agmsmith
108  * Starting on the tokenising passes.  Still need to test asian truncation.
109  *
110  * Revision 1.3  2005/10/06 11:54:07  agmsmith
111  * Not much.
112  *
113  * Revision 1.2  2005/09/12 01:49:37  agmsmith
114  * Enable case folding for the whole file tokenizer.
115  *
116  * r13961 | agmsmith | 2005-08-13 22:25:28 -0400 (Sat, 13 Aug 2005) | 2 lines
117  * Source code changes so that mboxtobemail now compiles and is in the build
118  * system.
119  *
120  * r13959 | agmsmith | 2005-08-13 22:05:27 -0400 (Sat, 13 Aug 2005) | 2 lines
121  * Rename the directory before doing anything else, otherwise svn dies badly.
122  *
123  * r13952 | agmsmith | 2005-08-13 15:31:42 -0400 (Sat, 13 Aug 2005) | 3 lines
124  * Added the resources and file type associations, changed the application
125  * signature and otherwise made the spam detection system work properly again.
126  *
127  * r13951 | agmsmith | 2005-08-13 11:40:01 -0400 (Sat, 13 Aug 2005) | 2 lines
128  * Had to do the file rename as a separate operation due to SVN limitations.
129  *
130  * r13950 | agmsmith | 2005-08-13 11:38:44 -0400 (Sat, 13 Aug 2005) | 3 lines
131  * Oops, "spamdb" is already used for a Unix package.  And spamdatabase is
132  * already reserved by a domain name squatter.  Use "spamdbm" instead.
133  *
134  * r13949 | agmsmith | 2005-08-13 11:17:52 -0400 (Sat, 13 Aug 2005) | 3 lines
135  * Renamed spamfilter to be the more meaningful spamdb (spam database) and
136  * moved it into its own source directory in preparation for adding resources.
137  *
138  * r13628 | agmsmith | 2005-07-10 20:11:29 -0400 (Sun, 10 Jul 2005) | 3 lines
139  * Updated keyword expansion to use SVN keywords.  Also seeing if svn is
140  * working well enough for me to update files from BeOS R5.
141  *
142  * r11909 | axeld | 2005-03-18 19:09:19 -0500 (Fri, 18 Mar 2005) | 2 lines
143  * Moved bin/ directory out of apps/.
144  *
145  * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
146  * Move trunk into respective module.
147  *
148  * r10362 | nwhitehorn | 2004-12-06 20:14:05 -0500 (Mon, 06 Dec 2004) | 2 lines
149  * Fixed the spam filter so it works correctly now.
150  *
151  * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
152  * Added AGMS's excellent spam detection software.  Still some weirdness with
153  * the configuration interface from E-mail prefs.
154  *
155  * Revision 1.2  2004/12/07 01:14:05  nwhitehorn
156  * Fixed the spam filter so it works correctly now.
157  *
158  * Revision 1.87  2004/09/20 15:57:26  nwhitehorn
159  * Mostly updated the tree to Be/Haiku style identifier naming conventions.  I
160  * have a few more things to work out, mostly in mail_util.h, and then I'm
161  * proceeding to jamify the build system.  Then we go into Haiku CVS.
162  *
163  * Revision 1.86  2003/07/26 16:47:46  agmsmith
164  * Bug - wasn't allowing double classification if the user had turned on
165  * the option to ignore the previous classification.
166  *
167  * Revision 1.85  2003/07/08 14:52:57  agmsmith
168  * Fix bug with classification choices dialog box coming up with weird
169  * sizes due to RefsReceived message coming in before ReadyToRun had
170  * finished setting up the default sizes of the controls.
171  *
172  * Revision 1.84  2003/07/04 19:59:29  agmsmith
173  * Now with a GUI option to let you declassify messages (set them back
174  * to uncertain, rather than spam or genuine).  Required a BAlert
175  * replacement since BAlerts can't do four buttons.
176  *
177  * Revision 1.83  2003/07/03 20:40:36  agmsmith
178  * Added Uncertain option for declassifying messages.
179  *
180  * Revision 1.82  2003/06/16 14:57:13  agmsmith
181  * Detect spam which uses mislabeled text attachments, going by the file name
182  * extension.
183  *
184  * Revision 1.81  2003/04/08 20:27:04  agmsmith
185  * AGMSBayesianSpamServer now shuts down immediately and returns true if
186  * it is asked to quit by the registrar.
187  *
188  * Revision 1.80  2003/04/07 19:20:27  agmsmith
189  * Ooops, int64 doesn't exist, use long long instead.
190  *
191  * Revision 1.79  2003/04/07 19:05:22  agmsmith
192  * Now with Allen Brunson's atoll for PPC (you need the %Ld, but that
193  * becomes %lld on other systems).
194  *
195  * Revision 1.78  2003/04/04 22:43:53  agmsmith
196  * Fixed up atoll PPC processor hack so it would actually work, was just
197  * returning zero which meant that it wouldn't load in the database file
198  * (read the size as zero).
199  *
200  * Revision 1.77  2003/01/22 03:19:48  agmsmith
201  * Don't convert words to lower case, the case is important for spam.
202  * Particularly sentences which start with exciting words, which you
203  * normally won't use at the start of a sentence (and thus capitalize).
204  *
205  * Revision 1.76  2002/12/18 02:29:22  agmsmith
206  * Add space for the Uncertain display in Tracker.
207  *
208  * Revision 1.75  2002/12/18 01:54:37  agmsmith
209  * Added uncertain sound effect.
210  *
211  * Revision 1.74  2002/12/13 23:53:12  agmsmith
212  * Minimize the window before opening it so that it doesn't flash on the
213  * screen in server mode.  Also load the database when the window is
214  * displayed so that the user can see the words.
215  *
216  * Revision 1.73  2002/12/13 20:55:57  agmsmith
217  * Documentation.
218  *
219  * Revision 1.72  2002/12/13 20:26:11  agmsmith
220  * Fixed bug with adding messages in strings to database (was limited to
221  * messages at most 1K long).  Also changed default server mode to true
222  * since that's what people use most.
223  *
224  * Revision 1.71  2002/12/11 22:37:30  agmsmith
225  * Added commands to train on spam and genuine e-mail messages passed
226  * in string arguments rather then via external files.
227  *
228  * Revision 1.70  2002/12/10 22:12:41  agmsmith
229  * Adding a message to the database now uses a BPositionIO rather than a
230  * file and file name (for future string rather than file additions).  Also
231  * now re-evaluate a file after reclassifying it so that the user can see
232  * the new ratio.  Also remove the [Spam 99.9%] subject prefix when doing
233  * a re-evaluation or classification (the number would be wrong).
234  *
235  * Revision 1.69  2002/12/10 01:46:04  agmsmith
236  * Added the Chi-Squared scoring method.
237  *
238  * Revision 1.68  2002/11/29 22:08:25  agmsmith
239  * Change default purge age to 2000 so that hitting the purge button
240  * doesn't erase stuff from the new sample database.
241  *
242  * Revision 1.67  2002/11/25 20:39:39  agmsmith
243  * Don't need to massage the MIME type since the mail library now does
244  * the lower case conversion and converts TEXT to text/plain too.
245  *
246  * Revision 1.66  2002/11/20 22:57:12  nwhitehorn
247  * PPC Compatibility Fixes
248  *
249  * Revision 1.65  2002/11/10 18:43:55  agmsmith
250  * Added a time delay to some quitting operations so that scripting commands
251  * from a second client (like a second e-mail account) will make the program
252  * abort the quit operation.
253  *
254  * Revision 1.64  2002/11/05 18:05:16  agmsmith
255  * Looked at Nathan's PPC changes (thanks!), modified style a bit.
256  *
257  * Revision 1.63  2002/11/04 03:30:22  nwhitehorn
258  * Now works (or compiles at least) on PowerPC.  I'll get around to testing it
259  * later.
260  *
261  * Revision 1.62  2002/11/04 01:03:33  agmsmith
262  * Fixed warnings so it compiles under the bemaildaemon system.
263  *
264  * Revision 1.61  2002/11/03 23:00:37  agmsmith
265  * Added to the bemaildaemon project on SourceForge.  Hmmmm, seems to switch to
266  * a new version if I commit and specify a message, but doesn't accept the
267  * message and puts up the text editor.  Must be a bug where cvs eats the first
268  * option after "commit".
269  *
270  * Revision 1.60.1.1  2002/10/22 14:29:27  agmsmith
271  * Needed to recompile with the original Libmail.so from Beta/1 since
272  * the current library uses a different constructor, and thus wouldn't
273  * run when used with the old library.
274  *
275  * Revision 1.60  2002/10/21 16:41:27  agmsmith
276  * Return a special error code when no words are found in a message,
277  * so that messages without text/plain parts can be recognized as
278  * spam by the mail filter.
279  *
280  * Revision 1.59  2002/10/20 21:29:47  agmsmith
281  * Watch out for MIME types of "text", treat as text/plain.
282  *
283  * Revision 1.58  2002/10/20 18:29:07  agmsmith
284  * *** empty log message ***
285  *
286  * Revision 1.57  2002/10/20 18:25:02  agmsmith
287  * Fix case sensitivity in MIME type tests, and fix text/any test.
288  *
289  * Revision 1.56  2002/10/19 17:00:10  agmsmith
290  * Added the pop-up menu for the tokenize modes.
291  *
292  * Revision 1.55  2002/10/19 14:54:06  agmsmith
293  * Fudge MIME type of body text components so that they get
294  * treated as text.
295  *
296  * Revision 1.54  2002/10/19 00:56:37  agmsmith
297  * The parsing of e-mail messages seems to be working now, just need
298  * to add some user interface stuff for the tokenizing mode.
299  *
300  * Revision 1.53  2002/10/18 23:37:56  agmsmith
301  * More mail kit usage, can now decode headers, but more to do.
302  *
303  * Revision 1.52  2002/10/16 23:52:33  agmsmith
304  * Getting ready to add more tokenizing modes, exploring Mail Kit to break
305  * apart messages into components (and decode BASE64 and other encodings).
306  *
307  * Revision 1.51  2002/10/11 20:05:31  agmsmith
308  * Added installation of sound effect names, which the filter will use.
309  *
310  * Revision 1.50  2002/10/02 16:50:02  agmsmith
311  * Forgot to add credits to the algorithm inventors.
312  *
313  * Revision 1.49  2002/10/01 00:39:29  agmsmith
314  * Added drag and drop to evaluate files or to add them to the list.
315  *
316  * Revision 1.48  2002/09/30 19:44:17  agmsmith
317  * Switched to Gary Robinson's method, removed max spam/genuine word.
318  *
319  * Revision 1.47  2002/09/23 17:08:55  agmsmith
320  * Add an attribute with the spam ratio to files which have been evaluated.
321  *
322  * Revision 1.46  2002/09/23 02:50:32  agmsmith
323  * Fiddling with display width of e-mail attributes.
324  *
325  * Revision 1.45  2002/09/23 01:13:56  agmsmith
326  * Oops, bug in string evaluation scripting.
327  *
328  * Revision 1.44  2002/09/22 21:00:55  agmsmith
329  * Added EvaluateString so that the BeMail add-on can pass the info without
330  * having to create a temporary file.
331  *
332  * Revision 1.43  2002/09/20 19:56:02  agmsmith
333  * Added about box and button for estimating the spam ratio of a file.
334  *
335  * Revision 1.42  2002/09/20 01:22:26  agmsmith
336  * More testing, decide that an extreme ratio bias point of 0.5 is good.
337  *
338  * Revision 1.41  2002/09/19 21:17:12  agmsmith
339  * Changed a few names and proofread the program.
340  *
341  * Revision 1.40  2002/09/19 14:27:17  agmsmith
342  * Rearranged execution of commands, moving them to a separate looper
343  * rather than the BApplication, so that thousands of files could be
344  * processed without worrying about the message queue filling up.
345  *
346  * Revision 1.39  2002/09/18 18:47:16  agmsmith
347  * Stop flickering when the view is partially obscured, update cached
348  * values in all situations except when app is busy.
349  *
350  * Revision 1.38  2002/09/18 18:08:11  agmsmith
351  * Add a function for evaluating the spam ratio of a message.
352  *
353  * Revision 1.37  2002/09/16 01:30:16  agmsmith
354  * Added Get Oldest command.
355  *
356  * Revision 1.36  2002/09/16 00:47:52  agmsmith
357  * Change the display to counter-weigh the spam ratio by the number of
358  * messages.
359  *
360  * Revision 1.35  2002/09/15 20:49:35  agmsmith
361  * Scrolling improved, buttons, keys and mouse wheel added.
362  *
363  * Revision 1.34  2002/09/15 03:46:10  agmsmith
364  * Up and down buttons under construction.
365  *
366  * Revision 1.33  2002/09/15 02:09:21  agmsmith
367  * Took out scroll bar.
368  *
369  * Revision 1.32  2002/09/15 02:05:30  agmsmith
370  * Trying to add a scroll bar, but it isn't very useful.
371  *
372  * Revision 1.31  2002/09/14 23:06:28  agmsmith
373  * Now has live updates of the list of words.
374  *
375  * Revision 1.30  2002/09/14 19:53:11  agmsmith
376  * Now with a better display of the words.
377  *
378  * Revision 1.29  2002/09/13 21:33:54  agmsmith
379  * Now draws the words in the word display view, but still primitive.
380  *
381  * Revision 1.28  2002/09/13 19:28:02  agmsmith
382  * Added display of most genuine and most spamiest, fixed up cursor.
383  *
384  * Revision 1.27  2002/09/13 03:08:42  agmsmith
385  * Show current word and message counts, and a busy cursor.
386  *
387  * Revision 1.26  2002/09/13 00:00:08  agmsmith
388  * Fixed up some deadlock problems, now using asynchronous message replies.
389  *
390  * Revision 1.25  2002/09/12 17:56:58  agmsmith
391  * Keep track of words which are spamiest and genuinest.
392  *
393  * Revision 1.24  2002/09/12 01:57:10  agmsmith
394  * Added server mode.
395  *
396  * Revision 1.23  2002/09/11 23:30:45  agmsmith
397  * Added Purge button and ignore classification checkbox.
398  *
399  * Revision 1.22  2002/09/11 21:23:13  agmsmith
400  * Added bulk update choice, purge button, moved to a BView container
401  * for all the controls (so background colour could be set, and Pulse
402  * works normally for it too).
403  *
404  * Revision 1.21  2002/09/10 22:52:49  agmsmith
405  * You can now change the database name in the GUI.
406  *
407  * Revision 1.20  2002/09/09 14:20:42  agmsmith
408  * Now can have multiple backups, and implemented refs received.
409  *
410  * Revision 1.19  2002/09/07 19:14:56  agmsmith
411  * Added standard GUI measurement code.
412  *
413  * Revision 1.18  2002/09/06 21:03:03  agmsmith
414  * Rearranging code to avoid forward references when adding a window class.
415  *
416  * Revision 1.17  2002/09/06 02:54:00  agmsmith
417  * Added the ability to purge old words from the database.
418  *
419  * Revision 1.16  2002/09/05 00:46:03  agmsmith
420  * Now adds spam to the database!
421  *
422  * Revision 1.15  2002/09/04 20:32:15  agmsmith
423  * Read ahead a couple of letters to decode quoted-printable better.
424  *
425  * Revision 1.14  2002/09/04 03:10:03  agmsmith
426  * Can now tokenize (break into words) a text file.
427  *
428  * Revision 1.13  2002/09/03 21:50:54  agmsmith
429  * Count database command, set up MIME type for the database file.
430  *
431  * Revision 1.12  2002/09/03 19:55:54  agmsmith
432  * Added loading and saving the database.
433  *
434  * Revision 1.11  2002/09/02 03:35:33  agmsmith
435  * Create indices and set up attribute associations with the e-mail MIME type.
436  *
437  * Revision 1.10  2002/09/01 15:52:49  agmsmith
438  * Can now delete the database.
439  *
440  * Revision 1.9  2002/08/31 21:55:32  agmsmith
441  * Yet more scripting.
442  *
443  * Revision 1.8  2002/08/31 21:41:37  agmsmith
444  * Under construction, with example code to decode a B_REPLY.
445  *
446  * Revision 1.7  2002/08/30 19:29:06  agmsmith
447  * Combined loading and saving settings into one function.
448  *
449  * Revision 1.6  2002/08/30 02:01:10  agmsmith
450  * Working on loading and saving settings.
451  *
452  * Revision 1.5  2002/08/29 23:17:42  agmsmith
453  * More scripting.
454  *
455  * Revision 1.4  2002/08/28 00:40:52  agmsmith
456  * Scripting now seems to work, at least the messages flow properly.
457  *
458  * Revision 1.3  2002/08/25 21:51:44  agmsmith
459  * Getting the about text formatting right.
460  *
461  * Revision 1.2  2002/08/25 21:28:20  agmsmith
462  * Trying out the BeOS scripting system as a way of implementing the program.
463  *
464  * Revision 1.1  2002/08/24 02:27:51  agmsmith
465  * Initial revision
466  */
467 
468 /* Standard C Library. */
469 
470 #include <errno.h>
471 #include <stdio.h>
472 #include <stdlib.h>
473 #include <strings.h>
474 
475 /* Standard C++ library. */
476 
477 #include <iostream>
478 
479 /* STL (Standard Template Library) headers. */
480 
481 #include <map>
482 #include <queue>
483 #include <set>
484 #include <string>
485 #include <vector>
486 
487 using namespace std;
488 
489 /* BeOS (Be Operating System) headers. */
490 
491 #include <Alert.h>
492 #include <Application.h>
493 #include <Beep.h>
494 #include <Button.h>
495 #include <CheckBox.h>
496 #include <Cursor.h>
497 #include <Directory.h>
498 #include <Entry.h>
499 #include <File.h>
500 #include <FilePanel.h>
501 #include <FindDirectory.h>
502 #include <fs_index.h>
503 #include <fs_info.h>
504 #include <MenuBar.h>
505 #include <MenuItem.h>
506 #include <Message.h>
507 #include <MessageQueue.h>
508 #include <MessageRunner.h>
509 #include <Mime.h>
510 #include <NodeInfo.h>
511 #include <Path.h>
512 #include <Picture.h>
513 #include <PictureButton.h>
514 #include <Point.h>
515 #include <Polygon.h>
516 #include <PopUpMenu.h>
517 #include <PropertyInfo.h>
518 #include <RadioButton.h>
519 #include <Resources.h>
520 #include <Screen.h>
521 #include <ScrollBar.h>
522 #include <String.h>
523 #include <StringView.h>
524 #include <TextControl.h>
525 #include <View.h>
526 
527 /* Included from the Mail Daemon Replacement project (MDR) include/public
528 directory, available from http://sourceforge.net/projects/bemaildaemon/ */
529 
530 #include <MailMessage.h>
531 #include <MailAttachment.h>
532 
533 
534 /******************************************************************************
535  * Global variables, and not-so-variable things too.  Grouped by functionality.
536  */
537 
538 static float g_MarginBetweenControls; /* Space of a letter "M" between them. */
539 static float g_LineOfTextHeight;      /* Height of text the current font. */
540 static float g_StringViewHeight;      /* Height of a string view text box. */
541 static float g_ButtonHeight;          /* How many pixels tall buttons are. */
542 static float g_CheckBoxHeight;        /* Same for check boxes. */
543 static float g_RadioButtonHeight;     /* Also for radio buttons. */
544 static float g_PopUpMenuHeight;       /* Again for pop-up menus. */
545 static float g_TextBoxHeight;         /* Ditto for editable text controls. */
546 
547 static const char *g_ABSAppSignature =
548   "application/x-vnd.agmsmith.spamdbm";
549 
550 static const char *g_ABSDatabaseFileMIMEType =
551   "text/x-vnd.agmsmith.spam_probability_database";
552 
553 static const char *g_DefaultDatabaseFileName =
554   "SpamDBM Database";
555 
556 static const char *g_DatabaseRecognitionString =
557   "Spam Database File";
558 
559 static const char *g_AttributeNameClassification = "MAIL:classification";
560 static const char *g_AttributeNameSpamRatio = "MAIL:ratio_spam";
561 static const char *g_BeepGenuine = "SpamFilter-Genuine";
562 static const char *g_BeepSpam = "SpamFilter-Spam";
563 static const char *g_BeepUncertain = "SpamFilter-Uncertain";
564 static const char *g_ClassifiedSpam = "Spam";
565 static const char *g_ClassifiedGenuine = "Genuine";
566 static const char *g_DataName = "data";
567 static const char *g_ResultName = "result";
568 
569 static const char *g_SettingsDirectoryName = "Mail";
570 static const char *g_SettingsFileName = "SpamDBM Settings";
571 static const uint32 g_SettingsWhatCode = 'SDBM';
572 static const char *g_BackupSuffix = ".backup %d";
573 static const int g_MaxBackups = 10; /* Numbered from 0 to g_MaxBackups - 1. */
574 static const size_t g_MaxWordLength = 50; /* Words longer than this aren't. */
575 static const int g_MaxInterestingWords = 150; /* Top N words are examined. */
576 static const double g_RobinsonS = 0.45; /* Default weight for no data. */
577 static const double g_RobinsonX = 0.5; /* Halfway point for no data. */
578 
579 static bool g_CommandLineMode;
580   /* TRUE if the program was started from the command line (and thus should
581   exit after processing the command), FALSE if it is running with a graphical
582   user interface. */
583 
584 static bool g_ServerMode;
585   /* When TRUE the program runs in server mode - error messages don't result in
586   pop-up dialog boxes, but you can still see them in stderr.  Also the window
587   is minimized, if it exists. */
588 
589 static int g_QuitCountdown = -1;
590   /* Set to the number of pulse timing events (about one every half second) to
591   count down before the program quits.  Negative means stop counting.  Zero
592   means quit at the next pulse event.  This is used to keep the program alive
593   for a short while after someone requests that it quit, in case more scripting
594   commands come in, which will stop the countdown.  Needed to handle the case
595   where there are multiple e-mail accounts all requesting spam identification,
596   and one finishes first and tells the server to quit.  It also checks to see
597   that there is no more work to do before trying to quit. */
598 
599 static volatile bool g_AppReadyToRunCompleted = false;
600   /* The BApplication starts processing messages before ReadyToRun finishes,
601   which can lead to initialisation problems (button heights not determined).
602   So wait for this to turn TRUE in code that might run early, like
603   RefsReceived. */
604 
605 static class CommanderLooper *g_CommanderLooperPntr = NULL;
606 static BMessenger *g_CommanderMessenger = NULL;
607   /* Some globals for use with the looper which processes external commands
608   (arguments received, file references received), needed for avoiding deadlocks
609   which would happen if the BApplication sent a scripting message to itself. */
610 
611 static BCursor *g_BusyCursor = NULL;
612   /* The busy cursor, will be loaded from the resource file during application
613   startup. */
614 
615 typedef enum PropertyNumbersEnum
616 {
617   PN_DATABASE_FILE = 0,
618   PN_SPAM,
619   PN_SPAM_STRING,
620   PN_GENUINE,
621   PN_GENUINE_STRING,
622   PN_UNCERTAIN,
623   PN_IGNORE_PREVIOUS_CLASSIFICATION,
624   PN_SERVER_MODE,
625   PN_FLUSH,
626   PN_PURGE_AGE,
627   PN_PURGE_POPULARITY,
628   PN_PURGE,
629   PN_OLDEST,
630   PN_EVALUATE,
631   PN_EVALUATE_STRING,
632   PN_RESET_TO_DEFAULTS,
633   PN_INSTALL_THINGS,
634   PN_TOKENIZE_MODE,
635   PN_SCORING_MODE,
636   PN_MAX
637 } PropertyNumbers;
638 
639 static const char * g_PropertyNames [PN_MAX] =
640 {
641   "DatabaseFile",
642   "Spam",
643   "SpamString",
644   "Genuine",
645   "GenuineString",
646   "Uncertain",
647   "IgnorePreviousClassification",
648   "ServerMode",
649   "Flush",
650   "PurgeAge",
651   "PurgePopularity",
652   "Purge",
653   "Oldest",
654   "Evaluate",
655   "EvaluateString",
656   "ResetToDefaults",
657   "InstallThings",
658   "TokenizeMode",
659   "ScoringMode"
660 };
661 
662 /* This array lists the scripting commands we can handle, in a format that the
663 scripting system can understand too. */
664 
665 static struct property_info g_ScriptingPropertyList [] =
666 {
667   /* *name; commands[10]; specifiers[10]; *usage; extra_data; ... */
668   {g_PropertyNames[PN_DATABASE_FILE], {B_GET_PROPERTY, 0},
669     {B_DIRECT_SPECIFIER, 0}, "Get the pathname of the current database file.  "
670     "The default name is something like B_USER_SETTINGS_DIRECTORY / "
671     "Mail / SpamDBM Database", PN_DATABASE_FILE,
672     {}, {}, {}},
673   {g_PropertyNames[PN_DATABASE_FILE], {B_SET_PROPERTY, 0},
674     {B_DIRECT_SPECIFIER, 0}, "Change the pathname of the database file to "
675     "use.  It will automatically be converted to an absolute path name, "
676     "so make sure the parent directories exist before setting it.  If it "
677     "doesn't exist, you'll have to use the create command next.",
678     PN_DATABASE_FILE, {}, {}, {}},
679   {g_PropertyNames[PN_DATABASE_FILE], {B_CREATE_PROPERTY, 0},
680     {B_DIRECT_SPECIFIER, 0}, "Creates a new empty database, will replace "
681     "the existing database file too.", PN_DATABASE_FILE, {}, {}, {}},
682   {g_PropertyNames[PN_DATABASE_FILE], {B_DELETE_PROPERTY, 0},
683     {B_DIRECT_SPECIFIER, 0}, "Deletes the database file and all backup copies "
684     "of that file too.  Really only of use for uninstallers.",
685     PN_DATABASE_FILE, {}, {}, {}},
686   {g_PropertyNames[PN_DATABASE_FILE], {B_COUNT_PROPERTIES, 0},
687     {B_DIRECT_SPECIFIER, 0}, "Returns the number of words in the database.",
688     PN_DATABASE_FILE, {}, {}, {}},
689   {g_PropertyNames[PN_SPAM], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
690     "Adds the spam in the given file (specify full pathname to be safe) to "
691     "the database.  The words in the files will be added to the list of words "
692     "in the database that identify spam messages.  The files processed will "
693     "also have the attribute MAIL:classification added with a value of "
694     "\"Spam\" or \"Genuine\" as specified.  They also have their spam ratio "
695     "attribute updated, as if you had also used the Evaluate command on "
696     "them.  If they already have the MAIL:classification "
697     "attribute and it matches the new classification then they won't get "
698     "processed (and if it is different, they will get removed from the "
699     "statistics for the old class and added to the statistics for the new "
700     "one).  You can turn off that behaviour with the "
701     "IgnorePreviousClassification property.  The command line version lets "
702     "you specify more than one pathname.", PN_SPAM, {}, {}, {}},
703   {g_PropertyNames[PN_SPAM], {B_COUNT_PROPERTIES, 0}, {B_DIRECT_SPECIFIER, 0},
704     "Returns the number of spam messages in the database.", PN_SPAM,
705     {}, {}, {}},
706   {g_PropertyNames[PN_SPAM_STRING], {B_SET_PROPERTY, 0},
707     {B_DIRECT_SPECIFIER, 0}, "Adds the spam in the given string (assumed to "
708     "be the text of a whole e-mail message, not just a file name) to the "
709     "database.", PN_SPAM_STRING, {}, {}, {}},
710   {g_PropertyNames[PN_GENUINE], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
711     "Similar to adding spam except that the message file is added to the "
712     "genuine statistics.", PN_GENUINE, {}, {}, {}},
713   {g_PropertyNames[PN_GENUINE], {B_COUNT_PROPERTIES, 0},
714     {B_DIRECT_SPECIFIER, 0}, "Returns the number of genuine messages in the "
715     "database.", PN_GENUINE, {}, {}, {}},
716   {g_PropertyNames[PN_GENUINE_STRING], {B_SET_PROPERTY, 0},
717     {B_DIRECT_SPECIFIER, 0}, "Adds the genuine message in the given string "
718     "(assumed to be the text of a whole e-mail message, not just a file name) "
719     "to the database.", PN_GENUINE_STRING, {}, {}, {}},
720   {g_PropertyNames[PN_UNCERTAIN], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
721     "Similar to adding spam except that the message file is removed from the "
722     "database, undoing the previous classification.  Obviously, it needs to "
723     "have been classified previously (using the file attributes) so it can "
724     "tell if it is removing spam or genuine words.", PN_UNCERTAIN, {}, {}, {}},
725   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_SET_PROPERTY, 0},
726     {B_DIRECT_SPECIFIER, 0}, "If set to true then the previous classification "
727     "(which was saved as an attribute of the e-mail message file) will be "
728     "ignored, so that you can add the message to the database again.  If set "
729     "to false (the normal case), the attribute will be examined, and if the "
730     "message has already been classified as what you claim it is, nothing "
731     "will be done.  If it was misclassified, then the message will be removed "
732     "from the statistics for the old class and added to the stats for the "
733     "new classification you have requested.",
734     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
735   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_GET_PROPERTY, 0},
736     {B_DIRECT_SPECIFIER, 0}, "Find out the current setting of the flag for "
737     "ignoring the previously recorded classification.",
738     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
739   {g_PropertyNames[PN_SERVER_MODE], {B_SET_PROPERTY, 0},
740     {B_DIRECT_SPECIFIER, 0}, "If set to true then error messages get printed "
741     "to the standard error stream rather than showing up in an alert box.  "
742     "It also starts up with the window minimized.", PN_SERVER_MODE,
743     {}, {}, {}},
744   {g_PropertyNames[PN_SERVER_MODE], {B_GET_PROPERTY, 0},
745     {B_DIRECT_SPECIFIER, 0}, "Find out the setting of the server mode flag.",
746     PN_SERVER_MODE, {}, {}, {}},
747   {g_PropertyNames[PN_FLUSH], {B_EXECUTE_PROPERTY, 0},
748     {B_DIRECT_SPECIFIER, 0}, "Writes out the database file to disk, if it has "
749     "been updated in memory but hasn't been saved to disk.  It will "
750     "automatically get written when the program exits, so this command is "
751     "mostly useful for server mode.", PN_FLUSH, {}, {}, {}},
752   {g_PropertyNames[PN_PURGE_AGE], {B_SET_PROPERTY, 0},
753     {B_DIRECT_SPECIFIER, 0}, "Sets the old age limit.  Words which haven't "
754       "been updated since this many message additions to the database may be "
755       "deleted when you do a purge.  A good value is 1000, meaning that if a "
756       "word hasn't appeared in the last 1000 spam/genuine messages, it will "
757       "be forgotten.  Zero will purge all words, 1 will purge words not in "
758       "the last message added to the database, 2 will purge words not in the "
759       "last two messages added, and so on.  This is mostly useful for "
760       "removing those one time words which are often hunks of binary garbage, "
761       "not real words.  This acts in combination with the popularity limit; "
762       "both conditions have to be valid before the word gets deleted.",
763       PN_PURGE_AGE, {}, {}, {}},
764   {g_PropertyNames[PN_PURGE_AGE], {B_GET_PROPERTY, 0},
765     {B_DIRECT_SPECIFIER, 0}, "Gets the old age limit.", PN_PURGE_AGE,
766     {}, {}, {}},
767   {g_PropertyNames[PN_PURGE_POPULARITY], {B_SET_PROPERTY, 0},
768     {B_DIRECT_SPECIFIER, 0}, "Sets the popularity limit.  Words which aren't "
769     "this popular may be deleted when you do a purge.  A good value is 5, "
770     "which means that the word is safe from purging if it has been seen in 6 "
771     "or more e-mail messages.  If it's only in 5 or less, then it may get "
772     "purged.  The extreme is zero, where only words that haven't been seen "
773     "in any message are deleted (usually means no words).  This acts in "
774     "combination with the old age limit; both conditions have to be valid "
775     "before the word gets deleted.", PN_PURGE_POPULARITY, {}, {}, {}},
776   {g_PropertyNames[PN_PURGE_POPULARITY], {B_GET_PROPERTY, 0},
777     {B_DIRECT_SPECIFIER, 0}, "Gets the purge popularity limit.",
778     PN_PURGE_POPULARITY, {}, {}, {}},
779   {g_PropertyNames[PN_PURGE], {B_EXECUTE_PROPERTY, 0},
780     {B_DIRECT_SPECIFIER, 0}, "Purges the old obsolete words from the "
781     "database, if they are old enough according to the age limit and also "
782     "unpopular enough according to the popularity limit.", PN_PURGE,
783     {}, {}, {}},
784   {g_PropertyNames[PN_OLDEST], {B_GET_PROPERTY, 0},
785     {B_DIRECT_SPECIFIER, 0}, "Gets the age of the oldest message in the "
786     "database.  It's relative to the beginning of time, so you need to do "
787     "(total messages - age - 1) to see how many messages ago it was added.",
788     PN_OLDEST, {}, {}, {}},
789   {g_PropertyNames[PN_EVALUATE], {B_SET_PROPERTY, 0},
790     {B_DIRECT_SPECIFIER, 0}, "Evaluates a given file (by path name) to see "
791     "if it is spam or not.  Returns the ratio of spam probability vs genuine "
792     "probability, 0.0 meaning completely genuine, 1.0 for completely spam.  "
793     "Normally you should safely be able to consider it as spam if it is over "
794     "0.56 for the Robinson scoring method.  For the ChiSquared method, the "
795     "numbers are near 0 for genuine, near 1 for spam, and anywhere in the "
796     "middle means it can't decide.  The program attaches a MAIL:ratio_spam "
797     "attribute with the ratio as its "
798     "float32 value to the file.  Also returns the top few interesting words "
799     "in \"words\" and the associated per-word probability ratios in "
800     "\"ratios\".", PN_EVALUATE, {}, {}, {}},
801   {g_PropertyNames[PN_EVALUATE_STRING], {B_SET_PROPERTY, 0},
802     {B_DIRECT_SPECIFIER, 0}, "Like Evaluate, but rather than a file name, "
803     "the string argument contains the entire text of the message to be "
804     "evaluated.", PN_EVALUATE_STRING, {}, {}, {}},
805   {g_PropertyNames[PN_RESET_TO_DEFAULTS], {B_EXECUTE_PROPERTY, 0},
806     {B_DIRECT_SPECIFIER, 0}, "Resets all the configuration options to the "
807     "default values, including the database name.", PN_RESET_TO_DEFAULTS,
808     {}, {}, {}},
809   {g_PropertyNames[PN_INSTALL_THINGS], {B_EXECUTE_PROPERTY, 0},
810     {B_DIRECT_SPECIFIER, 0}, "Creates indices for the MAIL:classification and "
811     "MAIL:ratio_spam attributes on all volumes which support BeOS queries, "
812     "identifies them to the system as e-mail related attributes (modifies "
813     "the text/x-email MIME type), and sets up the new MIME type "
814     "(text/x-vnd.agmsmith.spam_probability_database) for the database file.  "
815     "Also registers names for the sound effects used by the separate filter "
816     "program (use the installsound BeOS program or the Sounds preferences "
817     "program to associate sound files with the names).", PN_INSTALL_THINGS,
818     {}, {}, {}},
819   {g_PropertyNames[PN_TOKENIZE_MODE], {B_SET_PROPERTY, 0},
820     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for breaking up the "
821     "message into words.  Use \"Whole\" for the whole file (also use it for "
822     "non-email files).  The file isn't broken into parts; the whole thing is "
823     "converted into words, headers and attachments are just more raw data.  "
824     "Well, not quite raw data since it converts quoted-printable codes "
825     "(equals sign followed by hex digits or end of line) to the equivalent "
826     "single characters.  \"PlainText\" breaks the file into MIME components "
827     "and only looks at the ones which are of MIME type text/plain.  "
828     "\"AnyText\" will look for words in all text/* things, including "
829     "text/html attachments.  \"AllParts\" will decode all message components "
830     "and look for words in them, including binary attachments.  "
831     "\"JustHeader\" will only look for words in the message header.  "
832     "\"AllPartsAndHeader\", \"PlainTextAndHeader\" and \"AnyTextAndHeader\" "
833     "will also include the words from the message headers.", PN_TOKENIZE_MODE,
834     {}, {}, {}},
835   {g_PropertyNames[PN_TOKENIZE_MODE], {B_GET_PROPERTY, 0},
836     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for breaking up the "
837     "message into words.", PN_TOKENIZE_MODE, {}, {}, {}},
838   {g_PropertyNames[PN_SCORING_MODE], {B_SET_PROPERTY, 0},
839     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for combining the "
840     "probabilities of individual words into an overall score.  "
841     "\"Robinson\" mode will use Gary Robinson's nth root of the product "
842     "method.  It gives a nice range of values between 0 and 1 so you can "
843     "see shades of spaminess.  The cutoff point between spam and genuine "
844     "varies depending on your database of words (0.56 was one point in "
845     "some experiments).  \"ChiSquared\" mode will use chi-squared "
846     "statistics to evaluate the difference in probabilities that the lists "
847     "of word ratios are random.  The result is very close to 0 for genuine "
848     "and very close to 1 for spam, and near the middle if it is uncertain.",
849     PN_SCORING_MODE, {}, {}, {}},
850   {g_PropertyNames[PN_SCORING_MODE], {B_GET_PROPERTY, 0},
851     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for combining the "
852     "individual word ratios into an overall score.", PN_SCORING_MODE,
853     {}, {}, {}},
854   {0, {0}, {0}, 0, 0, {}, {}, {}} /* End of list of property commands. */
855 };
856 
857 
858 /* The various scoring modes as text and enums.  See PN_SCORING_MODE. */
859 
860 typedef enum ScoringModeEnum
861 {
862   SM_ROBINSON = 0,
863   SM_CHISQUARED,
864   SM_MAX
865 } ScoringModes;
866 
867 static const char * g_ScoringModeNames [SM_MAX] =
868 {
869   "Robinson",
870   "ChiSquared"
871 };
872 
873 
874 /* The various tokenizing modes as text and enums.  See PN_TOKENIZE_MODE. */
875 
876 typedef enum TokenizeModeEnum
877 {
878   TM_WHOLE = 0,
879   TM_PLAIN_TEXT,
880   TM_PLAIN_TEXT_HEADER,
881   TM_ANY_TEXT,
882   TM_ANY_TEXT_HEADER,
883   TM_ALL_PARTS,
884   TM_ALL_PARTS_HEADER,
885   TM_JUST_HEADER,
886   TM_MAX
887 } TokenizeModes;
888 
889 static const char * g_TokenizeModeNames [TM_MAX] =
890 {
891   "All",
892   "Plain text",
893   "Plain text and header",
894   "Any text",
895   "Any text and header",
896   "All parts",
897   "All parts and header",
898   "Just header"
899 };
900 
901 
902 /* Possible message classifications. */
903 
904 typedef enum ClassificationTypesEnum
905 {
906   CL_GENUINE = 0,
907   CL_SPAM,
908   CL_UNCERTAIN,
909   CL_MAX
910 } ClassificationTypes;
911 
912 static const char * g_ClassificationTypeNames [CL_MAX] =
913 {
914   g_ClassifiedGenuine,
915   g_ClassifiedSpam,
916   "Uncertain"
917 };
918 
919 
920 /* Some polygon graphics for the scroll arrows. */
921 
922 static BPoint g_UpLinePoints [] =
923 {
924   BPoint (8, 2 * (1)),
925   BPoint (14, 2 * (6)),
926   BPoint (10, 2 * (6)),
927   BPoint (10, 2 * (13)),
928   BPoint (6, 2 * (13)),
929   BPoint (6, 2 * (6)),
930   BPoint (2, 2 * (6))
931 };
932 
933 static BPoint g_DownLinePoints [] =
934 {
935   BPoint (8, 2 * (14-1)),
936   BPoint (14, 2 * (14-6)),
937   BPoint (10, 2 * (14-6)),
938   BPoint (10, 2 * (14-13)),
939   BPoint (6, 2 * (14-13)),
940   BPoint (6, 2 * (14-6)),
941   BPoint (2, 2 * (14-6))
942 };
943 
944 static BPoint g_UpPagePoints [] =
945 {
946   BPoint (8, 2 * (1)),
947   BPoint (13, 2 * (6)),
948   BPoint (10, 2 * (6)),
949   BPoint (14, 2 * (10)),
950   BPoint (10, 2 * (10)),
951   BPoint (10, 2 * (13)),
952   BPoint (6, 2 * (13)),
953   BPoint (6, 2 * (10)),
954   BPoint (2, 2 * (10)),
955   BPoint (6, 2 * (6)),
956   BPoint (3, 2 * (6))
957 };
958 
959 static BPoint g_DownPagePoints [] =
960 {
961   BPoint (8, 2 * (14-1)),
962   BPoint (13, 2 * (14-6)),
963   BPoint (10, 2 * (14-6)),
964   BPoint (14, 2 * (14-10)),
965   BPoint (10, 2 * (14-10)),
966   BPoint (10, 2 * (14-13)),
967   BPoint (6, 2 * (14-13)),
968   BPoint (6, 2 * (14-10)),
969   BPoint (2, 2 * (14-10)),
970   BPoint (6, 2 * (14-6)),
971   BPoint (3, 2 * (14-6))
972 };
973 
974 
975 /* An array of flags to identify characters which are considered to be spaces.
976 If character code X has g_SpaceCharacters[X] set to true then it is a
977 space-like character.  Character codes 128 and above are always non-space since
978 they are UTF-8 characters.  Initialised in the ABSApp constructor. */
979 
980 static bool g_SpaceCharacters [128];
981 
982 
983 
984 /******************************************************************************
985  * Each word in the spam database gets one of these structures.  The database
986  * has a string (the word) as the key and this structure as the value
987  * (statistics for that word).
988  */
989 
990 typedef struct StatisticsStruct
991 {
992   uint32 age;
993     /* Sequence number for the time when this word was last updated in the
994     database, so that we can remove old words (haven't been seen in recent
995     spam).  It's zero for the first file ever added (spam or genuine) to the
996     database, 1 for all words added or updated by the second file, etc.  If a
997     later file updates an existing word, it gets the age of the later file. */
998 
999   uint32 genuineCount;
1000     /* Number of genuine messages that have this word. */
1001 
1002   uint32 spamCount;
1003     /* A count of the number of spam e-mail messages which contain the word. */
1004 
1005 } StatisticsRecord, *StatisticsPointer;
1006 
1007 typedef map<string, StatisticsRecord> StatisticsMap;
1008   /* Define this type which will be used for our main data storage facility, so
1009   we can more conveniently specify things that are derived from it, like
1010   iterators. */
1011 
1012 
1013 
1014 /******************************************************************************
1015  * An alert box asking how the user wants to mark messages.  There are buttons
1016  * for each classification category, and a checkbox to mark all remaining N
1017  * messages the same way.  And a cancel button.  To use it, first create the
1018  * ClassificationChoicesWindow, specifying the input arguments.  Then call the
1019  * Go method which will show the window, stuff the user's answer into your
1020  * output arguments (class set to CL_MAX if the user cancels), and destroy the
1021  * window.  Implemented because BAlert only allows 3 buttons, max!
1022  */
1023 
1024 class ClassificationChoicesWindow : public BWindow
1025 {
1026 public:
1027   /* Constructor and destructor. */
1028   ClassificationChoicesWindow (BRect FrameRect,
1029     const char *FileName, int NumberOfFiles);
1030 
1031   /* BeOS virtual functions. */
1032   virtual void MessageReceived (BMessage *MessagePntr);
1033 
1034   /* Our methods. */
1035   void Go (bool *BulkModeSelectedPntr,
1036     ClassificationTypes *ChoosenClassificationPntr);
1037 
1038   /* Various message codes for various buttons etc. */
1039   static const uint32 MSG_CLASS_BUTTONS = 'ClB0';
1040   static const uint32 MSG_CANCEL_BUTTON = 'Cncl';
1041   static const uint32 MSG_BULK_CHECKBOX = 'BlkK';
1042 
1043 private:
1044   /* Member variables. */
1045   bool *m_BulkModeSelectedPntr;
1046   ClassificationTypes *m_ChoosenClassificationPntr;
1047 };
1048 
1049 class ClassificationChoicesView : public BView
1050 {
1051 public:
1052   /* Constructor and destructor. */
1053   ClassificationChoicesView (BRect FrameRect,
1054     const char *FileName, int NumberOfFiles);
1055 
1056   /* BeOS virtual functions. */
1057   virtual void AttachedToWindow ();
1058   virtual void GetPreferredSize (float *width, float *height);
1059 
1060 private:
1061   /* Member variables. */
1062   const char *m_FileName;
1063   int         m_NumberOfFiles;
1064   float       m_PreferredBottomY;
1065 };
1066 
1067 
1068 
1069 /******************************************************************************
1070  * Due to deadlock problems with the BApplication posting scripting messages to
1071  * itself, we need to add a second Looper.  Its job is to just to convert
1072  * command line arguments and arguments from the Tracker (refs received) into a
1073  * series of scripting commands sent to the main BApplication.  It also prints
1074  * out the replies received (to stdout for command line replies).  An instance
1075  * of this class will be created and run by the main() function, and shut down
1076  * by it too.
1077  */
1078 
1079 class CommanderLooper : public BLooper
1080 {
1081 public:
1082   CommanderLooper ();
1083   ~CommanderLooper ();
1084   virtual void MessageReceived (BMessage *MessagePntr);
1085 
1086   void CommandArguments (int argc, char **argv);
1087   void CommandReferences (BMessage *MessagePntr,
1088     bool BulkMode = false,
1089     ClassificationTypes BulkClassification = CL_GENUINE);
1090   bool IsBusy ();
1091 
1092 private:
1093   void ProcessArgs (BMessage *MessagePntr);
1094   void ProcessRefs (BMessage *MessagePntr);
1095 
1096   static const uint32 MSG_COMMAND_ARGUMENTS = 'CArg';
1097   static const uint32 MSG_COMMAND_FILE_REFS = 'CRef';
1098 
1099   bool m_IsBusy;
1100 };
1101 
1102 
1103 
1104 /******************************************************************************
1105  * This view contains the various buttons and other controls for setting
1106  * configuration options and displaying the state of the database (but not the
1107  * actual list of words).  It will appear in the top half of the
1108  * DatabaseWindow.
1109  */
1110 
1111 class ControlsView : public BView
1112 {
1113 public:
1114   /* Constructor and destructor. */
1115   ControlsView (BRect NewBounds);
1116   ~ControlsView ();
1117 
1118   /* BeOS virtual functions. */
1119   virtual void AttachedToWindow ();
1120   virtual void FrameResized (float Width, float Height);
1121   virtual void MessageReceived (BMessage *MessagePntr);
1122   virtual void Pulse ();
1123 
1124 private:
1125   /* Various message codes for various buttons etc. */
1126   static const uint32 MSG_BROWSE_BUTTON = 'Brws';
1127   static const uint32 MSG_DATABASE_NAME = 'DbNm';
1128   static const uint32 MSG_ESTIMATE_BUTTON = 'Estm';
1129   static const uint32 MSG_ESTIMATE_FILE_REFS = 'ERef';
1130   static const uint32 MSG_IGNORE_CLASSIFICATION = 'IPCl';
1131   static const uint32 MSG_PURGE_AGE = 'PuAg';
1132   static const uint32 MSG_PURGE_BUTTON = 'Purg';
1133   static const uint32 MSG_PURGE_POPULARITY = 'PuPo';
1134   static const uint32 MSG_SERVER_MODE = 'SrvM';
1135 
1136   /* Our member functions. */
1137   void BrowseForDatabaseFile ();
1138   void BrowseForFileToEstimate ();
1139   void PollServerForChanges ();
1140 
1141   /* Member variables. */
1142   BButton        *m_AboutButtonPntr;
1143   BButton        *m_AddExampleButtonPntr;
1144   BButton        *m_BrowseButtonPntr;
1145   BFilePanel     *m_BrowseFilePanelPntr;
1146   BButton        *m_CreateDatabaseButtonPntr;
1147   char            m_DatabaseFileNameCachedValue [PATH_MAX];
1148   BTextControl   *m_DatabaseFileNameTextboxPntr;
1149   bool            m_DatabaseLoadDone;
1150   BButton        *m_EstimateSpamButtonPntr;
1151   BFilePanel     *m_EstimateSpamFilePanelPntr;
1152   uint32          m_GenuineCountCachedValue;
1153   BTextControl   *m_GenuineCountTextboxPntr;
1154   bool            m_IgnorePreviousClassCachedValue;
1155   BCheckBox      *m_IgnorePreviousClassCheckboxPntr;
1156   BButton        *m_InstallThingsButtonPntr;
1157   uint32          m_PurgeAgeCachedValue;
1158   BTextControl   *m_PurgeAgeTextboxPntr;
1159   BButton        *m_PurgeButtonPntr;
1160   uint32          m_PurgePopularityCachedValue;
1161   BTextControl   *m_PurgePopularityTextboxPntr;
1162   BButton        *m_ResetToDefaultsButtonPntr;
1163   ScoringModes    m_ScoringModeCachedValue;
1164   BMenuBar       *m_ScoringModeMenuBarPntr;
1165   BPopUpMenu     *m_ScoringModePopUpMenuPntr;
1166   bool            m_ServerModeCachedValue;
1167   BCheckBox      *m_ServerModeCheckboxPntr;
1168   uint32          m_SpamCountCachedValue;
1169   BTextControl   *m_SpamCountTextboxPntr;
1170   bigtime_t       m_TimeOfLastPoll;
1171   TokenizeModes   m_TokenizeModeCachedValue;
1172   BMenuBar       *m_TokenizeModeMenuBarPntr;
1173   BPopUpMenu     *m_TokenizeModePopUpMenuPntr;
1174   uint32          m_WordCountCachedValue;
1175   BTextControl   *m_WordCountTextboxPntr;
1176 };
1177 
1178 
1179 /* Various message codes for various buttons etc. */
1180 static const uint32 MSG_LINE_DOWN = 'LnDn';
1181 static const uint32 MSG_LINE_UP = 'LnUp';
1182 static const uint32 MSG_PAGE_DOWN = 'PgDn';
1183 static const uint32 MSG_PAGE_UP = 'PgUp';
1184 
1185 /******************************************************************************
1186  * This view contains the list of words.  It displays as many as can fit in the
1187  * view rectangle, starting at a specified word (so it can simulate scrolling).
1188  * Usually it will appear in the bottom half of the DatabaseWindow.
1189  */
1190 
1191 class WordsView : public BView
1192 {
1193 public:
1194   /* Constructor and destructor. */
1195   WordsView (BRect NewBounds);
1196 
1197   /* BeOS virtual functions. */
1198   virtual void AttachedToWindow ();
1199   virtual void Draw (BRect UpdateRect);
1200   virtual void KeyDown (const char *BufferPntr, int32 NumBytes);
1201   virtual void MakeFocus (bool Focused);
1202   virtual void MessageReceived (BMessage *MessagePntr);
1203   virtual void MouseDown (BPoint point);
1204   virtual void Pulse ();
1205 
1206 private:
1207   /* Our member functions. */
1208   void MoveTextUpOrDown (uint32 MovementType);
1209   void RefsDroppedHere (BMessage *MessagePntr);
1210 
1211   /* Member variables. */
1212   BPictureButton *m_ArrowLineDownPntr;
1213   BPictureButton *m_ArrowLineUpPntr;
1214   BPictureButton *m_ArrowPageDownPntr;
1215   BPictureButton *m_ArrowPageUpPntr;
1216     /* Various buttons for controlling scrolling, since we can't use a scroll
1217     bar.  To make them less obvious, their background view colour needs to be
1218     changed whenever the main view's colour changes. */
1219 
1220   float m_AscentHeight;
1221     /* The ascent height for the font used to draw words.  Height from the top
1222     of the highest letter to the base line (which is near the middle bottom of
1223     the letters, the line where you would align your writing of the text by
1224     hand, all letters have part above, some also have descenders below this
1225     line). */
1226 
1227   rgb_color m_BackgroundColour;
1228     /* The current background colour.  Changes when the focus changes. */
1229 
1230   uint32 m_CachedTotalGenuineMessages;
1231   uint32 m_CachedTotalSpamMessages;
1232   uint32 m_CachedWordCount;
1233     /* These are cached copies of the similar values in the BApplication.  They
1234     reflect what's currently displayed.  If they are different than the values
1235     from the BApplication then the polling loop will try to redraw the display.
1236     They get set to the values actually used during drawing when drawing is
1237     successful. */
1238 
1239   char m_FirstDisplayedWord [g_MaxWordLength + 1];
1240     /* The scrolling display starts at this word.  Since we can't use index
1241     numbers (word[12345] for example), we use the word itself.  The scroll
1242     buttons set this to the next or previous word in the database.  Typing by
1243     the user when the view has the focus will also change this starting word.
1244     */
1245 
1246   rgb_color m_FocusedColour;
1247     /* The colour to use for focused mode (typing by the user is received by
1248     our view). */
1249 
1250   bigtime_t m_LastTimeAKeyWasPressed;
1251     /* Records the time when a key was last pressed.  Used for determining when
1252     the user has stopped typing a batch of letters. */
1253 
1254   float m_LineHeight;
1255     /* Height of a line of text in the font used for the word display.
1256     Includes the height of the letters plus a bit of extra space for between
1257     the lines (called leading). */
1258 
1259   BFont m_TextFont;
1260     /* The font used to draw the text in the window. */
1261 
1262   float m_TextHeight;
1263     /* Maximum total height of the letters in the text, includes the part above
1264     the baseline and the part below.  Doesn't include the sliver of space
1265     between lines. */
1266 
1267   rgb_color m_UnfocusedColour;
1268     /* The colour to use for unfocused mode, when user typing isn't active. */
1269 };
1270 
1271 
1272 
1273 /******************************************************************************
1274  * The BWindow class for this program.  It displays the database in real time,
1275  * and has various buttons and gadgets in the top half for changing settings
1276  * (live changes, no OK button, and they reflect changes done by other programs
1277  * using the server too).  The bottom half is a scrolling view listing all the
1278  * words in the database.  A simple graphic blotch behind each word shows
1279  * whether the word is strongly or weakly related to spam or genuine messages.
1280  * Most operations go through the scripting message system, but it also peeks
1281  * at the BApplication data for examining simple things and when redrawing the
1282  * list of words.
1283  */
1284 
1285 class DatabaseWindow : public BWindow
1286 {
1287 public:
1288   /* Constructor and destructor. */
1289   DatabaseWindow ();
1290 
1291   /* BeOS virtual functions. */
1292   virtual void MessageReceived (BMessage *MessagePntr);
1293   virtual bool QuitRequested ();
1294 
1295 private:
1296   /* Member variables. */
1297   ControlsView *m_ControlsViewPntr;
1298   WordsView    *m_WordsViewPntr;
1299 };
1300 
1301 
1302 
1303 /******************************************************************************
1304  * ABSApp is the BApplication class for this program.  This handles messages
1305  * from the outside world (requests to load a database, or to add files to the
1306  * collection).  It responds to command line arguments (if you start up the
1307  * program a second time, the system will just send the arguments to the
1308  * existing running program).  It responds to scripting messages.  And it
1309  * responds to messages from the window.  Its thread does the main work of
1310  * updating the database and reading / writing files.
1311  */
1312 
1313 class ABSApp : public BApplication
1314 {
1315 public:
1316   /* Constructor and destructor. */
1317   ABSApp ();
1318   ~ABSApp ();
1319 
1320   /* BeOS virtual functions. */
1321   virtual void AboutRequested ();
1322   virtual void ArgvReceived (int32 argc, char **argv);
1323   virtual status_t GetSupportedSuites (BMessage *MessagePntr);
1324   virtual void MessageReceived (BMessage *MessagePntr);
1325   virtual void Pulse ();
1326   virtual bool QuitRequested ();
1327   virtual void ReadyToRun ();
1328   virtual void RefsReceived (BMessage *MessagePntr);
1329   virtual BHandler *ResolveSpecifier (BMessage *MessagePntr, int32 Index,
1330     BMessage *SpecifierMsgPntr, int32 SpecificationKind, const char *Property);
1331 
1332 private:
1333   /* Our member functions. */
1334   status_t AddFileToDatabase (ClassificationTypes IsSpamOrWhat,
1335     const char *FileName, char *ErrorMessage);
1336   status_t AddPositionIOToDatabase (ClassificationTypes IsSpamOrWhat,
1337     BPositionIO *MessageIOPntr, const char *OptionalFileName,
1338     char *ErrorMessage);
1339   status_t AddStringToDatabase (ClassificationTypes IsSpamOrWhat,
1340     const char *String, char *ErrorMessage);
1341   void AddWordsToSet (const char *InputString, size_t NumberOfBytes,
1342     char PrefixCharacter, set<string> &WordSet);
1343   status_t CreateDatabaseFile (char *ErrorMessage);
1344   void DefaultSettings ();
1345   status_t DeleteDatabaseFile (char *ErrorMessage);
1346   status_t EvaluateFile (const char *PathName, BMessage *ReplyMessagePntr,
1347     char *ErrorMessage);
1348   status_t EvaluatePositionIO (BPositionIO *PositionIOPntr,
1349     const char *OptionalFileName, BMessage *ReplyMessagePntr,
1350     char *ErrorMessage);
1351   status_t EvaluateString (const char *BufferPntr, ssize_t BufferSize,
1352     BMessage *ReplyMessagePntr, char *ErrorMessage);
1353   status_t GetWordsFromPositionIO (BPositionIO *PositionIOPntr,
1354     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1355   status_t InstallThings (char *ErrorMessage);
1356   status_t LoadDatabaseIfNeeded (char *ErrorMessage);
1357   status_t LoadSaveDatabase (bool DoLoad, char *ErrorMessage);
1358 public:
1359   status_t LoadSaveSettings (bool DoLoad);
1360 private:
1361   status_t MakeBackup (char *ErrorMessage);
1362   void MakeDatabaseEmpty ();
1363   void ProcessScriptingMessage (BMessage *MessagePntr,
1364     struct property_info *PropInfoPntr);
1365   status_t PurgeOldWords (char *ErrorMessage);
1366   status_t RecursivelyTokenizeMailComponent (
1367     BMailComponent *ComponentPntr, const char *OptionalFileName,
1368     set<string> &WordSet, char *ErrorMessage,
1369     int RecursionLevel, int MaxRecursionLevel);
1370   status_t SaveDatabaseIfNeeded (char *ErrorMessage);
1371   status_t TokenizeParts (BPositionIO *PositionIOPntr,
1372     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1373   status_t TokenizeWhole (BPositionIO *PositionIOPntr,
1374     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1375 
1376 public:
1377   /* Member variables.  Many are read by the window thread to see if it needs
1378   updating, and to draw the words.  However, the other threads will lock the
1379   BApplication or using scripting commands if they want to make changes. */
1380 
1381   bool m_DatabaseHasChanged;
1382     /* Set to TRUE when the in-memory database (stored in m_WordMap) has
1383     changed and is different from the on-disk database file.  When the
1384     application exits, the database will be written out if it has changed. */
1385 
1386   BString m_DatabaseFileName;
1387     /* The absolute path name to use for the database file on disk. */
1388 
1389   bool m_IgnorePreviousClassification;
1390     /* If TRUE then the previous classification of a message (stored in an
1391     attribute on the message file) will be ignored, and the message will be
1392     added to the requested spam/genuine list.  If this is FALSE then the spam
1393     won't be added to the list if it has already been classified as specified,
1394     but if it was mis-classified, it will be removed from the old list and
1395     added to the new list. */
1396 
1397   uint32 m_OldestAge;
1398     /* The age of the oldest word.  This will be the smallest age number in the
1399     database.  Mostly useful for scaling graphics representing age in the word
1400     display.  If the oldest word is no longer the oldest, this variable won't
1401     get immediately updated since it would take a lot of effort to find the
1402     next older age.  Since it's only used for display, we'll let it be slightly
1403     incorrect.  The next database load or purge will fix it. */
1404 
1405   uint32 m_PurgeAge;
1406     /* When purging old words, they have to be at least this old to be eligible
1407     for deletion.  Age is measured as the number of e-mails added to the
1408     database since the word was last updated in the database.  Zero means all
1409     words are old. */
1410 
1411   uint32 m_PurgePopularity;
1412     /* When purging old words, they have to be less than or equal to this
1413     popularity limit to be eligible for deletion.  Popularity is measured as
1414     the number of messages (spam and genuine) which have the word.  Zero means
1415     no words. */
1416 
1417   ScoringModes m_ScoringMode;
1418     /* Controls how to combine the word probabilities into an overall score.
1419     See the PN_SCORING_MODE comments for details. */
1420 
1421   BPath m_SettingsDirectoryPath;
1422     /* The constructor initialises this to the settings directory path.  It
1423     never changes after that. */
1424 
1425   bool m_SettingsHaveChanged;
1426     /* Set to TRUE when the settings are changed (different than the ones which
1427     were loaded).  When the application exits, the settings will be written out
1428     if they have changed. */
1429 
1430   double m_SmallestUseableDouble;
1431     /* When multiplying fractional numbers together, avoid using numbers
1432     smaller than this because the double exponent range is close to being
1433     exhausted.  The IEEE STANDARD 754 floating-point arithmetic (used on the
1434     Intel i8087 and later math processors) has 64 bit numbers with 53 bits of
1435     mantissa, giving it an underflow starting at 0.5**1022 = 2.2e-308 where it
1436     rounds off to the nearest multiple of 0.5**1074 = 4.9e-324. */
1437 
1438   TokenizeModes m_TokenizeMode;
1439     /* Controls how to convert the raw message text into words.  See the
1440     PN_TOKENIZE_MODE comments for details. */
1441 
1442   uint32 m_TotalGenuineMessages;
1443     /* Number of genuine messages which are in the database. */
1444 
1445   uint32 m_TotalSpamMessages;
1446     /* Number of spam messages which are in the database. */
1447 
1448   uint32 m_WordCount;
1449     /* The number of words currently in the database.  Stored separately as a
1450     member variable to avoid having to call m_WordMap.size() all the time,
1451     which other threads can't do while the database is being updated (but they
1452     can look at the word count variable). */
1453 
1454   StatisticsMap m_WordMap;
1455     /* The in-memory data structure holding the set of words and their
1456     associated statistics.  When the database isn't in use, it is an empty
1457     collection.  You should lock the BApplication if you are using the word
1458     collection (reading or writing) from another thread. */
1459 };
1460 
1461 
1462 
1463 /******************************************************************************
1464  * Global utility function to display an error message and return.  The message
1465  * part describes the error, and if ErrorNumber is non-zero, gets the string
1466  * ", error code $X (standard description)." appended to it.  If the message
1467  * is NULL then it gets defaulted to "Something went wrong".  The title part
1468  * doesn't get displayed (no title bar in the dialog box, but you can see it in
1469  * the debugger as the window thread name), and defaults to "Error Message" if
1470  * you didn't specify one.  If running in command line mode, the error gets
1471  * printed to stderr rather than showing up in a dialog box.
1472  */
1473 
1474 static void
1475 DisplayErrorMessage (
1476   const char *MessageString = NULL,
1477   int ErrorNumber = 0,
1478   const char *TitleString = NULL)
1479 {
1480   BAlert *AlertPntr;
1481   char ErrorBuffer [PATH_MAX + 1500];
1482 
1483   if (TitleString == NULL)
1484     TitleString = "SpamDBM Error Message";
1485 
1486   if (MessageString == NULL)
1487   {
1488     if (ErrorNumber == 0)
1489       MessageString = "No error, no message, why bother?";
1490     else
1491       MessageString = "Something went wrong";
1492   }
1493 
1494   if (ErrorNumber != 0)
1495   {
1496     sprintf (ErrorBuffer, "%s, error code $%X/%d (%s) has occured.",
1497       MessageString, ErrorNumber, ErrorNumber, strerror (ErrorNumber));
1498     MessageString = ErrorBuffer;
1499   }
1500 
1501   if (g_CommandLineMode || g_ServerMode)
1502     cerr << TitleString << ": " << MessageString << endl;
1503   else
1504   {
1505     AlertPntr = new BAlert (TitleString, MessageString,
1506       "Acknowledge", NULL, NULL, B_WIDTH_AS_USUAL, B_STOP_ALERT);
1507     if (AlertPntr != NULL) {
1508       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1509       AlertPntr->Go ();
1510     }
1511   }
1512 }
1513 
1514 
1515 
1516 /******************************************************************************
1517  * Word wrap a long line of text into shorter 79 column lines and print the
1518  * result on the given output stream.
1519  */
1520 
1521 static void
1522 WrapTextToStream (ostream& OutputStream, const char *TextPntr)
1523 {
1524   const int LineLength = 79;
1525   char     *StringPntr;
1526   char      TempString [LineLength+1];
1527 
1528   TempString[LineLength] = 0; /* Only needs to be done once. */
1529 
1530   while (*TextPntr != 0)
1531   {
1532     while (isspace (*TextPntr))
1533       TextPntr++; /* Skip leading spaces. */
1534     if (*TextPntr == 0)
1535       break; /* It was all spaces, don't print any more. */
1536 
1537     strncpy (TempString, TextPntr, LineLength);
1538 
1539     /* Advance StringPntr to the end of the temp string, partly to see how long
1540     it is (rather than doing strlen). */
1541 
1542     StringPntr = TempString;
1543     while (*StringPntr != 0)
1544       StringPntr++;
1545 
1546     if (StringPntr - TempString < LineLength)
1547     {
1548       /* This line fits completely. */
1549       OutputStream << TempString << endl;
1550       TextPntr += StringPntr - TempString;
1551       continue;
1552     }
1553 
1554     /* Advance StringPntr to the last space in the temp string. */
1555 
1556     while (StringPntr > TempString)
1557     {
1558       if (isspace (*StringPntr))
1559         break; /* Found the trailing space. */
1560       else /* Go backwards, looking for the trailing space. */
1561         StringPntr--;
1562     }
1563 
1564     /* Remove more trailing spaces at the end of the line, in case there were
1565     several spaces in a row. */
1566 
1567     while (StringPntr > TempString && isspace (StringPntr[-1]))
1568       StringPntr--;
1569 
1570     /* Print the line of text and advance the text pointer too. */
1571 
1572     if (StringPntr == TempString)
1573     {
1574       /* This line has no spaces, don't wrap it, just split off a chunk. */
1575       OutputStream << TempString << endl;
1576       TextPntr += strlen (TempString);
1577       continue;
1578     }
1579 
1580     *StringPntr = 0; /* Cut off after the first trailing space. */
1581     OutputStream << TempString << endl;
1582     TextPntr += StringPntr - TempString;
1583   }
1584 }
1585 
1586 
1587 
1588 /******************************************************************************
1589  * Print the usage info to the stream.  Includes a list of all commands.
1590  */
1591 ostream& PrintUsage (ostream& OutputStream);
1592 
1593 ostream& PrintUsage (ostream& OutputStream)
1594 {
1595   struct property_info *PropInfoPntr;
1596 
1597   OutputStream << "\nSpamDBM - A Spam Database Manager\n";
1598   OutputStream << "Copyright © 2002 by Alexander G. M. Smith.  ";
1599   OutputStream << "Released to the public domain.\n\n";
1600   WrapTextToStream (OutputStream, "Compiled on " __DATE__ " at " __TIME__
1601 ".  $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $  $HeadURL: http://svn.haiku-os.org/haiku/haiku/trunk/src/bin/mail_utils/spamdbm.cpp $");
1602   OutputStream << "\n"
1603 "This is a program for classifying e-mail messages as spam (junk mail which\n"
1604 "you don't want to read) and regular genuine messages.  It can learn what's\n"
1605 "spam and what's genuine.  You just give it a bunch of spam messages and a\n"
1606 "bunch of non-spam ones.  It uses them to make a list of the words from the\n"
1607 "messages with the probability that each word is from a spam message or from\n"
1608 "a genuine message.  Later on, it can use those probabilities to classify\n"
1609 "new messages as spam or not spam.  If the classifier stops working well\n"
1610 "(because the spammers have changed their writing style and vocabulary, or\n"
1611 "your regular correspondants are writing like spammers), you can use this\n"
1612 "program to update the list of words to identify the new messages\n"
1613 "correctly.\n"
1614 "\n"
1615 "The original idea was from Paul Graham's algorithm, which has an excellent\n"
1616 "writeup at: http://www.paulgraham.com/spam.html\n"
1617 "\n"
1618 "Gary Robinson came up with the improved algorithm, which you can read about at:\n"
1619 "http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n"
1620 "\n"
1621 "Then he, Tim Peters and the SpamBayes mailing list developed the Chi-Squared\n"
1622 "test, see http://mail.python.org/pipermail/spambayes/2002-October/001036.html\n"
1623 "for one of the earlier messages leading from the central limit theorem to\n"
1624 "the current chi-squared scoring method.\n"
1625 "\n"
1626 "Thanks go to Isaac Yonemoto for providing a better icon, which we can\n"
1627 "unfortunately no longer use, since the Hormel company wants people to\n"
1628 "avoid associating their meat product with junk e-mail.\n"
1629 "\n"
1630 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes\n"
1631 "uses to extract words from messages.  In particular, HTML is now handled.\n"
1632 "\n"
1633 "Usage: Specify the operation as the first argument followed by more\n"
1634 "information as appropriate.  The program's configuration will affect the\n"
1635 "actual operation (things like the name of the database file to use, or\n"
1636 "whether it should allow non-email messages to be added).  In command line\n"
1637 "mode it will do the operation and exit.  In GUI/server mode a command line\n"
1638 "invocation will just send the command to the running server.  You can also\n"
1639 "use BeOS scripting (see the \"Hey\" command which you can get from\n"
1640 "http://www.bebits.com/app/2042 ) to control the Spam server.  And finally,\n"
1641 "there's also a GUI interface which shows up if you start it without any\n"
1642 "command line arguments.\n"
1643 "\n"
1644 "Commands:\n"
1645 "\n"
1646 "Quit\n"
1647 "Stop the program.  Useful if it's running as a server.\n"
1648 "\n";
1649 
1650   /* Go through all our scripting commands and add a description of each one to
1651   the usage text. */
1652 
1653   for (PropInfoPntr = g_ScriptingPropertyList + 0;
1654   PropInfoPntr->name != 0;
1655   PropInfoPntr++)
1656   {
1657     switch (PropInfoPntr->commands[0])
1658     {
1659       case B_GET_PROPERTY:
1660         OutputStream << "Get " << PropInfoPntr->name << endl;
1661         break;
1662 
1663       case B_SET_PROPERTY:
1664         OutputStream << "Set " << PropInfoPntr->name << " NewValue" << endl;
1665         break;
1666 
1667       case B_COUNT_PROPERTIES:
1668         OutputStream << "Count " << PropInfoPntr->name << endl;
1669         break;
1670 
1671       case B_CREATE_PROPERTY:
1672         OutputStream << "Create " << PropInfoPntr->name << endl;
1673         break;
1674 
1675       case B_DELETE_PROPERTY:
1676         OutputStream << "Delete " << PropInfoPntr->name << endl;
1677         break;
1678 
1679       case B_EXECUTE_PROPERTY:
1680         OutputStream << PropInfoPntr->name << endl;
1681         break;
1682 
1683       default:
1684         OutputStream << "Buggy Command: " << PropInfoPntr->name << endl;
1685         break;
1686     }
1687     WrapTextToStream (OutputStream, (char *)PropInfoPntr->usage);
1688     OutputStream << endl;
1689   }
1690 
1691   return OutputStream;
1692 }
1693 
1694 
1695 
1696 /******************************************************************************
1697  * A utility function to send a command to the application, will return after a
1698  * short delay if the application is busy (doesn't wait for it to be executed).
1699  * The reply from the application is also thrown away.  It used to be an
1700  * overloaded function, but the system couldn't distinguish between bool and
1701  * int, so now it has slightly different names depending on the arguments.
1702  */
1703 
1704 static void
1705 SubmitCommand (BMessage& CommandMessage)
1706 {
1707   status_t ErrorCode;
1708 
1709   ErrorCode = be_app_messenger.SendMessage (&CommandMessage,
1710     be_app_messenger /* reply messenger, throw away the reply */,
1711     1000000 /* delivery timeout */);
1712 
1713   if (ErrorCode != B_OK)
1714     cerr << "SubmitCommand failed to send a command, code " <<
1715     ErrorCode << " (" << strerror (ErrorCode) << ")." << endl;
1716 }
1717 
1718 
1719 static void
1720 SubmitCommandString (
1721   PropertyNumbers Property,
1722   uint32 CommandCode,
1723   const char *StringArgument = NULL)
1724 {
1725   BMessage CommandMessage (CommandCode);
1726 
1727   if (Property < 0 || Property >= PN_MAX)
1728   {
1729     DisplayErrorMessage ("SubmitCommandString bug.");
1730     return;
1731   }
1732   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1733   if (StringArgument != NULL)
1734     CommandMessage.AddString (g_DataName, StringArgument);
1735   SubmitCommand (CommandMessage);
1736 }
1737 
1738 
1739 static void
1740 SubmitCommandInt32 (
1741   PropertyNumbers Property,
1742   uint32 CommandCode,
1743   int32 Int32Argument)
1744 {
1745   BMessage CommandMessage (CommandCode);
1746 
1747   if (Property < 0 || Property >= PN_MAX)
1748   {
1749     DisplayErrorMessage ("SubmitCommandInt32 bug.");
1750     return;
1751   }
1752   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1753   CommandMessage.AddInt32 (g_DataName, Int32Argument);
1754   SubmitCommand (CommandMessage);
1755 }
1756 
1757 
1758 static void
1759 SubmitCommandBool (
1760   PropertyNumbers Property,
1761   uint32 CommandCode,
1762   bool BoolArgument)
1763 {
1764   BMessage CommandMessage (CommandCode);
1765 
1766   if (Property < 0 || Property >= PN_MAX)
1767   {
1768     DisplayErrorMessage ("SubmitCommandBool bug.");
1769     return;
1770   }
1771   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1772   CommandMessage.AddBool (g_DataName, BoolArgument);
1773   SubmitCommand (CommandMessage);
1774 }
1775 
1776 
1777 
1778 /******************************************************************************
1779  * A utility function which will estimate the spaminess of file(s), not
1780  * callable from the application thread since it sends a scripting command to
1781  * the application and waits for results.  For each file there will be an entry
1782  * reference in the message.  For each of those, run it through the spam
1783  * estimator and display a box with the results.  This function is used both by
1784  * the file requestor and by dragging and dropping into the middle of the words
1785  * view.
1786  */
1787 
1788 static void
1789 EstimateRefFilesAndDisplay (BMessage *MessagePntr)
1790 {
1791   BAlert     *AlertPntr;
1792   BEntry      Entry;
1793   entry_ref   EntryRef;
1794   status_t    ErrorCode;
1795   int         i, j;
1796   BPath       Path;
1797   BMessage    ReplyMessage;
1798   BMessage    ScriptingMessage;
1799   const char *StringPntr;
1800   float       TempFloat;
1801   int32       TempInt32;
1802   char        TempString [PATH_MAX + 1024 +
1803                 g_MaxInterestingWords * (g_MaxWordLength + 16)];
1804 
1805   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
1806   {
1807     /* See if the entry is a valid file or directory or other thing. */
1808 
1809     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
1810     if (ErrorCode != B_OK || !Entry.Exists () || Entry.GetPath (&Path) != B_OK)
1811       continue;
1812 
1813     /* Evaluate the spaminess of the file. */
1814 
1815     ScriptingMessage.MakeEmpty ();
1816     ScriptingMessage.what = B_SET_PROPERTY;
1817     ScriptingMessage.AddSpecifier (g_PropertyNames[PN_EVALUATE]);
1818     ScriptingMessage.AddString (g_DataName, Path.Path ());
1819 
1820     if (be_app_messenger.SendMessage (&ScriptingMessage,&ReplyMessage) != B_OK)
1821       break; /* App has died or something is wrong. */
1822 
1823     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
1824     TempInt32 != B_OK)
1825       break; /* Error messages will be displayed elsewhere. */
1826 
1827     ReplyMessage.FindFloat (g_ResultName, &TempFloat);
1828     sprintf (TempString, "%f spam ratio for \"%s\".\nThe top words are:",
1829       (double) TempFloat, Path.Path ());
1830 
1831     for (j = 0; j < 20 /* Don't print too many! */; j++)
1832     {
1833       if (ReplyMessage.FindString ("words", j, &StringPntr) != B_OK ||
1834       ReplyMessage.FindFloat ("ratios", j, &TempFloat) != B_OK)
1835         break;
1836 
1837       sprintf (TempString + strlen (TempString), "\n%s / %f",
1838         StringPntr, TempFloat);
1839     }
1840     if (j >= 20 && j < g_MaxInterestingWords)
1841       sprintf (TempString + strlen (TempString), "\nAnd up to %d more words.",
1842         g_MaxInterestingWords - j);
1843 
1844     AlertPntr = new BAlert ("Estimate", TempString, "OK");
1845     if (AlertPntr != NULL) {
1846       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1847       AlertPntr->Go ();
1848     }
1849   }
1850 }
1851 
1852 
1853 
1854 /******************************************************************************
1855  * A utility function from the http://sourceforge.net/projects/spambayes
1856  * SpamBayes project.  Return prob(chisq >= x2, with v degrees of freedom).  It
1857  * computes the probability that the chi-squared value (a kind of normalized
1858  * error measurement), with v degrees of freedom, would be larger than a given
1859  * number (x2; chi is the Greek letter X thus x2).  So you can tell if the
1860  * error is really unusual (the returned probability is near zero meaning that
1861  * your measured error number is kind of large - actual chi-squared is rarely
1862  * above that number merely due to random effects), or if it happens often
1863  * (usually if the probability is over 5% then it's within 3 standard
1864  * deviations - meaning that chi-squared goes over your number fairly often due
1865  * merely to random effects).  v must be even for this calculation to work.
1866  */
1867 
1868 static double ChiSquaredProbability (double x2, int v)
1869 {
1870   int    halfV = v / 2;
1871   int    i;
1872   double m;
1873   double sum;
1874   double term;
1875 
1876   if (v & 1)
1877     return -1.0; /* Out of range return value as a hint v is odd. */
1878 
1879   /* If x2 is very large, exp(-m) will underflow to 0. */
1880   m = x2 / 2.0;
1881   sum = term = exp (-m);
1882   for (i = 1; i < halfV; i++)
1883   {
1884     term *= m / i;
1885     sum += term;
1886   }
1887 
1888   /* With small x2 and large v, accumulated roundoff error, plus error in the
1889   platform exp(), can cause this to spill a few ULP above 1.0.  For example,
1890   ChiSquaredProbability(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
1891   point.  Returning a value even a teensy bit over 1.0 is no good. */
1892 
1893   if (sum > 1.0)
1894     return 1.0;
1895   return sum;
1896 }
1897 
1898 
1899 
1900 /******************************************************************************
1901  * A utility function to remove the "[Spam 99.9%] " from in front of the
1902  * MAIL:subject attribute of a file.
1903  */
1904 
1905 static status_t RemoveSpamPrefixFromSubjectAttribute (BNode *BNodePntr)
1906 {
1907   status_t    ErrorCode;
1908   const char *MailSubjectName = "MAIL:subject";
1909   char       *StringPntr;
1910   char        SubjectString [2000];
1911 
1912   ErrorCode = BNodePntr->ReadAttr (MailSubjectName,
1913     B_STRING_TYPE, 0 /* offset */, SubjectString,
1914     sizeof (SubjectString) - 1);
1915   if (ErrorCode <= 0)
1916     return 0; /* The attribute isn't there so we don't care. */
1917   if (ErrorCode >= (int) sizeof (SubjectString) - 1)
1918     return 0; /* Can't handle subjects which are too long. */
1919 
1920   SubjectString [ErrorCode] = 0;
1921   ErrorCode = 0; /* So do-nothing exit returns zero. */
1922   if (strncmp (SubjectString, "[Spam ", 6) == 0)
1923   {
1924     for (StringPntr = SubjectString;
1925     *StringPntr != 0 && *StringPntr != ']'; StringPntr++)
1926       ; /* No body in this for loop. */
1927     if (StringPntr[0] == ']' && StringPntr[1] == ' ')
1928     {
1929       ErrorCode = BNodePntr->RemoveAttr (MailSubjectName);
1930       ErrorCode = BNodePntr->WriteAttr (MailSubjectName,
1931         B_STRING_TYPE, 0 /* offset */,
1932         StringPntr + 2, strlen (StringPntr + 2) + 1);
1933       if (ErrorCode > 0)
1934         ErrorCode = 0;
1935     }
1936   }
1937 
1938   return ErrorCode;
1939 }
1940 
1941 
1942 
1943 /******************************************************************************
1944  * The tokenizing functions.  To make tokenization of the text easier to
1945  * understand, it is broken up into several passes.  Each pass goes over the
1946  * text (can include NUL bytes) and extracts all the words it can recognise
1947  * (can be none).  The extracted words are added to the WordSet, with the
1948  * PrefixCharacter prepended (zero if none) so we can distinguish between words
1949  * found in headers and in the text body.  It also modifies the input text
1950  * buffer in-place to change the text that the next pass will see (blanking out
1951  * words that it wants to delete, but not inserting much new text since the
1952  * buffer can't be enlarged).  They all return the number of bytes remaining in
1953  * InputString after it has been modified to be input for the next pass.
1954  * Returns zero if it has exhausted the possibility of getting more words, or
1955  * if something goes wrong.
1956  */
1957 
1958 static size_t TokenizerPassLowerCase (
1959   char *BufferPntr,
1960   size_t NumberOfBytes)
1961 {
1962   char *EndOfStringPntr;
1963 
1964   EndOfStringPntr = BufferPntr + NumberOfBytes;
1965 
1966   while (BufferPntr < EndOfStringPntr)
1967   {
1968     /* Do our own lower case conversion; tolower () has problems with UTF-8
1969     characters that have the high bit set. */
1970 
1971     if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
1972       *BufferPntr = *BufferPntr + ('a' - 'A');
1973     BufferPntr++;
1974   }
1975   return NumberOfBytes;
1976 }
1977 
1978 
1979 /* A utility function for some commonly repeated code.  If this was Modula-2,
1980 we could use a nested procedure.  But it's not.  Adds the given word to the set
1981 of words, checking for maximum word length and prepending the prefix to the
1982 word, which gets modified by this function to reflect the word actually added
1983 to the set. */
1984 
1985 static void
1986 AddWordAndPrefixToSet (
1987   string &Word,
1988   const char *PrefixString,
1989   set<string> &WordSet)
1990 {
1991   if (Word.empty ())
1992     return;
1993 
1994   if (Word.size () > g_MaxWordLength)
1995     Word.resize (g_MaxWordLength);
1996   Word.insert (0, PrefixString);
1997   WordSet.insert (Word);
1998 }
1999 
2000 
2001 /* Hunt through the text for various URLs and extract the components as
2002 separate words.  Doesn't affect the text in the buffer.  Looks for
2003 protocol://user:password@computer:port/path?query=key#anchor strings.  Also
2004 www.blah strings are detected and broken down.  Doesn't do HREF="" strings
2005 where the string has a relative path (no host computer name).  Assumes the
2006 input buffer is already in lower case. */
2007 
2008 static size_t TokenizerPassExtractURLs (
2009   char *BufferPntr,
2010   size_t NumberOfBytes,
2011   char PrefixCharacter,
2012   set<string> &WordSet)
2013 {
2014   char   *AtSignStringPntr;
2015   char   *HostStringPntr;
2016   char   *InputStringEndPntr;
2017   char   *InputStringPntr;
2018   char   *OptionsStringPntr;
2019   char   *PathStringPntr;
2020   char    PrefixString [2];
2021   char   *ProtocolStringPntr;
2022   string  Word;
2023 
2024   InputStringPntr = BufferPntr;
2025   InputStringEndPntr = BufferPntr + NumberOfBytes;
2026   PrefixString [0] = PrefixCharacter;
2027   PrefixString [1] = 0;
2028 
2029   while (InputStringPntr < InputStringEndPntr - 4)
2030   {
2031     HostStringPntr = NULL;
2032     if (memcmp (InputStringPntr, "www.", 4) == 0)
2033       HostStringPntr = InputStringPntr;
2034     else if (memcmp (InputStringPntr, "://", 3) == 0)
2035     {
2036       /* Find the protocol name, and add it as a word such as "ftp:" "http:" */
2037       ProtocolStringPntr = InputStringPntr;
2038       while (ProtocolStringPntr > BufferPntr &&
2039       isalpha (ProtocolStringPntr[-1]))
2040         ProtocolStringPntr--;
2041       Word.assign (ProtocolStringPntr,
2042         (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
2043       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2044       HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
2045     }
2046     if (HostStringPntr == NULL)
2047     {
2048       InputStringPntr++;
2049       continue;
2050     }
2051 
2052     /* Got a host name string starting at HostStringPntr.  It's everything
2053     until the next slash or space, like "user:password@computer:port". */
2054 
2055     InputStringPntr = HostStringPntr;
2056     AtSignStringPntr = NULL;
2057     while (InputStringPntr < InputStringEndPntr &&
2058     (*InputStringPntr != '/' && !isspace (*InputStringPntr)))
2059     {
2060       if (*InputStringPntr == '@')
2061         AtSignStringPntr = InputStringPntr;
2062       InputStringPntr++;
2063     }
2064     if (AtSignStringPntr != NULL)
2065     {
2066       /* Add a word with the user and password, unseparated. */
2067       Word.assign (HostStringPntr,
2068         AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
2069       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2070       HostStringPntr = AtSignStringPntr + 1;
2071     }
2072 
2073     /* Add a word with the computer and port, unseparated. */
2074 
2075     Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
2076     AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2077 
2078     /* Now get the path name, not including the extra junk after ?  and #
2079     separators (they're stored as separate options).  Stops at white space or a
2080     double quote mark. */
2081 
2082     PathStringPntr = InputStringPntr;
2083     OptionsStringPntr = NULL;
2084     while (InputStringPntr < InputStringEndPntr &&
2085     (*InputStringPntr != '"' && !isspace (*InputStringPntr)))
2086     {
2087       if (OptionsStringPntr == NULL &&
2088       (*InputStringPntr == '?' || *InputStringPntr == '#'))
2089         OptionsStringPntr = InputStringPntr;
2090       InputStringPntr++;
2091     }
2092 
2093     if (OptionsStringPntr == NULL)
2094     {
2095       /* No options, all path. */
2096       Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
2097       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2098     }
2099     else
2100     {
2101       /* Insert the path before the options. */
2102       Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
2103       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2104 
2105       /* Insert all the options as a word. */
2106       Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
2107       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2108     }
2109   }
2110   return NumberOfBytes;
2111 }
2112 
2113 
2114 /* Replace long Asian words (likely to actually be sentences) with the first
2115 character in the word. */
2116 
2117 static size_t TokenizerPassTruncateLongAsianWords (
2118   char *BufferPntr,
2119   size_t NumberOfBytes)
2120 {
2121   char *EndOfStringPntr;
2122   char *InputStringPntr;
2123   int   Letter;
2124   char *OutputStringPntr;
2125   char *StartOfInputLongUnicodeWord;
2126   char *StartOfOutputLongUnicodeWord;
2127 
2128   InputStringPntr = BufferPntr;
2129   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2130   OutputStringPntr = InputStringPntr;
2131   StartOfInputLongUnicodeWord = NULL; /* Non-NULL flags it as started. */
2132   StartOfOutputLongUnicodeWord = NULL;
2133 
2134   /* Copy the text from the input to the output (same buffer), but when we find
2135   a sequence of UTF-8 characters that is too long then truncate it down to one
2136   character and reset the output pointer to be after that character, thus
2137   deleting the word.  Replacing the deleted characters after it with spaces
2138   won't work since we need to preserve the lack of space to handle those sneaky
2139   HTML artificial word breakers.  So that Thelongword<blah>ing becomes
2140   "T<blah>ing" rather than "T <blah>ing", so the next step joins them up into
2141   "Ting" rather than "T" and "ing".  The first code in a UTF-8 character is
2142   11xxxxxx and subsequent ones are 10xxxxxx. */
2143 
2144   while (InputStringPntr < EndOfStringPntr)
2145   {
2146     Letter = (unsigned char) *InputStringPntr;
2147     if (Letter < 128) // Got a regular ASCII letter?
2148     {
2149       if (StartOfInputLongUnicodeWord != NULL)
2150       {
2151         if (InputStringPntr - StartOfInputLongUnicodeWord >
2152         (int) g_MaxWordLength * 2)
2153         {
2154           /* Need to truncate the long word (100 bytes or about 50 characters)
2155           back down to the first UTF-8 character, so find out where the first
2156           character ends (skip past the 10xxxxxx bytes), and rewind the output
2157           pointer to be just after that (ignoring the rest of the long word in
2158           effect). */
2159 
2160           OutputStringPntr = StartOfOutputLongUnicodeWord + 1;
2161           while (OutputStringPntr < InputStringPntr)
2162           {
2163             Letter = (unsigned char) *OutputStringPntr;
2164             if (Letter < 128 || Letter >= 192)
2165               break;
2166             ++OutputStringPntr; // Still a UTF-8 middle of the character code.
2167           }
2168         }
2169         StartOfInputLongUnicodeWord = NULL;
2170       }
2171     }
2172     else if (Letter >= 192 && StartOfInputLongUnicodeWord == NULL)
2173     {
2174       /* Got the start of a UTF-8 character.  Remember the spot so we can see
2175       if this is a too long UTF-8 word, which is often a whole sentence in
2176       asian languages, since they sort of use a single character per word. */
2177 
2178       StartOfInputLongUnicodeWord = InputStringPntr;
2179       StartOfOutputLongUnicodeWord = OutputStringPntr;
2180     }
2181     *OutputStringPntr++ = *InputStringPntr++;
2182   }
2183   return OutputStringPntr - BufferPntr;
2184 }
2185 
2186 
2187 /* Find all the words in the string and add them to our local set of words.
2188 The characters considered white space are defined by g_SpaceCharacters.  This
2189 function is also used as a subroutine by other tokenizer functions when they
2190 have a bunch of presumably plain text they want broken into words and added. */
2191 
2192 static size_t TokenizerPassGetPlainWords (
2193   char *BufferPntr,
2194   size_t NumberOfBytes,
2195   char PrefixCharacter,
2196   set<string> &WordSet)
2197 {
2198   string  AccumulatedWord;
2199   char   *EndOfStringPntr;
2200   size_t  Length;
2201   int     Letter;
2202 
2203   if (NumberOfBytes <= 0)
2204     return 0; /* Nothing to process. */
2205 
2206   if (PrefixCharacter != 0)
2207     AccumulatedWord = PrefixCharacter;
2208   EndOfStringPntr = BufferPntr + NumberOfBytes;
2209   while (true)
2210   {
2211     if (BufferPntr >= EndOfStringPntr)
2212       Letter = EOF; // Usually a negative number.
2213     else
2214       Letter = (unsigned char) *BufferPntr++;
2215 
2216     /* See if it is a letter we treat as white space.  Some word separators
2217     like dashes and periods aren't considered as space.  Note that codes above
2218     127 are UTF-8 characters, which we consider non-space. */
2219 
2220     if (Letter < 0 /* EOF is -1 */ ||
2221     (Letter < 128 && g_SpaceCharacters[Letter]))
2222     {
2223       /* That space finished off a word.  Remove trailing periods... */
2224 
2225       while ((Length = AccumulatedWord.size()) > 0 &&
2226       AccumulatedWord [Length-1] == '.')
2227         AccumulatedWord.resize (Length - 1);
2228 
2229       /* If there's anything left in the word, add it to the set.  Also ignore
2230       words which are too big (it's probably some binary encoded data).  But
2231       leave room for supercalifragilisticexpialidoceous.  According to one web
2232       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
2233       currently in English.  Note that some uuencoded data was seen with a 60
2234       character line length. */
2235 
2236       if (PrefixCharacter != 0)
2237         Length--; // Don't count prefix when judging size or emptiness.
2238       if (Length > 0 && Length <= g_MaxWordLength)
2239         WordSet.insert (AccumulatedWord);
2240 
2241       /* Empty out the string to get ready for the next word.  Not quite empty,
2242       start it off with the prefix character if any. */
2243 
2244       if (PrefixCharacter != 0)
2245         AccumulatedWord = PrefixCharacter;
2246       else
2247         AccumulatedWord.resize (0);
2248     }
2249     else /* Not a space-like character, add it to the word. */
2250       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
2251 
2252     if (Letter < 0)
2253       break; /* End of data.  Exit here so that last word got processed. */
2254   }
2255   return NumberOfBytes;
2256 }
2257 
2258 
2259 /* Delete Things from the text.  The Thing is marked by a start string and an
2260 end string, such as "<!--" and "--> for HTML comment things.  All the text
2261 between the markers will be added to the word list before it gets deleted from
2262 the buffer.  The markers must be prepared in lower case and the buffer is
2263 assumed to have already been converted to lower case.  You can specify an empty
2264 string for the end marker if you're just matching a string constant like
2265 "&nbsp;", which you would put in the starting marker.  This is a utility
2266 function used by other tokenizer functions. */
2267 
2268 static size_t TokenizerUtilRemoveStartEndThing (
2269   char *BufferPntr,
2270   size_t NumberOfBytes,
2271   char PrefixCharacter,
2272   set<string> &WordSet,
2273   const char *ThingStartCode,
2274   const char *ThingEndCode,
2275   bool ReplaceWithSpace)
2276 {
2277   char *EndOfStringPntr;
2278   bool  FoundAndDeletedThing;
2279   char *InputStringPntr;
2280   char *OutputStringPntr;
2281   int   ThingEndLength;
2282   char *ThingEndPntr;
2283   int   ThingStartLength;
2284 
2285   InputStringPntr = BufferPntr;
2286   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2287   OutputStringPntr = InputStringPntr;
2288   ThingStartLength = strlen (ThingStartCode);
2289   ThingEndLength = strlen (ThingEndCode);
2290 
2291   if (ThingStartLength <= 0)
2292     return NumberOfBytes; /* Need some things to look for first! */
2293 
2294   while (InputStringPntr < EndOfStringPntr)
2295   {
2296     /* Search for the starting marker. */
2297 
2298     FoundAndDeletedThing = false;
2299     if (EndOfStringPntr - InputStringPntr >=
2300     ThingStartLength + ThingEndLength /* space remains for start + end */ &&
2301     *InputStringPntr == *ThingStartCode &&
2302     memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
2303     {
2304       /* Found the start marker.  Look for the terminating string.  If it is an
2305       empty string, then we've found it right now! */
2306 
2307       ThingEndPntr = InputStringPntr + ThingStartLength;
2308       while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
2309       {
2310         if (ThingEndLength == 0 ||
2311         (*ThingEndPntr == *ThingEndCode &&
2312         memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
2313         {
2314           /* Got the end of the Thing.  First dump the text inbetween the start
2315           and end markers into the words list. */
2316 
2317           TokenizerPassGetPlainWords (InputStringPntr + ThingStartLength,
2318             ThingEndPntr - (InputStringPntr + ThingStartLength),
2319             PrefixCharacter, WordSet);
2320 
2321           /* Delete by not updating the output pointer while moving the input
2322           pointer to just after the ending tag. */
2323 
2324           InputStringPntr = ThingEndPntr + ThingEndLength;
2325           if (ReplaceWithSpace)
2326             *OutputStringPntr++ = ' ';
2327           FoundAndDeletedThing = true;
2328           break;
2329         }
2330         ThingEndPntr++;
2331       } /* End while ThingEndPntr */
2332     }
2333     if (!FoundAndDeletedThing)
2334       *OutputStringPntr++ = *InputStringPntr++;
2335   } /* End while InputStringPntr */
2336 
2337   return OutputStringPntr - BufferPntr;
2338 }
2339 
2340 
2341 static size_t TokenizerPassRemoveHTMLComments (
2342   char *BufferPntr,
2343   size_t NumberOfBytes,
2344   char PrefixCharacter,
2345   set<string> &WordSet)
2346 {
2347   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2348     PrefixCharacter, WordSet, "<!--", "-->", false);
2349 }
2350 
2351 
2352 static size_t TokenizerPassRemoveHTMLStyle (
2353   char *BufferPntr,
2354   size_t NumberOfBytes,
2355   char PrefixCharacter,
2356   set<string> &WordSet)
2357 {
2358   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2359     PrefixCharacter, WordSet,
2360     "<style", "/style>", false /* replace with space if true */);
2361 }
2362 
2363 
2364 /* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
2365 start of the next sentence is recognised at least as the start of a very long
2366 word.  The Japanese comma also does the same job. */
2367 
2368 static size_t TokenizerPassJapanesePeriodsToSpaces (
2369   char *BufferPntr,
2370   size_t NumberOfBytes,
2371   char PrefixCharacter,
2372   set<string> &WordSet)
2373 {
2374   size_t BytesRemaining = NumberOfBytes;
2375 
2376   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2377     BytesRemaining, PrefixCharacter, WordSet, "。" /* period */, "", true);
2378   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2379     BytesRemaining, PrefixCharacter, WordSet, "、" /* comma */, "", true);
2380   return BytesRemaining;
2381 }
2382 
2383 
2384 /* Delete HTML tags from the text.  The contents of the tag are added as words
2385 before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
2386 stage while other HTML things get replaced by nothing. */
2387 
2388 static size_t TokenizerPassRemoveHTMLTags (
2389   char *BufferPntr,
2390   size_t NumberOfBytes,
2391   char PrefixCharacter,
2392   set<string> &WordSet)
2393 {
2394   size_t BytesRemaining = NumberOfBytes;
2395 
2396   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2397     BytesRemaining, PrefixCharacter, WordSet, "&nbsp;", "", true);
2398   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2399     BytesRemaining, PrefixCharacter, WordSet, "<p", ">", true);
2400   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2401     BytesRemaining, PrefixCharacter, WordSet, "<br", ">", true);
2402   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2403     BytesRemaining, PrefixCharacter, WordSet, "<", ">", false);
2404   return BytesRemaining;
2405 }
2406 
2407 
2408 
2409 /******************************************************************************
2410  * Implementation of the ABSApp class, constructor, destructor and the rest of
2411  * the member functions in mostly alphabetical order.
2412  */
2413 
2414 ABSApp::ABSApp ()
2415 : BApplication (g_ABSAppSignature),
2416   m_DatabaseHasChanged (false),
2417   m_SettingsHaveChanged (false)
2418 {
2419   status_t    ErrorCode;
2420   int         HalvingCount;
2421   int         i;
2422   const void *ResourceData;
2423   size_t      ResourceSize;
2424   BResources *ResourcesPntr;
2425 
2426   MakeDatabaseEmpty ();
2427 
2428   /* Set up the pathname which identifies our settings directory.  Note that
2429   the actual settings are loaded later on (or set to defaults) by the main()
2430   function, before this BApplication starts running.  So we don't bother
2431   initialising the other setting related variables here. */
2432 
2433   ErrorCode =
2434     find_directory (B_USER_SETTINGS_DIRECTORY, &m_SettingsDirectoryPath);
2435   if (ErrorCode == B_OK)
2436     ErrorCode = m_SettingsDirectoryPath.Append (g_SettingsDirectoryName);
2437   if (ErrorCode != B_OK)
2438     m_SettingsDirectoryPath.SetTo (".");
2439 
2440   /* Set up the table which identifies which characters are spaces and which
2441   are not.  Spaces are all control characters and all punctuation except for:
2442   apostrophe (so "it's" and possessive versions of words get stored), dash (for
2443   hyphenated words), dollar sign (for cash amounts), period (for IP addresses,
2444   we later remove trailing periods). */
2445 
2446   memset (g_SpaceCharacters, 1, sizeof (g_SpaceCharacters));
2447   g_SpaceCharacters['\''] = false;
2448   g_SpaceCharacters['-'] = false;
2449   g_SpaceCharacters['$'] = false;
2450   g_SpaceCharacters['.'] = false;
2451   for (i = '0'; i <= '9'; i++)
2452     g_SpaceCharacters[i] = false;
2453   for (i = 'A'; i <= 'Z'; i++)
2454     g_SpaceCharacters[i] = false;
2455   for (i = 'a'; i <= 'z'; i++)
2456     g_SpaceCharacters[i] = false;
2457 
2458   /* Initialise the busy cursor from data in the application's resources. */
2459 
2460   if ((ResourcesPntr = AppResources ()) != NULL && (ResourceData =
2461   ResourcesPntr->LoadResource ('CURS', "Busy Cursor", &ResourceSize)) != NULL
2462   && ResourceSize >= 68 /* Size of a raw 2x16x16x8+4 cursor is 68 bytes */)
2463     g_BusyCursor = new BCursor (ResourceData);
2464 
2465   /* Find out the smallest usable double by seeing how small we can make it. */
2466 
2467   m_SmallestUseableDouble = 1.0;
2468   HalvingCount = 0;
2469   while (HalvingCount < 10000 && m_SmallestUseableDouble > 0.0)
2470   {
2471     HalvingCount++;
2472     m_SmallestUseableDouble /= 2;
2473   }
2474 
2475   /* Recreate the number.  But don't make quite as small, we want to allow some
2476   precision bits and a bit of extra margin for intermediate results in future
2477   calculations. */
2478 
2479   HalvingCount -= 50 + sizeof (double) * 8;
2480 
2481   m_SmallestUseableDouble = 1.0;
2482   while (HalvingCount > 0)
2483   {
2484     HalvingCount--;
2485     m_SmallestUseableDouble /= 2;
2486   }
2487 }
2488 
2489 
2490 ABSApp::~ABSApp ()
2491 {
2492   status_t ErrorCode;
2493   char     ErrorMessage [PATH_MAX + 1024];
2494 
2495   if (m_SettingsHaveChanged)
2496     LoadSaveSettings (false /* DoLoad */);
2497   if ((ErrorCode = SaveDatabaseIfNeeded (ErrorMessage)) != B_OK)
2498     DisplayErrorMessage (ErrorMessage, ErrorCode, "Exiting Error");
2499   delete g_BusyCursor;
2500   g_BusyCursor = NULL;
2501 }
2502 
2503 
2504 /* Display a box showing information about this program. */
2505 
2506 void
2507 ABSApp::AboutRequested ()
2508 {
2509   BAlert *AboutAlertPntr;
2510 
2511   AboutAlertPntr = new BAlert ("About",
2512 "SpamDBM - Spam Database Manager\n\n"
2513 
2514 "This is a BeOS program for classifying e-mail messages as spam (unwanted \
2515 junk mail) or as genuine mail using a Bayesian statistical approach.  There \
2516 is also a Mail Daemon Replacement add-on to filter mail using the \
2517 classification statistics collected earlier.\n\n"
2518 
2519 "Written by Alexander G. M. Smith, fall 2002.\n\n"
2520 
2521 "The original idea was from Paul Graham's algorithm, which has an excellent \
2522 writeup at: http://www.paulgraham.com/spam.html\n\n"
2523 
2524 "Gary Robinson came up with the improved algorithm, which you can read about \
2525 at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n\n"
2526 
2527 "Mr. Robinson, Tim Peters and the SpamBayes mailing list people then \
2528 developed the even better chi-squared scoring method.\n\n"
2529 
2530 "Icon courtesy of Isaac Yonemoto, though it is no longer used since Hormel \
2531 doesn't want their meat product associated with junk e-mail.\n\n"
2532 
2533 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes \
2534 uses to extract words from messages.  In particular, HTML is now handled.\n\n"
2535 
2536 "Released to the public domain, with no warranty.\n"
2537 "$Revision: 30630 $\n"
2538 "Compiled on " __DATE__ " at " __TIME__ ".", "Done");
2539   if (AboutAlertPntr != NULL)
2540   {
2541     AboutAlertPntr->SetFlags(AboutAlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
2542     AboutAlertPntr->Go ();
2543   }
2544 }
2545 
2546 
2547 /* Add the text in the given file to the database as an example of a spam or
2548 genuine message, or removes it from the database if you claim it is
2549 CL_UNCERTAIN.  Also resets the spam ratio attribute to show the effect of the
2550 database change. */
2551 
2552 status_t ABSApp::AddFileToDatabase (
2553   ClassificationTypes IsSpamOrWhat,
2554   const char *FileName,
2555   char *ErrorMessage)
2556 {
2557   status_t ErrorCode;
2558   BFile    MessageFile;
2559   BMessage TempBMessage;
2560 
2561   ErrorCode = MessageFile.SetTo (FileName, B_READ_ONLY);
2562   if (ErrorCode != B_OK)
2563   {
2564     sprintf (ErrorMessage, "Unable to open file \"%s\" for reading", FileName);
2565     return ErrorCode;
2566   }
2567 
2568   ErrorCode = AddPositionIOToDatabase (IsSpamOrWhat,
2569     &MessageFile, FileName, ErrorMessage);
2570   MessageFile.Unset ();
2571   if (ErrorCode != B_OK)
2572     return ErrorCode;
2573 
2574   /* Re-evaluate the file so that the user sees the new ratio attribute. */
2575   return EvaluateFile (FileName, &TempBMessage, ErrorMessage);
2576 }
2577 
2578 
2579 /* Add the given text to the database.  The unique words found in MessageIOPntr
2580 will be added to the database (incrementing the count for the number of
2581 messages using each word, either the spam or genuine count depending on
2582 IsSpamOrWhat).  It will remove the message (decrement the word counts) if you
2583 specify CL_UNCERTAIN as the new classification.  And if it switches from spam
2584 to genuine or vice versa, it will do both - decrement the counts for the old
2585 class and increment the counts for the new one.  An attribute will be added to
2586 MessageIOPntr (if it is a file) to record that it has been marked as Spam or
2587 Genuine (so that it doesn't get added to the database a second time).  If it is
2588 being removed from the database, the classification attribute gets removed too.
2589 If things go wrong, a non-zero error code will be returned and an explanation
2590 written to ErrorMessage (assumed to be at least PATH_MAX + 1024 bytes long).
2591 OptionalFileName is just used in the error message to identify the file to the
2592 user. */
2593 
2594 status_t ABSApp::AddPositionIOToDatabase (
2595   ClassificationTypes IsSpamOrWhat,
2596   BPositionIO *MessageIOPntr,
2597   const char *OptionalFileName,
2598   char *ErrorMessage)
2599 {
2600   BNode                             *BNodePntr;
2601   char                               ClassificationString [NAME_MAX];
2602   StatisticsMap::iterator            DataIter;
2603   status_t                           ErrorCode = 0;
2604   pair<StatisticsMap::iterator,bool> InsertResult;
2605   uint32                             NewAge;
2606   StatisticsRecord                   NewStatistics;
2607   ClassificationTypes                PreviousClassification;
2608   StatisticsPointer                  StatisticsPntr;
2609   set<string>::iterator              WordEndIter;
2610   set<string>::iterator              WordIter;
2611   set<string>                        WordSet;
2612 
2613   NewAge = m_TotalGenuineMessages + m_TotalSpamMessages;
2614   if (NewAge >= 0xFFFFFFF0UL)
2615   {
2616     sprintf (ErrorMessage, "The database is full!  There are %lu messages in "
2617       "it and we can't add any more without overflowing the maximum integer "
2618       "representation in 32 bits", NewAge);
2619     return B_NO_MEMORY;
2620   }
2621 
2622   /* Check that this file hasn't already been added to the database. */
2623 
2624   PreviousClassification = CL_UNCERTAIN;
2625   BNodePntr = dynamic_cast<BNode *> (MessageIOPntr);
2626   if (BNodePntr != NULL) /* If this thing might have attributes. */
2627   {
2628     ErrorCode = BNodePntr->ReadAttr (g_AttributeNameClassification,
2629       B_STRING_TYPE, 0 /* offset */, ClassificationString,
2630       sizeof (ClassificationString) - 1);
2631     if (ErrorCode <= 0) /* Positive values for the number of bytes read */
2632       strcpy (ClassificationString, "none");
2633     else /* Just in case it needs a NUL at the end. */
2634       ClassificationString [ErrorCode] = 0;
2635 
2636     if (strcasecmp (ClassificationString, g_ClassifiedSpam) == 0)
2637       PreviousClassification = CL_SPAM;
2638     else if (strcasecmp (ClassificationString, g_ClassifiedGenuine) == 0)
2639       PreviousClassification = CL_GENUINE;
2640   }
2641 
2642   if (!m_IgnorePreviousClassification &&
2643   PreviousClassification != CL_UNCERTAIN)
2644   {
2645     if (IsSpamOrWhat == PreviousClassification)
2646     {
2647       sprintf (ErrorMessage, "Ignoring file \"%s\" since it seems to have "
2648         "already been classified as %s.", OptionalFileName,
2649         g_ClassificationTypeNames [IsSpamOrWhat]);
2650     }
2651     else
2652     {
2653       sprintf (ErrorMessage, "Changing existing classification of file \"%s\" "
2654         "from %s to %s.", OptionalFileName,
2655         g_ClassificationTypeNames [PreviousClassification],
2656         g_ClassificationTypeNames [IsSpamOrWhat]);
2657     }
2658     DisplayErrorMessage (ErrorMessage, 0, "Note");
2659   }
2660 
2661   if (!m_IgnorePreviousClassification &&
2662   IsSpamOrWhat == PreviousClassification)
2663     /* Nothing to do if it is already classified correctly and the user doesn't
2664     want double classification. */
2665     return B_OK;
2666 
2667   /* Get the list of unique words in the file. */
2668 
2669   ErrorCode = GetWordsFromPositionIO (MessageIOPntr, OptionalFileName,
2670     WordSet, ErrorMessage);
2671   if (ErrorCode != B_OK)
2672     return ErrorCode;
2673 
2674   /* Update the count of the number of messages processed, with corrections if
2675   reclassifying a message. */
2676 
2677   m_DatabaseHasChanged = true;
2678 
2679   if (!m_IgnorePreviousClassification &&
2680   PreviousClassification == CL_SPAM && m_TotalSpamMessages > 0)
2681     m_TotalSpamMessages--;
2682 
2683   if (IsSpamOrWhat == CL_SPAM)
2684     m_TotalSpamMessages++;
2685 
2686   if (!m_IgnorePreviousClassification &&
2687   PreviousClassification == CL_GENUINE && m_TotalGenuineMessages > 0)
2688       m_TotalGenuineMessages--;
2689 
2690   if (IsSpamOrWhat == CL_GENUINE)
2691     m_TotalGenuineMessages++;
2692 
2693   /* Mark the file's attributes with the new classification.  Don't care if it
2694   fails. */
2695 
2696   if (BNodePntr != NULL) /* If this thing might have attributes. */
2697   {
2698     ErrorCode = BNodePntr->RemoveAttr (g_AttributeNameClassification);
2699     if (IsSpamOrWhat != CL_UNCERTAIN)
2700     {
2701       strcpy (ClassificationString, g_ClassificationTypeNames [IsSpamOrWhat]);
2702       ErrorCode = BNodePntr->WriteAttr (g_AttributeNameClassification,
2703         B_STRING_TYPE, 0 /* offset */,
2704         ClassificationString, strlen (ClassificationString) + 1);
2705     }
2706   }
2707 
2708   /* Add the words to the database by incrementing or decrementing the counts
2709   for each word as appropriate. */
2710 
2711   WordEndIter = WordSet.end ();
2712   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
2713   {
2714     if ((DataIter = m_WordMap.find (*WordIter)) == m_WordMap.end ())
2715     {
2716       /* No record in the database for the word. */
2717 
2718       if (IsSpamOrWhat == CL_UNCERTAIN)
2719         continue; /* Not adding words, don't have to subtract from nothing. */
2720 
2721       /* Create a new one record in the database for the new word. */
2722 
2723       memset (&NewStatistics, 0, sizeof (NewStatistics));
2724       InsertResult = m_WordMap.insert (
2725         StatisticsMap::value_type (*WordIter, NewStatistics));
2726       if (!InsertResult.second)
2727       {
2728         sprintf (ErrorMessage, "Failed to insert new database entry for "
2729           "word \"%s\", while processing file \"%s\"",
2730           WordIter->c_str (), OptionalFileName);
2731         return B_NO_MEMORY;
2732       }
2733       DataIter = InsertResult.first;
2734       m_WordCount++;
2735     }
2736 
2737     /* Got the database record for the word, update the statistics. */
2738 
2739     StatisticsPntr = &DataIter->second;
2740 
2741     StatisticsPntr->age = NewAge;
2742 
2743     /* Can't update m_OldestAge here, since it would take a lot of effort to
2744     find the next older age.  Since it's only used for display, we'll let it be
2745     slightly incorrect.  The next database load or purge will fix it. */
2746 
2747     if (IsSpamOrWhat == CL_SPAM)
2748       StatisticsPntr->spamCount++;
2749 
2750     if (IsSpamOrWhat == CL_GENUINE)
2751       StatisticsPntr->genuineCount++;
2752 
2753     if (!m_IgnorePreviousClassification &&
2754     PreviousClassification == CL_SPAM && StatisticsPntr->spamCount > 0)
2755       StatisticsPntr->spamCount--;
2756 
2757     if (!m_IgnorePreviousClassification &&
2758     PreviousClassification == CL_GENUINE && StatisticsPntr->genuineCount > 0)
2759       StatisticsPntr->genuineCount--;
2760   }
2761 
2762   return B_OK;
2763 }
2764 
2765 
2766 /* Add the text in the string to the database as an example of a spam or
2767 genuine message. */
2768 
2769 status_t ABSApp::AddStringToDatabase (
2770   ClassificationTypes IsSpamOrWhat,
2771   const char *String,
2772   char *ErrorMessage)
2773 {
2774   BMemoryIO MemoryIO (String, strlen (String));
2775 
2776   return AddPositionIOToDatabase (IsSpamOrWhat, &MemoryIO,
2777    "Memory Buffer" /* OptionalFileName */, ErrorMessage);
2778 }
2779 
2780 
2781 /* Given a bunch of text, find the words within it (doing special tricks to
2782 extract words from HTML), and add them to the set.  Allow NULs in the text.  If
2783 the PrefixCharacter isn't zero then it is prepended to all words found (so you
2784 can distinguish words as being from a header or from the body text).  See also
2785 TokenizeWhole which does something similar. */
2786 
2787 void
2788 ABSApp::AddWordsToSet (
2789   const char *InputString,
2790   size_t NumberOfBytes,
2791   char PrefixCharacter,
2792   set<string> &WordSet)
2793 {
2794   char   *BufferPntr;
2795   size_t  CurrentSize;
2796   int     PassNumber;
2797 
2798   /* Copy the input buffer.  The code will be modifying it in-place as HTML
2799   fragments and other junk are deleted. */
2800 
2801   BufferPntr = new char [NumberOfBytes];
2802   if (BufferPntr == NULL)
2803     return;
2804   memcpy (BufferPntr, InputString, NumberOfBytes);
2805 
2806   /* Do the tokenization.  Each pass does something to the text in the buffer,
2807   and may add words to the word set. */
2808 
2809   CurrentSize = NumberOfBytes;
2810   for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
2811   {
2812     switch (PassNumber)
2813     {
2814       case 1: /* Lowercase first, rest of them assume lower case inputs. */
2815         CurrentSize = TokenizerPassLowerCase (BufferPntr, CurrentSize);
2816         break;
2817       case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
2818         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2819       case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
2820         BufferPntr, CurrentSize); break;
2821       case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
2822         BufferPntr, CurrentSize, 'Z', WordSet); break;
2823       case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
2824         BufferPntr, CurrentSize, 'Z', WordSet); break;
2825       case 6: CurrentSize = TokenizerPassExtractURLs (
2826         BufferPntr, CurrentSize, 'Z', WordSet); break;
2827       case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
2828         BufferPntr, CurrentSize, 'Z', WordSet); break;
2829       case 8: CurrentSize = TokenizerPassGetPlainWords (
2830         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2831       default: break;
2832     }
2833   }
2834 
2835   delete [] BufferPntr;
2836 }
2837 
2838 
2839 /* The user has provided a command line.  This could actually be from a
2840 separate attempt to invoke the program (this application's resource/attributes
2841 have the launch flags set to "single launch", so the shell doesn't start the
2842 program but instead sends the arguments to the already running instance).  In
2843 either case, the command is sent to an intermediary thread where it is
2844 asynchronously converted into a scripting message(s) that are sent back to this
2845 BApplication.  The intermediary is needed since we can't recursively execute
2846 scripting messages while processing a message (this ArgsReceived one). */
2847 
2848 void
2849 ABSApp::ArgvReceived (int32 argc, char **argv)
2850 {
2851   if (g_CommanderLooperPntr != NULL)
2852     g_CommanderLooperPntr->CommandArguments (argc, argv);
2853 }
2854 
2855 
2856 /* Create a new empty database.  Note that we have to write out the new file
2857 immediately, otherwise other operations will see the empty database and then
2858 try to load the file, and complain that it doesn't exist.  Now they will see
2859 the empty database and redundantly load the empty file. */
2860 
2861 status_t ABSApp::CreateDatabaseFile (char *ErrorMessage)
2862 {
2863   MakeDatabaseEmpty ();
2864   m_DatabaseHasChanged = true;
2865   return SaveDatabaseIfNeeded (ErrorMessage); /* Make it now. */
2866 }
2867 
2868 
2869 /* Set the settings to the defaults.  Needed in case there isn't a settings
2870 file or it is obsolete. */
2871 
2872 void
2873 ABSApp::DefaultSettings ()
2874 {
2875   status_t ErrorCode;
2876   BPath    DatabasePath (m_SettingsDirectoryPath);
2877   char     TempString [PATH_MAX];
2878 
2879   /* The default database file is in the settings directory. */
2880 
2881   ErrorCode = DatabasePath.Append (g_DefaultDatabaseFileName);
2882   if (ErrorCode != B_OK)
2883     strcpy (TempString, g_DefaultDatabaseFileName); /* Unlikely to happen. */
2884   else
2885     strcpy (TempString, DatabasePath.Path ());
2886   m_DatabaseFileName.SetTo (TempString);
2887 
2888   // Users need to be allowed to undo their mistakes...
2889   m_IgnorePreviousClassification = true;
2890   g_ServerMode = true;
2891   m_PurgeAge = 2000;
2892   m_PurgePopularity = 2;
2893   m_ScoringMode = SM_CHISQUARED;
2894   m_TokenizeMode = TM_ANY_TEXT_HEADER;
2895 
2896   m_SettingsHaveChanged = true;
2897 }
2898 
2899 
2900 /* Deletes the database file, and the backup file, and clears the database but
2901 marks it as not changed so that it doesn't get written out when the program
2902 exits. */
2903 
2904 status_t ABSApp::DeleteDatabaseFile (char *ErrorMessage)
2905 {
2906   BEntry   FileEntry;
2907   status_t ErrorCode;
2908   int      i;
2909   char     TempString [PATH_MAX+20];
2910 
2911   /* Clear the in-memory database. */
2912 
2913   MakeDatabaseEmpty ();
2914   m_DatabaseHasChanged = false;
2915 
2916   /* Delete the backup files first.  Don't care if it fails. */
2917 
2918   for (i = 0; i < g_MaxBackups; i++)
2919   {
2920     strcpy (TempString, m_DatabaseFileName.String ());
2921     sprintf (TempString + strlen (TempString), g_BackupSuffix, i);
2922     ErrorCode = FileEntry.SetTo (TempString);
2923     if (ErrorCode == B_OK)
2924       FileEntry.Remove ();
2925   }
2926 
2927   /* Delete the main database file. */
2928 
2929   strcpy (TempString, m_DatabaseFileName.String ());
2930   ErrorCode = FileEntry.SetTo (TempString);
2931   if (ErrorCode != B_OK)
2932   {
2933     sprintf (ErrorMessage, "While deleting, failed to make BEntry for "
2934       "\"%s\" (does the directory exist?)", TempString);
2935     return ErrorCode;
2936   }
2937 
2938   ErrorCode = FileEntry.Remove ();
2939   if (ErrorCode != B_OK)
2940     sprintf (ErrorMessage, "While deleting, failed to remove file "
2941       "\"%s\"", TempString);
2942 
2943   return ErrorCode;
2944 }
2945 
2946 
2947 /* Evaluate the given file as being a spam message, and tag it with the
2948 resulting spam probability ratio.  If it also has an e-mail subject attribute,
2949 remove the [Spam 99.9%] prefix since the number usually changes. */
2950 
2951 status_t ABSApp::EvaluateFile (
2952   const char *PathName,
2953   BMessage *ReplyMessagePntr,
2954   char *ErrorMessage)
2955 {
2956   status_t ErrorCode;
2957   float    TempFloat;
2958   BFile    TextFile;
2959 
2960   /* Open the specified file. */
2961 
2962   ErrorCode = TextFile.SetTo (PathName, B_READ_ONLY);
2963   if (ErrorCode != B_OK)
2964   {
2965     sprintf (ErrorMessage, "Problems opening file \"%s\" for evaluating",
2966       PathName);
2967     return ErrorCode;
2968   }
2969 
2970   ErrorCode =
2971     EvaluatePositionIO (&TextFile, PathName, ReplyMessagePntr, ErrorMessage);
2972 
2973   if (ErrorCode == B_OK &&
2974   ReplyMessagePntr->FindFloat (g_ResultName, &TempFloat) == B_OK)
2975   {
2976     TextFile.WriteAttr (g_AttributeNameSpamRatio, B_FLOAT_TYPE,
2977       0 /* offset */, &TempFloat, sizeof (TempFloat));
2978     /* Don't know the spam cutoff ratio, that's in the e-mail filter, so just
2979     blindly remove the prefix, which would have the wrong percentage. */
2980     RemoveSpamPrefixFromSubjectAttribute (&TextFile);
2981   }
2982 
2983   return ErrorCode;
2984 }
2985 
2986 
2987 /* Evaluate a given file or memory buffer (a BPositionIO handles both cases)
2988 for spaminess.  The output is added to the ReplyMessagePntr message, with the
2989 probability ratio stored in "result" (0.0 means genuine and 1.0 means spam).
2990 It also adds the most significant words (used in the ratio calculation) to the
2991 array "words" and the associated per-word probability ratios in "ratios".  If
2992 it fails, an error code is returned and an error message written to the
2993 ErrorMessage string (which is at least MAX_PATH + 1024 bytes long).
2994 OptionalFileName is only used in the error message.
2995 
2996 The math used for combining the individual word probabilities in my method is
2997 based on Gary Robinson's method (formerly it was a variation of Paul Graham's
2998 method) or the Chi-Squared method.  It's input is the database of words that
2999 has a count of the number of spam and number of genuine messages each word
3000 appears in (doesn't matter if it appears more than once in a message, it still
3001 counts as 1).
3002 
3003 The spam word count is divided the by the total number of spam e-mail messages
3004 in the database to get the probability of spam and probability of genuineness
3005 is similarly computed for a particular word.  The spam probability is divided
3006 by the sum of the spam and genuine probabilities to get the Raw Spam Ratio for
3007 the word.  It's nearer to 0.0 for genuine and nearer to 1.0 for spam, and can
3008 be exactly zero or one too.
3009 
3010 To avoid multiplying later results by zero, and to compensate for a lack of
3011 data points, the Raw Spam Ratio is adjusted towards the 0.5 halfway point.  The
3012 0.5 is combined with the raw spam ratio, with a weight of 0.45 (determined to
3013 be a good value by the "spambayes" mailing list tests) messages applied to the
3014 half way point and a weight of the number of spam + genuine messages applied to
3015 the raw spam ratio.  This gives you the compensated spam ratio for the word.
3016 
3017 The top N (150 was good in the spambayes tests) extreme words are selected by
3018 the distance of each word's compensated spam ratio from 0.5.  Then the ratios
3019 of the words are combined.
3020 
3021 The Gary Robinson combining (scoring) method gets one value from the Nth root
3022 of the product of all the word ratios.  The other is the Nth root of the
3023 product of (1 - ratio) for all the words.  The final result is the first value
3024 divided by the sum of the two values.  The Nth root helps spread the resulting
3025 range of values more evenly between 0.0 and 1.0, otherwise the values all clump
3026 together at 0 or 1.  Also you can think of the Nth root as a kind of average
3027 for products; it's like a generic word probability which when multiplied by
3028 itself N times gives you the same result as the N separate actual word
3029 probabilities multiplied together.
3030 
3031 The Chi-Squared combining (scoring) method assumes that the spam word
3032 probabilities are uniformly distributed and computes an error measurement
3033 (called chi squared - see http://bmj.com/collections/statsbk/8.shtml for a good
3034 tutorial) and then sees how likely that error value would be observed in
3035 practice.  If it's rare to observe, then the words are likely not just randomly
3036 occuring and it's spammy.  The same is done for genuine words.  The two
3037 resulting unlikelynesses are compared to see which is more unlikely, if neither
3038 is, then the method says it can't decide.  The SpamBayes notes (see the
3039 classifier.py file in CVS in http://sourceforge.net/projects/spambayes) say:
3040 
3041 "Across vectors of length n, containing random uniformly-distributed
3042 probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution with 2*n
3043 degrees of freedom.  This has been proven (in some appropriate sense) to be the
3044 most sensitive possible test for rejecting the hypothesis that a vector of
3045 probabilities is uniformly distributed.  Gary Robinson's original scheme was
3046 monotonic *with* this test, but skipped the details.  Turns out that getting
3047 closer to the theoretical roots gives a much sharper classification, with a
3048 very small (in # of msgs), but also very broad (in range of scores), "middle
3049 ground", where most of the mistakes live.  In particular, this scheme seems
3050 immune to all forms of "cancellation disease": if there are many strong ham
3051 *and* spam clues, this reliably scores close to 0.5.  Most other schemes are
3052 extremely certain then -- and often wrong."
3053 
3054 I did a test with 448 example genuine messages including personal mail (some
3055 with HTML attachments) and mailing lists, and 267 spam messages for 27471 words
3056 total.  Test messages were more recent messages in the same groups.  Out of 100
3057 test genuine messages, with Gary Robinson (0.56 cutoff limit), 1 (1%) was
3058 falsely identified as spam and 8 of 73 (11%) spam messages were incorrectly
3059 classified as genuine.  With my variation of Paul Graham's scheme (0.90 cutoff)
3060 I got 6 of 100 (6%) genuine messages incorrectly marked as spam and 2 of 73
3061 (3%) spam messages were incorrectly classified as genuine.  Pretty close, but
3062 Robinson's values are more evenly spread out so you can tell just how spammy it
3063 is by looking at the number. */
3064 
3065 struct WordAndRatioStruct
3066 {
3067   double        probabilityRatio; /* Actually the compensated ratio. */
3068   const string *wordPntr;
3069 
3070   bool operator() ( /* Our less-than comparison function for sorting. */
3071     const WordAndRatioStruct &ItemA,
3072     const WordAndRatioStruct &ItemB) const
3073   {
3074     return
3075       (fabs (ItemA.probabilityRatio - 0.5) <
3076       fabs (ItemB.probabilityRatio - 0.5));
3077   };
3078 };
3079 
3080 status_t ABSApp::EvaluatePositionIO (
3081   BPositionIO *PositionIOPntr,
3082   const char *OptionalFileName,
3083   BMessage *ReplyMessagePntr,
3084   char *ErrorMessage)
3085 {
3086   StatisticsMap::iterator            DataEndIter;
3087   StatisticsMap::iterator            DataIter;
3088   status_t                           ErrorCode;
3089   double                             GenuineProbability;
3090   uint32                             GenuineSpamSum;
3091   int                                i;
3092   priority_queue<
3093     WordAndRatioStruct /* Data type stored in the queue */,
3094     vector<WordAndRatioStruct> /* Underlying container */,
3095     WordAndRatioStruct /* Function for comparing elements */>
3096                                      PriorityQueue;
3097   double                             ProductGenuine;
3098   double                             ProductLogGenuine;
3099   double                             ProductLogSpam;
3100   double                             ProductSpam;
3101   double                             RawProbabilityRatio;
3102   float                              ResultRatio;
3103   double                             SpamProbability;
3104   StatisticsPointer                  StatisticsPntr;
3105   double                             TempDouble;
3106   double                             TotalGenuine;
3107   double                             TotalSpam;
3108   WordAndRatioStruct                 WordAndRatio;
3109   set<string>::iterator              WordEndIter;
3110   set<string>::iterator              WordIter;
3111   const WordAndRatioStruct          *WordRatioPntr;
3112   set<string>                        WordSet;
3113 
3114   /* Get the list of unique words in the file / memory buffer. */
3115 
3116   ErrorCode = GetWordsFromPositionIO (PositionIOPntr, OptionalFileName,
3117     WordSet, ErrorMessage);
3118   if (ErrorCode != B_OK)
3119     return ErrorCode;
3120 
3121   /* Prepare a few variables.  Mostly these are stored double values of some of
3122   the numbers involved (to avoid the overhead of multiple conversions from
3123   integer to double), with extra precautions to avoid divide by zero. */
3124 
3125   if (m_TotalGenuineMessages <= 0)
3126     TotalGenuine = 1.0;
3127   else
3128     TotalGenuine = m_TotalGenuineMessages;
3129 
3130   if (m_TotalSpamMessages <= 0)
3131     TotalSpam = 1.0;
3132   else
3133     TotalSpam = m_TotalSpamMessages;
3134 
3135   /* Look up the words in the database and calculate their compensated spam
3136   ratio.  The results are stored in a priority queue so that we can later find
3137   the top g_MaxInterestingWords for doing the actual determination. */
3138 
3139   WordEndIter = WordSet.end ();
3140   DataEndIter = m_WordMap.end ();
3141   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
3142   {
3143     WordAndRatio.wordPntr = &(*WordIter);
3144 
3145     if ((DataIter = m_WordMap.find (*WordIter)) != DataEndIter)
3146     {
3147       StatisticsPntr = &DataIter->second;
3148 
3149       /* Calculate the probability the word is spam and the probability it is
3150       genuine.  Then the raw probability ratio. */
3151 
3152       SpamProbability = StatisticsPntr->spamCount / TotalSpam;
3153       GenuineProbability = StatisticsPntr->genuineCount / TotalGenuine;
3154 
3155       if (SpamProbability + GenuineProbability > 0)
3156         RawProbabilityRatio =
3157         SpamProbability / (SpamProbability + GenuineProbability);
3158       else /* Word with zero statistics, perhaps due to reclassification. */
3159         RawProbabilityRatio = 0.5;
3160 
3161       /* The compensated ratio leans towards 0.5 (g_RobinsonX) more for fewer
3162       data points, with a weight of 0.45 (g_RobinsonS). */
3163 
3164       GenuineSpamSum =
3165         StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
3166 
3167       WordAndRatio.probabilityRatio =
3168         (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
3169         (g_RobinsonS + GenuineSpamSum);
3170     }
3171     else /* Unknown word. With N=0, compensated ratio equation is RobinsonX. */
3172       WordAndRatio.probabilityRatio = g_RobinsonX;
3173 
3174      PriorityQueue.push (WordAndRatio);
3175   }
3176 
3177   /* Compute the combined probability (multiply them together) of the top few
3178   words.  To avoid numeric underflow (doubles can only get as small as 1E-300),
3179   logarithms are also used.  But avoid the logarithms (sum of logs of numbers
3180   is the same as the product of numbers) as much as possible due to reduced
3181   accuracy and slowness. */
3182 
3183   ProductGenuine = 1.0;
3184   ProductLogGenuine = 0.0;
3185   ProductSpam = 1.0;
3186   ProductLogSpam = 0.0;
3187   for (i = 0;
3188   i < g_MaxInterestingWords && !PriorityQueue.empty();
3189   i++, PriorityQueue.pop())
3190   {
3191     WordRatioPntr = &PriorityQueue.top();
3192     ProductSpam *= WordRatioPntr->probabilityRatio;
3193     ProductGenuine *= 1.0 - WordRatioPntr->probabilityRatio;
3194 
3195     /* Check for the numbers getting dangerously small, close to underflowing.
3196     If they are, move the value into the logarithm storage part. */
3197 
3198     if (ProductSpam < m_SmallestUseableDouble)
3199     {
3200       ProductLogSpam += log (ProductSpam);
3201       ProductSpam = 1.0;
3202     }
3203 
3204     if (ProductGenuine < m_SmallestUseableDouble)
3205     {
3206       ProductLogGenuine += log (ProductGenuine);
3207       ProductGenuine = 1.0;
3208     }
3209 
3210     ReplyMessagePntr->AddString ("words", WordRatioPntr->wordPntr->c_str ());
3211     ReplyMessagePntr->AddFloat ("ratios", WordRatioPntr->probabilityRatio);
3212   }
3213 
3214   /* Get the resulting log of the complete products. */
3215 
3216   if (i > 0)
3217   {
3218     ProductLogSpam += log (ProductSpam);
3219     ProductLogGenuine += log (ProductGenuine);
3220   }
3221 
3222   if (m_ScoringMode == SM_ROBINSON)
3223   {
3224     /* Apply Gary Robinson's scoring method where we take the Nth root of the
3225     products.  This is easiest in logarithm form. */
3226 
3227     if (i > 0)
3228     {
3229       ProductSpam = exp (ProductLogSpam / i);
3230       ProductGenuine = exp (ProductLogGenuine / i);
3231       ResultRatio = ProductSpam / (ProductGenuine + ProductSpam);
3232     }
3233     else /* Somehow got no words! */
3234       ResultRatio = g_RobinsonX;
3235   }
3236   else if (m_ScoringMode == SM_CHISQUARED)
3237   {
3238     /* From the SpamBayes notes: "We compute two chi-squared statistics, one
3239     for ham and one for spam.  The sum-of-the-logs business is more sensitive
3240     to probs near 0 than to probs near 1, so the spam measure uses 1-p (so that
3241     high-spamprob words have greatest effect), and the ham measure uses p
3242     directly (so that lo-spamprob words have greatest effect)."  That means we
3243     just reversed the meaning of the previously calculated spam and genuine
3244     products!  Oh well. */
3245 
3246     TempDouble = ProductLogSpam;
3247     ProductLogSpam = ProductLogGenuine;
3248     ProductLogGenuine = TempDouble;
3249 
3250     if (i > 0)
3251     {
3252       ProductSpam =
3253         1.0 - ChiSquaredProbability (-2.0 * ProductLogSpam, 2 * i);
3254       ProductGenuine =
3255         1.0 - ChiSquaredProbability (-2.0 * ProductLogGenuine, 2 * i);
3256 
3257       /* The SpamBayes notes say: "How to combine these into a single spam
3258       score?  We originally used (S-H)/(S+H) scaled into [0., 1.], which equals
3259       S/(S+H).  A systematic problem is that we could end up being near-certain
3260       a thing was (for example) spam, even if S was small, provided that H was
3261       much smaller.  Rob Hooft stared at these problems and invented the
3262       measure we use now, the simpler S-H, scaled into [0., 1.]." */
3263 
3264       ResultRatio = (ProductSpam - ProductGenuine + 1.0) / 2.0;
3265     }
3266     else /* No words to analyse. */
3267       ResultRatio = 0.5;
3268   }
3269   else /* Unknown scoring mode. */
3270   {
3271     strcpy (ErrorMessage, "Unknown scoring mode specified in settings");
3272     return B_BAD_VALUE;
3273   }
3274 
3275   ReplyMessagePntr->AddFloat (g_ResultName, ResultRatio);
3276   return B_OK;
3277 }
3278 
3279 
3280 /* Just evaluate the given string as being spam text. */
3281 
3282 status_t ABSApp::EvaluateString (
3283   const char *BufferPntr,
3284   ssize_t BufferSize,
3285   BMessage *ReplyMessagePntr,
3286   char *ErrorMessage)
3287 {
3288   BMemoryIO MemoryIO (BufferPntr, BufferSize);
3289 
3290   return EvaluatePositionIO (&MemoryIO, "Memory Buffer",
3291     ReplyMessagePntr, ErrorMessage);
3292 }
3293 
3294 
3295 /* Tell other programs about the scripting commands we support.  Try this
3296 command: "hey application/x-vnd.agmsmith.spamdbm getsuites" to
3297 see it in action (this program has to be already running for it to work). */
3298 
3299 status_t ABSApp::GetSupportedSuites (BMessage *MessagePntr)
3300 {
3301   BPropertyInfo TempPropInfo (g_ScriptingPropertyList);
3302 
3303   MessagePntr->AddString ("suites", "suite/x-vnd.agmsmith.spamdbm");
3304   MessagePntr->AddFlat ("messages", &TempPropInfo);
3305   return BApplication::GetSupportedSuites (MessagePntr);
3306 }
3307 
3308 
3309 /* Add all the words in the given file or memory buffer to the supplied set.
3310 The file name is only there for error messages, it assumes you have already
3311 opened the PositionIO to the right file.  If things go wrong, a non-zero error
3312 code will be returned and an explanation written to ErrorMessage (assumed to be
3313 at least PATH_MAX + 1024 bytes long). */
3314 
3315 status_t ABSApp::GetWordsFromPositionIO (
3316   BPositionIO *PositionIOPntr,
3317   const char *OptionalFileName,
3318   set<string> &WordSet,
3319   char *ErrorMessage)
3320 {
3321   status_t ErrorCode;
3322 
3323   if (m_TokenizeMode == TM_WHOLE)
3324     ErrorCode = TokenizeWhole (PositionIOPntr, OptionalFileName,
3325       WordSet, ErrorMessage);
3326   else
3327     ErrorCode = TokenizeParts (PositionIOPntr, OptionalFileName,
3328       WordSet, ErrorMessage);
3329 
3330   if (ErrorCode == B_OK && WordSet.empty ())
3331   {
3332     /* ENOMSG usually means no message found in queue, but I'm using it to show
3333     no words, a good indicator of spam which is pure HTML. */
3334 
3335     sprintf (ErrorMessage, "No words were found in \"%s\"", OptionalFileName);
3336     ErrorCode = ENOMSG;
3337   }
3338 
3339   return ErrorCode;
3340 }
3341 
3342 
3343 /* Set up indices for attributes MAIL:classification (string) and
3344 MAIL:ratio_spam (float) on all mounted disk volumes that support queries.  Also
3345 tell the system to make those attributes visible to the user (so they can see
3346 them in Tracker) and associate them with e-mail messages.  Also set up the
3347 database file MIME type (provide a description and associate it with this
3348 program so that it picks up the right icon).  And register the names for our
3349 sound effects. */
3350 
3351 status_t ABSApp::InstallThings (char *ErrorMessage)
3352 {
3353   int32       Cookie;
3354   dev_t       DeviceID;
3355   status_t    ErrorCode = B_OK;
3356   fs_info     FSInfo;
3357   int32       i;
3358   int32       iClassification;
3359   int32       iProbability;
3360   int32       j;
3361   index_info  IndexInfo;
3362   BMimeType   MimeType;
3363   BMessage    Parameters;
3364   const char *StringPntr;
3365   bool        TempBool;
3366   int32       TempInt32;
3367 
3368   /* Iterate through all mounted devices and try to make the indices on each
3369   one.  Don't bother if the index exists or the device doesn't support indices
3370   (actually queries). */
3371 
3372   Cookie = 0;
3373   while ((DeviceID = next_dev (&Cookie)) >= 0)
3374   {
3375     if (!fs_stat_dev (DeviceID, &FSInfo) && (FSInfo.flags & B_FS_HAS_QUERY))
3376     {
3377       if (fs_stat_index (DeviceID, g_AttributeNameClassification, &IndexInfo)
3378       && errno == B_ENTRY_NOT_FOUND)
3379       {
3380         if (fs_create_index (DeviceID, g_AttributeNameClassification,
3381         B_STRING_TYPE, 0 /* flags */))
3382         {
3383           ErrorCode = errno;
3384           sprintf (ErrorMessage, "Unable to make string index %s on "
3385             "volume #%d, volume name \"%s\", file system type \"%s\", "
3386             "on device \"%s\"", g_AttributeNameClassification,
3387             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3388             FSInfo.device_name);
3389         }
3390       }
3391 
3392       if (fs_stat_index (DeviceID, g_AttributeNameSpamRatio,
3393       &IndexInfo) && errno == B_ENTRY_NOT_FOUND)
3394       {
3395         if (fs_create_index (DeviceID, g_AttributeNameSpamRatio,
3396         B_FLOAT_TYPE, 0 /* flags */))
3397         {
3398           ErrorCode = errno;
3399           sprintf (ErrorMessage, "Unable to make float index %s on "
3400             "volume #%d, volume name \"%s\", file system type \"%s\", "
3401             "on device \"%s\"", g_AttributeNameSpamRatio,
3402             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3403             FSInfo.device_name);
3404         }
3405       }
3406     }
3407   }
3408   if (ErrorCode != B_OK)
3409     return ErrorCode;
3410 
3411   /* Set up the MIME types for the classification attributes, associate them
3412   with e-mail and make them visible to the user (but not editable).  First need
3413   to get the existing MIME settings, then add ours to them (otherwise the
3414   existing ones get wiped out). */
3415 
3416   ErrorCode = MimeType.SetTo ("text/x-email");
3417   if (ErrorCode != B_OK || !MimeType.IsInstalled ())
3418   {
3419     sprintf (ErrorMessage, "No e-mail MIME type (%s) in the system, can't "
3420       "update it to add our special attributes, and without e-mail this "
3421       "program is useless!", MimeType.Type ());
3422     if (ErrorCode == B_OK)
3423       ErrorCode = -1;
3424     return ErrorCode;
3425   }
3426 
3427   ErrorCode = MimeType.GetAttrInfo (&Parameters);
3428   if (ErrorCode != B_OK)
3429   {
3430     sprintf (ErrorMessage, "Unable to retrieve list of attributes "
3431       "associated with e-mail messages in the MIME database");
3432     return ErrorCode;
3433   }
3434 
3435   for (i = 0, iClassification = -1, iProbability = -1;
3436   i < 1000 && (iClassification < 0 || iProbability < 0);
3437   i++)
3438   {
3439     ErrorCode = Parameters.FindString ("attr:name", i, &StringPntr);
3440     if (ErrorCode != B_OK)
3441       break; /* Reached the end of the attributes. */
3442     if (strcmp (StringPntr, g_AttributeNameClassification) == 0)
3443       iClassification = i;
3444     else if (strcmp (StringPntr, g_AttributeNameSpamRatio) == 0)
3445       iProbability = i;
3446   }
3447 
3448   /* Add extra default settings for those programs which previously didn't
3449   update the MIME database with all the attributes that exist (so our new
3450   additions don't show up at the wrong index). */
3451 
3452   i--; /* Set i to index of last valid attribute. */
3453 
3454   for (j = 0; j <= i; j++)
3455   {
3456     if (Parameters.FindString ("attr:public_name", j, &StringPntr) ==
3457     B_BAD_INDEX)
3458     {
3459       if (Parameters.FindString ("attr:name", j, &StringPntr) != B_OK)
3460         StringPntr = "None!";
3461       Parameters.AddString ("attr:public_name", StringPntr);
3462     }
3463   }
3464 
3465   while (Parameters.FindInt32 ("attr:type", i, &TempInt32) == B_BAD_INDEX)
3466     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3467 
3468   while (Parameters.FindBool ("attr:viewable", i, &TempBool) == B_BAD_INDEX)
3469     Parameters.AddBool ("attr:viewable", true);
3470 
3471   while (Parameters.FindBool ("attr:editable", i, &TempBool) == B_BAD_INDEX)
3472     Parameters.AddBool ("attr:editable", false);
3473 
3474   while (Parameters.FindInt32 ("attr:width", i, &TempInt32) == B_BAD_INDEX)
3475     Parameters.AddInt32 ("attr:width", 60);
3476 
3477   while (Parameters.FindInt32 ("attr:alignment", i, &TempInt32) == B_BAD_INDEX)
3478     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3479 
3480   while (Parameters.FindBool ("attr:extra", i, &TempBool) == B_BAD_INDEX)
3481     Parameters.AddBool ("attr:extra", false);
3482 
3483   /* Add our new attributes to e-mail related things, if not already there. */
3484 
3485   if (iClassification < 0)
3486   {
3487     Parameters.AddString ("attr:name", g_AttributeNameClassification);
3488     Parameters.AddString ("attr:public_name", "Classification Group");
3489     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3490     Parameters.AddBool ("attr:viewable", true);
3491     Parameters.AddBool ("attr:editable", false);
3492     Parameters.AddInt32 ("attr:width", 45);
3493     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3494     Parameters.AddBool ("attr:extra", false);
3495   }
3496 
3497   if (iProbability < 0)
3498   {
3499     Parameters.AddString ("attr:name", g_AttributeNameSpamRatio);
3500     Parameters.AddString ("attr:public_name", "Spam/Genuine Estimate");
3501     Parameters.AddInt32 ("attr:type", B_FLOAT_TYPE);
3502     Parameters.AddBool ("attr:viewable", true);
3503     Parameters.AddBool ("attr:editable", false);
3504     Parameters.AddInt32 ("attr:width", 50);
3505     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3506     Parameters.AddBool ("attr:extra", false);
3507   }
3508 
3509   if (iClassification < 0 || iProbability < 0)
3510   {
3511     ErrorCode = MimeType.SetAttrInfo (&Parameters);
3512     if (ErrorCode != B_OK)
3513     {
3514       sprintf (ErrorMessage, "Unable to associate the classification "
3515         "attributes with e-mail messages in the MIME database");
3516       return ErrorCode;
3517     }
3518   }
3519 
3520   /* Set up the MIME type for the database file. */
3521 
3522   sprintf (ErrorMessage, "Problems with setting up MIME type (%s) for "
3523     "the database files", g_ABSDatabaseFileMIMEType); /* A generic message. */
3524 
3525   ErrorCode = MimeType.SetTo (g_ABSDatabaseFileMIMEType);
3526   if (ErrorCode != B_OK)
3527     return ErrorCode;
3528 
3529   MimeType.Delete ();
3530   ErrorCode = MimeType.Install ();
3531   if (ErrorCode != B_OK)
3532   {
3533     sprintf (ErrorMessage, "Failed to install MIME type (%s) in the system",
3534       MimeType.Type ());
3535     return ErrorCode;
3536   }
3537 
3538   MimeType.SetShortDescription ("Spam Database");
3539   MimeType.SetLongDescription ("Bayesian Statistical Database for "
3540     "Classifying Junk E-Mail");
3541   sprintf (ErrorMessage, "1.0 ('%s')", g_DatabaseRecognitionString);
3542   MimeType.SetSnifferRule (ErrorMessage);
3543   MimeType.SetPreferredApp (g_ABSAppSignature);
3544 
3545   /* Set up the names of the sound effects.  Later on the user can associate
3546   sound files with the names by using the Sounds preferences panel or the
3547   installsound command.  The MDR add-on filter will trigger these sounds. */
3548 
3549   add_system_beep_event (g_BeepGenuine);
3550   add_system_beep_event (g_BeepSpam);
3551   add_system_beep_event (g_BeepUncertain);
3552 
3553   return B_OK;
3554 }
3555 
3556 
3557 /* Load the database if it hasn't been loaded yet.  Otherwise do nothing. */
3558 
3559 status_t ABSApp::LoadDatabaseIfNeeded (char *ErrorMessage)
3560 {
3561   if (m_WordMap.empty ())
3562     return LoadSaveDatabase (true /* DoLoad */, ErrorMessage);
3563 
3564   return B_OK;
3565 }
3566 
3567 
3568 /* Either load the database of spam words (DoLoad is TRUE) from the file
3569 specified in the settings, or write (DoLoad is FALSE) the database to it.  If
3570 it doesn't exist (and its parent directories do exist) then it will be created
3571 when saving.  If it doesn't exist when loading, the in-memory database will be
3572 set to an empty one and an error will be returned with an explanation put into
3573 ErrorMessage (should be big enough for a path name and a couple of lines of
3574 text).
3575 
3576 The database file format is a UTF-8 text file (well, there could be some
3577 latin-1 characters and other junk in there - it just copies the bytes from the
3578 e-mail messages directly), with tab characters to separate fields (so that you
3579 can also load it into a spreadsheet).  The first line identifies the overall
3580 file type.  The second lists pairs of classifications plus the number of
3581 messages in each class.  Currently it is just Genuine and Spam, but for future
3582 compatability, that could be followed by more classification pairs.  The
3583 remaining lines each contain a word, the date it was last updated (actually
3584 it's the number of messages in the database when the word was added, smaller
3585 numbers mean it was updated longer ago), the genuine count and the spam count.
3586 */
3587 
3588 status_t ABSApp::LoadSaveDatabase (bool DoLoad, char *ErrorMessage)
3589 {
3590   time_t                             CurrentTime;
3591   FILE                              *DatabaseFile = NULL;
3592   BNode                              DatabaseNode;
3593   BNodeInfo                          DatabaseNodeInfo;
3594   StatisticsMap::iterator            DataIter;
3595   StatisticsMap::iterator            EndIter;
3596   status_t                           ErrorCode;
3597   int                                i;
3598   pair<StatisticsMap::iterator,bool> InsertResult;
3599   char                               LineString [10240];
3600   StatisticsRecord                   Statistics;
3601   const char                        *StringPntr;
3602   char                              *TabPntr;
3603   const char                        *WordPntr;
3604 
3605   if (DoLoad)
3606   {
3607     MakeDatabaseEmpty ();
3608     m_DatabaseHasChanged = false; /* In case of early error exit. */
3609   }
3610   else /* Saving the database, backup the old version on disk. */
3611   {
3612     ErrorCode = MakeBackup (ErrorMessage);
3613     if (ErrorCode != B_OK) /* Usually because the directory isn't there. */
3614       return ErrorCode;
3615   }
3616 
3617   DatabaseFile = fopen (m_DatabaseFileName.String (), DoLoad ? "rb" : "wb");
3618   if (DatabaseFile == NULL)
3619   {
3620     ErrorCode = errno;
3621     sprintf (ErrorMessage, "Can't open database file \"%s\" for %s",
3622       m_DatabaseFileName.String (), DoLoad ? "reading" : "writing");
3623     goto ErrorExit;
3624   }
3625 
3626   /* Process the first line, which identifies the file. */
3627 
3628   if (DoLoad)
3629   {
3630     sprintf (ErrorMessage, "Can't read first line of database file \"%s\", "
3631       "expected it to start with \"%s\"",
3632       m_DatabaseFileName.String (), g_DatabaseRecognitionString);
3633     ErrorCode = -1;
3634 
3635     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3636       goto ErrorExit;
3637     if (strncmp (LineString, g_DatabaseRecognitionString,
3638     strlen (g_DatabaseRecognitionString)) != 0)
3639       goto ErrorExit;
3640   }
3641   else /* Saving */
3642   {
3643     CurrentTime = time (NULL);
3644     if (fprintf (DatabaseFile, "%s V1 (word, age, genuine count, spam count)\t"
3645     "Written by SpamDBM $Revision: 30630 $\t"
3646     "Compiled on " __DATE__ " at " __TIME__ "\tThis file saved on %s",
3647     g_DatabaseRecognitionString, ctime (&CurrentTime)) <= 0)
3648     {
3649       ErrorCode = errno;
3650       sprintf (ErrorMessage, "Problems when writing to database file \"%s\"",
3651         m_DatabaseFileName.String ());
3652       goto ErrorExit;
3653     }
3654   }
3655 
3656   /* The second line lists the different classifications.  We just check to see
3657   that the first two are Genuine and Spam.  If there are others, they'll be
3658   ignored and lost when the database is saved. */
3659 
3660   if (DoLoad)
3661   {
3662     sprintf (ErrorMessage, "Can't read second line of database file \"%s\", "
3663       "expected it to list classifications %s and %s along with their totals",
3664       m_DatabaseFileName.String (), g_ClassifiedGenuine, g_ClassifiedSpam);
3665     ErrorCode = B_BAD_VALUE;
3666 
3667     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3668       goto ErrorExit;
3669     i = strlen (LineString);
3670     if (i > 0 && LineString[i-1] == '\n')
3671       LineString[i-1] = 0; /* Remove trailing line feed character. */
3672 
3673     /* Look for the title word at the start of the line. */
3674 
3675     TabPntr = LineString;
3676     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3677       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3678 
3679     if (strncmp (StringPntr, "Classifications", 15) != 0)
3680       goto ErrorExit;
3681 
3682     /* Look for the Genuine class and count. */
3683 
3684     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3685       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3686 
3687     if (strcmp (StringPntr, g_ClassifiedGenuine) != 0)
3688       goto ErrorExit;
3689 
3690     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3691       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3692 
3693     m_TotalGenuineMessages = atoll (StringPntr);
3694 
3695     /* Look for the Spam class and count. */
3696 
3697     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3698       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3699 
3700     if (strcmp (StringPntr, g_ClassifiedSpam) != 0)
3701       goto ErrorExit;
3702 
3703     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3704       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3705 
3706     m_TotalSpamMessages = atoll (StringPntr);
3707   }
3708   else /* Saving */
3709   {
3710     fprintf (DatabaseFile,
3711       "Classifications and total messages:\t%s\t%lu\t%s\t%lu\n",
3712       g_ClassifiedGenuine, m_TotalGenuineMessages,
3713       g_ClassifiedSpam, m_TotalSpamMessages);
3714   }
3715 
3716   /* The remainder of the file is the list of words and statistics.  Each line
3717   has a word, a tab, the time when the word was last changed in the database
3718   (sequence number of message addition, starts at 0 and goes up by one for each
3719   message added to the database), a tab then the number of messages in the
3720   first class (genuine) that had that word, then a tab, then the number of
3721   messages in the second class (spam) with that word, and so on. */
3722 
3723   if (DoLoad)
3724   {
3725     while (!feof (DatabaseFile))
3726     {
3727       if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3728       {
3729         ErrorCode = errno;
3730         if (feof (DatabaseFile))
3731           break;
3732         if (ErrorCode == B_OK)
3733           ErrorCode = -1;
3734         sprintf (ErrorMessage, "Error while reading words and statistics "
3735           "from database file \"%s\"", m_DatabaseFileName.String ());
3736         goto ErrorExit;
3737       }
3738 
3739       i = strlen (LineString);
3740       if (i > 0 && LineString[i-1] == '\n')
3741         LineString[i-1] = 0; /* Remove trailing line feed character. */
3742 
3743       /* Get the word at the start of the line, save in WordPntr. */
3744 
3745       TabPntr = LineString;
3746       for (WordPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3747         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3748 
3749       /* Get the date stamp.  Actually a sequence number, not a date. */
3750 
3751       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3752         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3753 
3754       Statistics.age = atoll (StringPntr);
3755 
3756       /* Get the Genuine count. */
3757 
3758       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3759         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3760 
3761       Statistics.genuineCount = atoll (StringPntr);
3762 
3763       /* Get the Spam count. */
3764 
3765       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3766         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3767 
3768       Statistics.spamCount = atoll (StringPntr);
3769 
3770       /* Ignore empty words, totally unused words and ones which are too long
3771       (avoids lots of length checking everywhere). */
3772 
3773       if (WordPntr[0] == 0 || strlen (WordPntr) > g_MaxWordLength ||
3774       (Statistics.genuineCount <= 0 && Statistics.spamCount <= 0))
3775         continue; /* Ignore this line of text, start on next one. */
3776 
3777       /* Add the combination to the database. */
3778 
3779       InsertResult = m_WordMap.insert (
3780         StatisticsMap::value_type (WordPntr, Statistics));
3781       if (InsertResult.second == false)
3782       {
3783         ErrorCode = B_BAD_VALUE;
3784         sprintf (ErrorMessage, "Error while inserting word \"%s\" from "
3785           "database \"%s\", perhaps it is a duplicate",
3786           WordPntr, m_DatabaseFileName.String ());
3787         goto ErrorExit;
3788       }
3789       m_WordCount++;
3790 
3791       /* And the hunt for the oldest word. */
3792 
3793       if (Statistics.age < m_OldestAge)
3794         m_OldestAge = Statistics.age;
3795     }
3796   }
3797   else /* Saving, dump all words and statistics to the file. */
3798   {
3799     EndIter = m_WordMap.end ();
3800     for (DataIter = m_WordMap.begin (); DataIter != EndIter; DataIter++)
3801     {
3802       if (fprintf (DatabaseFile, "%s\t%lu\t%lu\t%lu\n",
3803       DataIter->first.c_str (), DataIter->second.age,
3804       DataIter->second.genuineCount, DataIter->second.spamCount) <= 0)
3805       {
3806         ErrorCode = errno;
3807         sprintf (ErrorMessage, "Error while writing word \"%s\" to "
3808           "database \"%s\"",
3809           DataIter->first.c_str(), m_DatabaseFileName.String ());
3810         goto ErrorExit;
3811       }
3812     }
3813   }
3814 
3815   /* Set the file type so that the new file gets associated with this program,
3816   and picks up the right icon. */
3817 
3818   if (!DoLoad)
3819   {
3820     sprintf (ErrorMessage, "Unable to set attributes (file type) of database "
3821       "file \"%s\"", m_DatabaseFileName.String ());
3822     ErrorCode = DatabaseNode.SetTo (m_DatabaseFileName.String ());
3823     if (ErrorCode != B_OK)
3824       goto ErrorExit;
3825     DatabaseNodeInfo.SetTo (&DatabaseNode);
3826     ErrorCode = DatabaseNodeInfo.SetType (g_ABSDatabaseFileMIMEType);
3827     if (ErrorCode != B_OK)
3828       goto ErrorExit;
3829   }
3830 
3831   /* Success! */
3832   m_DatabaseHasChanged = false;
3833   ErrorCode = B_OK;
3834 
3835 ErrorExit:
3836   if (DatabaseFile != NULL)
3837     fclose (DatabaseFile);
3838   return ErrorCode;
3839 }
3840 
3841 
3842 /* Either load the settings (DoLoad is TRUE) from the configuration file or
3843 write them (DoLoad is FALSE) to it.  The configuration file is a flattened
3844 BMessage containing the various program settings.  If it doesn't exist (and its
3845 parent directories don't exist) then it will be created when saving.  If it
3846 doesn't exist when loading, the settings will be set to default values. */
3847 
3848 status_t ABSApp::LoadSaveSettings (bool DoLoad)
3849 {
3850   status_t    ErrorCode;
3851   const char *NamePntr;
3852   BMessage    Settings;
3853   BDirectory  SettingsDirectory;
3854   BFile       SettingsFile;
3855   const char *StringPntr;
3856   bool        TempBool;
3857   int32       TempInt32;
3858   char        TempString [PATH_MAX + 100];
3859 
3860   /* Preset things to default values if loading, in case of an error or it's an
3861   older version of the settings file which doesn't have every field defined. */
3862 
3863   if (DoLoad)
3864     DefaultSettings ();
3865 
3866   /* Look for our settings directory.  When saving we can try to create it. */
3867 
3868   ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3869   if (ErrorCode != B_OK)
3870   {
3871     if (DoLoad || ErrorCode != B_ENTRY_NOT_FOUND)
3872     {
3873       sprintf (TempString, "Can't find settings directory \"%s\"",
3874         m_SettingsDirectoryPath.Path ());
3875       goto ErrorExit;
3876     }
3877     ErrorCode = create_directory (m_SettingsDirectoryPath.Path (), 0755);
3878     if (ErrorCode == B_OK)
3879       ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3880     if (ErrorCode != B_OK)
3881     {
3882       sprintf (TempString, "Can't create settings directory \"%s\"",
3883         m_SettingsDirectoryPath.Path ());
3884       goto ErrorExit;
3885     }
3886   }
3887 
3888   ErrorCode = SettingsFile.SetTo (&SettingsDirectory, g_SettingsFileName,
3889     DoLoad ? B_READ_ONLY : B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
3890   if (ErrorCode != B_OK)
3891   {
3892     sprintf (TempString, "Can't open settings file \"%s\" in directory \"%s\" "
3893       "for %s", g_SettingsFileName, m_SettingsDirectoryPath.Path(),
3894       DoLoad ? "reading" : "writing");
3895     goto ErrorExit;
3896   }
3897 
3898   if (DoLoad)
3899   {
3900     ErrorCode = Settings.Unflatten (&SettingsFile);
3901     if (ErrorCode != 0 || Settings.what != g_SettingsWhatCode)
3902     {
3903       sprintf (TempString, "Corrupt data detected while reading settings "
3904         "file \"%s\" in directory \"%s\", will revert to defaults",
3905         g_SettingsFileName, m_SettingsDirectoryPath.Path());
3906       goto ErrorExit;
3907     }
3908   }
3909 
3910   /* Transfer the settings between the BMessage and our various global
3911   variables.  For loading, if the setting isn't present, leave it at the
3912   default value.  Note that loading and saving are intermingled here to make
3913   code maintenance easier (less chance of forgetting to update it if load and
3914   save were separate functions). */
3915 
3916   ErrorCode = B_OK; /* So that saving settings can record an error. */
3917 
3918   NamePntr = "DatabaseFileName";
3919   if (DoLoad)
3920   {
3921     if (Settings.FindString (NamePntr, &StringPntr) == B_OK)
3922       m_DatabaseFileName.SetTo (StringPntr);
3923   }
3924   else if (ErrorCode == B_OK)
3925     ErrorCode = Settings.AddString (NamePntr, m_DatabaseFileName);
3926 
3927   NamePntr = "ServerMode";
3928   if (DoLoad)
3929   {
3930     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3931       g_ServerMode = TempBool;
3932   }
3933   else if (ErrorCode == B_OK)
3934     ErrorCode = Settings.AddBool (NamePntr, g_ServerMode);
3935 
3936   NamePntr = "IgnorePreviousClassification";
3937   if (DoLoad)
3938   {
3939     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3940       m_IgnorePreviousClassification = TempBool;
3941   }
3942   else if (ErrorCode == B_OK)
3943     ErrorCode = Settings.AddBool (NamePntr, m_IgnorePreviousClassification);
3944 
3945   NamePntr = "PurgeAge";
3946   if (DoLoad)
3947   {
3948     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3949       m_PurgeAge = TempInt32;
3950   }
3951   else if (ErrorCode == B_OK)
3952     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgeAge);
3953 
3954   NamePntr = "PurgePopularity";
3955   if (DoLoad)
3956   {
3957     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3958       m_PurgePopularity = TempInt32;
3959   }
3960   else if (ErrorCode == B_OK)
3961     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgePopularity);
3962 
3963   NamePntr = "ScoringMode";
3964   if (DoLoad)
3965   {
3966     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3967       m_ScoringMode = (ScoringModes) TempInt32;
3968     if (m_ScoringMode < 0 || m_ScoringMode >= SM_MAX)
3969       m_ScoringMode = (ScoringModes) 0;
3970   }
3971   else if (ErrorCode == B_OK)
3972     ErrorCode = Settings.AddInt32 (NamePntr, m_ScoringMode);
3973 
3974   NamePntr = "TokenizeMode";
3975   if (DoLoad)
3976   {
3977     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3978       m_TokenizeMode = (TokenizeModes) TempInt32;
3979     if (m_TokenizeMode < 0 || m_TokenizeMode >= TM_MAX)
3980       m_TokenizeMode = (TokenizeModes) 0;
3981   }
3982   else if (ErrorCode == B_OK)
3983     ErrorCode = Settings.AddInt32 (NamePntr, m_TokenizeMode);
3984 
3985   if (ErrorCode != B_OK)
3986   {
3987     strcpy (TempString, "Unable to stuff the program settings into a "
3988       "temporary BMessage, settings not saved");
3989     goto ErrorExit;
3990   }
3991 
3992   /* Save the settings BMessage to the settings file. */
3993 
3994   if (!DoLoad)
3995   {
3996     Settings.what = g_SettingsWhatCode;
3997     ErrorCode = Settings.Flatten (&SettingsFile);
3998     if (ErrorCode != 0)
3999     {
4000       sprintf (TempString, "Problems while writing settings file \"%s\" in "
4001         "directory \"%s\"", g_SettingsFileName,
4002         m_SettingsDirectoryPath.Path ());
4003       goto ErrorExit;
4004     }
4005   }
4006 
4007   m_SettingsHaveChanged = false;
4008   return B_OK;
4009 
4010 ErrorExit: /* Error message in TempString, code in ErrorCode. */
4011   DisplayErrorMessage (TempString, ErrorCode, DoLoad ?
4012     "Loading Settings Error" : "Saving Settings Error");
4013   return ErrorCode;
4014 }
4015 
4016 
4017 void
4018 ABSApp::MessageReceived (BMessage *MessagePntr)
4019 {
4020   const char           *PropertyName;
4021   struct property_info *PropInfoPntr;
4022   int32                 SpecifierIndex;
4023   int32                 SpecifierKind;
4024   BMessage              SpecifierMessage;
4025 
4026   /* See if it is a scripting message that applies to the database or one of
4027   the other operations this program supports.  Pass on other scripting messages
4028   to the inherited parent MessageReceived function (they're usually scripting
4029   messages for the BApplication). */
4030 
4031   switch (MessagePntr->what)
4032   {
4033     case B_GET_PROPERTY:
4034     case B_SET_PROPERTY:
4035     case B_COUNT_PROPERTIES:
4036     case B_CREATE_PROPERTY:
4037     case B_DELETE_PROPERTY:
4038     case B_EXECUTE_PROPERTY:
4039       if (MessagePntr->GetCurrentSpecifier (&SpecifierIndex, &SpecifierMessage,
4040       &SpecifierKind, &PropertyName) == B_OK &&
4041       SpecifierKind == B_DIRECT_SPECIFIER)
4042       {
4043         for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
4044         {
4045           if (PropInfoPntr->name == 0)
4046             break; /* Ran out of commands. */
4047 
4048           if (PropInfoPntr->commands[0] == MessagePntr->what &&
4049           strcasecmp (PropInfoPntr->name, PropertyName) == 0)
4050           {
4051             ProcessScriptingMessage (MessagePntr, PropInfoPntr);
4052             return;
4053           }
4054         }
4055       }
4056       break;
4057   }
4058 
4059   /* Pass the unprocessed message to the inherited function, maybe it knows
4060   what to do.  This includes replies to messages we sent ourselves. */
4061 
4062   BApplication::MessageReceived (MessagePntr);
4063 }
4064 
4065 
4066 /* Rename the existing database file to a backup file name, potentially
4067 replacing an older backup.  If something goes wrong, returns an error code and
4068 puts an explanation in ErrorMessage. */
4069 
4070 status_t ABSApp::MakeBackup (char *ErrorMessage)
4071 {
4072   BEntry   Entry;
4073   status_t ErrorCode;
4074   int      i;
4075   char     LeafName [NAME_MAX];
4076   char     NewName [PATH_MAX+20];
4077   char     OldName [PATH_MAX+20];
4078 
4079   ErrorCode = Entry.SetTo (m_DatabaseFileName.String ());
4080   if (ErrorCode != B_OK)
4081   {
4082     sprintf (ErrorMessage, "While making backup, failed to make a BEntry for "
4083       "\"%s\" (maybe the directory doesn't exist?)",
4084       m_DatabaseFileName.String ());
4085     return ErrorCode;
4086   }
4087   if (!Entry.Exists ())
4088     return B_OK; /* No existing file to worry about overwriting. */
4089   Entry.GetName (LeafName);
4090 
4091   /* Find the first hole (no file) where we will stop the renaming chain. */
4092 
4093   for (i = 0; i < g_MaxBackups - 1; i++)
4094   {
4095     strcpy (OldName, m_DatabaseFileName.String ());
4096     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4097     Entry.SetTo (OldName);
4098     if (!Entry.Exists ())
4099       break;
4100   }
4101 
4102   /* Move the files down by one to fill in the hole in the name series. */
4103 
4104   for (i--; i >= 0; i--)
4105   {
4106     strcpy (OldName, m_DatabaseFileName.String ());
4107     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4108     Entry.SetTo (OldName);
4109     strcpy (NewName, LeafName);
4110     sprintf (NewName + strlen (NewName), g_BackupSuffix, i + 1);
4111     ErrorCode = Entry.Rename (NewName, true /* clobber */);
4112   }
4113 
4114   Entry.SetTo (m_DatabaseFileName.String ());
4115   strcpy (NewName, LeafName);
4116   sprintf (NewName + strlen (NewName), g_BackupSuffix, 0);
4117   ErrorCode = Entry.Rename (NewName, true /* clobber */);
4118   if (ErrorCode != B_OK)
4119     sprintf (ErrorMessage, "While making backup, failed to rename "
4120       "\"%s\" to \"%s\"", m_DatabaseFileName.String (), NewName);
4121 
4122   return ErrorCode;
4123 }
4124 
4125 
4126 void
4127 ABSApp::MakeDatabaseEmpty ()
4128 {
4129   m_WordMap.clear (); /* Sets the map to empty, deallocating any old data. */
4130   m_WordCount = 0;
4131   m_TotalGenuineMessages = 0;
4132   m_TotalSpamMessages = 0;
4133   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4134 }
4135 
4136 
4137 /* Do what the scripting command says.  A reply message will be sent back with
4138 several fields: "error" containing the numerical error code (0 for success),
4139 "CommandText" with a text representation of the command, "result" with the
4140 resulting data for a get or count command.  If it isn't understood, then rather
4141 than a B_REPLY kind of message, it will be a B_MESSAGE_NOT_UNDERSTOOD message
4142 with an "error" number and an "message" string with a description. */
4143 
4144 void
4145 ABSApp::ProcessScriptingMessage (
4146   BMessage *MessagePntr,
4147   struct property_info *PropInfoPntr)
4148 {
4149   bool        ArgumentBool = false;
4150   bool        ArgumentGotBool = false;
4151   bool        ArgumentGotInt32 = false;
4152   bool        ArgumentGotString = false;
4153   int32       ArgumentInt32 = 0;
4154   const char *ArgumentString = NULL;
4155   BString     CommandText;
4156   status_t    ErrorCode;
4157   int         i;
4158   BMessage    ReplyMessage (B_MESSAGE_NOT_UNDERSTOOD);
4159   ssize_t     StringBufferSize;
4160   BMessage    TempBMessage;
4161   BPath       TempPath;
4162   char        TempString [PATH_MAX + 1024];
4163 
4164   if (g_QuitCountdown >= 0 && !g_CommandLineMode)
4165   {
4166     g_QuitCountdown = -1;
4167     cerr << "Quit countdown aborted due to a scripting command arriving.\n";
4168   }
4169 
4170   if (g_BusyCursor != NULL)
4171     SetCursor (g_BusyCursor);
4172 
4173   ErrorCode = MessagePntr->FindData (g_DataName, B_STRING_TYPE,
4174     (const void **) &ArgumentString, &StringBufferSize);
4175   if (ErrorCode == B_OK)
4176   {
4177     if (PropInfoPntr->extra_data != PN_EVALUATE_STRING &&
4178     PropInfoPntr->extra_data != PN_SPAM_STRING &&
4179     PropInfoPntr->extra_data != PN_GENUINE_STRING &&
4180     strlen (ArgumentString) >= PATH_MAX)
4181     {
4182       sprintf (TempString, "\"data\" string of a scripting message is too "
4183         "long, for SET %s action", PropInfoPntr->name);
4184       ErrorCode = B_NAME_TOO_LONG;
4185       goto ErrorExit;
4186     }
4187     ArgumentGotString = true;
4188   }
4189   else if (MessagePntr->FindBool (g_DataName, &ArgumentBool) == B_OK)
4190     ArgumentGotBool = true;
4191   else if (MessagePntr->FindInt32 (g_DataName, &ArgumentInt32) == B_OK)
4192     ArgumentGotInt32 = true;
4193 
4194   /* Prepare a Human readable description of the scripting command. */
4195 
4196   switch (PropInfoPntr->commands[0])
4197   {
4198     case B_SET_PROPERTY:
4199       CommandText.SetTo ("Set ");
4200       break;
4201 
4202     case B_GET_PROPERTY:
4203       CommandText.SetTo ("Get ");
4204       break;
4205 
4206     case B_COUNT_PROPERTIES:
4207       CommandText.SetTo ("Count ");
4208       break;
4209 
4210     case B_CREATE_PROPERTY:
4211       CommandText.SetTo ("Create ");
4212       break;
4213 
4214     case B_DELETE_PROPERTY:
4215       CommandText.SetTo ("Delete ");
4216       break;
4217 
4218     case B_EXECUTE_PROPERTY:
4219       CommandText.SetTo ("Execute ");
4220       break;
4221 
4222     default:
4223       sprintf (TempString, "Bug: scripting command for \"%s\" has an unknown "
4224         "action code %d", PropInfoPntr->name,
4225         (int) PropInfoPntr->commands[0]);
4226       ErrorCode = -1;
4227       goto ErrorExit;
4228   }
4229   CommandText.Append (PropInfoPntr->name);
4230 
4231   /* Add on the argument value to our readable command, if there is one. */
4232 
4233   if (ArgumentGotString)
4234   {
4235     CommandText.Append (" \"");
4236     CommandText.Append (ArgumentString);
4237     CommandText.Append ("\"");
4238   }
4239   if (ArgumentGotBool)
4240     CommandText.Append (ArgumentBool ? " true" : " false");
4241   if (ArgumentGotInt32)
4242   {
4243     sprintf (TempString, " %ld", ArgumentInt32);
4244     CommandText.Append (TempString);
4245   }
4246 
4247   /* From now on the scripting command has been recognized and is in the
4248   correct format, so it always returns a B_REPLY message.  A readable version
4249   of the command is also added to make debugging easier. */
4250 
4251   ReplyMessage.what = B_REPLY;
4252   ReplyMessage.AddString ("CommandText", CommandText);
4253 
4254   /* Now actually do the command.  First prepare a default error message. */
4255 
4256   sprintf (TempString, "Operation code %d (get, set, count, etc) "
4257     "unsupported for property %s",
4258     (int) PropInfoPntr->commands[0], PropInfoPntr->name);
4259   ErrorCode = B_BAD_INDEX;
4260 
4261   switch (PropInfoPntr->extra_data)
4262   {
4263     case PN_DATABASE_FILE:
4264       switch (PropInfoPntr->commands[0])
4265       {
4266         case B_GET_PROPERTY: /* Get the database file name. */
4267           ReplyMessage.AddString (g_ResultName, m_DatabaseFileName);
4268           break;
4269 
4270         case B_SET_PROPERTY: /* Set the database file name to a new one. */
4271           if (!ArgumentGotString)
4272           {
4273             ErrorCode = B_BAD_TYPE;
4274             sprintf (TempString, "You need to specify a string for the "
4275               "SET %s command", PropInfoPntr->name);
4276             goto ErrorExit;
4277           }
4278           ErrorCode = TempPath.SetTo (ArgumentString, NULL /* leaf */,
4279             true /* normalize - verifies parent directories exist */);
4280           if (ErrorCode != B_OK)
4281           {
4282             sprintf (TempString, "New database path name of \"%s\" is invalid "
4283               "(parent directories must exist)", ArgumentString);
4284             goto ErrorExit;
4285           }
4286           if ((ErrorCode = SaveDatabaseIfNeeded (TempString)) != B_OK)
4287             goto ErrorExit;
4288           MakeDatabaseEmpty (); /* So that the new one gets loaded if used. */
4289 
4290           if (strlen (TempPath.Leaf ()) > NAME_MAX-strlen(g_BackupSuffix)-1)
4291           {
4292             /* Truncate the name so that there is enough space for the backup
4293             extension.  Approximately. */
4294             strcpy (TempString, TempPath.Leaf ());
4295             TempString [NAME_MAX - strlen (g_BackupSuffix) - 1] = 0;
4296             TempPath.GetParent (&TempPath);
4297             TempPath.Append (TempString);
4298           }
4299           m_DatabaseFileName.SetTo (TempPath.Path ());
4300           m_SettingsHaveChanged = true;
4301           break;
4302 
4303         case B_CREATE_PROPERTY: /* Make a new database file plus more. */
4304           if ((ErrorCode = CreateDatabaseFile (TempString)) != B_OK)
4305             goto ErrorExit;
4306           break;
4307 
4308         case B_DELETE_PROPERTY: /* Delete the file and its backups too. */
4309           if ((ErrorCode = DeleteDatabaseFile (TempString)) != B_OK)
4310             goto ErrorExit;
4311           break;
4312 
4313         case B_COUNT_PROPERTIES:
4314           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4315             goto ErrorExit;
4316           ReplyMessage.AddInt32 (g_ResultName, m_WordCount);
4317           break;
4318 
4319         default: /* Unknown operation code, error message already set. */
4320           goto ErrorExit;
4321       }
4322       break;
4323 
4324     case PN_SPAM:
4325     case PN_SPAM_STRING:
4326     case PN_GENUINE:
4327     case PN_GENUINE_STRING:
4328     case PN_UNCERTAIN:
4329       switch (PropInfoPntr->commands[0])
4330       {
4331         case B_COUNT_PROPERTIES: /* Get the number of spam/genuine messages. */
4332           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4333             goto ErrorExit;
4334           if (PropInfoPntr->extra_data == PN_SPAM ||
4335           PropInfoPntr->extra_data == PN_SPAM_STRING)
4336             ReplyMessage.AddInt32 (g_ResultName, m_TotalSpamMessages);
4337           else
4338             ReplyMessage.AddInt32 (g_ResultName, m_TotalGenuineMessages);
4339           break;
4340 
4341         case B_SET_PROPERTY: /* Add spam/genuine/uncertain to database. */
4342           if (!ArgumentGotString)
4343           {
4344             ErrorCode = B_BAD_TYPE;
4345             sprintf (TempString, "You need to specify a string (%s) "
4346               "for the SET %s command",
4347               (PropInfoPntr->extra_data == PN_GENUINE_STRING ||
4348               PropInfoPntr->extra_data == PN_SPAM_STRING)
4349               ? "text of the message to be added"
4350               : "pathname of the file containing the text to be added",
4351               PropInfoPntr->name);
4352             goto ErrorExit;
4353           }
4354           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4355             goto ErrorExit;
4356           if (PropInfoPntr->extra_data == PN_GENUINE ||
4357           PropInfoPntr->extra_data == PN_SPAM ||
4358           PropInfoPntr->extra_data == PN_UNCERTAIN)
4359             ErrorCode = AddFileToDatabase (
4360               (PropInfoPntr->extra_data == PN_SPAM) ? CL_SPAM :
4361               ((PropInfoPntr->extra_data == PN_GENUINE) ? CL_GENUINE :
4362               CL_UNCERTAIN),
4363               ArgumentString, TempString /* ErrorMessage */);
4364           else
4365             ErrorCode = AddStringToDatabase (
4366               (PropInfoPntr->extra_data == PN_SPAM_STRING) ?
4367               CL_SPAM : CL_GENUINE,
4368               ArgumentString, TempString /* ErrorMessage */);
4369           if (ErrorCode != B_OK)
4370             goto ErrorExit;
4371           break;
4372 
4373         default: /* Unknown operation code, error message already set. */
4374           goto ErrorExit;
4375       }
4376       break;
4377 
4378     case PN_IGNORE_PREVIOUS_CLASSIFICATION:
4379       switch (PropInfoPntr->commands[0])
4380       {
4381         case B_GET_PROPERTY:
4382           ReplyMessage.AddBool (g_ResultName, m_IgnorePreviousClassification);
4383           break;
4384 
4385         case B_SET_PROPERTY:
4386           if (!ArgumentGotBool)
4387           {
4388             ErrorCode = B_BAD_TYPE;
4389             sprintf (TempString, "You need to specify a boolean (true/yes, "
4390               "false/no) for the SET %s command", PropInfoPntr->name);
4391             goto ErrorExit;
4392           }
4393           m_IgnorePreviousClassification = ArgumentBool;
4394           m_SettingsHaveChanged = true;
4395           break;
4396 
4397         default: /* Unknown operation code, error message already set. */
4398           goto ErrorExit;
4399       }
4400       break;
4401 
4402     case PN_SERVER_MODE:
4403       switch (PropInfoPntr->commands[0])
4404       {
4405         case B_GET_PROPERTY:
4406           ReplyMessage.AddBool (g_ResultName, g_ServerMode);
4407           break;
4408 
4409         case B_SET_PROPERTY:
4410           if (!ArgumentGotBool)
4411           {
4412             ErrorCode = B_BAD_TYPE;
4413             sprintf (TempString, "You need to specify a boolean (true/yes, "
4414               "false/no) for the SET %s command", PropInfoPntr->name);
4415             goto ErrorExit;
4416           }
4417           g_ServerMode = ArgumentBool;
4418           m_SettingsHaveChanged = true;
4419           break;
4420 
4421         default: /* Unknown operation code, error message already set. */
4422           goto ErrorExit;
4423       }
4424       break;
4425 
4426     case PN_FLUSH:
4427       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4428       (ErrorCode = SaveDatabaseIfNeeded (TempString)) == B_OK)
4429         break;
4430       goto ErrorExit;
4431 
4432     case PN_PURGE_AGE:
4433       switch (PropInfoPntr->commands[0])
4434       {
4435         case B_GET_PROPERTY:
4436           ReplyMessage.AddInt32 (g_ResultName, m_PurgeAge);
4437           break;
4438 
4439         case B_SET_PROPERTY:
4440           if (!ArgumentGotInt32)
4441           {
4442             ErrorCode = B_BAD_TYPE;
4443             sprintf (TempString, "You need to specify a 32 bit integer "
4444               "for the SET %s command", PropInfoPntr->name);
4445             goto ErrorExit;
4446           }
4447           m_PurgeAge = ArgumentInt32;
4448           m_SettingsHaveChanged = true;
4449           break;
4450 
4451         default: /* Unknown operation code, error message already set. */
4452           goto ErrorExit;
4453       }
4454       break;
4455 
4456     case PN_PURGE_POPULARITY:
4457       switch (PropInfoPntr->commands[0])
4458       {
4459         case B_GET_PROPERTY:
4460           ReplyMessage.AddInt32 (g_ResultName, m_PurgePopularity);
4461           break;
4462 
4463         case B_SET_PROPERTY:
4464           if (!ArgumentGotInt32)
4465           {
4466             ErrorCode = B_BAD_TYPE;
4467             sprintf (TempString, "You need to specify a 32 bit integer "
4468               "for the SET %s command", PropInfoPntr->name);
4469             goto ErrorExit;
4470           }
4471           m_PurgePopularity = ArgumentInt32;
4472           m_SettingsHaveChanged = true;
4473           break;
4474 
4475         default: /* Unknown operation code, error message already set. */
4476           goto ErrorExit;
4477       }
4478       break;
4479 
4480     case PN_PURGE:
4481       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4482       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK &&
4483       (ErrorCode = PurgeOldWords (TempString)) == B_OK)
4484         break;
4485       goto ErrorExit;
4486 
4487     case PN_OLDEST:
4488       if (PropInfoPntr->commands[0] == B_GET_PROPERTY &&
4489       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4490       {
4491         ReplyMessage.AddInt32 (g_ResultName, m_OldestAge);
4492         break;
4493       }
4494       goto ErrorExit;
4495 
4496     case PN_EVALUATE:
4497     case PN_EVALUATE_STRING:
4498       if (PropInfoPntr->commands[0] == B_SET_PROPERTY)
4499       {
4500         if (!ArgumentGotString)
4501         {
4502           ErrorCode = B_BAD_TYPE;
4503           sprintf (TempString, "You need to specify a string for the "
4504             "SET %s command", PropInfoPntr->name);
4505           goto ErrorExit;
4506         }
4507         if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4508         {
4509           if (PropInfoPntr->extra_data == PN_EVALUATE)
4510           {
4511             if ((ErrorCode = EvaluateFile (ArgumentString, &ReplyMessage,
4512             TempString)) == B_OK)
4513               break;
4514           }
4515           else /* PN_EVALUATE_STRING */
4516           {
4517             if ((ErrorCode = EvaluateString (ArgumentString, StringBufferSize,
4518             &ReplyMessage, TempString)) == B_OK)
4519               break;
4520           }
4521         }
4522       }
4523       goto ErrorExit;
4524 
4525     case PN_RESET_TO_DEFAULTS:
4526       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY)
4527       {
4528         DefaultSettings ();
4529         break;
4530       }
4531       goto ErrorExit;
4532 
4533     case PN_INSTALL_THINGS:
4534       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4535       (ErrorCode = InstallThings (TempString)) == B_OK)
4536         break;
4537       goto ErrorExit;
4538 
4539     case PN_SCORING_MODE:
4540       switch (PropInfoPntr->commands[0])
4541       {
4542         case B_GET_PROPERTY:
4543           ReplyMessage.AddString (g_ResultName,
4544             g_ScoringModeNames[m_ScoringMode]);
4545           break;
4546 
4547         case B_SET_PROPERTY:
4548           i = SM_MAX;
4549           if (ArgumentGotString)
4550             for (i = 0; i < SM_MAX; i++)
4551             {
4552               if (strcasecmp (ArgumentString, g_ScoringModeNames [i]) == 0)
4553               {
4554                 m_ScoringMode = (ScoringModes) i;
4555                 m_SettingsHaveChanged = true;
4556                 break;
4557               }
4558             }
4559           if (i >= SM_MAX) /* Didn't find a valid scoring mode word. */
4560           {
4561             ErrorCode = B_BAD_TYPE;
4562             sprintf (TempString, "You used the unrecognized \"%s\" as "
4563               "a scoring mode for the SET %s command.  Should be one of: ",
4564               ArgumentGotString ? ArgumentString : "not specified",
4565               PropInfoPntr->name);
4566             for (i = 0; i < SM_MAX; i++)
4567             {
4568               strcat (TempString, g_ScoringModeNames [i]);
4569               if (i < SM_MAX - 1)
4570                 strcat (TempString, ", ");
4571             }
4572             goto ErrorExit;
4573           }
4574           break;
4575 
4576         default: /* Unknown operation code, error message already set. */
4577           goto ErrorExit;
4578       }
4579       break;
4580 
4581     case PN_TOKENIZE_MODE:
4582       switch (PropInfoPntr->commands[0])
4583       {
4584         case B_GET_PROPERTY:
4585           ReplyMessage.AddString (g_ResultName,
4586             g_TokenizeModeNames[m_TokenizeMode]);
4587           break;
4588 
4589         case B_SET_PROPERTY:
4590           i = TM_MAX;
4591           if (ArgumentGotString)
4592             for (i = 0; i < TM_MAX; i++)
4593             {
4594               if (strcasecmp (ArgumentString, g_TokenizeModeNames [i]) == 0)
4595               {
4596                 m_TokenizeMode = (TokenizeModes) i;
4597                 m_SettingsHaveChanged = true;
4598                 break;
4599               }
4600             }
4601           if (i >= TM_MAX) /* Didn't find a valid tokenize mode word. */
4602           {
4603             ErrorCode = B_BAD_TYPE;
4604             sprintf (TempString, "You used the unrecognized \"%s\" as "
4605               "a tokenize mode for the SET %s command.  Should be one of: ",
4606               ArgumentGotString ? ArgumentString : "not specified",
4607               PropInfoPntr->name);
4608             for (i = 0; i < TM_MAX; i++)
4609             {
4610               strcat (TempString, g_TokenizeModeNames [i]);
4611               if (i < TM_MAX - 1)
4612                 strcat (TempString, ", ");
4613             }
4614             goto ErrorExit;
4615           }
4616           break;
4617 
4618         default: /* Unknown operation code, error message already set. */
4619           goto ErrorExit;
4620       }
4621       break;
4622 
4623     default:
4624       sprintf (TempString, "Bug!  Unrecognized property identification "
4625         "number %d (should be between 0 and %d).  Fix the entry in "
4626         "the g_ScriptingPropertyList array!",
4627         (int) PropInfoPntr->extra_data, PN_MAX - 1);
4628       goto ErrorExit;
4629   }
4630 
4631   /* Success. */
4632 
4633   ReplyMessage.AddInt32 ("error", B_OK);
4634   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4635     this /* Reply's reply handler */, 500000 /* send timeout */);
4636   if (ErrorCode != B_OK)
4637     cerr << "ProcessScriptingMessage failed to send a reply message, code " <<
4638     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4639     CommandText.String () << endl;
4640   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4641   return;
4642 
4643 ErrorExit: /* Error message in TempString, return code in ErrorCode. */
4644   ReplyMessage.AddInt32 ("error", ErrorCode);
4645   ReplyMessage.AddString ("message", TempString);
4646   DisplayErrorMessage (TempString, ErrorCode);
4647   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4648     this /* Reply's reply handler */, 500000 /* send timeout */);
4649   if (ErrorCode != B_OK)
4650     cerr << "ProcessScriptingMessage failed to send an error message, code " <<
4651     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4652     CommandText.String () << endl;
4653   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4654 }
4655 
4656 
4657 /* Since quitting stops the program before the results of a script command are
4658 received, we use a time delay to do the quit and make sure there are no pending
4659 commands being processed by the auxiliary looper which is sending us commands.
4660 Also, we have a countdown which can be interrupted by an incoming scripting
4661 message in case one client tells us to quit while another one is still using us
4662 (happens when you have two or more e-mail accounts).  But if the system is
4663 shutting down, quit immediately! */
4664 
4665 void
4666 ABSApp::Pulse ()
4667 {
4668   if (g_QuitCountdown == 0)
4669   {
4670     if (g_CommanderLooperPntr == NULL ||
4671     !g_CommanderLooperPntr->IsBusy ())
4672       PostMessage (B_QUIT_REQUESTED);
4673   }
4674   else if (g_QuitCountdown > 0)
4675   {
4676     cerr << "SpamDBM quitting in " << g_QuitCountdown << ".\n";
4677     g_QuitCountdown--;
4678   }
4679 }
4680 
4681 
4682 /* A quit request message has come in.  If the quit countdown has reached zero,
4683 allow the request, otherwise reject it (and start the countdown if it hasn't
4684 been started). */
4685 
4686 bool
4687 ABSApp::QuitRequested ()
4688 {
4689   BMessage  *QuitMessage;
4690   team_info  RemoteInfo;
4691   BMessenger RemoteMessenger;
4692   team_id    RemoteTeam;
4693 
4694   /* See if the quit is from the system shutdown command (which goes through
4695   the registrar server), if so, quit immediately. */
4696 
4697   QuitMessage = CurrentMessage ();
4698   if (QuitMessage != NULL && QuitMessage->IsSourceRemote ())
4699   {
4700     RemoteMessenger = QuitMessage->ReturnAddress ();
4701     RemoteTeam = RemoteMessenger.Team ();
4702     if (get_team_info (RemoteTeam, &RemoteInfo) == B_OK &&
4703     strstr (RemoteInfo.args, "registrar") != NULL)
4704       g_QuitCountdown = 0;
4705   }
4706 
4707   if (g_QuitCountdown == 0)
4708     return BApplication::QuitRequested ();
4709 
4710   if (g_QuitCountdown < 0)
4711 //    g_QuitCountdown = 10; /* Start the countdown. */
4712     g_QuitCountdown = 5; /* Quit more quickly */
4713 
4714   return false;
4715 }
4716 
4717 
4718 /* Go through the current database and delete words which are too old (time is
4719 equivalent to the number of messages added to the database) and too unpopular
4720 (words not used by many messages).  Hopefully this will get rid of words which
4721 are just hunks of binary or other garbage.  The database has been loaded
4722 elsewhere. */
4723 
4724 status_t
4725 ABSApp::PurgeOldWords (char *ErrorMessage)
4726 {
4727   uint32                  CurrentTime;
4728   StatisticsMap::iterator CurrentIter;
4729   StatisticsMap::iterator EndIter;
4730   StatisticsMap::iterator NextIter;
4731   char                    TempString [80];
4732 
4733   strcpy (ErrorMessage, "Purge can't fail"); /* So argument gets used. */
4734   CurrentTime = m_TotalGenuineMessages + m_TotalSpamMessages - 1;
4735   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4736 
4737   EndIter = m_WordMap.end ();
4738   NextIter = m_WordMap.begin ();
4739   while (NextIter != EndIter) {
4740     CurrentIter = NextIter++;
4741 
4742     if (CurrentTime - CurrentIter->second.age >= m_PurgeAge &&
4743     CurrentIter->second.genuineCount + CurrentIter->second.spamCount <=
4744     m_PurgePopularity) {
4745       /* Delete this word, it is unpopular and old.  Sob. */
4746 
4747       m_WordMap.erase (CurrentIter);
4748       if (m_WordCount > 0)
4749         m_WordCount--;
4750 
4751       m_DatabaseHasChanged = true;
4752     }
4753     else /* This word is still in the database.  Update oldest age. */
4754     {
4755       if (CurrentIter->second.age < m_OldestAge)
4756         m_OldestAge = CurrentIter->second.age;
4757     }
4758   }
4759 
4760   /* Just a little bug check here.  Just in case. */
4761 
4762   if (m_WordCount != m_WordMap.size ()) {
4763     sprintf (TempString, "Our word count of %lu doesn't match the "
4764       "size of the database, %lu", m_WordCount, m_WordMap.size());
4765     DisplayErrorMessage (TempString, -1, "Bug!");
4766     m_WordCount = m_WordMap.size ();
4767   }
4768 
4769   return B_OK;
4770 }
4771 
4772 
4773 void
4774 ABSApp::ReadyToRun ()
4775 {
4776   DatabaseWindow *DatabaseWindowPntr;
4777   float           JunkFloat;
4778   BButton        *TempButtonPntr;
4779   BCheckBox      *TempCheckBoxPntr;
4780   font_height     TempFontHeight;
4781   BMenuBar       *TempMenuBarPntr;
4782   BMenuItem      *TempMenuItemPntr;
4783   BPopUpMenu     *TempPopUpMenuPntr;
4784   BRadioButton   *TempRadioButtonPntr;
4785   BRect           TempRect;
4786   const char     *TempString = "Testing My Things";
4787   BStringView    *TempStringViewPntr;
4788   BTextControl   *TempTextPntr;
4789   BWindow        *TempWindowPntr;
4790 
4791   /* This batch of code gets some measurements which will be used for laying
4792   out controls and other GUI elements.  Set the spacing between buttons and
4793   other controls to the width of the letter "M" in the user's desired font. */
4794 
4795  g_MarginBetweenControls = (int) be_plain_font->StringWidth ("M");
4796 
4797   /* Also find out how much space a line of text uses. */
4798 
4799   be_plain_font->GetHeight (&TempFontHeight);
4800   g_LineOfTextHeight = ceilf (
4801     TempFontHeight.ascent + TempFontHeight.descent + TempFontHeight.leading);
4802 
4803   /* Start finding out the height of various user interface gadgets, which can
4804   vary based on the current font size.  Make a temporary gadget, which is
4805   attached to our window, then resize it to its prefered size so that it
4806   accomodates the font size and other frills it needs. */
4807 
4808   TempWindowPntr = new (std::nothrow) BWindow (BRect (10, 20, 200, 200),
4809 	"Temporary Window", B_DOCUMENT_WINDOW,
4810 	B_NO_WORKSPACE_ACTIVATION | B_ASYNCHRONOUS_CONTROLS);
4811   if (TempWindowPntr == NULL) {
4812     DisplayErrorMessage ("Unable to create temporary window for finding "
4813       "sizes of controls.");
4814     g_QuitCountdown = 0;
4815     return;
4816   }
4817 
4818   TempRect = TempWindowPntr->Bounds ();
4819 
4820   /* Find the height of a single line of text in a BStringView. */
4821 
4822   TempStringViewPntr = new (std::nothrow) BStringView (TempRect, TempString, TempString);
4823   if (TempStringViewPntr != NULL) {
4824     TempWindowPntr->Lock();
4825     TempWindowPntr->AddChild (TempStringViewPntr);
4826     TempStringViewPntr->GetPreferredSize (&JunkFloat, &g_StringViewHeight);
4827     TempWindowPntr->RemoveChild (TempStringViewPntr);
4828     TempWindowPntr->Unlock();
4829     delete TempStringViewPntr;
4830   }
4831 
4832   /* Find the height of a button, which seems to be larger than a text
4833   control and can make life difficult.  Make a temporary button, which
4834   is attached to our window so that it resizes to accomodate the font size. */
4835 
4836   TempButtonPntr = new (std::nothrow) BButton (TempRect, TempString, TempString, NULL);
4837   if (TempButtonPntr != NULL) {
4838     TempWindowPntr->Lock();
4839     TempWindowPntr->AddChild (TempButtonPntr);
4840     TempButtonPntr->GetPreferredSize (&JunkFloat, &g_ButtonHeight);
4841     TempWindowPntr->RemoveChild (TempButtonPntr);
4842     TempWindowPntr->Unlock();
4843     delete TempButtonPntr;
4844   }
4845 
4846   /* Find the height of a text box. */
4847 
4848   TempTextPntr = new (std::nothrow) BTextControl (TempRect, TempString, NULL /* label */,
4849     TempString, NULL);
4850   if (TempTextPntr != NULL) {
4851     TempWindowPntr->Lock ();
4852     TempWindowPntr->AddChild (TempTextPntr);
4853     TempTextPntr->GetPreferredSize (&JunkFloat, &g_TextBoxHeight);
4854     TempWindowPntr->RemoveChild (TempTextPntr);
4855     TempWindowPntr->Unlock ();
4856     delete TempTextPntr;
4857   }
4858 
4859   /* Find the height of a checkbox control. */
4860 
4861   TempCheckBoxPntr = new (std::nothrow) BCheckBox (TempRect, TempString, TempString, NULL);
4862   if (TempCheckBoxPntr != NULL) {
4863     TempWindowPntr->Lock ();
4864     TempWindowPntr->AddChild (TempCheckBoxPntr);
4865     TempCheckBoxPntr->GetPreferredSize (&JunkFloat, &g_CheckBoxHeight);
4866     TempWindowPntr->RemoveChild (TempCheckBoxPntr);
4867     TempWindowPntr->Unlock ();
4868     delete TempCheckBoxPntr;
4869   }
4870 
4871   /* Find the height of a radio button control. */
4872 
4873   TempRadioButtonPntr =
4874     new (std::nothrow) BRadioButton (TempRect, TempString, TempString, NULL);
4875   if (TempRadioButtonPntr != NULL) {
4876     TempWindowPntr->Lock ();
4877     TempWindowPntr->AddChild (TempRadioButtonPntr);
4878     TempRadioButtonPntr->GetPreferredSize (&JunkFloat, &g_RadioButtonHeight);
4879     TempWindowPntr->RemoveChild (TempRadioButtonPntr);
4880     TempWindowPntr->Unlock ();
4881     delete TempRadioButtonPntr;
4882   }
4883 
4884   /* Find the height of a pop-up menu. */
4885 
4886   TempMenuBarPntr = new (std::nothrow) BMenuBar (TempRect, TempString,
4887     B_FOLLOW_LEFT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
4888     true /* resize to fit items */);
4889   TempPopUpMenuPntr = new (std::nothrow) BPopUpMenu (TempString);
4890   TempMenuItemPntr = new (std::nothrow) BMenuItem (TempString, new BMessage (12345), 'g');
4891 
4892   if (TempMenuBarPntr != NULL && TempPopUpMenuPntr != NULL &&
4893   TempMenuItemPntr != NULL)
4894   {
4895     TempPopUpMenuPntr->AddItem (TempMenuItemPntr);
4896     TempMenuBarPntr->AddItem (TempPopUpMenuPntr);
4897 
4898     TempWindowPntr->Lock ();
4899     TempWindowPntr->AddChild (TempMenuBarPntr);
4900     TempMenuBarPntr->GetPreferredSize (&JunkFloat, &g_PopUpMenuHeight);
4901     TempWindowPntr->RemoveChild (TempMenuBarPntr);
4902     TempWindowPntr->Unlock ();
4903     delete TempMenuBarPntr; // It will delete contents too.
4904   }
4905 
4906   TempWindowPntr->Lock ();
4907   TempWindowPntr->Quit ();
4908 
4909   SetPulseRate (500000);
4910 
4911   if (g_CommandLineMode)
4912     g_QuitCountdown = 0; /* Quit as soon as queued up commands done. */
4913   else /* GUI mode, make a window. */
4914   {
4915     DatabaseWindowPntr = new (std::nothrow) DatabaseWindow ();
4916     if (DatabaseWindowPntr == NULL) {
4917       DisplayErrorMessage ("Unable to create window.");
4918       g_QuitCountdown = 0;
4919     } else {
4920       DatabaseWindowPntr->Show (); /* Starts the window's message loop. */
4921     }
4922   }
4923 
4924   g_AppReadyToRunCompleted = true;
4925 }
4926 
4927 
4928 /* Given a mail component (body text, attachment, whatever), look for words in
4929 it.  If the tokenize mode specifies that it isn't one of the ones we are
4930 looking for, just skip it.  For container type components, recursively examine
4931 their contents, up to the maximum depth specified. */
4932 
4933 status_t
4934 ABSApp::RecursivelyTokenizeMailComponent (
4935   BMailComponent *ComponentPntr,
4936   const char *OptionalFileName,
4937   set<string> &WordSet,
4938   char *ErrorMessage,
4939   int RecursionLevel,
4940   int MaxRecursionLevel)
4941 {
4942   char                        AttachmentName [B_FILE_NAME_LENGTH];
4943   BMailAttachment            *AttachmentPntr;
4944   BMimeType                   ComponentMIMEType;
4945   BMailContainer             *ContainerPntr;
4946   BMallocIO                   ContentsIO;
4947   const char                 *ContentsBufferPntr;
4948   size_t                      ContentsBufferSize;
4949   status_t                    ErrorCode;
4950   bool                        ExamineComponent;
4951   const char                 *HeaderKeyPntr;
4952   const char                 *HeaderValuePntr;
4953   int                         i;
4954   int                         j;
4955   const char                 *NameExtension;
4956   int                         NumComponents;
4957   BMimeType                   TextAnyMIMEType ("text");
4958   BMimeType                   TextPlainMIMEType ("text/plain");
4959 
4960   if (ComponentPntr == NULL)
4961     return B_OK;
4962 
4963   /* Add things in the sub-headers that might be useful.  Things like the file
4964   name of attachments, the encoding type, etc. */
4965 
4966   if (m_TokenizeMode == TM_PLAIN_TEXT_HEADER ||
4967   m_TokenizeMode == TM_ANY_TEXT_HEADER ||
4968   m_TokenizeMode == TM_ALL_PARTS_HEADER ||
4969   m_TokenizeMode == TM_JUST_HEADER)
4970   {
4971     for (i = 0; i < 1000; i++)
4972     {
4973       HeaderKeyPntr = ComponentPntr->HeaderAt (i);
4974       if (HeaderKeyPntr == NULL)
4975         break;
4976       AddWordsToSet (HeaderKeyPntr, strlen (HeaderKeyPntr),
4977         'H' /* Prefix for Headers, uppercase unlike normal words. */, WordSet);
4978       for (j = 0; j < 1000; j++)
4979       {
4980         HeaderValuePntr = ComponentPntr->HeaderField (HeaderKeyPntr, j);
4981         if (HeaderValuePntr == NULL)
4982           break;
4983         AddWordsToSet (HeaderValuePntr, strlen (HeaderValuePntr),
4984           'H', WordSet);
4985       }
4986     }
4987   }
4988 
4989   /* Check the MIME type of the thing.  It's used to decide if the contents are
4990   worth examining for words. */
4991 
4992   ErrorCode = ComponentPntr->MIMEType (&ComponentMIMEType);
4993   if (ErrorCode != B_OK)
4994   {
4995     sprintf (ErrorMessage, "ABSApp::RecursivelyTokenizeMailComponent: "
4996       "Unable to get MIME type at level %d in \"%s\"",
4997       RecursionLevel, OptionalFileName);
4998     return ErrorCode;
4999   }
5000   if (ComponentMIMEType.Type() == NULL)
5001   {
5002     /* Have to make up a MIME type for things which don't have them, such as
5003     the main body text, otherwise it would get ignored. */
5004 
5005     if (NULL != dynamic_cast<BTextMailComponent *>(ComponentPntr))
5006       ComponentMIMEType.SetType ("text/plain");
5007   }
5008   if (!TextAnyMIMEType.Contains (&ComponentMIMEType) &&
5009   NULL != (AttachmentPntr = dynamic_cast<BMailAttachment *>(ComponentPntr)))
5010   {
5011     /* Sometimes spam doesn't give a text MIME type for text when they do an
5012     attachment (which is often base64 encoded).  Use the file name extension to
5013     see if it really is text. */
5014     NameExtension = NULL;
5015     if (AttachmentPntr->FileName (AttachmentName) >= 0)
5016       NameExtension = strrchr (AttachmentName, '.');
5017     if (NameExtension != NULL)
5018     {
5019       if (strcasecmp (NameExtension, ".txt") == 0)
5020         ComponentMIMEType.SetType ("text/plain");
5021       else if (strcasecmp (NameExtension, ".htm") == 0 ||
5022       strcasecmp (NameExtension, ".html") == 0)
5023         ComponentMIMEType.SetType ("text/html");
5024     }
5025   }
5026 
5027   switch (m_TokenizeMode)
5028   {
5029     case TM_PLAIN_TEXT:
5030     case TM_PLAIN_TEXT_HEADER:
5031       ExamineComponent = TextPlainMIMEType.Contains (&ComponentMIMEType);
5032       break;
5033 
5034     case TM_ANY_TEXT:
5035     case TM_ANY_TEXT_HEADER:
5036       ExamineComponent = TextAnyMIMEType.Contains (&ComponentMIMEType);
5037       break;
5038 
5039     case TM_ALL_PARTS:
5040     case TM_ALL_PARTS_HEADER:
5041       ExamineComponent = true;
5042       break;
5043 
5044     default:
5045       ExamineComponent = false;
5046       break;
5047   }
5048 
5049   if (ExamineComponent)
5050   {
5051     /* Get the contents of the component.  This will be UTF-8 text (converted
5052     from whatever encoding was used) for text attachments.  For other ones,
5053     it's just the raw data, or perhaps decoded from base64 encoding. */
5054 
5055     ContentsIO.SetBlockSize (16 * 1024);
5056     ErrorCode = ComponentPntr->GetDecodedData (&ContentsIO);
5057     if (ErrorCode == B_OK) /* Can fail for container components: no data. */
5058     {
5059       /* Look for words in the decoded data. */
5060 
5061       ContentsBufferPntr = (const char *) ContentsIO.Buffer ();
5062       ContentsBufferSize = ContentsIO.BufferLength ();
5063       if (ContentsBufferPntr != NULL /* can be empty */)
5064         AddWordsToSet (ContentsBufferPntr, ContentsBufferSize,
5065           0 /* no prefix character, this is body text */, WordSet);
5066     }
5067   }
5068 
5069   /* Examine any sub-components in the message. */
5070 
5071   if (RecursionLevel + 1 <= MaxRecursionLevel &&
5072   NULL != (ContainerPntr = dynamic_cast<BMailContainer *>(ComponentPntr)))
5073   {
5074     NumComponents = ContainerPntr->CountComponents ();
5075 
5076     for (i = 0; i < NumComponents; i++)
5077     {
5078       ComponentPntr = ContainerPntr->GetComponent (i);
5079 
5080       ErrorCode = RecursivelyTokenizeMailComponent (ComponentPntr,
5081         OptionalFileName, WordSet, ErrorMessage, RecursionLevel + 1,
5082         MaxRecursionLevel);
5083       if (ErrorCode != B_OK)
5084         break;
5085     }
5086   }
5087 
5088   return ErrorCode;
5089 }
5090 
5091 
5092 /* The user has tried to open a file or several files with this application,
5093 via Tracker's open-with menu item.  If it is a database type file, then change
5094 the database file name to it.  Otherwise, ask the user whether they want to
5095 classify it as spam or non-spam.  There will be at most around 100 files, BeOS
5096 R5.0.3's Tracker crashes if it tries to pass on more than that many using Open
5097 With... etc.  The command is sent to an intermediary thread where it is
5098 asynchronously converted into a scripting message(s) that are sent back to this
5099 BApplication.  The intermediary is needed since we can't recursively execute
5100 scripting messages while processing a message (this RefsReceived one). */
5101 
5102 void
5103 ABSApp::RefsReceived (BMessage *MessagePntr)
5104 {
5105   if (g_CommanderLooperPntr != NULL)
5106     g_CommanderLooperPntr->CommandReferences (MessagePntr);
5107 }
5108 
5109 
5110 /* A scripting command is looking for something to execute it.  See if it is
5111 targetted at our database. */
5112 
5113 BHandler * ABSApp::ResolveSpecifier (
5114   BMessage *MessagePntr,
5115   int32 Index,
5116   BMessage *SpecifierMsgPntr,
5117   int32 SpecificationKind,
5118   const char *PropertyPntr)
5119 {
5120   int i;
5121 
5122   /* See if it is one of our commands. */
5123 
5124   if (SpecificationKind == B_DIRECT_SPECIFIER)
5125   {
5126     for (i = PN_MAX - 1; i >= 0; i--)
5127     {
5128       if (strcasecmp (PropertyPntr, g_PropertyNames [i]) == 0)
5129         return this; /* Found it!  Return the Handler (which is us). */
5130     }
5131   }
5132 
5133   /* Handle an unrecognized scripting command, let the parent figure it out. */
5134 
5135   return BApplication::ResolveSpecifier (
5136     MessagePntr, Index, SpecifierMsgPntr, SpecificationKind, PropertyPntr);
5137 }
5138 
5139 
5140 /* Save the database if it hasn't been saved yet.  Otherwise do nothing. */
5141 
5142 status_t ABSApp::SaveDatabaseIfNeeded (char *ErrorMessage)
5143 {
5144   if (m_DatabaseHasChanged)
5145     return LoadSaveDatabase (false /* DoLoad */, ErrorMessage);
5146 
5147   return B_OK;
5148 }
5149 
5150 
5151 /* Presumably the file is an e-mail message (or at least the header portion of
5152 one).  Break it into parts: header, body and MIME components.  Then add the
5153 words in the portions that match the current tokenization settings to the set
5154 of words. */
5155 
5156 status_t ABSApp::TokenizeParts (
5157   BPositionIO *PositionIOPntr,
5158   const char *OptionalFileName,
5159   set<string> &WordSet,
5160   char *ErrorMessage)
5161 {
5162   status_t        ErrorCode = B_OK;
5163   BEmailMessage   WholeEMail;
5164 
5165   sprintf (ErrorMessage, "ABSApp::TokenizeParts: While getting e-mail "
5166     "headers, had problems with \"%s\"", OptionalFileName);
5167 
5168   ErrorCode = WholeEMail.SetToRFC822 (
5169     PositionIOPntr /* it does its own seeking to the start */,
5170     -1 /* length */, true /* parse_now */);
5171   if (ErrorCode < 0) goto ErrorExit;
5172 
5173   ErrorCode = RecursivelyTokenizeMailComponent (&WholeEMail,
5174     OptionalFileName, WordSet, ErrorMessage, 0 /* Initial recursion level */,
5175     (m_TokenizeMode == TM_JUST_HEADER) ? 0 : 500 /* Max recursion level */);
5176 
5177 ErrorExit:
5178   return ErrorCode;
5179 }
5180 
5181 
5182 /* Add all the words in the whole file or memory buffer to the supplied set.
5183 The file doesn't have to be an e-mail message since it isn't parsed for e-mail
5184 headers or MIME headers or anything.  It blindly adds everything that looks
5185 like a word, though it does convert quoted printable codes to the characters
5186 they represent.  See also AddWordsToSet which does something more advanced. */
5187 
5188 status_t ABSApp::TokenizeWhole (
5189   BPositionIO *PositionIOPntr,
5190   const char *OptionalFileName,
5191   set<string> &WordSet,
5192   char *ErrorMessage)
5193 {
5194   string                AccumulatedWord;
5195   uint8                 Buffer [16 * 1024];
5196   uint8                *BufferCurrentPntr = Buffer + 0;
5197   uint8                *BufferEndPntr = Buffer + 0;
5198   const char           *IOErrorString =
5199                           "TokenizeWhole: Error %ld while reading \"%s\"";
5200   size_t                Length;
5201   int                   Letter = ' ';
5202   char                  HexString [4];
5203   int                   NextLetter = ' ';
5204   int                   NextNextLetter = ' ';
5205 
5206   /* Use a buffer since reading single characters from a BFile is so slow.
5207   BufferCurrentPntr is the position of the next character to be read.  When it
5208   reaches BufferEndPntr, it is time to fill the buffer again. */
5209 
5210 #define ReadChar(CharVar) \
5211   { \
5212     if (BufferCurrentPntr < BufferEndPntr) \
5213       CharVar = *BufferCurrentPntr++; \
5214     else /* Try to fill the buffer. */ \
5215     { \
5216       ssize_t AmountRead; \
5217       AmountRead = PositionIOPntr->Read (Buffer, sizeof (Buffer)); \
5218       if (AmountRead < 0) \
5219       { \
5220         sprintf (ErrorMessage, IOErrorString, AmountRead, OptionalFileName); \
5221         return AmountRead; \
5222       } \
5223       else if (AmountRead == 0) \
5224         CharVar = EOF; \
5225       else \
5226       { \
5227         BufferEndPntr = Buffer + AmountRead; \
5228         BufferCurrentPntr = Buffer + 0; \
5229         CharVar = *BufferCurrentPntr++; \
5230       } \
5231     } \
5232   }
5233 
5234   /* Read all the words in the file and add them to our local set of words.  A
5235   set is used since we don't care how many times a word occurs. */
5236 
5237   while (true)
5238   {
5239     /* We read two letters ahead so that we can decode quoted printable
5240     characters (an equals sign followed by two hex digits or a new line).  Note
5241     that Letter can become EOF (-1) when end of file is reached. */
5242 
5243     Letter = NextLetter;
5244     NextLetter = NextNextLetter;
5245     ReadChar (NextNextLetter);
5246 
5247     /* Decode quoted printable codes first, so that the rest of the code just
5248     sees an ordinary character.  Or even nothing, if it is the hidden line
5249     break combination.  This may falsely corrupt stuff following an equals
5250     sign, but usually won't. */
5251 
5252     if (Letter == '=')
5253     {
5254       if ((NextLetter == '\r' && NextNextLetter == '\n') ||
5255       (NextLetter == '\n' && NextNextLetter == '\r'))
5256       {
5257         /* Make the "=\r\n" pair disappear.  It's not even white space. */
5258         ReadChar (NextLetter);
5259         ReadChar (NextNextLetter);
5260         continue;
5261       }
5262       if (NextLetter == '\n' || NextLetter == '\r')
5263       {
5264         /* Make the "=\n" pair disappear.  It's not even white space. */
5265         NextLetter = NextNextLetter;
5266         ReadChar (NextNextLetter);
5267         continue;
5268       }
5269       if (NextNextLetter != EOF &&
5270       isxdigit (NextLetter) && isxdigit (NextNextLetter))
5271       {
5272         /* Convert the hex code to a letter. */
5273         HexString[0] = NextLetter;
5274         HexString[1] = NextNextLetter;
5275         HexString[2] = 0;
5276         Letter = strtoul (HexString, NULL, 16 /* number system base */);
5277         ReadChar (NextLetter);
5278         ReadChar (NextNextLetter);
5279       }
5280     }
5281 
5282     /* Convert to lower case to improve word matches.  Of course this loses a
5283     bit of information, such as MONEY vs Money, an indicator of spam.  Well,
5284     apparently that isn't all that useful a distinction, so do it. */
5285 
5286     if (Letter >= 'A' && Letter < 'Z')
5287       Letter = Letter + ('a' - 'A');
5288 
5289     /* See if it is a letter we treat as white space - all control characters
5290     and all punctuation except for: apostrophe (so "it's" and possessive
5291     versions of words get stored), dash (for hyphenated words), dollar sign
5292     (for cash amounts), period (for IP addresses, we later remove trailing
5293     (periods).  Note that codes above 127 are UTF-8 characters, which we
5294     consider non-space. */
5295 
5296     if (Letter < 0 /* EOF */ || (Letter < 128 && g_SpaceCharacters[Letter]))
5297     {
5298       /* That space finished off a word.  Remove trailing periods... */
5299 
5300       while ((Length = AccumulatedWord.size()) > 0 &&
5301       AccumulatedWord [Length-1] == '.')
5302         AccumulatedWord.resize (Length - 1);
5303 
5304       /* If there's anything left in the word, add it to the set.  Also ignore
5305       words which are too big (it's probably some binary encoded data).  But
5306       leave room for supercalifragilisticexpialidoceous.  According to one web
5307       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
5308       currently in English.  Note that some uuencoded data was seen with a 60
5309       character line length. */
5310 
5311       if (Length > 0 && Length <= g_MaxWordLength)
5312         WordSet.insert (AccumulatedWord);
5313 
5314       /* Empty out the string to get ready for the next word. */
5315 
5316       AccumulatedWord.resize (0);
5317     }
5318     else /* Not a space-like character, add it to the word. */
5319       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
5320 
5321     /* Stop at end of file or error.  Don't care which.  Exit here so that last
5322     word got processed. */
5323 
5324     if (Letter == EOF)
5325       break;
5326   }
5327 
5328   return B_OK;
5329 }
5330 
5331 
5332 
5333 /******************************************************************************
5334  * Implementation of the ClassificationChoicesView class, constructor,
5335  * destructor and the rest of the member functions in mostly alphabetical
5336  * order.
5337  */
5338 
5339 ClassificationChoicesWindow::ClassificationChoicesWindow (
5340   BRect FrameRect,
5341   const char *FileName,
5342   int NumberOfFiles)
5343 : BWindow (FrameRect, "Classification Choices", B_TITLED_WINDOW,
5344     B_NOT_ZOOMABLE | B_NOT_RESIZABLE | B_ASYNCHRONOUS_CONTROLS),
5345   m_BulkModeSelectedPntr (NULL),
5346   m_ChoosenClassificationPntr (NULL)
5347 {
5348   ClassificationChoicesView *SubViewPntr;
5349 
5350   SubViewPntr = new ClassificationChoicesView (Bounds(),
5351     FileName, NumberOfFiles);
5352   AddChild (SubViewPntr);
5353   SubViewPntr->ResizeToPreferred ();
5354   ResizeTo (SubViewPntr->Frame().Width(), SubViewPntr->Frame().Height());
5355 }
5356 
5357 
5358 void
5359 ClassificationChoicesWindow::MessageReceived (BMessage *MessagePntr)
5360 {
5361   BControl *ControlPntr;
5362 
5363   if (MessagePntr->what >= MSG_CLASS_BUTTONS &&
5364   MessagePntr->what < MSG_CLASS_BUTTONS + CL_MAX)
5365   {
5366     if (m_ChoosenClassificationPntr != NULL)
5367       *m_ChoosenClassificationPntr =
5368         (ClassificationTypes) (MessagePntr->what - MSG_CLASS_BUTTONS);
5369     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5370     return;
5371   }
5372 
5373   if (MessagePntr->what == MSG_BULK_CHECKBOX)
5374   {
5375     if (m_BulkModeSelectedPntr != NULL &&
5376     MessagePntr->FindPointer ("source", (void **) &ControlPntr) == B_OK)
5377       *m_BulkModeSelectedPntr = (ControlPntr->Value() == B_CONTROL_ON);
5378     return;
5379   }
5380 
5381   if (MessagePntr->what == MSG_CANCEL_BUTTON)
5382   {
5383     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5384     return;
5385   }
5386 
5387   BWindow::MessageReceived (MessagePntr);
5388 }
5389 
5390 
5391 void
5392 ClassificationChoicesWindow::Go (
5393   bool *BulkModeSelectedPntr,
5394   ClassificationTypes *ChoosenClassificationPntr)
5395 {
5396   status_t  ErrorCode = 0;
5397   BView    *MainViewPntr;
5398   thread_id WindowThreadID;
5399 
5400   m_BulkModeSelectedPntr = BulkModeSelectedPntr;
5401   m_ChoosenClassificationPntr = ChoosenClassificationPntr;
5402   if (m_ChoosenClassificationPntr != NULL)
5403     *m_ChoosenClassificationPntr = CL_MAX;
5404 
5405   Show (); // Starts the window thread running.
5406 
5407   /* Move the window to the center of the screen it is now being displayed on
5408   (have to wait for it to be showing). */
5409 
5410   Lock ();
5411   MainViewPntr = FindView ("ClassificationChoicesView");
5412   if (MainViewPntr != NULL)
5413   {
5414     BRect   TempRect;
5415     BScreen TempScreen (this);
5416     float   X;
5417     float   Y;
5418 
5419     TempRect = TempScreen.Frame ();
5420     X = TempRect.Width() / 2;
5421     Y = TempRect.Height() / 2;
5422     TempRect = MainViewPntr->Frame();
5423     X -= TempRect.Width() / 2;
5424     Y -= TempRect.Height() / 2;
5425     MoveTo (ceilf (X), ceilf (Y));
5426   }
5427   Unlock ();
5428 
5429   /* Wait for the window to go away. */
5430 
5431   WindowThreadID = Thread ();
5432   if (WindowThreadID >= 0)
5433     // Delay until the window thread has died, presumably window deleted now.
5434     wait_for_thread (WindowThreadID, &ErrorCode);
5435 }
5436 
5437 
5438 
5439 /******************************************************************************
5440  * Implementation of the ClassificationChoicesView class, constructor,
5441  * destructor and the rest of the member functions in mostly alphabetical
5442  * order.
5443  */
5444 
5445 ClassificationChoicesView::ClassificationChoicesView (
5446   BRect FrameRect,
5447   const char *FileName,
5448   int NumberOfFiles)
5449 : BView (FrameRect, "ClassificationChoicesView",
5450     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_NAVIGABLE_JUMP),
5451   m_FileName (FileName),
5452   m_NumberOfFiles (NumberOfFiles),
5453   m_PreferredBottomY (ceilf (g_ButtonHeight * 10))
5454 {
5455 }
5456 
5457 
5458 void
5459 ClassificationChoicesView::AttachedToWindow ()
5460 {
5461   BButton            *ButtonPntr;
5462   BCheckBox          *CheckBoxPntr;
5463   ClassificationTypes Classification;
5464   float               Margin;
5465   float               RowHeight;
5466   float               RowTop;
5467   BTextView          *TextViewPntr;
5468   BRect               TempRect;
5469   char                TempString [2048];
5470   BRect               TextRect;
5471   float               X;
5472 
5473   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5474 
5475   RowHeight = g_ButtonHeight;
5476   if (g_CheckBoxHeight > RowHeight)
5477     RowHeight = g_CheckBoxHeight;
5478   RowHeight = ceilf (RowHeight * 1.1);
5479 
5480   TempRect = Bounds ();
5481   RowTop = TempRect.top;
5482 
5483   /* Show the file name text. */
5484 
5485   Margin = ceilf ((RowHeight - g_StringViewHeight) / 2);
5486   TempRect = Bounds ();
5487   TempRect.top = RowTop + Margin;
5488   TextRect = TempRect;
5489   TextRect.OffsetTo (0, 0);
5490   TextRect.InsetBy (g_MarginBetweenControls, 2);
5491   sprintf (TempString, "How do you want to classify the file named \"%s\"?",
5492     m_FileName);
5493   TextViewPntr = new BTextView (TempRect, "FileText", TextRect,
5494     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE);
5495   AddChild (TextViewPntr);
5496   TextViewPntr->SetText (TempString);
5497   TextViewPntr->MakeEditable (false);
5498   TextViewPntr->SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5499   TextViewPntr->ResizeTo (TempRect.Width (),
5500     3 + TextViewPntr->TextHeight (0, sizeof (TempString)));
5501   RowTop = TextViewPntr->Frame().bottom + Margin;
5502 
5503   /* Make the classification buttons. */
5504 
5505   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5506   TempRect = Bounds ();
5507   TempRect.top = RowTop + Margin;
5508   X = Bounds().left + g_MarginBetweenControls;
5509   for (Classification = (ClassificationTypes) 0; Classification < CL_MAX;
5510   Classification = (ClassificationTypes) ((int) Classification + 1))
5511   {
5512     TempRect = Bounds ();
5513     TempRect.top = RowTop + Margin;
5514     TempRect.left = X;
5515     sprintf (TempString, "%s Button",
5516       g_ClassificationTypeNames [Classification]);
5517     ButtonPntr = new BButton (TempRect, TempString,
5518       g_ClassificationTypeNames [Classification], new BMessage (
5519       ClassificationChoicesWindow::MSG_CLASS_BUTTONS + Classification));
5520     AddChild (ButtonPntr);
5521     ButtonPntr->ResizeToPreferred ();
5522     X = ButtonPntr->Frame().right + 3 * g_MarginBetweenControls;
5523   }
5524   RowTop += ceilf (RowHeight * 1.2);
5525 
5526   /* Make the Cancel button. */
5527 
5528   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5529   TempRect = Bounds ();
5530   TempRect.top = RowTop + Margin;
5531   TempRect.left += g_MarginBetweenControls;
5532   ButtonPntr = new BButton (TempRect, "Cancel Button",
5533     "Cancel", new BMessage (ClassificationChoicesWindow::MSG_CANCEL_BUTTON));
5534   AddChild (ButtonPntr);
5535   ButtonPntr->ResizeToPreferred ();
5536   X = ButtonPntr->Frame().right + g_MarginBetweenControls;
5537 
5538   /* Make the checkbox for bulk operations. */
5539 
5540   if (m_NumberOfFiles > 1)
5541   {
5542     Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
5543     TempRect = Bounds ();
5544     TempRect.top = RowTop + Margin;
5545     TempRect.left = X;
5546     sprintf (TempString, "Mark all %d remaining messages the same way.",
5547       m_NumberOfFiles - 1);
5548     CheckBoxPntr = new BCheckBox (TempRect, "BulkBox", TempString,
5549       new BMessage (ClassificationChoicesWindow::MSG_BULK_CHECKBOX));
5550     AddChild (CheckBoxPntr);
5551     CheckBoxPntr->ResizeToPreferred ();
5552   }
5553   RowTop += RowHeight;
5554 
5555   m_PreferredBottomY = RowTop;
5556 }
5557 
5558 
5559 void
5560 ClassificationChoicesView::GetPreferredSize (float *width, float *height)
5561 {
5562   if (width != NULL)
5563     *width = Bounds().Width();
5564   if (height != NULL)
5565     *height = m_PreferredBottomY;
5566 }
5567 
5568 
5569 
5570 /******************************************************************************
5571  * Implementation of the CommanderLooper class, constructor, destructor and the
5572  * rest of the member functions in mostly alphabetical order.
5573  */
5574 
5575 CommanderLooper::CommanderLooper ()
5576 : BLooper ("CommanderLooper", B_NORMAL_PRIORITY),
5577   m_IsBusy (false)
5578 {
5579 }
5580 
5581 
5582 CommanderLooper::~CommanderLooper ()
5583 {
5584   g_CommanderLooperPntr = NULL;
5585   delete g_CommanderMessenger;
5586   g_CommanderMessenger = NULL;
5587 }
5588 
5589 
5590 /* Process some command line arguments.  Basically just send a message to this
5591 looper itself to do the work later.  That way the caller can continue doing
5592 whatever they're doing, particularly if it's the BApplication. */
5593 
5594 void
5595 CommanderLooper::CommandArguments (int argc, char **argv)
5596 {
5597   int      i;
5598   BMessage InternalMessage;
5599 
5600   InternalMessage.what = MSG_COMMAND_ARGUMENTS;
5601   for (i = 0; i < argc; i++)
5602     InternalMessage.AddString ("arg", argv[i]);
5603 
5604   PostMessage (&InternalMessage);
5605 }
5606 
5607 
5608 /* Copy the refs out of the given message and stuff them into an internal
5609 message to ourself (so that the original message can be returned to the caller,
5610 and if it is Tracker, it can close the file handles it has open).  Optionally
5611 allow preset classification rather than asking the user (set BulkMode to TRUE
5612 and specify the class with BulkClassification). */
5613 
5614 void
5615 CommanderLooper::CommandReferences (
5616   BMessage *MessagePntr,
5617   bool BulkMode,
5618   ClassificationTypes BulkClassification)
5619 {
5620   entry_ref EntryRef;
5621   int       i;
5622   BMessage  InternalMessage;
5623 
5624   InternalMessage.what = MSG_COMMAND_FILE_REFS;
5625   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
5626     InternalMessage.AddRef ("refs", &EntryRef);
5627   InternalMessage.AddBool ("BulkMode", BulkMode);
5628   InternalMessage.AddInt32 ("BulkClassification", BulkClassification);
5629 
5630   PostMessage (&InternalMessage);
5631 }
5632 
5633 
5634 /* This function is called by other threads to see if the CommanderLooper is
5635 busy working on something. */
5636 
5637 bool
5638 CommanderLooper::IsBusy ()
5639 {
5640   if (m_IsBusy)
5641     return true;
5642 
5643   if (IsLocked () || !MessageQueue()->IsEmpty ())
5644     return true;
5645 
5646   return false;
5647 }
5648 
5649 
5650 void
5651 
5652 CommanderLooper::MessageReceived (BMessage *MessagePntr)
5653 {
5654   m_IsBusy = true;
5655 
5656   if (MessagePntr->what == MSG_COMMAND_ARGUMENTS)
5657     ProcessArgs (MessagePntr);
5658   else if (MessagePntr->what == MSG_COMMAND_FILE_REFS)
5659     ProcessRefs (MessagePntr);
5660   else
5661     BLooper::MessageReceived (MessagePntr);
5662 
5663   m_IsBusy = false;
5664 }
5665 
5666 
5667 /* Process the command line by converting it into a series of scripting
5668 messages (possibly thousands) and sent them to the BApplication synchronously
5669 (so we can print the result). */
5670 
5671 void
5672 CommanderLooper::ProcessArgs (BMessage *MessagePntr)
5673 {
5674   int32                 argc = 0;
5675   const char          **argv = NULL;
5676   int                   ArgumentIndex;
5677   uint32                CommandCode;
5678   const char           *CommandWord;
5679   status_t              ErrorCode;
5680   const char           *ErrorTitle = "ProcessArgs";
5681   char                 *EndPntr;
5682   int32                 i;
5683   BMessage              ReplyMessage;
5684   BMessage              ScriptMessage;
5685   struct property_info *PropInfoPntr;
5686   const char           *PropertyName;
5687   bool                  TempBool;
5688   float                 TempFloat;
5689   int32                 TempInt32;
5690   const char           *TempStringPntr;
5691   type_code             TypeCode;
5692   const char           *ValuePntr;
5693 
5694   /* Get the argument count and pointers to arguments out of the message and
5695   into our argc and argv. */
5696 
5697   ErrorCode = MessagePntr->GetInfo ("arg", &TypeCode, &argc);
5698   if (ErrorCode != B_OK || TypeCode != B_STRING_TYPE)
5699   {
5700     DisplayErrorMessage ("Unable to find argument strings in message",
5701       ErrorCode, ErrorTitle);
5702     goto ErrorExit;
5703   }
5704 
5705   if (argc < 2)
5706   {
5707     cerr << PrintUsage;
5708     DisplayErrorMessage ("You need to specify a command word, like GET, SET "
5709       "and so on followed by a property, like DatabaseFile, and maybe "
5710       "followed by a value of some sort", -1, ErrorTitle);
5711     goto ErrorExit;
5712   }
5713 
5714   argv = (const char **) malloc (sizeof (char *) * argc);
5715   if (argv == NULL)
5716   {
5717     DisplayErrorMessage ("Out of memory when allocating argv array",
5718       ENOMEM, ErrorTitle);
5719     goto ErrorExit;
5720   }
5721 
5722   for (i = 0; i < argc; i++)
5723   {
5724     if ((ErrorCode = MessagePntr->FindString ("arg", i, &argv[i])) != B_OK)
5725     {
5726       DisplayErrorMessage ("Unable to find argument in the BMessage",
5727         ErrorCode, ErrorTitle);
5728       goto ErrorExit;
5729     }
5730   }
5731 
5732   CommandWord = argv[1];
5733 
5734   /* Special case for the Quit command since it isn't a scripting command. */
5735 
5736   if (strcasecmp (CommandWord, "quit") == 0)
5737   {
5738     g_QuitCountdown = 10;
5739     goto ErrorExit;
5740   }
5741 
5742   /* Find the corresponding scripting command. */
5743 
5744   if (strcasecmp (CommandWord, "set") == 0)
5745     CommandCode = B_SET_PROPERTY;
5746   else if (strcasecmp (CommandWord, "get") == 0)
5747     CommandCode = B_GET_PROPERTY;
5748   else if (strcasecmp (CommandWord, "count") == 0)
5749     CommandCode = B_COUNT_PROPERTIES;
5750   else if (strcasecmp (CommandWord, "create") == 0)
5751     CommandCode = B_CREATE_PROPERTY;
5752   else if (strcasecmp (CommandWord, "delete") == 0)
5753     CommandCode = B_DELETE_PROPERTY;
5754   else
5755     CommandCode = B_EXECUTE_PROPERTY;
5756 
5757   if (CommandCode == B_EXECUTE_PROPERTY)
5758   {
5759     PropertyName = CommandWord;
5760     ArgumentIndex = 2; /* Arguments to the command start at this index. */
5761   }
5762   else
5763   {
5764     if (CommandCode == B_SET_PROPERTY)
5765     {
5766       /* SET commands require at least one argument value. */
5767       if (argc < 4)
5768       {
5769         cerr << PrintUsage;
5770         DisplayErrorMessage ("SET commands require at least one "
5771           "argument value after the property name", -1, ErrorTitle);
5772         goto ErrorExit;
5773       }
5774     }
5775     else
5776       if (argc < 3)
5777       {
5778         cerr << PrintUsage;
5779         DisplayErrorMessage ("You need to specify a property to act on",
5780           -1, ErrorTitle);
5781         goto ErrorExit;
5782       }
5783     PropertyName = argv[2];
5784     ArgumentIndex = 3;
5785   }
5786 
5787   /* See if it is one of our commands. */
5788 
5789   for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
5790   {
5791     if (PropInfoPntr->name == 0)
5792     {
5793       cerr << PrintUsage;
5794       DisplayErrorMessage ("The property specified isn't known or "
5795         "doesn't support the requested action (usually means it is an "
5796         "unknown command)", -1, ErrorTitle);
5797       goto ErrorExit; /* Unrecognized command. */
5798     }
5799 
5800     if (PropInfoPntr->commands[0] == CommandCode &&
5801     strcasecmp (PropertyName, PropInfoPntr->name) == 0)
5802       break;
5803   }
5804 
5805   /* Make the equivalent command message.  For commands with multiple
5806   arguments, repeat the message for each single argument and just change the
5807   data portion for each extra argument.  Send the command and wait for a reply,
5808   which we'll print out. */
5809 
5810   ScriptMessage.MakeEmpty ();
5811   ScriptMessage.what = CommandCode;
5812   ScriptMessage.AddSpecifier (PropertyName);
5813   while (true)
5814   {
5815     if (ArgumentIndex < argc) /* If there are arguments to be added. */
5816     {
5817       ValuePntr = argv[ArgumentIndex];
5818 
5819       /* Convert the value into the likely kind of data. */
5820 
5821       if (strcasecmp (ValuePntr, "yes") == 0 ||
5822       strcasecmp (ValuePntr, "true") == 0)
5823         ScriptMessage.AddBool (g_DataName, true);
5824       else if (strcasecmp (ValuePntr, "no") == 0 ||
5825       strcasecmp (ValuePntr, "false") == 0)
5826         ScriptMessage.AddBool (g_DataName, false);
5827       else
5828       {
5829         /* See if it is a number. */
5830         i = strtol (ValuePntr, &EndPntr, 0);
5831         if (*EndPntr == 0)
5832           ScriptMessage.AddInt32 (g_DataName, i);
5833         else /* Nope, it's just a string. */
5834           ScriptMessage.AddString (g_DataName, ValuePntr);
5835       }
5836     }
5837 
5838     ErrorCode = be_app_messenger.SendMessage (&ScriptMessage, &ReplyMessage);
5839     if (ErrorCode != B_OK)
5840     {
5841       DisplayErrorMessage ("Unable to send scripting command",
5842         ErrorCode, ErrorTitle);
5843       goto ErrorExit;
5844     }
5845 
5846     /* Print the reply to the scripting command.  Even in server mode.  To
5847     standard output. */
5848 
5849     if (ReplyMessage.FindString ("CommandText", &TempStringPntr) == B_OK)
5850     {
5851       TempInt32 = -1;
5852       if (ReplyMessage.FindInt32 ("error", &TempInt32) == B_OK &&
5853       TempInt32 == B_OK)
5854       {
5855         /* It's a successful reply to one of our scripting messages.  Print out
5856         the returned values code for command line users to see. */
5857 
5858         cout << "Result of command to " << TempStringPntr << " is:\t";
5859         if (ReplyMessage.FindString (g_ResultName, &TempStringPntr) == B_OK)
5860           cout << "\"" << TempStringPntr << "\"";
5861         else if (ReplyMessage.FindInt32 (g_ResultName, &TempInt32) == B_OK)
5862           cout << TempInt32;
5863         else if (ReplyMessage.FindFloat (g_ResultName, &TempFloat) == B_OK)
5864           cout << TempFloat;
5865         else if (ReplyMessage.FindBool (g_ResultName, &TempBool) == B_OK)
5866           cout << (TempBool ? "true" : "false");
5867         else
5868           cout << "just plain success";
5869         if (ReplyMessage.FindInt32 ("count", &TempInt32) == B_OK)
5870           cout << "\t(count " << TempInt32 << ")";
5871         for (i = 0; (i < 50) &&
5872         ReplyMessage.FindString ("words", i, &TempStringPntr) == B_OK &&
5873         ReplyMessage.FindFloat ("ratios", i, &TempFloat) == B_OK;
5874         i++)
5875         {
5876           if (i == 0)
5877             cout << "\twith top words:\t";
5878           else
5879             cout << "\t";
5880           cout << TempStringPntr << "/" << TempFloat;
5881         }
5882         cout << endl;
5883       }
5884       else /* An error reply, print out the error, even in server mode. */
5885       {
5886         cout << "Failure of command " << TempStringPntr << ", error ";
5887         cout << TempInt32 << " (" << strerror (TempInt32) << ")";
5888         if (ReplyMessage.FindString ("message", &TempStringPntr) == B_OK)
5889           cout << ", message: " << TempStringPntr;
5890         cout << "." << endl;
5891       }
5892     }
5893 
5894     /* Advance to the next argument and its scripting message. */
5895 
5896     ScriptMessage.RemoveName (g_DataName);
5897     if (++ArgumentIndex >= argc)
5898       break;
5899   }
5900 
5901 ErrorExit:
5902   free (argv);
5903 }
5904 
5905 
5906 /* Given a bunch of references to files, open the files.  If it's a database
5907 file, switch to using it as a database.  Otherwise, treat them as text files
5908 and add them to the database.  Prompt the user for the spam or genuine or
5909 uncertain (declassification) choice, with the option to bulk mark many files at
5910 once. */
5911 
5912 void
5913 CommanderLooper::ProcessRefs (BMessage *MessagePntr)
5914 {
5915   bool                         BulkMode = false;
5916   ClassificationTypes          BulkClassification = CL_GENUINE;
5917   ClassificationChoicesWindow *ChoiceWindowPntr;
5918   BEntry                       Entry;
5919   entry_ref                    EntryRef;
5920   status_t                     ErrorCode;
5921   const char                  *ErrorTitle = "CommanderLooper::ProcessRefs";
5922   int32                        NumberOfRefs = 0;
5923   BPath                        Path;
5924   int                          RefIndex;
5925   BMessage                     ReplyMessage;
5926   BMessage                     ScriptingMessage;
5927   bool                         TempBool;
5928   BFile                        TempFile;
5929   int32                        TempInt32;
5930   char                         TempString [PATH_MAX + 1024];
5931   type_code                    TypeCode;
5932 
5933   // Wait for ReadyToRun to finish initializing the globals with the sizes of
5934   // the controls, since they are needed when we show the custom alert box for
5935   // choosing the message type.
5936 
5937   TempInt32 = 0;
5938   while (!g_AppReadyToRunCompleted && TempInt32++ < 10)
5939     snooze (200000);
5940 
5941   ErrorCode = MessagePntr->GetInfo ("refs", &TypeCode, &NumberOfRefs);
5942   if (ErrorCode != B_OK || TypeCode != B_REF_TYPE || NumberOfRefs <= 0)
5943   {
5944     DisplayErrorMessage ("Unable to get refs from the message",
5945       ErrorCode, ErrorTitle);
5946     return;
5947   }
5948 
5949   if (MessagePntr->FindBool ("BulkMode", &TempBool) == B_OK)
5950     BulkMode = TempBool;
5951   if (MessagePntr->FindInt32 ("BulkClassification", &TempInt32) == B_OK &&
5952   TempInt32 >= 0 && TempInt32 < CL_MAX)
5953     BulkClassification = (ClassificationTypes) TempInt32;
5954 
5955   for (RefIndex = 0;
5956   MessagePntr->FindRef ("refs", RefIndex, &EntryRef) == B_OK;
5957   RefIndex++)
5958   {
5959     ScriptingMessage.MakeEmpty ();
5960     ScriptingMessage.what = 0; /* Haven't figured out what to do yet. */
5961 
5962     /* See if the entry is a valid file or directory or other thing. */
5963 
5964     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
5965     if (ErrorCode != B_OK ||
5966     ((ErrorCode = /* assignment */ B_ENTRY_NOT_FOUND) != 0 /* this pacifies
5967     mwcc -nwhitehorn */ && !Entry.Exists ()) ||
5968     ((ErrorCode = Entry.GetPath (&Path)) != B_OK))
5969     {
5970       DisplayErrorMessage ("Bad entry reference encountered, will skip it",
5971         ErrorCode, ErrorTitle);
5972       BulkMode = false;
5973       continue; /* Bad file reference, try the next one. */
5974     }
5975 
5976     /* If it's a file, check if it is a spam database file.  Go by the magic
5977     text at the start of the file, in case someone has edited the file with a
5978     spreadsheet or other tool and lost the MIME type. */
5979 
5980     if (Entry.IsFile ())
5981     {
5982       ErrorCode = TempFile.SetTo (&Entry, B_READ_ONLY);
5983       if (ErrorCode != B_OK)
5984       {
5985         sprintf (TempString, "Unable to open file \"%s\" for reading, will "
5986           "skip it", Path.Path ());
5987         DisplayErrorMessage (TempString, ErrorCode, ErrorTitle);
5988         BulkMode = false;
5989         continue;
5990       }
5991       if (TempFile.Read (TempString, strlen (g_DatabaseRecognitionString)) ==
5992       (int) strlen (g_DatabaseRecognitionString) && strncmp (TempString,
5993       g_DatabaseRecognitionString, strlen (g_DatabaseRecognitionString)) == 0)
5994       {
5995         ScriptingMessage.what = B_SET_PROPERTY;
5996         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
5997         ScriptingMessage.AddString (g_DataName, Path.Path ());
5998       }
5999       TempFile.Unset ();
6000     }
6001 
6002     /* Not a database file.  Could be a directory or a file.  Submit it as
6003     something to be marked spam or genuine. */
6004 
6005     if (ScriptingMessage.what == 0)
6006     {
6007       if (!Entry.IsFile ())
6008       {
6009         sprintf (TempString, "\"%s\" is not a file, can't do anything with it",
6010           Path.Path ());
6011         DisplayErrorMessage (TempString, -1, ErrorTitle);
6012         BulkMode = false;
6013         continue;
6014       }
6015 
6016       if (!BulkMode) /* Have to ask the user. */
6017       {
6018         ChoiceWindowPntr = new ClassificationChoicesWindow (
6019           BRect (40, 40, 40 + 50 * g_MarginBetweenControls,
6020           40 + g_ButtonHeight * 5), Path.Path (), NumberOfRefs - RefIndex);
6021         ChoiceWindowPntr->Go (&BulkMode, &BulkClassification);
6022         if (BulkClassification == CL_MAX)
6023           break; /* Cancel was picked. */
6024       }
6025 
6026       /* Format the command for classifying the file. */
6027 
6028       ScriptingMessage.what = B_SET_PROPERTY;
6029 
6030       if (BulkClassification == CL_GENUINE)
6031         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_GENUINE]);
6032       else if (BulkClassification == CL_SPAM)
6033         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_SPAM]);
6034       else if (BulkClassification == CL_UNCERTAIN)
6035         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_UNCERTAIN]);
6036       else /* Broken code */
6037         break;
6038       ScriptingMessage.AddString (g_DataName, Path.Path ());
6039     }
6040 
6041     /* Tell the BApplication to do the work, and wait for it to finish.  The
6042     BApplication will display any error messages for us. */
6043 
6044     ErrorCode =
6045       be_app_messenger.SendMessage (&ScriptingMessage, &ReplyMessage);
6046     if (ErrorCode != B_OK)
6047     {
6048       DisplayErrorMessage ("Unable to send scripting command",
6049         ErrorCode, ErrorTitle);
6050       return;
6051     }
6052 
6053     /* If there was an error, allow the user to stop by switching off bulk
6054     mode.  The message will already have been displayed in an alert box, if
6055     server mode is off. */
6056 
6057     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
6058     TempInt32 != B_OK)
6059       BulkMode = false;
6060   }
6061 }
6062 
6063 
6064 
6065 /******************************************************************************
6066  * Implementation of the ControlsView class, constructor, destructor and the
6067  * rest of the member functions in mostly alphabetical order.
6068  */
6069 
6070 ControlsView::ControlsView (BRect NewBounds)
6071 : BView (NewBounds, "ControlsView", B_FOLLOW_TOP | B_FOLLOW_LEFT_RIGHT,
6072     B_WILL_DRAW | B_PULSE_NEEDED | B_NAVIGABLE_JUMP | B_FRAME_EVENTS),
6073   m_AboutButtonPntr (NULL),
6074   m_AddExampleButtonPntr (NULL),
6075   m_BrowseButtonPntr (NULL),
6076   m_BrowseFilePanelPntr (NULL),
6077   m_CreateDatabaseButtonPntr (NULL),
6078   m_DatabaseFileNameTextboxPntr (NULL),
6079   m_DatabaseLoadDone (false),
6080   m_EstimateSpamButtonPntr (NULL),
6081   m_EstimateSpamFilePanelPntr (NULL),
6082   m_GenuineCountTextboxPntr (NULL),
6083   m_IgnorePreviousClassCheckboxPntr (NULL),
6084   m_InstallThingsButtonPntr (NULL),
6085   m_PurgeAgeTextboxPntr (NULL),
6086   m_PurgeButtonPntr (NULL),
6087   m_PurgePopularityTextboxPntr (NULL),
6088   m_ResetToDefaultsButtonPntr (NULL),
6089   m_ScoringModeMenuBarPntr (NULL),
6090   m_ScoringModePopUpMenuPntr (NULL),
6091   m_ServerModeCheckboxPntr (NULL),
6092   m_SpamCountTextboxPntr (NULL),
6093   m_TimeOfLastPoll (0),
6094   m_TokenizeModeMenuBarPntr (NULL),
6095   m_TokenizeModePopUpMenuPntr (NULL),
6096   m_WordCountTextboxPntr (NULL)
6097 {
6098 }
6099 
6100 
6101 ControlsView::~ControlsView ()
6102 {
6103   if (m_BrowseFilePanelPntr != NULL)
6104   {
6105     delete m_BrowseFilePanelPntr;
6106     m_BrowseFilePanelPntr = NULL;
6107   }
6108 
6109   if (m_EstimateSpamFilePanelPntr != NULL)
6110   {
6111     delete m_EstimateSpamFilePanelPntr;
6112     m_EstimateSpamFilePanelPntr = NULL;
6113   }
6114 }
6115 
6116 
6117 void
6118 ControlsView::AttachedToWindow ()
6119 {
6120   float         BigPurgeButtonTop;
6121   BMessage      CommandMessage;
6122   const char   *EightDigitsString = " 12345678 ";
6123   float         Height;
6124   float         Margin;
6125   float         RowHeight;
6126   float         RowTop;
6127   ScoringModes  ScoringMode;
6128   const char   *StringPntr;
6129   BMenuItem    *TempMenuItemPntr;
6130   BRect         TempRect;
6131   char          TempString [PATH_MAX];
6132   TokenizeModes TokenizeMode;
6133   float         Width;
6134   float         X;
6135 
6136   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
6137 
6138   TempRect = Bounds ();
6139   X = TempRect.right;
6140   RowTop = TempRect.top;
6141   RowHeight = g_ButtonHeight;
6142   if (g_TextBoxHeight > RowHeight)
6143     RowHeight = g_TextBoxHeight;
6144   RowHeight = ceilf (RowHeight * 1.1);
6145 
6146   /* Make the Create button at the far right of the first row of controls,
6147   which are all database file related. */
6148 
6149   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6150   TempRect = Bounds ();
6151   TempRect.top = RowTop + Margin;
6152   TempRect.bottom = TempRect.top + g_ButtonHeight;
6153 
6154   CommandMessage.MakeEmpty ();
6155   CommandMessage.what = B_CREATE_PROPERTY;
6156   CommandMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6157   m_CreateDatabaseButtonPntr = new BButton (TempRect, "Create Button",
6158     "Create", new BMessage (CommandMessage), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6159   if (m_CreateDatabaseButtonPntr == NULL) goto ErrorExit;
6160   AddChild (m_CreateDatabaseButtonPntr);
6161   m_CreateDatabaseButtonPntr->SetTarget (be_app);
6162   m_CreateDatabaseButtonPntr->ResizeToPreferred ();
6163   m_CreateDatabaseButtonPntr->GetPreferredSize (&Width, &Height);
6164   m_CreateDatabaseButtonPntr->MoveTo (X - Width, TempRect.top);
6165   X -= Width + g_MarginBetweenControls;
6166 
6167   /* Make the Browse button, middle of the first row. */
6168 
6169   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6170   TempRect = Bounds ();
6171   TempRect.top = RowTop + Margin;
6172   TempRect.bottom = TempRect.top + g_ButtonHeight;
6173 
6174   m_BrowseButtonPntr = new BButton (TempRect, "Browse Button",
6175     "Browse…", new BMessage (MSG_BROWSE_BUTTON), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6176   if (m_BrowseButtonPntr == NULL) goto ErrorExit;
6177   AddChild (m_BrowseButtonPntr);
6178   m_BrowseButtonPntr->SetTarget (this);
6179   m_BrowseButtonPntr->ResizeToPreferred ();
6180   m_BrowseButtonPntr->GetPreferredSize (&Width, &Height);
6181   m_BrowseButtonPntr->MoveTo (X - Width, TempRect.top);
6182   X -= Width + g_MarginBetweenControls;
6183 
6184   /* Fill the rest of the space on the first row with the file name box. */
6185 
6186   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6187   TempRect = Bounds ();
6188   TempRect.top = RowTop + Margin;
6189   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6190   TempRect.right = X;
6191 
6192   StringPntr = "Word Database:";
6193   strcpy (m_DatabaseFileNameCachedValue, "Unknown...");
6194   m_DatabaseFileNameTextboxPntr = new BTextControl (TempRect,
6195     "File Name",
6196     StringPntr /* label */,
6197     m_DatabaseFileNameCachedValue /* text */,
6198     new BMessage (MSG_DATABASE_NAME),
6199     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6200     B_WILL_DRAW | B_NAVIGABLE | B_NAVIGABLE_JUMP);
6201   AddChild (m_DatabaseFileNameTextboxPntr);
6202   m_DatabaseFileNameTextboxPntr->SetTarget (this);
6203   m_DatabaseFileNameTextboxPntr->SetDivider (
6204     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6205 
6206   /* Second row contains the purge age, and a long line explaining it.  There
6207   is space to the right where the top half of the big purge button will go. */
6208 
6209   RowTop += RowHeight /* previous row's RowHeight */;
6210   BigPurgeButtonTop = RowTop;
6211   TempRect = Bounds ();
6212   X = TempRect.left;
6213   RowHeight = g_TextBoxHeight;
6214   RowHeight = ceilf (RowHeight * 1.1);
6215 
6216   StringPntr = "Number of occurrences needed to store a word:";
6217   m_PurgeAgeCachedValue = 12345678;
6218 
6219   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6220   TempRect.top = RowTop + Margin;
6221   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6222   TempRect.left = X;
6223   TempRect.right = TempRect.left +
6224     be_plain_font->StringWidth (StringPntr) +
6225     be_plain_font->StringWidth (EightDigitsString) +
6226     3 * g_MarginBetweenControls;
6227 
6228   sprintf (TempString, "%d", (int) m_PurgeAgeCachedValue);
6229   m_PurgeAgeTextboxPntr = new BTextControl (TempRect,
6230     "Purge Age",
6231     StringPntr /* label */,
6232     TempString /* text */,
6233     new BMessage (MSG_PURGE_AGE),
6234     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6235     B_WILL_DRAW | B_NAVIGABLE);
6236   AddChild (m_PurgeAgeTextboxPntr);
6237   m_PurgeAgeTextboxPntr->SetTarget (this);
6238   m_PurgeAgeTextboxPntr->SetDivider (
6239     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6240 
6241   /* Third row contains the purge popularity and bottom half of the purge
6242   button. */
6243 
6244   RowTop += RowHeight /* previous row's RowHeight */;
6245   TempRect = Bounds ();
6246   X = TempRect.left;
6247   RowHeight = g_TextBoxHeight;
6248   RowHeight = ceilf (RowHeight * 1.1);
6249 
6250   StringPntr = "Number of messages to store words from:";
6251   m_PurgePopularityCachedValue = 87654321;
6252   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6253   TempRect.top = RowTop + Margin;
6254   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6255   TempRect.left = X;
6256   TempRect.right = TempRect.left +
6257     be_plain_font->StringWidth (StringPntr) +
6258     be_plain_font->StringWidth (EightDigitsString) +
6259     3 * g_MarginBetweenControls;
6260   X = TempRect.right + g_MarginBetweenControls;
6261 
6262   sprintf (TempString, "%d", (int) m_PurgePopularityCachedValue);
6263   m_PurgePopularityTextboxPntr = new BTextControl (TempRect,
6264     "Purge Popularity",
6265     StringPntr /* label */,
6266     TempString /* text */,
6267     new BMessage (MSG_PURGE_POPULARITY),
6268     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6269     B_WILL_DRAW | B_NAVIGABLE);
6270   AddChild (m_PurgePopularityTextboxPntr);
6271   m_PurgePopularityTextboxPntr->SetTarget (this);
6272   m_PurgePopularityTextboxPntr->SetDivider (
6273     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6274 
6275   /* Make the purge button, which will take up space in the 2nd and 3rd rows,
6276   on the right side.  Twice as tall as a regular button too. */
6277 
6278   StringPntr = "Remove Old Words";
6279   Margin = ceilf ((((RowTop + RowHeight) - BigPurgeButtonTop) -
6280     2 * g_TextBoxHeight) / 2);
6281   TempRect.top = BigPurgeButtonTop + Margin;
6282   TempRect.bottom = TempRect.top + 2 * g_TextBoxHeight;
6283   TempRect.left = X;
6284   TempRect.right = X + ceilf (2 * be_plain_font->StringWidth (StringPntr));
6285 
6286   CommandMessage.MakeEmpty ();
6287   CommandMessage.what = B_EXECUTE_PROPERTY;
6288   CommandMessage.AddSpecifier (g_PropertyNames[PN_PURGE]);
6289   m_PurgeButtonPntr = new BButton (TempRect, "Purge Button",
6290     StringPntr, new BMessage (CommandMessage), B_FOLLOW_LEFT | B_FOLLOW_TOP);
6291   if (m_PurgeButtonPntr == NULL) goto ErrorExit;
6292   m_PurgeButtonPntr->ResizeToPreferred();
6293   AddChild (m_PurgeButtonPntr);
6294   m_PurgeButtonPntr->SetTarget (be_app);
6295 
6296   /* The fourth row contains the ignore previous classification checkbox. */
6297 
6298   RowTop += RowHeight /* previous row's RowHeight */;
6299   TempRect = Bounds ();
6300   X = TempRect.left;
6301   RowHeight = g_CheckBoxHeight;
6302   RowHeight = ceilf (RowHeight * 1.1);
6303 
6304   StringPntr = "Allow Retraining on a Message";
6305   m_IgnorePreviousClassCachedValue = false;
6306 
6307   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6308   TempRect.top = RowTop + Margin;
6309   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6310   TempRect.left = X;
6311   m_IgnorePreviousClassCheckboxPntr = new BCheckBox (TempRect,
6312     "Ignore Check",
6313     StringPntr,
6314     new BMessage (MSG_IGNORE_CLASSIFICATION),
6315     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6316   if (m_IgnorePreviousClassCheckboxPntr == NULL) goto ErrorExit;
6317   AddChild (m_IgnorePreviousClassCheckboxPntr);
6318   m_IgnorePreviousClassCheckboxPntr->SetTarget (this);
6319   m_IgnorePreviousClassCheckboxPntr->ResizeToPreferred ();
6320   m_IgnorePreviousClassCheckboxPntr->GetPreferredSize (&Width, &Height);
6321   X += Width + g_MarginBetweenControls;
6322 
6323   /* The fifth row contains the server mode checkbox. */
6324 
6325   RowTop += RowHeight /* previous row's RowHeight */;
6326   TempRect = Bounds ();
6327   RowHeight = g_CheckBoxHeight;
6328   RowHeight = ceilf (RowHeight * 1.1);
6329 
6330   StringPntr = "Print errors to Terminal";
6331   m_ServerModeCachedValue = false;
6332 
6333   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6334   TempRect.top = RowTop + Margin;
6335   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6336   m_ServerModeCheckboxPntr = new BCheckBox (TempRect,
6337     "ServerMode Check",
6338     StringPntr,
6339     new BMessage (MSG_SERVER_MODE),
6340     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6341   if (m_ServerModeCheckboxPntr == NULL) goto ErrorExit;
6342   AddChild (m_ServerModeCheckboxPntr);
6343   m_ServerModeCheckboxPntr->SetTarget (this);
6344   m_ServerModeCheckboxPntr->ResizeToPreferred ();
6345   m_ServerModeCheckboxPntr->GetPreferredSize (&Width, &Height);
6346 
6347   /* This row just contains a huge pop-up menu which shows the tokenize mode
6348   and an explanation of what each mode does. */
6349 
6350   RowTop += RowHeight /* previous row's RowHeight */;
6351   TempRect = Bounds ();
6352   RowHeight = g_PopUpMenuHeight;
6353   RowHeight = ceilf (RowHeight * 1.1);
6354 
6355   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6356   TempRect.top = RowTop + Margin;
6357   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6358 
6359   m_TokenizeModeCachedValue = TM_MAX; /* Illegal value will force redraw. */
6360   m_TokenizeModeMenuBarPntr = new BMenuBar (TempRect, "TokenizeModeMenuBar",
6361     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6362     false /* resize to fit items */);
6363   if (m_TokenizeModeMenuBarPntr == NULL) goto ErrorExit;
6364   m_TokenizeModePopUpMenuPntr = new BPopUpMenu ("TokenizeModePopUpMenu");
6365   if (m_TokenizeModePopUpMenuPntr == NULL) goto ErrorExit;
6366 
6367   for (TokenizeMode = (TokenizeModes) 0;
6368   TokenizeMode < TM_MAX;
6369   TokenizeMode = (TokenizeModes) ((int) TokenizeMode + 1))
6370   {
6371     /* Each different tokenize mode gets its own menu item.  Selecting the item
6372     will send a canned command to the application to switch to the appropriate
6373     tokenize mode.  An optional explanation of each mode is added to the mode
6374     name string. */
6375 
6376     CommandMessage.MakeEmpty ();
6377     CommandMessage.what = B_SET_PROPERTY;
6378     CommandMessage.AddSpecifier (g_PropertyNames[PN_TOKENIZE_MODE]);
6379     CommandMessage.AddString (g_DataName, g_TokenizeModeNames[TokenizeMode]);
6380     strcpy (TempString, g_TokenizeModeNames[TokenizeMode]);
6381     switch (TokenizeMode)
6382     {
6383       case TM_WHOLE:
6384         strcat (TempString, " - Scan everything");
6385         break;
6386 
6387       case TM_PLAIN_TEXT:
6388         strcat (TempString, " - Scan e-mail body text except rich text");
6389         break;
6390 
6391       case TM_PLAIN_TEXT_HEADER:
6392         strcat (TempString, " - Scan entire e-mail text except rich text");
6393         break;
6394 
6395       case TM_ANY_TEXT:
6396         strcat (TempString, " - Scan e-mail body text and text attachments");
6397         break;
6398 
6399       case TM_ANY_TEXT_HEADER:
6400        strcat (TempString, " - Scan entire e-mail text and text attachments (recommended)");
6401         break;
6402 
6403       case TM_ALL_PARTS:
6404         strcat (TempString, " - Scan e-mail body and all attachments");
6405         break;
6406 
6407       case TM_ALL_PARTS_HEADER:
6408         strcat (TempString, " - Scan all parts of the e-mail");
6409         break;
6410 
6411       case TM_JUST_HEADER:
6412         strcat (TempString, " - Scan just the header (mail routing information)");
6413         break;
6414 
6415       default:
6416         break;
6417     }
6418     TempMenuItemPntr =
6419       new BMenuItem (TempString, new BMessage (CommandMessage));
6420     if (TempMenuItemPntr == NULL) goto ErrorExit;
6421     TempMenuItemPntr->SetTarget (be_app);
6422     m_TokenizeModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6423   }
6424   m_TokenizeModeMenuBarPntr->AddItem (m_TokenizeModePopUpMenuPntr);
6425   AddChild (m_TokenizeModeMenuBarPntr);
6426 
6427   /* This row just contains a huge pop-up menu which shows the scoring mode
6428   and an explanation of what each mode does. */
6429 
6430   RowTop += RowHeight /* previous row's RowHeight */;
6431   TempRect = Bounds ();
6432   RowHeight = g_PopUpMenuHeight;
6433   RowHeight = ceilf (RowHeight * 1.1);
6434 
6435   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6436   TempRect.top = RowTop + Margin;
6437   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6438 
6439   m_ScoringModeCachedValue = SM_MAX; /* Illegal value will force redraw. */
6440   m_ScoringModeMenuBarPntr = new BMenuBar (TempRect, "ScoringModeMenuBar",
6441     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6442     false /* resize to fit items */);
6443   if (m_ScoringModeMenuBarPntr == NULL) goto ErrorExit;
6444   m_ScoringModePopUpMenuPntr = new BPopUpMenu ("ScoringModePopUpMenu");
6445   if (m_ScoringModePopUpMenuPntr == NULL) goto ErrorExit;
6446 
6447   for (ScoringMode = (ScoringModes) 0;
6448   ScoringMode < SM_MAX;
6449   ScoringMode = (ScoringModes) ((int) ScoringMode + 1))
6450   {
6451     /* Each different scoring mode gets its own menu item.  Selecting the item
6452     will send a canned command to the application to switch to the appropriate
6453     scoring mode.  An optional explanation of each mode is added to the mode
6454     name string. */
6455 
6456     CommandMessage.MakeEmpty ();
6457     CommandMessage.what = B_SET_PROPERTY;
6458     CommandMessage.AddSpecifier (g_PropertyNames[PN_SCORING_MODE]);
6459     CommandMessage.AddString (g_DataName, g_ScoringModeNames[ScoringMode]);
6460 /*
6461     strcpy (TempString, g_ScoringModeNames[ScoringMode]);
6462     switch (ScoringMode)
6463     {
6464       case SM_ROBINSON:
6465         strcat (TempString, " - Learning Method 1: Naive Bayesian");
6466         break;
6467 
6468       case SM_CHISQUARED:
6469         strcat (TempString, " - Learning Method 2: Chi-Squared");
6470         break;
6471 
6472       default:
6473         break;
6474     }
6475 */
6476     switch (ScoringMode)
6477     {
6478       case SM_ROBINSON:
6479         strcpy (TempString, "Learning method 1: Naive Bayesian");
6480         break;
6481 
6482       case SM_CHISQUARED:
6483         strcpy (TempString, "Learning method 2: Chi-Squared");
6484         break;
6485 
6486       default:
6487         break;
6488     }
6489     TempMenuItemPntr =
6490       new BMenuItem (TempString, new BMessage (CommandMessage));
6491     if (TempMenuItemPntr == NULL) goto ErrorExit;
6492     TempMenuItemPntr->SetTarget (be_app);
6493     m_ScoringModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6494   }
6495   m_ScoringModeMenuBarPntr->AddItem (m_ScoringModePopUpMenuPntr);
6496   AddChild (m_ScoringModeMenuBarPntr);
6497 
6498   /* The next row has the install MIME types button and the reset to defaults
6499   button, one on the left and the other on the right. */
6500 
6501   RowTop += RowHeight /* previous row's RowHeight */;
6502   TempRect = Bounds ();
6503   RowHeight = g_ButtonHeight;
6504   RowHeight = ceilf (RowHeight * 1.1);
6505 
6506   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6507   TempRect.top = RowTop + Margin;
6508   TempRect.bottom = TempRect.top + g_ButtonHeight;
6509 
6510   CommandMessage.MakeEmpty ();
6511   CommandMessage.what = B_EXECUTE_PROPERTY;
6512   CommandMessage.AddSpecifier (g_PropertyNames[PN_INSTALL_THINGS]);
6513   m_InstallThingsButtonPntr = new BButton (TempRect, "Install Button",
6514     "Install spam types",
6515     new BMessage (CommandMessage),
6516     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6517   if (m_InstallThingsButtonPntr == NULL) goto ErrorExit;
6518   AddChild (m_InstallThingsButtonPntr);
6519   m_InstallThingsButtonPntr->SetTarget (be_app);
6520   m_InstallThingsButtonPntr->ResizeToPreferred ();
6521 
6522   /* The Reset to Defaults button.  On the right side of the row. */
6523 
6524   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6525   TempRect = Bounds ();
6526   TempRect.top = RowTop + Margin;
6527   TempRect.bottom = TempRect.top + g_ButtonHeight;
6528 
6529   CommandMessage.MakeEmpty ();
6530   CommandMessage.what = B_EXECUTE_PROPERTY;
6531   CommandMessage.AddSpecifier (g_PropertyNames[PN_RESET_TO_DEFAULTS]);
6532   m_ResetToDefaultsButtonPntr = new BButton (TempRect, "Reset Button",
6533     "Default settings", new BMessage (CommandMessage),
6534     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6535   if (m_ResetToDefaultsButtonPntr == NULL) goto ErrorExit;
6536   AddChild (m_ResetToDefaultsButtonPntr);
6537   m_ResetToDefaultsButtonPntr->SetTarget (be_app);
6538   m_ResetToDefaultsButtonPntr->ResizeToPreferred ();
6539   m_ResetToDefaultsButtonPntr->GetPreferredSize (&Width, &Height);
6540   m_ResetToDefaultsButtonPntr->MoveTo (TempRect.right - Width, TempRect.top);
6541 
6542   /* The next row contains the Estimate, Add Examples and About buttons. */
6543 
6544   RowTop += RowHeight /* previous row's RowHeight */;
6545   TempRect = Bounds ();
6546   X = TempRect.left;
6547   RowHeight = g_ButtonHeight;
6548   RowHeight = ceilf (RowHeight * 1.1);
6549 
6550   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6551   TempRect.top = RowTop + Margin;
6552   TempRect.bottom = TempRect.top + g_ButtonHeight;
6553   TempRect.left = X;
6554 
6555   m_EstimateSpamButtonPntr = new BButton (TempRect, "Estimate Button",
6556     "Scan a message",
6557     new BMessage (MSG_ESTIMATE_BUTTON),
6558     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6559   if (m_EstimateSpamButtonPntr == NULL) goto ErrorExit;
6560   AddChild (m_EstimateSpamButtonPntr);
6561   m_EstimateSpamButtonPntr->SetTarget (this);
6562   m_EstimateSpamButtonPntr->ResizeToPreferred ();
6563   X = m_EstimateSpamButtonPntr->Frame().right + g_MarginBetweenControls;
6564 
6565   /* The Add Example button in the middle.  Does the same as the browse button,
6566   but don't tell anyone that! */
6567 
6568   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6569   TempRect.top = RowTop + Margin;
6570   TempRect.bottom = TempRect.top + g_ButtonHeight;
6571   TempRect.left = X;
6572 
6573   m_AddExampleButtonPntr = new BButton (TempRect, "Example Button",
6574     "Train spam filter on a message",
6575     new BMessage (MSG_BROWSE_BUTTON),
6576     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6577     B_WILL_DRAW | B_NAVIGABLE | B_FULL_UPDATE_ON_RESIZE);
6578   if (m_AddExampleButtonPntr == NULL) goto ErrorExit;
6579   AddChild (m_AddExampleButtonPntr);
6580   m_AddExampleButtonPntr->SetTarget (this);
6581   m_AddExampleButtonPntr->ResizeToPreferred ();
6582   X = m_AddExampleButtonPntr->Frame().right + g_MarginBetweenControls;
6583 
6584   /* Add the About button on the right. */
6585 
6586   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6587   TempRect = Bounds ();
6588   TempRect.top = RowTop + Margin;
6589   TempRect.bottom = TempRect.top + g_ButtonHeight;
6590   TempRect.left = X;
6591 
6592   m_AboutButtonPntr = new BButton (TempRect, "About Button",
6593     "About…",
6594     new BMessage (B_ABOUT_REQUESTED),
6595     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6596   if (m_AboutButtonPntr == NULL) goto ErrorExit;
6597   AddChild (m_AboutButtonPntr);
6598   m_AboutButtonPntr->SetTarget (be_app);
6599 
6600   /* This row displays various counters.  Starting with the genuine messages
6601   count on the left. */
6602 
6603   RowTop += RowHeight /* previous row's RowHeight */;
6604   TempRect = Bounds ();
6605   RowHeight = g_TextBoxHeight;
6606   RowHeight = ceilf (RowHeight * 1.1);
6607 
6608   StringPntr = "Genuine messages:";
6609   m_GenuineCountCachedValue = 87654321;
6610   sprintf (TempString, "%d", (int) m_GenuineCountCachedValue);
6611 
6612   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6613   TempRect = Bounds ();
6614   TempRect.top = RowTop + Margin;
6615   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6616   TempRect.right = TempRect.left +
6617     be_plain_font->StringWidth (StringPntr) +
6618     be_plain_font->StringWidth (TempString) +
6619     3 * g_MarginBetweenControls;
6620 
6621   m_GenuineCountTextboxPntr = new BTextControl (TempRect,
6622     "Genuine count",
6623     StringPntr /* label */,
6624     TempString /* text */,
6625     NULL /* no message */,
6626     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6627     B_WILL_DRAW /* not B_NAVIGABLE */);
6628   AddChild (m_GenuineCountTextboxPntr);
6629   m_GenuineCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6630   m_GenuineCountTextboxPntr->SetDivider (
6631     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6632   m_GenuineCountTextboxPntr->SetEnabled (false); /* For display only. */
6633 
6634   /* The word count in the center. */
6635 
6636   StringPntr = "Word count:";
6637   m_WordCountCachedValue = 87654321;
6638   sprintf (TempString, "%d", (int) m_WordCountCachedValue);
6639 
6640   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6641   TempRect = Bounds ();
6642   TempRect.top = RowTop + Margin;
6643   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6644   Width = be_plain_font->StringWidth (StringPntr) +
6645     be_plain_font->StringWidth (TempString) +
6646     3 * g_MarginBetweenControls;
6647   TempRect.left = ceilf ((TempRect.right - TempRect.left) / 2 - Width / 2);
6648   TempRect.right = TempRect.left + Width;
6649 
6650   m_WordCountTextboxPntr = new BTextControl (TempRect,
6651     "Word count",
6652     StringPntr /* label */,
6653     TempString /* text */,
6654     NULL /* no message */,
6655     B_FOLLOW_H_CENTER | B_FOLLOW_TOP,
6656     B_WILL_DRAW /* not B_NAVIGABLE */);
6657   AddChild (m_WordCountTextboxPntr);
6658   m_WordCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6659   m_WordCountTextboxPntr->SetDivider (
6660     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6661   m_WordCountTextboxPntr->SetEnabled (false); /* For display only. */
6662 
6663   /* The spam count on the far right. */
6664 
6665   StringPntr = "Spam messages:";
6666   m_SpamCountCachedValue = 87654321;
6667   sprintf (TempString, "%d", (int) m_SpamCountCachedValue);
6668 
6669   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6670   TempRect = Bounds ();
6671   TempRect.top = RowTop + Margin;
6672   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6673   TempRect.left = TempRect.right -
6674     be_plain_font->StringWidth (StringPntr) -
6675     be_plain_font->StringWidth (TempString) -
6676     3 * g_MarginBetweenControls;
6677 
6678   m_SpamCountTextboxPntr = new BTextControl (TempRect,
6679     "Spam count",
6680     StringPntr /* label */,
6681     TempString /* text */,
6682     NULL /* no message */,
6683     B_FOLLOW_RIGHT | B_FOLLOW_TOP,
6684     B_WILL_DRAW /* not B_NAVIGABLE */);
6685   AddChild (m_SpamCountTextboxPntr);
6686   m_SpamCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6687   m_SpamCountTextboxPntr->SetDivider (
6688     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6689   m_SpamCountTextboxPntr->SetEnabled (false); /* For display only. */
6690 
6691   /* Change the size of our view so it only takes up the space needed by the
6692   buttons. */
6693 
6694   RowTop += RowHeight /* previous row's RowHeight */;
6695   ResizeTo (Bounds().Width(), RowTop - Bounds().top + 1);
6696 
6697   return; /* Successful. */
6698 
6699 ErrorExit:
6700   DisplayErrorMessage ("Unable to initialise the controls view.");
6701 }
6702 
6703 
6704 void
6705 ControlsView::BrowseForDatabaseFile ()
6706 {
6707   if (m_BrowseFilePanelPntr == NULL)
6708   {
6709     BEntry      DirectoryEntry;
6710     entry_ref   DirectoryEntryRef;
6711     BMessage    GetDatabasePathCommand;
6712     BMessage    GetDatabasePathResult;
6713     const char *StringPntr = NULL;
6714 
6715     /* Create a new file panel.  First set up the entry ref stuff so that the
6716     file panel can open to show the initial directory (the one where the
6717     database file currently is).  Note that we have to create it after the
6718     window and view are up and running, otherwise the BMessenger won't point to
6719     a valid looper/handler.  First find out the current database file name to
6720     use as a starting point. */
6721 
6722     GetDatabasePathCommand.what = B_GET_PROPERTY;
6723     GetDatabasePathCommand.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6724     be_app_messenger.SendMessage (&GetDatabasePathCommand,
6725       &GetDatabasePathResult, 5000000 /* delivery timeout */,
6726       5000000 /* reply timeout */);
6727     if (GetDatabasePathResult.FindString (g_ResultName, &StringPntr) != B_OK ||
6728     DirectoryEntry.SetTo (StringPntr) != B_OK ||
6729     DirectoryEntry.GetParent (&DirectoryEntry) != B_OK)
6730       DirectoryEntry.SetTo ("."); /* Default directory if we can't find it. */
6731     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6732     {
6733       DisplayErrorMessage (
6734         "Unable to set up the file requestor starting directory.  Sorry.");
6735       return;
6736     }
6737 
6738     m_BrowseFilePanelPntr = new BFilePanel (
6739       B_OPEN_PANEL /* mode */,
6740       &be_app_messenger /* target for event messages */,
6741       &DirectoryEntryRef /* starting directory */,
6742       B_FILE_NODE,
6743       true /* true for multiple selections */,
6744       NULL /* canned message */,
6745       NULL /* ref filter */,
6746       false /* true for modal */,
6747       true /* true to hide when done */);
6748   }
6749 
6750   if (m_BrowseFilePanelPntr != NULL)
6751     m_BrowseFilePanelPntr->Show (); /* Answer returned later in RefsReceived. */
6752 }
6753 
6754 
6755 void
6756 ControlsView::BrowseForFileToEstimate ()
6757 {
6758   if (m_EstimateSpamFilePanelPntr == NULL)
6759   {
6760     BEntry      DirectoryEntry;
6761     entry_ref   DirectoryEntryRef;
6762     status_t    ErrorCode;
6763     BMessenger  MessengerToSelf (this);
6764     BPath       PathToMailDirectory;
6765 
6766     /* Create a new file panel.  First set up the entry ref stuff so that the
6767     file panel can open to show the initial directory (the user's mail
6768     directory).  Note that we have to create the panel after the window and
6769     view are up and running, otherwise the BMessenger won't point to a valid
6770     looper/handler. */
6771 
6772     ErrorCode = find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6773     if (ErrorCode == B_OK)
6774     {
6775       PathToMailDirectory.Append ("mail");
6776       ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(),
6777         true /* traverse symbolic links*/);
6778       if (ErrorCode != B_OK || !DirectoryEntry.Exists ())
6779       {
6780         /* If no mail directory, try home directory. */
6781         find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6782         ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6783       }
6784     }
6785     if (ErrorCode != B_OK)
6786       PathToMailDirectory.SetTo (".");
6787 
6788     DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6789     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6790     {
6791       DisplayErrorMessage (
6792         "Unable to set up the file requestor starting directory.  Sorry.");
6793       return;
6794     }
6795 
6796     m_EstimateSpamFilePanelPntr = new BFilePanel (
6797       B_OPEN_PANEL /* mode */,
6798       &MessengerToSelf /* target for event messages */,
6799       &DirectoryEntryRef /* starting directory */,
6800       B_FILE_NODE,
6801       true /* true for multiple selections */,
6802       new BMessage (MSG_ESTIMATE_FILE_REFS) /* canned message */,
6803       NULL /* ref filter */,
6804       false /* true for modal */,
6805       true /* true to hide when done */);
6806   }
6807 
6808   if (m_EstimateSpamFilePanelPntr != NULL)
6809     m_EstimateSpamFilePanelPntr->Show (); /* Answer sent via a message. */
6810 }
6811 
6812 
6813 /* The display has been resized.  Have to manually adjust the popup menu bar to
6814 show the new size (the sub-items need to be resized too).  Then make it redraw.
6815 Well, actually just resetting the mark on the current item will resize it
6816 properly. */
6817 
6818 void
6819 ControlsView::FrameResized (float, float)
6820 {
6821   m_ScoringModeCachedValue = SM_MAX; /* Force it to reset the mark. */
6822   m_TokenizeModeCachedValue = TM_MAX; /* Force it to reset the mark. */
6823 }
6824 
6825 
6826 void
6827 ControlsView::MessageReceived (BMessage *MessagePntr)
6828 {
6829   BMessage CommandMessage;
6830   bool     TempBool;
6831   uint32   TempUint32;
6832 
6833   switch (MessagePntr->what)
6834   {
6835     case MSG_BROWSE_BUTTON:
6836       BrowseForDatabaseFile ();
6837       break;
6838 
6839     case MSG_DATABASE_NAME:
6840       if (strcmp (m_DatabaseFileNameCachedValue,
6841       m_DatabaseFileNameTextboxPntr->Text ()) != 0)
6842         SubmitCommandString (PN_DATABASE_FILE, B_SET_PROPERTY,
6843         m_DatabaseFileNameTextboxPntr->Text ());
6844       break;
6845 
6846     case MSG_ESTIMATE_BUTTON:
6847       BrowseForFileToEstimate ();
6848       break;
6849 
6850     case MSG_ESTIMATE_FILE_REFS:
6851       EstimateRefFilesAndDisplay (MessagePntr);
6852       break;
6853 
6854     case MSG_IGNORE_CLASSIFICATION:
6855       TempBool = (m_IgnorePreviousClassCheckboxPntr->Value() == B_CONTROL_ON);
6856       if (m_IgnorePreviousClassCachedValue != TempBool)
6857         SubmitCommandBool (PN_IGNORE_PREVIOUS_CLASSIFICATION,
6858         B_SET_PROPERTY, TempBool);
6859       break;
6860 
6861     case MSG_PURGE_AGE:
6862       TempUint32 = strtoul (m_PurgeAgeTextboxPntr->Text (), NULL, 10);
6863       if (m_PurgeAgeCachedValue != TempUint32)
6864         SubmitCommandInt32 (PN_PURGE_AGE, B_SET_PROPERTY, TempUint32);
6865       break;
6866 
6867     case MSG_PURGE_POPULARITY:
6868       TempUint32 = strtoul (m_PurgePopularityTextboxPntr->Text (), NULL, 10);
6869       if (m_PurgePopularityCachedValue != TempUint32)
6870         SubmitCommandInt32 (PN_PURGE_POPULARITY, B_SET_PROPERTY, TempUint32);
6871       break;
6872 
6873     case MSG_SERVER_MODE:
6874       TempBool = (m_ServerModeCheckboxPntr->Value() == B_CONTROL_ON);
6875       if (m_ServerModeCachedValue != TempBool)
6876         SubmitCommandBool (PN_SERVER_MODE, B_SET_PROPERTY, TempBool);
6877       break;
6878 
6879     default:
6880       BView::MessageReceived (MessagePntr);
6881   }
6882 }
6883 
6884 
6885 /* Check the server for changes in the state of the database, and if there are
6886 any changes, update the displayed values.  Since this is a read only
6887 examination of the server, we go directly to the application rather than
6888 sending it messages.  Also, when sending messages, we can't find out what it is
6889 doing while it is busy with a batch of spam additions (all the spam add
6890 commands will be in the queue ahead of our requests for info).  Instead, we
6891 lock the BApplication (so it isn't changing things while we're looking) and
6892 retrieve our values. */
6893 
6894 void
6895 ControlsView::PollServerForChanges ()
6896 {
6897   ABSApp     *MyAppPntr;
6898   BMenuItem  *TempMenuItemPntr;
6899   char        TempString [PATH_MAX];
6900   BWindow    *WindowPntr;
6901 
6902   /* We need a pointer to our window, for changing the title etc. */
6903 
6904   WindowPntr = Window ();
6905   if (WindowPntr == NULL)
6906     return; /* No window, no point in updating the display! */
6907 
6908   /* Check the server mode flag.  If the mode is off, then the window has to be
6909   minimized.  Similarly, if it gets turned on, maximize the window.  Note that
6910   the user can maximize the window manually, even while still in server mode.
6911   */
6912 
6913   if (g_ServerMode != m_ServerModeCachedValue &&
6914   m_ServerModeCheckboxPntr != NULL)
6915   {
6916     m_ServerModeCachedValue = g_ServerMode;
6917     m_ServerModeCheckboxPntr->SetValue (
6918       m_ServerModeCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6919     WindowPntr->Minimize (m_ServerModeCachedValue);
6920   }
6921 
6922   if (WindowPntr->IsMinimized ())
6923     return; /* Window isn't visible, don't waste time updating it. */
6924 
6925   /* So that people don't stare at a blank screen, request a database load if
6926   nothing is there.  But only do it once, so the user doesn't get a lot of
6927   invalid database messages if one doesn't exist yet.  In server mode, we never
6928   get this far so it is only loaded when the user wants to see something. */
6929 
6930   if (!m_DatabaseLoadDone)
6931   {
6932     m_DatabaseLoadDone = true;
6933     /* Counting the number of words will load the database. */
6934     SubmitCommandString (PN_DATABASE_FILE, B_COUNT_PROPERTIES, "");
6935   }
6936 
6937   /* Check various read only values, which can be read from the BApplication
6938   without having to lock it.  This is useful for displaying the number of words
6939   as it is changing.  First up is the purge age setting. */
6940 
6941   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
6942   if (MyAppPntr == NULL)
6943     return; /* Doesn't exist or is the wrong class.  Not likely! */
6944 
6945   if (MyAppPntr->m_PurgeAge != m_PurgeAgeCachedValue &&
6946   m_PurgeAgeTextboxPntr != NULL)
6947   {
6948     m_PurgeAgeCachedValue = MyAppPntr->m_PurgeAge;
6949     sprintf (TempString, "%lu", m_PurgeAgeCachedValue);
6950     m_PurgeAgeTextboxPntr->SetText (TempString);
6951   }
6952 
6953   /* Check the purge popularity. */
6954 
6955   if (MyAppPntr->m_PurgePopularity != m_PurgePopularityCachedValue &&
6956   m_PurgePopularityTextboxPntr != NULL)
6957   {
6958     m_PurgePopularityCachedValue = MyAppPntr->m_PurgePopularity;
6959     sprintf (TempString, "%lu", m_PurgePopularityCachedValue);
6960     m_PurgePopularityTextboxPntr->SetText (TempString);
6961   }
6962 
6963   /* Check the Ignore Previous Classification flag. */
6964 
6965   if (MyAppPntr->m_IgnorePreviousClassification !=
6966   m_IgnorePreviousClassCachedValue &&
6967   m_IgnorePreviousClassCheckboxPntr != NULL)
6968   {
6969     m_IgnorePreviousClassCachedValue =
6970       MyAppPntr->m_IgnorePreviousClassification;
6971     m_IgnorePreviousClassCheckboxPntr->SetValue (
6972       m_IgnorePreviousClassCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6973   }
6974 
6975   /* Update the genuine count. */
6976 
6977   if (MyAppPntr->m_TotalGenuineMessages != m_GenuineCountCachedValue &&
6978   m_GenuineCountTextboxPntr != NULL)
6979   {
6980     m_GenuineCountCachedValue = MyAppPntr->m_TotalGenuineMessages;
6981     sprintf (TempString, "%lu", m_GenuineCountCachedValue);
6982     m_GenuineCountTextboxPntr->SetText (TempString);
6983   }
6984 
6985   /* Update the spam count. */
6986 
6987   if (MyAppPntr->m_TotalSpamMessages != m_SpamCountCachedValue &&
6988   m_SpamCountTextboxPntr != NULL)
6989   {
6990     m_SpamCountCachedValue = MyAppPntr->m_TotalSpamMessages;
6991     sprintf (TempString, "%lu", m_SpamCountCachedValue);
6992     m_SpamCountTextboxPntr->SetText (TempString);
6993   }
6994 
6995   /* Update the word count. */
6996 
6997   if (MyAppPntr->m_WordCount != m_WordCountCachedValue &&
6998   m_WordCountTextboxPntr != NULL)
6999   {
7000     m_WordCountCachedValue = MyAppPntr->m_WordCount;
7001     sprintf (TempString, "%lu", m_WordCountCachedValue);
7002     m_WordCountTextboxPntr->SetText (TempString);
7003   }
7004 
7005   /* Update the tokenize mode pop-up menu. */
7006 
7007   if (MyAppPntr->m_TokenizeMode != m_TokenizeModeCachedValue &&
7008   m_TokenizeModePopUpMenuPntr != NULL)
7009   {
7010     m_TokenizeModeCachedValue = MyAppPntr->m_TokenizeMode;
7011     TempMenuItemPntr =
7012       m_TokenizeModePopUpMenuPntr->ItemAt ((int) m_TokenizeModeCachedValue);
7013     if (TempMenuItemPntr != NULL)
7014       TempMenuItemPntr->SetMarked (true);
7015   }
7016 
7017   /* Update the scoring mode pop-up menu. */
7018 
7019   if (MyAppPntr->m_ScoringMode != m_ScoringModeCachedValue &&
7020   m_ScoringModePopUpMenuPntr != NULL)
7021   {
7022     m_ScoringModeCachedValue = MyAppPntr->m_ScoringMode;
7023     TempMenuItemPntr =
7024       m_ScoringModePopUpMenuPntr->ItemAt ((int) m_ScoringModeCachedValue);
7025     if (TempMenuItemPntr != NULL)
7026       TempMenuItemPntr->SetMarked (true);
7027   }
7028 
7029   /* Lock the application.  This will stop it from processing any further
7030   messages until we are done.  Or if it is busy, the lock will fail. */
7031 
7032   if (MyAppPntr->LockWithTimeout (100000) != B_OK)
7033     return; /* It's probably busy doing something. */
7034 
7035   /* See if the database file name has changed. */
7036 
7037   if (strcmp (MyAppPntr->m_DatabaseFileName.String (),
7038   m_DatabaseFileNameCachedValue) != 0 &&
7039   m_DatabaseFileNameTextboxPntr != NULL)
7040   {
7041     strcpy (m_DatabaseFileNameCachedValue,
7042       MyAppPntr->m_DatabaseFileName.String ());
7043     m_DatabaseFileNameTextboxPntr->SetText (m_DatabaseFileNameCachedValue);
7044     WindowPntr->SetTitle (m_DatabaseFileNameCachedValue);
7045   }
7046 
7047   /* Done.  Let the BApplication continue processing messages. */
7048 
7049   MyAppPntr->Unlock ();
7050 }
7051 
7052 
7053 void
7054 ControlsView::Pulse ()
7055 {
7056   if (system_time () > m_TimeOfLastPoll + 200000)
7057   {
7058     PollServerForChanges ();
7059     m_TimeOfLastPoll = system_time ();
7060   }
7061 }
7062 
7063 
7064 
7065 /******************************************************************************
7066  * Implementation of the DatabaseWindow class, constructor, destructor and the
7067  * rest of the member functions in mostly alphabetical order.
7068  */
7069 
7070 DatabaseWindow::DatabaseWindow ()
7071 : BWindow (BRect (30, 30, 620, 400),
7072     "Haiku spam filter server",
7073     B_DOCUMENT_WINDOW, B_ASYNCHRONOUS_CONTROLS)
7074 {
7075   BRect TempRect;
7076 
7077   /* Add the controls view. */
7078 
7079   m_ControlsViewPntr = new ControlsView (Bounds ());
7080   if (m_ControlsViewPntr == NULL)
7081     goto ErrorExit;
7082   AddChild (m_ControlsViewPntr);
7083 
7084   /* Add the word view in the remaining space under the controls view. */
7085 
7086 
7087   TempRect = Bounds ();
7088   TempRect.top = m_ControlsViewPntr->Frame().bottom + 1;
7089   m_WordsViewPntr = new WordsView (TempRect);
7090   if (m_WordsViewPntr == NULL)
7091     goto ErrorExit;
7092   AddChild (m_WordsViewPntr);
7093 
7094  /* Minimize the window if we are starting up in server mode.  This is done
7095 	before the window is open so it doesn't flash onto the screen, and possibly
7096 	steal a keystroke or two.  The ControlsView will further update the minimize
7097 	mode when it detects changes in the server mode. */
7098   Minimize (g_ServerMode);
7099 
7100   return;
7101 
7102 ErrorExit:
7103   DisplayErrorMessage ("Unable to initialise the window contents.");
7104 }
7105 
7106 
7107 void
7108 DatabaseWindow::MessageReceived (BMessage *MessagePntr)
7109 {
7110   if (MessagePntr->what == B_MOUSE_WHEEL_CHANGED)
7111   {
7112     /* Pass the mouse wheel stuff down to the words view, since that's the only
7113     one which does scrolling so we don't need to worry about whether it has
7114     focus or not. */
7115 
7116     if (m_WordsViewPntr != NULL)
7117       m_WordsViewPntr->MessageReceived (MessagePntr);
7118   }
7119   else
7120     BWindow::MessageReceived (MessagePntr);
7121 }
7122 
7123 
7124 bool
7125 DatabaseWindow::QuitRequested ()
7126 {
7127   be_app->PostMessage (B_QUIT_REQUESTED);
7128   return true;
7129 }
7130 
7131 
7132 
7133 /******************************************************************************
7134  * Implementation of the word display view.
7135  */
7136 
7137 WordsView::WordsView (BRect NewBounds)
7138 : BView (NewBounds, "WordsView", B_FOLLOW_ALL_SIDES,
7139     B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE | B_NAVIGABLE | B_PULSE_NEEDED),
7140   m_ArrowLineDownPntr (NULL),
7141   m_ArrowLineUpPntr (NULL),
7142   m_ArrowPageDownPntr (NULL),
7143   m_ArrowPageUpPntr (NULL),
7144   m_LastTimeAKeyWasPressed (0)
7145 {
7146   font_height TempFontHeight;
7147 
7148   GetFont (&m_TextFont); /* Modify the default font to be our own. */
7149   m_TextFont.SetSize (ceilf (m_TextFont.Size() * 1.1));
7150   m_TextFont.GetHeight (&TempFontHeight);
7151   SetFont (&m_TextFont);
7152 
7153   m_LineHeight = ceilf (TempFontHeight.ascent +
7154     TempFontHeight.descent + TempFontHeight.leading);
7155   m_AscentHeight = ceilf (TempFontHeight.ascent);
7156   m_TextHeight = ceilf (TempFontHeight.ascent +
7157     TempFontHeight.descent);
7158 
7159   m_FocusedColour.red = 255;
7160   m_FocusedColour.green = 255;
7161   m_FocusedColour.blue = 255;
7162   m_FocusedColour.alpha = 255;
7163 
7164   m_UnfocusedColour.red = 245;
7165   m_UnfocusedColour.green = 245;
7166   m_UnfocusedColour.blue = 255;
7167   m_UnfocusedColour.alpha = 255;
7168 
7169   m_BackgroundColour = m_UnfocusedColour;
7170   SetViewColor (m_BackgroundColour);
7171   SetLowColor (m_BackgroundColour);
7172   SetHighColor (0, 0, 0);
7173 
7174   strcpy (m_FirstDisplayedWord, "a");
7175 }
7176 
7177 
7178 void
7179 WordsView::AttachedToWindow ()
7180 {
7181   BPolygon        DownLinePolygon (g_DownLinePoints,
7182                     sizeof (g_DownLinePoints) /
7183                     sizeof (g_DownLinePoints[0]));
7184 
7185   BPolygon        DownPagePolygon (g_DownPagePoints,
7186                     sizeof (g_DownPagePoints) /
7187                     sizeof (g_DownPagePoints[0]));
7188 
7189   BPolygon        UpLinePolygon (g_UpLinePoints,
7190                     sizeof (g_UpLinePoints) /
7191                     sizeof (g_UpLinePoints[0]));
7192 
7193   BPolygon        UpPagePolygon (g_UpPagePoints,
7194                     sizeof (g_UpPagePoints) /
7195                     sizeof (g_UpPagePoints[0]));
7196 
7197   BPicture        TempOffPicture;
7198   BPicture        TempOnPicture;
7199   BRect           TempRect;
7200 
7201   /* Make the buttons and associated polygon images for the forward and
7202   backwards a word or a page of words buttons.  They're the width of the scroll
7203   bar area on the right, but twice as tall as usual, since there is no scroll
7204   bar and that will make it easier to use them.  First the up a line button. */
7205 
7206   SetHighColor (0, 0, 0);
7207   BeginPicture (&TempOffPicture);
7208   FillPolygon (&UpLinePolygon);
7209   SetHighColor (180, 180, 180);
7210   StrokePolygon (&UpLinePolygon);
7211   EndPicture ();
7212 
7213   SetHighColor (128, 128, 128);
7214   BeginPicture (&TempOnPicture);
7215   FillPolygon (&UpLinePolygon);
7216   EndPicture ();
7217 
7218   TempRect = Bounds ();
7219   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7220   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7221   m_ArrowLineUpPntr = new BPictureButton (TempRect, "Up Line",
7222     &TempOffPicture, &TempOnPicture,
7223     new BMessage (MSG_LINE_UP), B_ONE_STATE_BUTTON,
7224     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7225   if (m_ArrowLineUpPntr == NULL) goto ErrorExit;
7226   AddChild (m_ArrowLineUpPntr);
7227   m_ArrowLineUpPntr->SetTarget (this);
7228 
7229   /* Up a page button. */
7230 
7231   SetHighColor (0, 0, 0);
7232   BeginPicture (&TempOffPicture);
7233   FillPolygon (&UpPagePolygon);
7234   SetHighColor (180, 180, 180);
7235   StrokePolygon (&UpPagePolygon);
7236   EndPicture ();
7237 
7238   SetHighColor (128, 128, 128);
7239   BeginPicture (&TempOnPicture);
7240   FillPolygon (&UpPagePolygon);
7241   EndPicture ();
7242 
7243   TempRect = Bounds ();
7244   TempRect.top += 2 * B_H_SCROLL_BAR_HEIGHT + 1;
7245   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7246   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7247   m_ArrowPageUpPntr = new BPictureButton (TempRect, "Up Page",
7248     &TempOffPicture, &TempOnPicture,
7249     new BMessage (MSG_PAGE_UP), B_ONE_STATE_BUTTON,
7250     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7251   if (m_ArrowPageUpPntr == NULL) goto ErrorExit;
7252   AddChild (m_ArrowPageUpPntr);
7253   m_ArrowPageUpPntr->SetTarget (this);
7254 
7255   /* Down a page button. */
7256 
7257   SetHighColor (0, 0, 0);
7258   BeginPicture (&TempOffPicture);
7259   FillPolygon (&DownPagePolygon);
7260   SetHighColor (180, 180, 180);
7261   StrokePolygon (&DownPagePolygon);
7262   EndPicture ();
7263 
7264   SetHighColor (128, 128, 128);
7265   BeginPicture (&TempOnPicture);
7266   FillPolygon (&DownPagePolygon);
7267   EndPicture ();
7268 
7269   TempRect = Bounds ();
7270   TempRect.bottom -= 3 * B_H_SCROLL_BAR_HEIGHT + 1;
7271   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7272   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7273   m_ArrowPageDownPntr = new BPictureButton (TempRect, "Down Page",
7274     &TempOffPicture, &TempOnPicture,
7275     new BMessage (MSG_PAGE_DOWN), B_ONE_STATE_BUTTON,
7276     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7277   if (m_ArrowPageDownPntr == NULL) goto ErrorExit;
7278   AddChild (m_ArrowPageDownPntr);
7279   m_ArrowPageDownPntr->SetTarget (this);
7280 
7281   /* Down a line button. */
7282 
7283   SetHighColor (0, 0, 0);
7284   BeginPicture (&TempOffPicture);
7285   FillPolygon (&DownLinePolygon);
7286   SetHighColor (180, 180, 180);
7287   StrokePolygon (&DownLinePolygon);
7288   EndPicture ();
7289 
7290   SetHighColor (128, 128, 128);
7291   BeginPicture (&TempOnPicture);
7292   FillPolygon (&DownLinePolygon);
7293   EndPicture ();
7294 
7295   TempRect = Bounds ();
7296   TempRect.bottom -= B_H_SCROLL_BAR_HEIGHT;
7297   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7298   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7299   m_ArrowLineDownPntr = new BPictureButton (TempRect, "Down Line",
7300     &TempOffPicture, &TempOnPicture,
7301     new BMessage (MSG_LINE_DOWN), B_ONE_STATE_BUTTON,
7302     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7303   if (m_ArrowLineDownPntr == NULL) goto ErrorExit;
7304   AddChild (m_ArrowLineDownPntr);
7305   m_ArrowLineDownPntr->SetTarget (this);
7306 
7307   return;
7308 
7309 ErrorExit:
7310   DisplayErrorMessage ("Problems while making view displaying the words.");
7311 }
7312 
7313 
7314 /* Draw the words starting with the one at or after m_FirstDisplayedWord.  This
7315 requires looking at the database in the BApplication, which may or may not be
7316 available (if it isn't, don't draw, a redraw will usually be requested by the
7317 Pulse member function when it keeps on noticing that the stuff on the display
7318 doesn't match the database). */
7319 
7320 void
7321 WordsView::Draw (BRect UpdateRect)
7322 {
7323   float                   AgeDifference;
7324   float                   AgeProportion;
7325   float                   CenterX;
7326   float                   ColumnLeftCenterX;
7327   float                   ColumnMiddleCenterX;
7328   float                   ColumnRightCenterX;
7329   float                   CompensatedRatio;
7330   StatisticsMap::iterator DataIter;
7331   StatisticsMap::iterator EndIter;
7332   rgb_color               FillColour;
7333   float                   GenuineProportion;
7334   uint32                  GenuineSpamSum;
7335   float                   HeightPixels;
7336   float                   HeightProportion;
7337   float                   LeftBounds;
7338   ABSApp                 *MyAppPntr;
7339   uint32                  NewestAge;
7340   uint32                  OldestAge;
7341   float                   OneFifthTotalGenuine;
7342   float                   OneFifthTotalSpam;
7343   double                  RawProbabilityRatio;
7344   float                   RightBounds;
7345   float                   SpamProportion;
7346   StatisticsPointer       StatisticsPntr;
7347   BRect                   TempRect;
7348   char                    TempString [PATH_MAX];
7349   float                   TotalGenuineMessages = 1.0; /* Avoid divide by 0. */
7350   float                   TotalSpamMessages = 1.0;
7351   float                   Width;
7352   float                   Y;
7353 
7354   /* Lock the application.  This will stop it from processing any further
7355   messages until we are done.  Or if it is busy, the lock will fail. */
7356 
7357   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7358   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (100000) != B_OK)
7359     return; /* It's probably busy doing something. */
7360 
7361   /* Set up various loop invariant variables. */
7362 
7363   if (MyAppPntr->m_TotalGenuineMessages > 0)
7364     TotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7365   OneFifthTotalGenuine = TotalGenuineMessages / 5;
7366 
7367   if (MyAppPntr->m_TotalSpamMessages > 0)
7368     TotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7369   OneFifthTotalSpam = TotalSpamMessages / 5;
7370 
7371   EndIter = MyAppPntr->m_WordMap.end ();
7372 
7373   OldestAge = MyAppPntr->m_OldestAge;
7374   NewestAge = /* actually newest age plus one */
7375     MyAppPntr->m_TotalGenuineMessages + MyAppPntr->m_TotalSpamMessages;
7376 
7377   if (NewestAge == 0)
7378     goto NormalExit; /* No words to display, or something is badly wrong. */
7379 
7380   NewestAge--; /* The newest message has age NewestAge. */
7381   AgeDifference = NewestAge - OldestAge; /* Can be zero if just one message. */
7382 
7383   LeftBounds = Bounds().left;
7384   RightBounds = Bounds().right - B_V_SCROLL_BAR_WIDTH;
7385   Width = RightBounds - LeftBounds;
7386   FillColour.alpha = 255;
7387 
7388   CenterX = ceilf (LeftBounds + Width * 0.5);
7389   ColumnLeftCenterX = ceilf (LeftBounds + Width * 0.05);
7390   ColumnMiddleCenterX = CenterX;
7391   ColumnRightCenterX = ceilf (LeftBounds + Width * 0.95);
7392 
7393   for (DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord),
7394   Y = Bounds().top;
7395   DataIter != EndIter && Y < UpdateRect.bottom;
7396   DataIter++, Y += m_LineHeight)
7397   {
7398     if (Y + m_LineHeight < UpdateRect.top)
7399       continue; /* Not in the visible area yet, don't actually draw. */
7400 
7401     /* Draw the colour bar behind the word.  It reflects the spamness or
7402     genuineness of that particular word, plus the importance of the word and
7403     the age of the word.
7404 
7405     First calculate the compensated spam ratio (described elsewhere).  It is
7406     close to 0.0 for genuine words and close to 1.0 for pure spam.  It is drawn
7407     as a blue bar to the left of center if it is less than 0.5, and a red bar
7408     on the right of center if it is greater than 0.5.  At exactly 0.5 nothing
7409     is drawn; the word is worthless as an indicator.
7410 
7411     The height of the bar corresponds to the number of messages the word was
7412     found in.  Make the height proportional to the total of spam and genuine
7413     messages for the word divided by the sum of the most extreme spam and
7414     genuine counts in the database.
7415 
7416     The staturation of the colour corresponds to the age of the word, with old
7417     words being almost white rather than solid blue or red. */
7418 
7419     StatisticsPntr = &DataIter->second;
7420 
7421     SpamProportion = StatisticsPntr->spamCount / TotalSpamMessages;
7422     GenuineProportion = StatisticsPntr->genuineCount / TotalGenuineMessages;
7423     if (SpamProportion + GenuineProportion > 0.0f)
7424       RawProbabilityRatio =
7425       SpamProportion / (SpamProportion + GenuineProportion);
7426     else
7427       RawProbabilityRatio = g_RobinsonX;
7428 
7429     /* The compensated ratio leans towards 0.5 (RobinsonX) more for fewer
7430     data points, with a weight of 0.45 (RobinsonS). */
7431 
7432     GenuineSpamSum =
7433       StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
7434     CompensatedRatio =
7435       (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
7436       (g_RobinsonS + GenuineSpamSum);
7437 
7438     /* Used to use the height based on the most frequent word, but some words,
7439     like "From", show up in all messages which made most other words just
7440     appear as a thin line.  I did a histogram plot of the sizes in my test
7441     database, and figured that you get better coverage of 90% of the messages
7442     if you use 1/5 of the total number as the count which gives you 100%
7443     height.  The other 10% get a full height bar, but most people wouldn't care
7444     that they're super frequently used. */
7445 
7446     HeightProportion = 0.5f * (StatisticsPntr->genuineCount /
7447       OneFifthTotalGenuine + StatisticsPntr->spamCount / OneFifthTotalSpam);
7448 
7449     if (HeightProportion > 1.0f)
7450       HeightProportion = 1.0f;
7451     HeightPixels = ceilf (HeightProportion * m_TextHeight);
7452 
7453     if (AgeDifference <= 0.0f)
7454       AgeProportion = 1.0; /* New is 1.0, old is 0.0 */
7455     else
7456       AgeProportion = (StatisticsPntr->age - OldestAge) / AgeDifference;
7457 
7458     TempRect.top = ceilf (Y + m_TextHeight / 2 - HeightPixels / 2);
7459     TempRect.bottom = TempRect.top + HeightPixels;
7460 
7461     if (CompensatedRatio < 0.5f)
7462     {
7463       TempRect.left = ceilf (
7464         CenterX - 1.6f * (0.5f - CompensatedRatio) * (CenterX - LeftBounds));
7465       TempRect.right = CenterX;
7466       FillColour.red = 230 - (int) (AgeProportion * 230.0f);
7467       FillColour.green = FillColour.red;
7468       FillColour.blue = 255;
7469     }
7470     else /* Ratio >= 0.5, red spam block. */
7471     {
7472       TempRect.left = CenterX;
7473       TempRect.right = ceilf (
7474         CenterX + 1.6f * (CompensatedRatio - 0.5f) * (RightBounds - CenterX));
7475       FillColour.blue = 230 - (int) (AgeProportion * 230.0f);
7476       FillColour.green = FillColour.blue;
7477       FillColour.red = 255;
7478     }
7479     SetHighColor (FillColour);
7480     SetDrawingMode (B_OP_COPY);
7481     FillRect (TempRect);
7482 
7483     /* Print the text centered in columns of various widths.  The number of
7484     genuine messages in the left 10% of the width, the word in the middle 80%,
7485     and the number of spam messages using the word in the right 10%. */
7486 
7487     SetHighColor (0, 0, 0);
7488     SetDrawingMode (B_OP_OVER); /* So that antialiased text mixes better. */
7489 
7490     sprintf (TempString, "%lu", StatisticsPntr->genuineCount);
7491     Width = m_TextFont.StringWidth (TempString);
7492     MovePenTo (ceilf (ColumnLeftCenterX - Width / 2), Y + m_AscentHeight);
7493     DrawString (TempString);
7494 
7495     strcpy (TempString, DataIter->first.c_str ());
7496     Width = m_TextFont.StringWidth (TempString);
7497     MovePenTo (ceilf (ColumnMiddleCenterX - Width / 2), Y + m_AscentHeight);
7498     DrawString (TempString);
7499 
7500     sprintf (TempString, "%lu", StatisticsPntr->spamCount);
7501     Width = m_TextFont.StringWidth (TempString);
7502     MovePenTo (ceilf (ColumnRightCenterX - Width / 2), Y + m_AscentHeight);
7503     DrawString (TempString);
7504   }
7505 
7506   /* Draw the first word (the one which the user types in to select the first
7507   displayed word) on the right, in the scroll bar margin, rotated 90 degrees to
7508   fit between the page up and page down buttons. */
7509 
7510   Width = m_TextFont.StringWidth (m_FirstDisplayedWord);
7511   if (Width > 0)
7512   {
7513     TempRect = Bounds ();
7514     TempRect.top += 4 * B_H_SCROLL_BAR_HEIGHT + 1;
7515     TempRect.bottom -= 5 * B_H_SCROLL_BAR_HEIGHT + 1;
7516 
7517     MovePenTo (TempRect.right - m_TextHeight + m_AscentHeight - 1,
7518       ceilf ((TempRect.bottom + TempRect.top) / 2 + Width / 2));
7519     m_TextFont.SetRotation (90);
7520     SetFont (&m_TextFont, B_FONT_ROTATION);
7521     DrawString (m_FirstDisplayedWord);
7522     m_TextFont.SetRotation (0);
7523     SetFont (&m_TextFont, B_FONT_ROTATION);
7524   }
7525 
7526 NormalExit:
7527 
7528   /* Successfully finished drawing.  Update the cached values to match what we
7529   have drawn. */
7530   m_CachedTotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7531   m_CachedTotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7532   m_CachedWordCount = MyAppPntr->m_WordCount;
7533 
7534   /* Done.  Let the BApplication continue processing messages. */
7535   MyAppPntr->Unlock ();
7536 }
7537 
7538 
7539 /* When the user presses keys, they select the first word to be displayed in
7540 the view (it's the word at or lexicographically after the word typed in).  The
7541 keys are appended to the starting word, until the user stops typing for a
7542 while, then the next key will be the first letter of a new starting word. */
7543 
7544 void
7545 WordsView::KeyDown (const char *BufferPntr, int32 NumBytes)
7546 {
7547   int32          CharLength;
7548   bigtime_t      CurrentTime;
7549   char           TempString [40];
7550 
7551   CurrentTime = system_time ();
7552 
7553   if (NumBytes < (int32) sizeof (TempString))
7554   {
7555     memcpy (TempString, BufferPntr, NumBytes);
7556     TempString [NumBytes] = 0;
7557     CharLength = strlen (TempString); /* So NUL bytes don't get through. */
7558 
7559     /* Check for arrow keys, which move the view up and down. */
7560 
7561     if (CharLength == 1 &&
7562     (TempString[0] == B_UP_ARROW ||
7563     TempString[0] == B_DOWN_ARROW ||
7564     TempString[0] == B_PAGE_UP ||
7565     TempString[0] == B_PAGE_DOWN))
7566     {
7567       MoveTextUpOrDown ((TempString[0] == B_UP_ARROW) ? MSG_LINE_UP :
7568         ((TempString[0] == B_DOWN_ARROW) ? MSG_LINE_DOWN :
7569         ((TempString[0] == B_PAGE_UP) ? MSG_PAGE_UP : MSG_PAGE_DOWN)));
7570     }
7571     else if (CharLength > 1 ||
7572     (CharLength == 1 && 32 <= (uint8) TempString[0]))
7573     {
7574       /* Have a non-control character, or some sort of multibyte char.  Add it
7575       to the word and mark things for redisplay starting at the resulting word.
7576       */
7577 
7578       if (CurrentTime - m_LastTimeAKeyWasPressed >= 1000000 /* microseconds */)
7579         strcpy (m_FirstDisplayedWord, TempString); /* Starting a new word. */
7580       else if (strlen (m_FirstDisplayedWord) + CharLength <= g_MaxWordLength)
7581         strcat (m_FirstDisplayedWord, TempString); /* Append to existing. */
7582 
7583       Invalidate ();
7584     }
7585   }
7586 
7587   m_LastTimeAKeyWasPressed = CurrentTime;
7588   BView::KeyDown (BufferPntr, NumBytes);
7589 }
7590 
7591 
7592 /* Change the background colour to show that we have the focus.  When we have
7593 it, keystrokes will select the word to be displayed at the top of the list. */
7594 
7595 void
7596 WordsView::MakeFocus (bool Focused)
7597 {
7598   if (Focused)
7599     m_BackgroundColour = m_FocusedColour;
7600   else
7601     m_BackgroundColour = m_UnfocusedColour;
7602   SetViewColor (m_BackgroundColour);
7603   SetLowColor (m_BackgroundColour);
7604 
7605   /* Also need to set the background colour for the scroll buttons, since they
7606   can't be made transparent. */
7607 
7608   if (m_ArrowLineDownPntr != NULL)
7609   {
7610     m_ArrowLineDownPntr->SetViewColor (m_BackgroundColour);
7611     m_ArrowLineDownPntr->Invalidate ();
7612   }
7613 
7614   if (m_ArrowLineUpPntr != NULL)
7615   {
7616     m_ArrowLineUpPntr->SetViewColor (m_BackgroundColour);
7617     m_ArrowLineUpPntr->Invalidate ();
7618   }
7619 
7620   if (m_ArrowPageDownPntr != NULL)
7621   {
7622     m_ArrowPageDownPntr->SetViewColor (m_BackgroundColour);
7623     m_ArrowPageDownPntr->Invalidate ();
7624   }
7625 
7626   if (m_ArrowPageUpPntr != NULL)
7627   {
7628     m_ArrowPageUpPntr->SetViewColor (m_BackgroundColour);
7629     m_ArrowPageUpPntr->Invalidate ();
7630   }
7631 
7632   Invalidate ();
7633 
7634   BView::MakeFocus (Focused);
7635 }
7636 
7637 
7638 void
7639 WordsView::MessageReceived (BMessage *MessagePntr)
7640 {
7641   int32     CountFound;
7642   float     DeltaY; /* Usually -1.0, 0.0 or +1.0. */
7643   type_code TypeFound;
7644 
7645   switch (MessagePntr->what)
7646   {
7647     case B_MOUSE_WHEEL_CHANGED:
7648       if (MessagePntr->FindFloat ("be:wheel_delta_y", &DeltaY) != 0) break;
7649       if (DeltaY < 0)
7650         MoveTextUpOrDown (MSG_LINE_UP);
7651       else if (DeltaY > 0)
7652         MoveTextUpOrDown (MSG_LINE_DOWN);
7653       break;
7654 
7655     case MSG_LINE_DOWN:
7656     case MSG_LINE_UP:
7657     case MSG_PAGE_DOWN:
7658     case MSG_PAGE_UP:
7659       MoveTextUpOrDown (MessagePntr->what);
7660       break;
7661 
7662     case B_SIMPLE_DATA: /* Something has been dropped in our view. */
7663       if (MessagePntr->GetInfo ("refs", &TypeFound, &CountFound) == B_OK &&
7664       CountFound > 0 && TypeFound == B_REF_TYPE)
7665       {
7666         RefsDroppedHere (MessagePntr);
7667         break;
7668       }
7669       /* Else fall through to the default case, in case it is something else
7670       dropped that the system knows about. */
7671 
7672     default:
7673       BView::MessageReceived (MessagePntr);
7674   }
7675 }
7676 
7677 
7678 /* If the user clicks on our view, take over the focus. */
7679 
7680 void
7681 WordsView::MouseDown (BPoint)
7682 {
7683   if (!IsFocus ())
7684     MakeFocus (true);
7685 }
7686 
7687 
7688 void
7689 WordsView::MoveTextUpOrDown (uint32 MovementType)
7690 {
7691   StatisticsMap::iterator  DataIter;
7692   int                      i;
7693   ABSApp                  *MyAppPntr;
7694   int                      PageSize;
7695 
7696   /* Lock the application.  This will stop it from processing any further
7697   messages until we are done (we need to look at the word list directly).  Or
7698   if it is busy, the lock will fail. */
7699 
7700   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7701   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (2000000) != B_OK)
7702     return; /* It's probably busy doing something. */
7703 
7704   PageSize = (int) (Bounds().Height() / m_LineHeight - 1);
7705   if (PageSize < 1)
7706     PageSize = 1;
7707 
7708   DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord);
7709 
7710   switch (MovementType)
7711   {
7712     case MSG_LINE_UP:
7713       if (DataIter != MyAppPntr->m_WordMap.begin ())
7714         DataIter--;
7715       break;
7716 
7717     case MSG_LINE_DOWN:
7718       if (DataIter != MyAppPntr->m_WordMap.end ())
7719         DataIter++;
7720       break;
7721 
7722     case MSG_PAGE_UP:
7723       for (i = 0; i < PageSize; i++)
7724       {
7725         if (DataIter == MyAppPntr->m_WordMap.begin ())
7726           break;
7727         DataIter--;
7728       }
7729       break;
7730 
7731     case MSG_PAGE_DOWN:
7732       for (i = 0; i < PageSize; i++)
7733       {
7734         if (DataIter == MyAppPntr->m_WordMap.end ())
7735           break;
7736         DataIter++;
7737       }
7738       break;
7739   }
7740 
7741   if (DataIter != MyAppPntr->m_WordMap.end ())
7742     strcpy (m_FirstDisplayedWord, DataIter->first.c_str ());
7743 
7744   Invalidate ();
7745 
7746   MyAppPntr->Unlock ();
7747 }
7748 
7749 
7750 /* This function periodically polls the BApplication to see if anything has
7751 changed.  If the word list is different or the display has changed in some
7752 other way, it will then try to refresh the display, repeating the attempt until
7753 it gets successfully drawn. */
7754 
7755 void
7756 WordsView::Pulse ()
7757 {
7758   ABSApp *MyAppPntr;
7759 
7760   /* Probe the BApplication to see if it has changed. */
7761 
7762   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7763   if (MyAppPntr == NULL)
7764     return; /* Something is wrong, give up. */
7765 
7766   if (MyAppPntr->m_TotalGenuineMessages != m_CachedTotalGenuineMessages ||
7767   MyAppPntr->m_TotalSpamMessages != m_CachedTotalSpamMessages ||
7768   MyAppPntr->m_WordCount != m_CachedWordCount)
7769     Invalidate ();
7770 }
7771 
7772 
7773 /* The user has dragged and dropped some file references on the words view.  If
7774 it is in the left third, add the file(s) as examples of genuine messages, right
7775 third for spam messages and if it is in the middle third then evaluate the
7776 file(s) for spaminess. */
7777 
7778 void
7779 WordsView::RefsDroppedHere (BMessage *MessagePntr)
7780 {
7781   float  Left;
7782   bool   SpamExample = true; /* TRUE if example is of spam, FALSE genuine. */
7783   float  Third;
7784   BPoint WhereDropped;
7785 
7786   /* Find out which third of the view it was dropped into. */
7787 
7788   if (MessagePntr->FindPoint ("_drop_point_", &WhereDropped) != B_OK)
7789     return;  /* Need to know where it was dropped. */
7790   ConvertFromScreen (&WhereDropped);
7791   Third = Bounds().Width() / 3;
7792   Left = Bounds().left;
7793   if (WhereDropped.x < Left + Third)
7794     SpamExample = false;
7795   else if (WhereDropped.x < Left + 2 * Third)
7796   {
7797     /* In the middle third, evaluate all files for spaminess. */
7798     EstimateRefFilesAndDisplay (MessagePntr);
7799     return;
7800   }
7801 
7802   if (g_CommanderLooperPntr != NULL)
7803     g_CommanderLooperPntr->CommandReferences (
7804     MessagePntr, true /* BulkMode */, SpamExample ? CL_SPAM : CL_GENUINE);
7805 }
7806 
7807 
7808 
7809 /******************************************************************************
7810  * Finally, the main program which drives it all.
7811  */
7812 
7813 int main (int argc, char**)
7814 {
7815   g_CommandLineMode = (argc > 1);
7816   if (!g_CommandLineMode)
7817     cout << PrintUsage; /* In case no arguments specified. */
7818 
7819   g_CommanderLooperPntr = new CommanderLooper;
7820   if (g_CommanderLooperPntr != NULL)
7821   {
7822     g_CommanderMessenger = new BMessenger (NULL, g_CommanderLooperPntr);
7823     g_CommanderLooperPntr->Run ();
7824   }
7825 
7826   ABSApp MyApp;
7827 
7828   if (MyApp.InitCheck () == 0)
7829   {
7830     MyApp.LoadSaveSettings (true /* DoLoad */);
7831     MyApp.Run ();
7832   }
7833 
7834   if (g_CommanderLooperPntr != NULL)
7835   {
7836     g_CommanderLooperPntr->PostMessage (B_QUIT_REQUESTED);
7837     snooze (100000); /* Let the CommanderLooper thread run so it quits. */
7838   }
7839 
7840   cerr << "SpamDBM shutting down..." << endl;
7841   return 0; /* And implicitly destroys MyApp, which writes out the database. */
7842 }
7843