xref: /haiku/src/bin/mail_utils/spamdbm.cpp (revision 1f52c921e27aa442370e1bd4adc021acf2b78b64)
1 /******************************************************************************
2  * $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $
3  *
4  * This is a BeOS program for classifying e-mail messages as spam (unwanted
5  * junk mail) or as genuine mail using a Bayesian statistical approach.  There
6  * is also a Mail Daemon Replacement add-on to filter mail using the
7  * classification statistics collected earlier.
8  *
9  * See also http://www.paulgraham.com/spam.html for a good writeup and
10  * http://www.tuxedo.org/~esr/bogofilter/ for another implementation.
11  * And more recently, Gary Robinson's write up of his improved algorithm
12  * at http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
13  * which gives a better spread in spam ratios and slightly fewer
14  * misclassifications.
15  *
16  * Note that this uses the AGMS vacation coding style, not the OpenTracker one.
17  * That means no tabs, indents are two spaces, m_ is the prefix for member
18  * variables, g_ is the prefix for global names, C style comments, constants
19  * are in all capital letters and most other things are mixed case, it's word
20  * wrapped to fit in 79 characters per line to make proofreading on paper
21  * easier, and functions are listed in reverse dependency order so that forward
22  * declarations (function prototypes with no code) aren't needed.
23  *
24  * The Original Design:
25  * There is a spam database (just a file listing words and number of times they
26  * were used in spam and non-spam messages) that a BeMailDaemon input filter
27  * will use when scanning email.  It will mark the mail with the spam
28  * probability (an attribute, optionally a mail header field) and optionally do
29  * something if the probability exceeds a user defined level (delete message,
30  * change subject, file in a different folder).  Or should that be a different
31  * filter?  Outside the mail system, the probability can be used in queries to
32  * find spam.
33  *
34  * A second user application will be used to update the database.  Besides
35  * showing you the current list of words, you can drag and drop files to mark
36  * them as spam or non-spam (a balanced binary tree is used internally to make
37  * word storage fast).  It will add a second attribute to the files to show how
38  * they have been classified by the user (and won't update the database if you
39  * accidentally try to classify a file again).  Besides drag and drop, there
40  * will be a command line interface and a message passing interface.  BeMail
41  * (or other programs) will then communicate via messages to tell it when the
42  * user marks a message as spam or not (via having separate delete spam /
43  * delete genuine mail buttons and a menu item or two).
44  *
45  * Plus lots of details, like the rename swap method to update the database
46  * file (so programs with the old file open aren't affected).  A nice tab text
47  * format so you can open the database in a spreadsheet.  Startup and shutdown
48  * control of the updater from BeMail.  Automatic creation of the indices
49  * needed by the filter.  MIME types for the database file.  Icons for the app.
50  * System settings to enable tracker to display the new attributes when viewing
51  * e-mail (and maybe news articles if someone ever gets around to an NNTP as
52  * files reader).  Documentation.  Recursive directory traversal for the
53  * command line or directory drag and drop.  Options for the updater to warn or
54  * ignore non-email files.  Etc.
55  *
56  * The Actual Implementation:
57  * The spam database updates and the test for spam have been combined into one
58  * program which runs as a server.  That way there won't be as long a delay
59  * when the e-mail system wants to check for spam, because the database is
60  * already loaded by the server and in memory.  The MDR mail filter add-on
61  * simply sends scripting commands to the server (and starts it up if it isn't
62  * already running).  The filter takes care of marking the messages when it
63  * gets the rating back from the server, and then the rest of the mail system
64  * rule chain can delete the message or otherwise manipulate it.
65  *
66  * Revision History (now manually updated due to SVN's philosophy)
67  * $Log: spamdbm.cpp,v $
68  * ------------------------------------------------------------------------
69  * r15195 | agmsmith | 2005-11-27 21:07:55 -0500 (Sun, 27 Nov 2005) | 4 lines
70  * Just a few minutes after checking in, I mentioned it to Japanese expert Koki
71  * and he suggested also including the Japanese comma.  So before I forget to
72  * do it...
73  *
74  * ------------------------------------------------------------------------
75  * r15194 | agmsmith | 2005-11-27 20:37:13 -0500 (Sun, 27 Nov 2005) | 5 lines
76  * Truncate overly long URLs to the maximum word length.  Convert Japanese
77  * periods to spaces so that more "words" are found.  Fix UTF-8 comparison
78  * problems with tolower() incorrectly converting characters with the high bit
79  * set.
80  *
81  * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
82  * Added better tokenization so that HTML is parsed and things like tags
83  * between letters of a word no longer hide that word.  After testing, the
84  * result seems to be a tighter spread of ratings when done in full text plus
85  * header mode.
86  *
87  * Revision 1.10  2005/11/24 02:08:39  agmsmith
88  * Fixed up prefix codes, Z for things that are inside other things.
89  *
90  * Revision 1.9  2005/11/21 03:28:03  agmsmith
91  * Added a function for extracting URLs.
92  *
93  * Revision 1.8  2005/11/09 03:36:18  agmsmith
94  * Removed noframes detection (doesn't show up in e-mails).  Now use
95  * just H for headers and Z for HTML tag junk.
96  *
97  * Revision 1.7  2005/10/24 00:00:08  agmsmith
98  * Adding HTML tag removal, which also affected the search function so it
99  * could search for single part things like  .
100  *
101  * Revision 1.6  2005/10/17 01:55:08  agmsmith
102  * Remove HTML comments and a few other similar things.
103  *
104  * Revision 1.5  2005/10/16 18:35:36  agmsmith
105  * Under construction - looking into HTML not being in UTF-8.
106  *
107  * Revision 1.4  2005/10/11 01:51:21  agmsmith
108  * Starting on the tokenising passes.  Still need to test asian truncation.
109  *
110  * Revision 1.3  2005/10/06 11:54:07  agmsmith
111  * Not much.
112  *
113  * Revision 1.2  2005/09/12 01:49:37  agmsmith
114  * Enable case folding for the whole file tokenizer.
115  *
116  * r13961 | agmsmith | 2005-08-13 22:25:28 -0400 (Sat, 13 Aug 2005) | 2 lines
117  * Source code changes so that mboxtobemail now compiles and is in the build
118  * system.
119  *
120  * r13959 | agmsmith | 2005-08-13 22:05:27 -0400 (Sat, 13 Aug 2005) | 2 lines
121  * Rename the directory before doing anything else, otherwise svn dies badly.
122  *
123  * r13952 | agmsmith | 2005-08-13 15:31:42 -0400 (Sat, 13 Aug 2005) | 3 lines
124  * Added the resources and file type associations, changed the application
125  * signature and otherwise made the spam detection system work properly again.
126  *
127  * r13951 | agmsmith | 2005-08-13 11:40:01 -0400 (Sat, 13 Aug 2005) | 2 lines
128  * Had to do the file rename as a separate operation due to SVN limitations.
129  *
130  * r13950 | agmsmith | 2005-08-13 11:38:44 -0400 (Sat, 13 Aug 2005) | 3 lines
131  * Oops, "spamdb" is already used for a Unix package.  And spamdatabase is
132  * already reserved by a domain name squatter.  Use "spamdbm" instead.
133  *
134  * r13949 | agmsmith | 2005-08-13 11:17:52 -0400 (Sat, 13 Aug 2005) | 3 lines
135  * Renamed spamfilter to be the more meaningful spamdb (spam database) and
136  * moved it into its own source directory in preparation for adding resources.
137  *
138  * r13628 | agmsmith | 2005-07-10 20:11:29 -0400 (Sun, 10 Jul 2005) | 3 lines
139  * Updated keyword expansion to use SVN keywords.  Also seeing if svn is
140  * working well enough for me to update files from BeOS R5.
141  *
142  * r11909 | axeld | 2005-03-18 19:09:19 -0500 (Fri, 18 Mar 2005) | 2 lines
143  * Moved bin/ directory out of apps/.
144  *
145  * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
146  * Move trunk into respective module.
147  *
148  * r10362 | nwhitehorn | 2004-12-06 20:14:05 -0500 (Mon, 06 Dec 2004) | 2 lines
149  * Fixed the spam filter so it works correctly now.
150  *
151  * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
152  * Added AGMS's excellent spam detection software.  Still some weirdness with
153  * the configuration interface from E-mail prefs.
154  *
155  * Revision 1.2  2004/12/07 01:14:05  nwhitehorn
156  * Fixed the spam filter so it works correctly now.
157  *
158  * Revision 1.87  2004/09/20 15:57:26  nwhitehorn
159  * Mostly updated the tree to Be/Haiku style identifier naming conventions.  I
160  * have a few more things to work out, mostly in mail_util.h, and then I'm
161  * proceeding to jamify the build system.  Then we go into Haiku CVS.
162  *
163  * Revision 1.86  2003/07/26 16:47:46  agmsmith
164  * Bug - wasn't allowing double classification if the user had turned on
165  * the option to ignore the previous classification.
166  *
167  * Revision 1.85  2003/07/08 14:52:57  agmsmith
168  * Fix bug with classification choices dialog box coming up with weird
169  * sizes due to RefsReceived message coming in before ReadyToRun had
170  * finished setting up the default sizes of the controls.
171  *
172  * Revision 1.84  2003/07/04 19:59:29  agmsmith
173  * Now with a GUI option to let you declassify messages (set them back
174  * to uncertain, rather than spam or genuine).  Required a BAlert
175  * replacement since BAlerts can't do four buttons.
176  *
177  * Revision 1.83  2003/07/03 20:40:36  agmsmith
178  * Added Uncertain option for declassifying messages.
179  *
180  * Revision 1.82  2003/06/16 14:57:13  agmsmith
181  * Detect spam which uses mislabeled text attachments, going by the file name
182  * extension.
183  *
184  * Revision 1.81  2003/04/08 20:27:04  agmsmith
185  * AGMSBayesianSpamServer now shuts down immediately and returns true if
186  * it is asked to quit by the registrar.
187  *
188  * Revision 1.80  2003/04/07 19:20:27  agmsmith
189  * Ooops, int64 doesn't exist, use long long instead.
190  *
191  * Revision 1.79  2003/04/07 19:05:22  agmsmith
192  * Now with Allen Brunson's atoll for PPC (you need the %Ld, but that
193  * becomes %lld on other systems).
194  *
195  * Revision 1.78  2003/04/04 22:43:53  agmsmith
196  * Fixed up atoll PPC processor hack so it would actually work, was just
197  * returning zero which meant that it wouldn't load in the database file
198  * (read the size as zero).
199  *
200  * Revision 1.77  2003/01/22 03:19:48  agmsmith
201  * Don't convert words to lower case, the case is important for spam.
202  * Particularly sentences which start with exciting words, which you
203  * normally won't use at the start of a sentence (and thus capitalize).
204  *
205  * Revision 1.76  2002/12/18 02:29:22  agmsmith
206  * Add space for the Uncertain display in Tracker.
207  *
208  * Revision 1.75  2002/12/18 01:54:37  agmsmith
209  * Added uncertain sound effect.
210  *
211  * Revision 1.74  2002/12/13 23:53:12  agmsmith
212  * Minimize the window before opening it so that it doesn't flash on the
213  * screen in server mode.  Also load the database when the window is
214  * displayed so that the user can see the words.
215  *
216  * Revision 1.73  2002/12/13 20:55:57  agmsmith
217  * Documentation.
218  *
219  * Revision 1.72  2002/12/13 20:26:11  agmsmith
220  * Fixed bug with adding messages in strings to database (was limited to
221  * messages at most 1K long).  Also changed default server mode to true
222  * since that's what people use most.
223  *
224  * Revision 1.71  2002/12/11 22:37:30  agmsmith
225  * Added commands to train on spam and genuine e-mail messages passed
226  * in string arguments rather then via external files.
227  *
228  * Revision 1.70  2002/12/10 22:12:41  agmsmith
229  * Adding a message to the database now uses a BPositionIO rather than a
230  * file and file name (for future string rather than file additions).  Also
231  * now re-evaluate a file after reclassifying it so that the user can see
232  * the new ratio.  Also remove the [Spam 99.9%] subject prefix when doing
233  * a re-evaluation or classification (the number would be wrong).
234  *
235  * Revision 1.69  2002/12/10 01:46:04  agmsmith
236  * Added the Chi-Squared scoring method.
237  *
238  * Revision 1.68  2002/11/29 22:08:25  agmsmith
239  * Change default purge age to 2000 so that hitting the purge button
240  * doesn't erase stuff from the new sample database.
241  *
242  * Revision 1.67  2002/11/25 20:39:39  agmsmith
243  * Don't need to massage the MIME type since the mail library now does
244  * the lower case conversion and converts TEXT to text/plain too.
245  *
246  * Revision 1.66  2002/11/20 22:57:12  nwhitehorn
247  * PPC Compatibility Fixes
248  *
249  * Revision 1.65  2002/11/10 18:43:55  agmsmith
250  * Added a time delay to some quitting operations so that scripting commands
251  * from a second client (like a second e-mail account) will make the program
252  * abort the quit operation.
253  *
254  * Revision 1.64  2002/11/05 18:05:16  agmsmith
255  * Looked at Nathan's PPC changes (thanks!), modified style a bit.
256  *
257  * Revision 1.63  2002/11/04 03:30:22  nwhitehorn
258  * Now works (or compiles at least) on PowerPC.  I'll get around to testing it
259  * later.
260  *
261  * Revision 1.62  2002/11/04 01:03:33  agmsmith
262  * Fixed warnings so it compiles under the bemaildaemon system.
263  *
264  * Revision 1.61  2002/11/03 23:00:37  agmsmith
265  * Added to the bemaildaemon project on SourceForge.  Hmmmm, seems to switch to
266  * a new version if I commit and specify a message, but doesn't accept the
267  * message and puts up the text editor.  Must be a bug where cvs eats the first
268  * option after "commit".
269  *
270  * Revision 1.60.1.1  2002/10/22 14:29:27  agmsmith
271  * Needed to recompile with the original Libmail.so from Beta/1 since
272  * the current library uses a different constructor, and thus wouldn't
273  * run when used with the old library.
274  *
275  * Revision 1.60  2002/10/21 16:41:27  agmsmith
276  * Return a special error code when no words are found in a message,
277  * so that messages without text/plain parts can be recognized as
278  * spam by the mail filter.
279  *
280  * Revision 1.59  2002/10/20 21:29:47  agmsmith
281  * Watch out for MIME types of "text", treat as text/plain.
282  *
283  * Revision 1.58  2002/10/20 18:29:07  agmsmith
284  * *** empty log message ***
285  *
286  * Revision 1.57  2002/10/20 18:25:02  agmsmith
287  * Fix case sensitivity in MIME type tests, and fix text/any test.
288  *
289  * Revision 1.56  2002/10/19 17:00:10  agmsmith
290  * Added the pop-up menu for the tokenize modes.
291  *
292  * Revision 1.55  2002/10/19 14:54:06  agmsmith
293  * Fudge MIME type of body text components so that they get
294  * treated as text.
295  *
296  * Revision 1.54  2002/10/19 00:56:37  agmsmith
297  * The parsing of e-mail messages seems to be working now, just need
298  * to add some user interface stuff for the tokenizing mode.
299  *
300  * Revision 1.53  2002/10/18 23:37:56  agmsmith
301  * More mail kit usage, can now decode headers, but more to do.
302  *
303  * Revision 1.52  2002/10/16 23:52:33  agmsmith
304  * Getting ready to add more tokenizing modes, exploring Mail Kit to break
305  * apart messages into components (and decode BASE64 and other encodings).
306  *
307  * Revision 1.51  2002/10/11 20:05:31  agmsmith
308  * Added installation of sound effect names, which the filter will use.
309  *
310  * Revision 1.50  2002/10/02 16:50:02  agmsmith
311  * Forgot to add credits to the algorithm inventors.
312  *
313  * Revision 1.49  2002/10/01 00:39:29  agmsmith
314  * Added drag and drop to evaluate files or to add them to the list.
315  *
316  * Revision 1.48  2002/09/30 19:44:17  agmsmith
317  * Switched to Gary Robinson's method, removed max spam/genuine word.
318  *
319  * Revision 1.47  2002/09/23 17:08:55  agmsmith
320  * Add an attribute with the spam ratio to files which have been evaluated.
321  *
322  * Revision 1.46  2002/09/23 02:50:32  agmsmith
323  * Fiddling with display width of e-mail attributes.
324  *
325  * Revision 1.45  2002/09/23 01:13:56  agmsmith
326  * Oops, bug in string evaluation scripting.
327  *
328  * Revision 1.44  2002/09/22 21:00:55  agmsmith
329  * Added EvaluateString so that the BeMail add-on can pass the info without
330  * having to create a temporary file.
331  *
332  * Revision 1.43  2002/09/20 19:56:02  agmsmith
333  * Added about box and button for estimating the spam ratio of a file.
334  *
335  * Revision 1.42  2002/09/20 01:22:26  agmsmith
336  * More testing, decide that an extreme ratio bias point of 0.5 is good.
337  *
338  * Revision 1.41  2002/09/19 21:17:12  agmsmith
339  * Changed a few names and proofread the program.
340  *
341  * Revision 1.40  2002/09/19 14:27:17  agmsmith
342  * Rearranged execution of commands, moving them to a separate looper
343  * rather than the BApplication, so that thousands of files could be
344  * processed without worrying about the message queue filling up.
345  *
346  * Revision 1.39  2002/09/18 18:47:16  agmsmith
347  * Stop flickering when the view is partially obscured, update cached
348  * values in all situations except when app is busy.
349  *
350  * Revision 1.38  2002/09/18 18:08:11  agmsmith
351  * Add a function for evaluating the spam ratio of a message.
352  *
353  * Revision 1.37  2002/09/16 01:30:16  agmsmith
354  * Added Get Oldest command.
355  *
356  * Revision 1.36  2002/09/16 00:47:52  agmsmith
357  * Change the display to counter-weigh the spam ratio by the number of
358  * messages.
359  *
360  * Revision 1.35  2002/09/15 20:49:35  agmsmith
361  * Scrolling improved, buttons, keys and mouse wheel added.
362  *
363  * Revision 1.34  2002/09/15 03:46:10  agmsmith
364  * Up and down buttons under construction.
365  *
366  * Revision 1.33  2002/09/15 02:09:21  agmsmith
367  * Took out scroll bar.
368  *
369  * Revision 1.32  2002/09/15 02:05:30  agmsmith
370  * Trying to add a scroll bar, but it isn't very useful.
371  *
372  * Revision 1.31  2002/09/14 23:06:28  agmsmith
373  * Now has live updates of the list of words.
374  *
375  * Revision 1.30  2002/09/14 19:53:11  agmsmith
376  * Now with a better display of the words.
377  *
378  * Revision 1.29  2002/09/13 21:33:54  agmsmith
379  * Now draws the words in the word display view, but still primitive.
380  *
381  * Revision 1.28  2002/09/13 19:28:02  agmsmith
382  * Added display of most genuine and most spamiest, fixed up cursor.
383  *
384  * Revision 1.27  2002/09/13 03:08:42  agmsmith
385  * Show current word and message counts, and a busy cursor.
386  *
387  * Revision 1.26  2002/09/13 00:00:08  agmsmith
388  * Fixed up some deadlock problems, now using asynchronous message replies.
389  *
390  * Revision 1.25  2002/09/12 17:56:58  agmsmith
391  * Keep track of words which are spamiest and genuinest.
392  *
393  * Revision 1.24  2002/09/12 01:57:10  agmsmith
394  * Added server mode.
395  *
396  * Revision 1.23  2002/09/11 23:30:45  agmsmith
397  * Added Purge button and ignore classification checkbox.
398  *
399  * Revision 1.22  2002/09/11 21:23:13  agmsmith
400  * Added bulk update choice, purge button, moved to a BView container
401  * for all the controls (so background colour could be set, and Pulse
402  * works normally for it too).
403  *
404  * Revision 1.21  2002/09/10 22:52:49  agmsmith
405  * You can now change the database name in the GUI.
406  *
407  * Revision 1.20  2002/09/09 14:20:42  agmsmith
408  * Now can have multiple backups, and implemented refs received.
409  *
410  * Revision 1.19  2002/09/07 19:14:56  agmsmith
411  * Added standard GUI measurement code.
412  *
413  * Revision 1.18  2002/09/06 21:03:03  agmsmith
414  * Rearranging code to avoid forward references when adding a window class.
415  *
416  * Revision 1.17  2002/09/06 02:54:00  agmsmith
417  * Added the ability to purge old words from the database.
418  *
419  * Revision 1.16  2002/09/05 00:46:03  agmsmith
420  * Now adds spam to the database!
421  *
422  * Revision 1.15  2002/09/04 20:32:15  agmsmith
423  * Read ahead a couple of letters to decode quoted-printable better.
424  *
425  * Revision 1.14  2002/09/04 03:10:03  agmsmith
426  * Can now tokenize (break into words) a text file.
427  *
428  * Revision 1.13  2002/09/03 21:50:54  agmsmith
429  * Count database command, set up MIME type for the database file.
430  *
431  * Revision 1.12  2002/09/03 19:55:54  agmsmith
432  * Added loading and saving the database.
433  *
434  * Revision 1.11  2002/09/02 03:35:33  agmsmith
435  * Create indices and set up attribute associations with the e-mail MIME type.
436  *
437  * Revision 1.10  2002/09/01 15:52:49  agmsmith
438  * Can now delete the database.
439  *
440  * Revision 1.9  2002/08/31 21:55:32  agmsmith
441  * Yet more scripting.
442  *
443  * Revision 1.8  2002/08/31 21:41:37  agmsmith
444  * Under construction, with example code to decode a B_REPLY.
445  *
446  * Revision 1.7  2002/08/30 19:29:06  agmsmith
447  * Combined loading and saving settings into one function.
448  *
449  * Revision 1.6  2002/08/30 02:01:10  agmsmith
450  * Working on loading and saving settings.
451  *
452  * Revision 1.5  2002/08/29 23:17:42  agmsmith
453  * More scripting.
454  *
455  * Revision 1.4  2002/08/28 00:40:52  agmsmith
456  * Scripting now seems to work, at least the messages flow properly.
457  *
458  * Revision 1.3  2002/08/25 21:51:44  agmsmith
459  * Getting the about text formatting right.
460  *
461  * Revision 1.2  2002/08/25 21:28:20  agmsmith
462  * Trying out the BeOS scripting system as a way of implementing the program.
463  *
464  * Revision 1.1  2002/08/24 02:27:51  agmsmith
465  * Initial revision
466  */
467 
468 /* Standard C Library. */
469 
470 #include <errno.h>
471 #include <stdio.h>
472 #include <stdlib.h>
473 #include <strings.h>
474 
475 /* Standard C++ library. */
476 
477 #include <iostream>
478 
479 /* STL (Standard Template Library) headers. */
480 
481 #include <map>
482 #include <queue>
483 #include <set>
484 #include <string>
485 #include <vector>
486 
487 using namespace std;
488 
489 /* BeOS (Be Operating System) headers. */
490 
491 #include <Alert.h>
492 #include <Application.h>
493 #include <Beep.h>
494 #include <Button.h>
495 #include <CheckBox.h>
496 #include <Cursor.h>
497 #include <Directory.h>
498 #include <Entry.h>
499 #include <File.h>
500 #include <FilePanel.h>
501 #include <FindDirectory.h>
502 #include <fs_index.h>
503 #include <fs_info.h>
504 #include <MenuBar.h>
505 #include <MenuItem.h>
506 #include <Message.h>
507 #include <MessageQueue.h>
508 #include <MessageRunner.h>
509 #include <Mime.h>
510 #include <NodeInfo.h>
511 #include <Path.h>
512 #include <Picture.h>
513 #include <PictureButton.h>
514 #include <Point.h>
515 #include <Polygon.h>
516 #include <PopUpMenu.h>
517 #include <PropertyInfo.h>
518 #include <RadioButton.h>
519 #include <Resources.h>
520 #include <Screen.h>
521 #include <ScrollBar.h>
522 #include <String.h>
523 #include <StringView.h>
524 #include <TextControl.h>
525 #include <View.h>
526 
527 /* Included from the Mail Daemon Replacement project (MDR) include/public
528 directory, available from http://sourceforge.net/projects/bemaildaemon/ */
529 
530 #include <MailMessage.h>
531 #include <MailAttachment.h>
532 
533 
534 /******************************************************************************
535  * Global variables, and not-so-variable things too.  Grouped by functionality.
536  */
537 
538 static float g_MarginBetweenControls; /* Space of a letter "M" between them. */
539 static float g_LineOfTextHeight;      /* Height of text the current font. */
540 static float g_StringViewHeight;      /* Height of a string view text box. */
541 static float g_ButtonHeight;          /* How many pixels tall buttons are. */
542 static float g_CheckBoxHeight;        /* Same for check boxes. */
543 static float g_RadioButtonHeight;     /* Also for radio buttons. */
544 static float g_PopUpMenuHeight;       /* Again for pop-up menus. */
545 static float g_TextBoxHeight;         /* Ditto for editable text controls. */
546 
547 static const char *g_ABSAppSignature =
548   "application/x-vnd.agmsmith.spamdbm";
549 
550 static const char *g_ABSDatabaseFileMIMEType =
551   "text/x-vnd.agmsmith.spam_probability_database";
552 
553 static const char *g_DefaultDatabaseFileName =
554   "SpamDBM Database";
555 
556 static const char *g_DatabaseRecognitionString =
557   "Spam Database File";
558 
559 static const char *g_AttributeNameClassification = "MAIL:classification";
560 static const char *g_AttributeNameSpamRatio = "MAIL:ratio_spam";
561 static const char *g_BeepGenuine = "SpamFilter-Genuine";
562 static const char *g_BeepSpam = "SpamFilter-Spam";
563 static const char *g_BeepUncertain = "SpamFilter-Uncertain";
564 static const char *g_ClassifiedSpam = "Spam";
565 static const char *g_ClassifiedGenuine = "Genuine";
566 static const char *g_DataName = "data";
567 static const char *g_ResultName = "result";
568 
569 static const char *g_SettingsDirectoryName = "Mail";
570 static const char *g_SettingsFileName = "SpamDBM Settings";
571 static const uint32 g_SettingsWhatCode = 'SDBM';
572 static const char *g_BackupSuffix = ".backup %d";
573 static const int g_MaxBackups = 10; /* Numbered from 0 to g_MaxBackups - 1. */
574 static const size_t g_MaxWordLength = 50; /* Words longer than this aren't. */
575 static const int g_MaxInterestingWords = 150; /* Top N words are examined. */
576 static const double g_RobinsonS = 0.45; /* Default weight for no data. */
577 static const double g_RobinsonX = 0.5; /* Halfway point for no data. */
578 
579 static bool g_CommandLineMode;
580   /* TRUE if the program was started from the command line (and thus should
581   exit after processing the command), FALSE if it is running with a graphical
582   user interface. */
583 
584 static bool g_ServerMode;
585   /* When TRUE the program runs in server mode - error messages don't result in
586   pop-up dialog boxes, but you can still see them in stderr.  Also the window
587   is minimized, if it exists. */
588 
589 static int g_QuitCountdown = -1;
590   /* Set to the number of pulse timing events (about one every half second) to
591   count down before the program quits.  Negative means stop counting.  Zero
592   means quit at the next pulse event.  This is used to keep the program alive
593   for a short while after someone requests that it quit, in case more scripting
594   commands come in, which will stop the countdown.  Needed to handle the case
595   where there are multiple e-mail accounts all requesting spam identification,
596   and one finishes first and tells the server to quit.  It also checks to see
597   that there is no more work to do before trying to quit. */
598 
599 static volatile bool g_AppReadyToRunCompleted = false;
600   /* The BApplication starts processing messages before ReadyToRun finishes,
601   which can lead to initialisation problems (button heights not determined).
602   So wait for this to turn TRUE in code that might run early, like
603   RefsReceived. */
604 
605 static class CommanderLooper *g_CommanderLooperPntr = NULL;
606 static BMessenger *g_CommanderMessenger = NULL;
607   /* Some globals for use with the looper which processes external commands
608   (arguments received, file references received), needed for avoiding deadlocks
609   which would happen if the BApplication sent a scripting message to itself. */
610 
611 static BCursor *g_BusyCursor = NULL;
612   /* The busy cursor, will be loaded from the resource file during application
613   startup. */
614 
615 typedef enum PropertyNumbersEnum
616 {
617   PN_DATABASE_FILE = 0,
618   PN_SPAM,
619   PN_SPAM_STRING,
620   PN_GENUINE,
621   PN_GENUINE_STRING,
622   PN_UNCERTAIN,
623   PN_IGNORE_PREVIOUS_CLASSIFICATION,
624   PN_SERVER_MODE,
625   PN_FLUSH,
626   PN_PURGE_AGE,
627   PN_PURGE_POPULARITY,
628   PN_PURGE,
629   PN_OLDEST,
630   PN_EVALUATE,
631   PN_EVALUATE_STRING,
632   PN_RESET_TO_DEFAULTS,
633   PN_INSTALL_THINGS,
634   PN_TOKENIZE_MODE,
635   PN_SCORING_MODE,
636   PN_MAX
637 } PropertyNumbers;
638 
639 static const char * g_PropertyNames [PN_MAX] =
640 {
641   "DatabaseFile",
642   "Spam",
643   "SpamString",
644   "Genuine",
645   "GenuineString",
646   "Uncertain",
647   "IgnorePreviousClassification",
648   "ServerMode",
649   "Flush",
650   "PurgeAge",
651   "PurgePopularity",
652   "Purge",
653   "Oldest",
654   "Evaluate",
655   "EvaluateString",
656   "ResetToDefaults",
657   "InstallThings",
658   "TokenizeMode",
659   "ScoringMode"
660 };
661 
662 /* This array lists the scripting commands we can handle, in a format that the
663 scripting system can understand too. */
664 
665 static struct property_info g_ScriptingPropertyList [] =
666 {
667   /* *name; commands[10]; specifiers[10]; *usage; extra_data; ... */
668   {g_PropertyNames[PN_DATABASE_FILE], {B_GET_PROPERTY, 0},
669     {B_DIRECT_SPECIFIER, 0}, "Get the pathname of the current database file.  "
670     "The default name is something like B_USER_SETTINGS_DIRECTORY / "
671     "Mail / SpamDBM Database", PN_DATABASE_FILE,
672     {}, {}, {}},
673   {g_PropertyNames[PN_DATABASE_FILE], {B_SET_PROPERTY, 0},
674     {B_DIRECT_SPECIFIER, 0}, "Change the pathname of the database file to "
675     "use.  It will automatically be converted to an absolute path name, "
676     "so make sure the parent directories exist before setting it.  If it "
677     "doesn't exist, you'll have to use the create command next.",
678     PN_DATABASE_FILE, {}, {}, {}},
679   {g_PropertyNames[PN_DATABASE_FILE], {B_CREATE_PROPERTY, 0},
680     {B_DIRECT_SPECIFIER, 0}, "Creates a new empty database, will replace "
681     "the existing database file too.", PN_DATABASE_FILE, {}, {}, {}},
682   {g_PropertyNames[PN_DATABASE_FILE], {B_DELETE_PROPERTY, 0},
683     {B_DIRECT_SPECIFIER, 0}, "Deletes the database file and all backup copies "
684     "of that file too.  Really only of use for uninstallers.",
685     PN_DATABASE_FILE, {}, {}, {}},
686   {g_PropertyNames[PN_DATABASE_FILE], {B_COUNT_PROPERTIES, 0},
687     {B_DIRECT_SPECIFIER, 0}, "Returns the number of words in the database.",
688     PN_DATABASE_FILE, {}, {}, {}},
689   {g_PropertyNames[PN_SPAM], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
690     "Adds the spam in the given file (specify full pathname to be safe) to "
691     "the database.  The words in the files will be added to the list of words "
692     "in the database that identify spam messages.  The files processed will "
693     "also have the attribute MAIL:classification added with a value of "
694     "\"Spam\" or \"Genuine\" as specified.  They also have their spam ratio "
695     "attribute updated, as if you had also used the Evaluate command on "
696     "them.  If they already have the MAIL:classification "
697     "attribute and it matches the new classification then they won't get "
698     "processed (and if it is different, they will get removed from the "
699     "statistics for the old class and added to the statistics for the new "
700     "one).  You can turn off that behaviour with the "
701     "IgnorePreviousClassification property.  The command line version lets "
702     "you specify more than one pathname.", PN_SPAM, {}, {}, {}},
703   {g_PropertyNames[PN_SPAM], {B_COUNT_PROPERTIES, 0}, {B_DIRECT_SPECIFIER, 0},
704     "Returns the number of spam messages in the database.", PN_SPAM,
705     {}, {}, {}},
706   {g_PropertyNames[PN_SPAM_STRING], {B_SET_PROPERTY, 0},
707     {B_DIRECT_SPECIFIER, 0}, "Adds the spam in the given string (assumed to "
708     "be the text of a whole e-mail message, not just a file name) to the "
709     "database.", PN_SPAM_STRING, {}, {}, {}},
710   {g_PropertyNames[PN_GENUINE], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
711     "Similar to adding spam except that the message file is added to the "
712     "genuine statistics.", PN_GENUINE, {}, {}, {}},
713   {g_PropertyNames[PN_GENUINE], {B_COUNT_PROPERTIES, 0},
714     {B_DIRECT_SPECIFIER, 0}, "Returns the number of genuine messages in the "
715     "database.", PN_GENUINE, {}, {}, {}},
716   {g_PropertyNames[PN_GENUINE_STRING], {B_SET_PROPERTY, 0},
717     {B_DIRECT_SPECIFIER, 0}, "Adds the genuine message in the given string "
718     "(assumed to be the text of a whole e-mail message, not just a file name) "
719     "to the database.", PN_GENUINE_STRING, {}, {}, {}},
720   {g_PropertyNames[PN_UNCERTAIN], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
721     "Similar to adding spam except that the message file is removed from the "
722     "database, undoing the previous classification.  Obviously, it needs to "
723     "have been classified previously (using the file attributes) so it can "
724     "tell if it is removing spam or genuine words.", PN_UNCERTAIN, {}, {}, {}},
725   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_SET_PROPERTY, 0},
726     {B_DIRECT_SPECIFIER, 0}, "If set to true then the previous classification "
727     "(which was saved as an attribute of the e-mail message file) will be "
728     "ignored, so that you can add the message to the database again.  If set "
729     "to false (the normal case), the attribute will be examined, and if the "
730     "message has already been classified as what you claim it is, nothing "
731     "will be done.  If it was misclassified, then the message will be removed "
732     "from the statistics for the old class and added to the stats for the "
733     "new classification you have requested.",
734     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
735   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_GET_PROPERTY, 0},
736     {B_DIRECT_SPECIFIER, 0}, "Find out the current setting of the flag for "
737     "ignoring the previously recorded classification.",
738     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
739   {g_PropertyNames[PN_SERVER_MODE], {B_SET_PROPERTY, 0},
740     {B_DIRECT_SPECIFIER, 0}, "If set to true then error messages get printed "
741     "to the standard error stream rather than showing up in an alert box.  "
742     "It also starts up with the window minimized.", PN_SERVER_MODE,
743     {}, {}, {}},
744   {g_PropertyNames[PN_SERVER_MODE], {B_GET_PROPERTY, 0},
745     {B_DIRECT_SPECIFIER, 0}, "Find out the setting of the server mode flag.",
746     PN_SERVER_MODE, {}, {}, {}},
747   {g_PropertyNames[PN_FLUSH], {B_EXECUTE_PROPERTY, 0},
748     {B_DIRECT_SPECIFIER, 0}, "Writes out the database file to disk, if it has "
749     "been updated in memory but hasn't been saved to disk.  It will "
750     "automatically get written when the program exits, so this command is "
751     "mostly useful for server mode.", PN_FLUSH, {}, {}, {}},
752   {g_PropertyNames[PN_PURGE_AGE], {B_SET_PROPERTY, 0},
753     {B_DIRECT_SPECIFIER, 0}, "Sets the old age limit.  Words which haven't "
754       "been updated since this many message additions to the database may be "
755       "deleted when you do a purge.  A good value is 1000, meaning that if a "
756       "word hasn't appeared in the last 1000 spam/genuine messages, it will "
757       "be forgotten.  Zero will purge all words, 1 will purge words not in "
758       "the last message added to the database, 2 will purge words not in the "
759       "last two messages added, and so on.  This is mostly useful for "
760       "removing those one time words which are often hunks of binary garbage, "
761       "not real words.  This acts in combination with the popularity limit; "
762       "both conditions have to be valid before the word gets deleted.",
763       PN_PURGE_AGE, {}, {}, {}},
764   {g_PropertyNames[PN_PURGE_AGE], {B_GET_PROPERTY, 0},
765     {B_DIRECT_SPECIFIER, 0}, "Gets the old age limit.", PN_PURGE_AGE,
766     {}, {}, {}},
767   {g_PropertyNames[PN_PURGE_POPULARITY], {B_SET_PROPERTY, 0},
768     {B_DIRECT_SPECIFIER, 0}, "Sets the popularity limit.  Words which aren't "
769     "this popular may be deleted when you do a purge.  A good value is 5, "
770     "which means that the word is safe from purging if it has been seen in 6 "
771     "or more e-mail messages.  If it's only in 5 or less, then it may get "
772     "purged.  The extreme is zero, where only words that haven't been seen "
773     "in any message are deleted (usually means no words).  This acts in "
774     "combination with the old age limit; both conditions have to be valid "
775     "before the word gets deleted.", PN_PURGE_POPULARITY, {}, {}, {}},
776   {g_PropertyNames[PN_PURGE_POPULARITY], {B_GET_PROPERTY, 0},
777     {B_DIRECT_SPECIFIER, 0}, "Gets the purge popularity limit.",
778     PN_PURGE_POPULARITY, {}, {}, {}},
779   {g_PropertyNames[PN_PURGE], {B_EXECUTE_PROPERTY, 0},
780     {B_DIRECT_SPECIFIER, 0}, "Purges the old obsolete words from the "
781     "database, if they are old enough according to the age limit and also "
782     "unpopular enough according to the popularity limit.", PN_PURGE,
783     {}, {}, {}},
784   {g_PropertyNames[PN_OLDEST], {B_GET_PROPERTY, 0},
785     {B_DIRECT_SPECIFIER, 0}, "Gets the age of the oldest message in the "
786     "database.  It's relative to the beginning of time, so you need to do "
787     "(total messages - age - 1) to see how many messages ago it was added.",
788     PN_OLDEST, {}, {}, {}},
789   {g_PropertyNames[PN_EVALUATE], {B_SET_PROPERTY, 0},
790     {B_DIRECT_SPECIFIER, 0}, "Evaluates a given file (by path name) to see "
791     "if it is spam or not.  Returns the ratio of spam probability vs genuine "
792     "probability, 0.0 meaning completely genuine, 1.0 for completely spam.  "
793     "Normally you should safely be able to consider it as spam if it is over "
794     "0.56 for the Robinson scoring method.  For the ChiSquared method, the "
795     "numbers are near 0 for genuine, near 1 for spam, and anywhere in the "
796     "middle means it can't decide.  The program attaches a MAIL:ratio_spam "
797     "attribute with the ratio as its "
798     "float32 value to the file.  Also returns the top few interesting words "
799     "in \"words\" and the associated per-word probability ratios in "
800     "\"ratios\".", PN_EVALUATE, {}, {}, {}},
801   {g_PropertyNames[PN_EVALUATE_STRING], {B_SET_PROPERTY, 0},
802     {B_DIRECT_SPECIFIER, 0}, "Like Evaluate, but rather than a file name, "
803     "the string argument contains the entire text of the message to be "
804     "evaluated.", PN_EVALUATE_STRING, {}, {}, {}},
805   {g_PropertyNames[PN_RESET_TO_DEFAULTS], {B_EXECUTE_PROPERTY, 0},
806     {B_DIRECT_SPECIFIER, 0}, "Resets all the configuration options to the "
807     "default values, including the database name.", PN_RESET_TO_DEFAULTS,
808     {}, {}, {}},
809   {g_PropertyNames[PN_INSTALL_THINGS], {B_EXECUTE_PROPERTY, 0},
810     {B_DIRECT_SPECIFIER, 0}, "Creates indices for the MAIL:classification and "
811     "MAIL:ratio_spam attributes on all volumes which support BeOS queries, "
812     "identifies them to the system as e-mail related attributes (modifies "
813     "the text/x-email MIME type), and sets up the new MIME type "
814     "(text/x-vnd.agmsmith.spam_probability_database) for the database file.  "
815     "Also registers names for the sound effects used by the separate filter "
816     "program (use the installsound BeOS program or the Sounds preferences "
817     "program to associate sound files with the names).", PN_INSTALL_THINGS,
818     {}, {}, {}},
819   {g_PropertyNames[PN_TOKENIZE_MODE], {B_SET_PROPERTY, 0},
820     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for breaking up the "
821     "message into words.  Use \"Whole\" for the whole file (also use it for "
822     "non-email files).  The file isn't broken into parts; the whole thing is "
823     "converted into words, headers and attachments are just more raw data.  "
824     "Well, not quite raw data since it converts quoted-printable codes "
825     "(equals sign followed by hex digits or end of line) to the equivalent "
826     "single characters.  \"PlainText\" breaks the file into MIME components "
827     "and only looks at the ones which are of MIME type text/plain.  "
828     "\"AnyText\" will look for words in all text/* things, including "
829     "text/html attachments.  \"AllParts\" will decode all message components "
830     "and look for words in them, including binary attachments.  "
831     "\"JustHeader\" will only look for words in the message header.  "
832     "\"AllPartsAndHeader\", \"PlainTextAndHeader\" and \"AnyTextAndHeader\" "
833     "will also include the words from the message headers.", PN_TOKENIZE_MODE,
834     {}, {}, {}},
835   {g_PropertyNames[PN_TOKENIZE_MODE], {B_GET_PROPERTY, 0},
836     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for breaking up the "
837     "message into words.", PN_TOKENIZE_MODE, {}, {}, {}},
838   {g_PropertyNames[PN_SCORING_MODE], {B_SET_PROPERTY, 0},
839     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for combining the "
840     "probabilities of individual words into an overall score.  "
841     "\"Robinson\" mode will use Gary Robinson's nth root of the product "
842     "method.  It gives a nice range of values between 0 and 1 so you can "
843     "see shades of spaminess.  The cutoff point between spam and genuine "
844     "varies depending on your database of words (0.56 was one point in "
845     "some experiments).  \"ChiSquared\" mode will use chi-squared "
846     "statistics to evaluate the difference in probabilities that the lists "
847     "of word ratios are random.  The result is very close to 0 for genuine "
848     "and very close to 1 for spam, and near the middle if it is uncertain.",
849     PN_SCORING_MODE, {}, {}, {}},
850   {g_PropertyNames[PN_SCORING_MODE], {B_GET_PROPERTY, 0},
851     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for combining the "
852     "individual word ratios into an overall score.", PN_SCORING_MODE,
853     {}, {}, {}},
854 
855   { 0 }
856 };
857 
858 
859 /* The various scoring modes as text and enums.  See PN_SCORING_MODE. */
860 
861 typedef enum ScoringModeEnum
862 {
863   SM_ROBINSON = 0,
864   SM_CHISQUARED,
865   SM_MAX
866 } ScoringModes;
867 
868 static const char * g_ScoringModeNames [SM_MAX] =
869 {
870   "Robinson",
871   "ChiSquared"
872 };
873 
874 
875 /* The various tokenizing modes as text and enums.  See PN_TOKENIZE_MODE. */
876 
877 typedef enum TokenizeModeEnum
878 {
879   TM_WHOLE = 0,
880   TM_PLAIN_TEXT,
881   TM_PLAIN_TEXT_HEADER,
882   TM_ANY_TEXT,
883   TM_ANY_TEXT_HEADER,
884   TM_ALL_PARTS,
885   TM_ALL_PARTS_HEADER,
886   TM_JUST_HEADER,
887   TM_MAX
888 } TokenizeModes;
889 
890 static const char * g_TokenizeModeNames [TM_MAX] =
891 {
892   "All",
893   "Plain text",
894   "Plain text and header",
895   "Any text",
896   "Any text and header",
897   "All parts",
898   "All parts and header",
899   "Just header"
900 };
901 
902 
903 /* Possible message classifications. */
904 
905 typedef enum ClassificationTypesEnum
906 {
907   CL_GENUINE = 0,
908   CL_SPAM,
909   CL_UNCERTAIN,
910   CL_MAX
911 } ClassificationTypes;
912 
913 static const char * g_ClassificationTypeNames [CL_MAX] =
914 {
915   g_ClassifiedGenuine,
916   g_ClassifiedSpam,
917   "Uncertain"
918 };
919 
920 
921 /* Some polygon graphics for the scroll arrows. */
922 
923 static BPoint g_UpLinePoints [] =
924 {
925   BPoint (8, 2 * (1)),
926   BPoint (14, 2 * (6)),
927   BPoint (10, 2 * (6)),
928   BPoint (10, 2 * (13)),
929   BPoint (6, 2 * (13)),
930   BPoint (6, 2 * (6)),
931   BPoint (2, 2 * (6))
932 };
933 
934 static BPoint g_DownLinePoints [] =
935 {
936   BPoint (8, 2 * (14-1)),
937   BPoint (14, 2 * (14-6)),
938   BPoint (10, 2 * (14-6)),
939   BPoint (10, 2 * (14-13)),
940   BPoint (6, 2 * (14-13)),
941   BPoint (6, 2 * (14-6)),
942   BPoint (2, 2 * (14-6))
943 };
944 
945 static BPoint g_UpPagePoints [] =
946 {
947   BPoint (8, 2 * (1)),
948   BPoint (13, 2 * (6)),
949   BPoint (10, 2 * (6)),
950   BPoint (14, 2 * (10)),
951   BPoint (10, 2 * (10)),
952   BPoint (10, 2 * (13)),
953   BPoint (6, 2 * (13)),
954   BPoint (6, 2 * (10)),
955   BPoint (2, 2 * (10)),
956   BPoint (6, 2 * (6)),
957   BPoint (3, 2 * (6))
958 };
959 
960 static BPoint g_DownPagePoints [] =
961 {
962   BPoint (8, 2 * (14-1)),
963   BPoint (13, 2 * (14-6)),
964   BPoint (10, 2 * (14-6)),
965   BPoint (14, 2 * (14-10)),
966   BPoint (10, 2 * (14-10)),
967   BPoint (10, 2 * (14-13)),
968   BPoint (6, 2 * (14-13)),
969   BPoint (6, 2 * (14-10)),
970   BPoint (2, 2 * (14-10)),
971   BPoint (6, 2 * (14-6)),
972   BPoint (3, 2 * (14-6))
973 };
974 
975 
976 /* An array of flags to identify characters which are considered to be spaces.
977 If character code X has g_SpaceCharacters[X] set to true then it is a
978 space-like character.  Character codes 128 and above are always non-space since
979 they are UTF-8 characters.  Initialised in the ABSApp constructor. */
980 
981 static bool g_SpaceCharacters [128];
982 
983 
984 
985 /******************************************************************************
986  * Each word in the spam database gets one of these structures.  The database
987  * has a string (the word) as the key and this structure as the value
988  * (statistics for that word).
989  */
990 
991 typedef struct StatisticsStruct
992 {
993   uint32 age;
994     /* Sequence number for the time when this word was last updated in the
995     database, so that we can remove old words (haven't been seen in recent
996     spam).  It's zero for the first file ever added (spam or genuine) to the
997     database, 1 for all words added or updated by the second file, etc.  If a
998     later file updates an existing word, it gets the age of the later file. */
999 
1000   uint32 genuineCount;
1001     /* Number of genuine messages that have this word. */
1002 
1003   uint32 spamCount;
1004     /* A count of the number of spam e-mail messages which contain the word. */
1005 
1006 } StatisticsRecord, *StatisticsPointer;
1007 
1008 typedef map<string, StatisticsRecord> StatisticsMap;
1009   /* Define this type which will be used for our main data storage facility, so
1010   we can more conveniently specify things that are derived from it, like
1011   iterators. */
1012 
1013 
1014 
1015 /******************************************************************************
1016  * An alert box asking how the user wants to mark messages.  There are buttons
1017  * for each classification category, and a checkbox to mark all remaining N
1018  * messages the same way.  And a cancel button.  To use it, first create the
1019  * ClassificationChoicesWindow, specifying the input arguments.  Then call the
1020  * Go method which will show the window, stuff the user's answer into your
1021  * output arguments (class set to CL_MAX if the user cancels), and destroy the
1022  * window.  Implemented because BAlert only allows 3 buttons, max!
1023  */
1024 
1025 class ClassificationChoicesWindow : public BWindow
1026 {
1027 public:
1028   /* Constructor and destructor. */
1029   ClassificationChoicesWindow (BRect FrameRect,
1030     const char *FileName, int NumberOfFiles);
1031 
1032   /* BeOS virtual functions. */
1033   virtual void MessageReceived (BMessage *MessagePntr);
1034 
1035   /* Our methods. */
1036   void Go (bool *BulkModeSelectedPntr,
1037     ClassificationTypes *ChoosenClassificationPntr);
1038 
1039   /* Various message codes for various buttons etc. */
1040   static const uint32 MSG_CLASS_BUTTONS = 'ClB0';
1041   static const uint32 MSG_CANCEL_BUTTON = 'Cncl';
1042   static const uint32 MSG_BULK_CHECKBOX = 'BlkK';
1043 
1044 private:
1045   /* Member variables. */
1046   bool *m_BulkModeSelectedPntr;
1047   ClassificationTypes *m_ChoosenClassificationPntr;
1048 };
1049 
1050 class ClassificationChoicesView : public BView
1051 {
1052 public:
1053   /* Constructor and destructor. */
1054   ClassificationChoicesView (BRect FrameRect,
1055     const char *FileName, int NumberOfFiles);
1056 
1057   /* BeOS virtual functions. */
1058   virtual void AttachedToWindow ();
1059   virtual void GetPreferredSize (float *width, float *height);
1060 
1061 private:
1062   /* Member variables. */
1063   const char *m_FileName;
1064   int         m_NumberOfFiles;
1065   float       m_PreferredBottomY;
1066 };
1067 
1068 
1069 
1070 /******************************************************************************
1071  * Due to deadlock problems with the BApplication posting scripting messages to
1072  * itself, we need to add a second Looper.  Its job is to just to convert
1073  * command line arguments and arguments from the Tracker (refs received) into a
1074  * series of scripting commands sent to the main BApplication.  It also prints
1075  * out the replies received (to stdout for command line replies).  An instance
1076  * of this class will be created and run by the main() function, and shut down
1077  * by it too.
1078  */
1079 
1080 class CommanderLooper : public BLooper
1081 {
1082 public:
1083   CommanderLooper ();
1084   ~CommanderLooper ();
1085   virtual void MessageReceived (BMessage *MessagePntr);
1086 
1087   void CommandArguments (int argc, char **argv);
1088   void CommandReferences (BMessage *MessagePntr,
1089     bool BulkMode = false,
1090     ClassificationTypes BulkClassification = CL_GENUINE);
1091   bool IsBusy ();
1092 
1093 private:
1094   void ProcessArgs (BMessage *MessagePntr);
1095   void ProcessRefs (BMessage *MessagePntr);
1096 
1097   static const uint32 MSG_COMMAND_ARGUMENTS = 'CArg';
1098   static const uint32 MSG_COMMAND_FILE_REFS = 'CRef';
1099 
1100   bool m_IsBusy;
1101 };
1102 
1103 
1104 
1105 /******************************************************************************
1106  * This view contains the various buttons and other controls for setting
1107  * configuration options and displaying the state of the database (but not the
1108  * actual list of words).  It will appear in the top half of the
1109  * DatabaseWindow.
1110  */
1111 
1112 class ControlsView : public BView
1113 {
1114 public:
1115   /* Constructor and destructor. */
1116   ControlsView (BRect NewBounds);
1117   ~ControlsView ();
1118 
1119   /* BeOS virtual functions. */
1120   virtual void AttachedToWindow ();
1121   virtual void FrameResized (float Width, float Height);
1122   virtual void MessageReceived (BMessage *MessagePntr);
1123   virtual void Pulse ();
1124 
1125 private:
1126   /* Various message codes for various buttons etc. */
1127   static const uint32 MSG_BROWSE_BUTTON = 'Brws';
1128   static const uint32 MSG_DATABASE_NAME = 'DbNm';
1129   static const uint32 MSG_ESTIMATE_BUTTON = 'Estm';
1130   static const uint32 MSG_ESTIMATE_FILE_REFS = 'ERef';
1131   static const uint32 MSG_IGNORE_CLASSIFICATION = 'IPCl';
1132   static const uint32 MSG_PURGE_AGE = 'PuAg';
1133   static const uint32 MSG_PURGE_BUTTON = 'Purg';
1134   static const uint32 MSG_PURGE_POPULARITY = 'PuPo';
1135   static const uint32 MSG_SERVER_MODE = 'SrvM';
1136 
1137   /* Our member functions. */
1138   void BrowseForDatabaseFile ();
1139   void BrowseForFileToEstimate ();
1140   void PollServerForChanges ();
1141 
1142   /* Member variables. */
1143   BButton        *m_AboutButtonPntr;
1144   BButton        *m_AddExampleButtonPntr;
1145   BButton        *m_BrowseButtonPntr;
1146   BFilePanel     *m_BrowseFilePanelPntr;
1147   BButton        *m_CreateDatabaseButtonPntr;
1148   char            m_DatabaseFileNameCachedValue [PATH_MAX];
1149   BTextControl   *m_DatabaseFileNameTextboxPntr;
1150   bool            m_DatabaseLoadDone;
1151   BButton        *m_EstimateSpamButtonPntr;
1152   BFilePanel     *m_EstimateSpamFilePanelPntr;
1153   uint32          m_GenuineCountCachedValue;
1154   BTextControl   *m_GenuineCountTextboxPntr;
1155   bool            m_IgnorePreviousClassCachedValue;
1156   BCheckBox      *m_IgnorePreviousClassCheckboxPntr;
1157   BButton        *m_InstallThingsButtonPntr;
1158   uint32          m_PurgeAgeCachedValue;
1159   BTextControl   *m_PurgeAgeTextboxPntr;
1160   BButton        *m_PurgeButtonPntr;
1161   uint32          m_PurgePopularityCachedValue;
1162   BTextControl   *m_PurgePopularityTextboxPntr;
1163   BButton        *m_ResetToDefaultsButtonPntr;
1164   ScoringModes    m_ScoringModeCachedValue;
1165   BMenuBar       *m_ScoringModeMenuBarPntr;
1166   BPopUpMenu     *m_ScoringModePopUpMenuPntr;
1167   bool            m_ServerModeCachedValue;
1168   BCheckBox      *m_ServerModeCheckboxPntr;
1169   uint32          m_SpamCountCachedValue;
1170   BTextControl   *m_SpamCountTextboxPntr;
1171   bigtime_t       m_TimeOfLastPoll;
1172   TokenizeModes   m_TokenizeModeCachedValue;
1173   BMenuBar       *m_TokenizeModeMenuBarPntr;
1174   BPopUpMenu     *m_TokenizeModePopUpMenuPntr;
1175   uint32          m_WordCountCachedValue;
1176   BTextControl   *m_WordCountTextboxPntr;
1177 };
1178 
1179 
1180 /* Various message codes for various buttons etc. */
1181 static const uint32 MSG_LINE_DOWN = 'LnDn';
1182 static const uint32 MSG_LINE_UP = 'LnUp';
1183 static const uint32 MSG_PAGE_DOWN = 'PgDn';
1184 static const uint32 MSG_PAGE_UP = 'PgUp';
1185 
1186 /******************************************************************************
1187  * This view contains the list of words.  It displays as many as can fit in the
1188  * view rectangle, starting at a specified word (so it can simulate scrolling).
1189  * Usually it will appear in the bottom half of the DatabaseWindow.
1190  */
1191 
1192 class WordsView : public BView
1193 {
1194 public:
1195   /* Constructor and destructor. */
1196   WordsView (BRect NewBounds);
1197 
1198   /* BeOS virtual functions. */
1199   virtual void AttachedToWindow ();
1200   virtual void Draw (BRect UpdateRect);
1201   virtual void KeyDown (const char *BufferPntr, int32 NumBytes);
1202   virtual void MakeFocus (bool Focused);
1203   virtual void MessageReceived (BMessage *MessagePntr);
1204   virtual void MouseDown (BPoint point);
1205   virtual void Pulse ();
1206 
1207 private:
1208   /* Our member functions. */
1209   void MoveTextUpOrDown (uint32 MovementType);
1210   void RefsDroppedHere (BMessage *MessagePntr);
1211 
1212   /* Member variables. */
1213   BPictureButton *m_ArrowLineDownPntr;
1214   BPictureButton *m_ArrowLineUpPntr;
1215   BPictureButton *m_ArrowPageDownPntr;
1216   BPictureButton *m_ArrowPageUpPntr;
1217     /* Various buttons for controlling scrolling, since we can't use a scroll
1218     bar.  To make them less obvious, their background view colour needs to be
1219     changed whenever the main view's colour changes. */
1220 
1221   float m_AscentHeight;
1222     /* The ascent height for the font used to draw words.  Height from the top
1223     of the highest letter to the base line (which is near the middle bottom of
1224     the letters, the line where you would align your writing of the text by
1225     hand, all letters have part above, some also have descenders below this
1226     line). */
1227 
1228   rgb_color m_BackgroundColour;
1229     /* The current background colour.  Changes when the focus changes. */
1230 
1231   uint32 m_CachedTotalGenuineMessages;
1232   uint32 m_CachedTotalSpamMessages;
1233   uint32 m_CachedWordCount;
1234     /* These are cached copies of the similar values in the BApplication.  They
1235     reflect what's currently displayed.  If they are different than the values
1236     from the BApplication then the polling loop will try to redraw the display.
1237     They get set to the values actually used during drawing when drawing is
1238     successful. */
1239 
1240   char m_FirstDisplayedWord [g_MaxWordLength + 1];
1241     /* The scrolling display starts at this word.  Since we can't use index
1242     numbers (word[12345] for example), we use the word itself.  The scroll
1243     buttons set this to the next or previous word in the database.  Typing by
1244     the user when the view has the focus will also change this starting word.
1245     */
1246 
1247   rgb_color m_FocusedColour;
1248     /* The colour to use for focused mode (typing by the user is received by
1249     our view). */
1250 
1251   bigtime_t m_LastTimeAKeyWasPressed;
1252     /* Records the time when a key was last pressed.  Used for determining when
1253     the user has stopped typing a batch of letters. */
1254 
1255   float m_LineHeight;
1256     /* Height of a line of text in the font used for the word display.
1257     Includes the height of the letters plus a bit of extra space for between
1258     the lines (called leading). */
1259 
1260   BFont m_TextFont;
1261     /* The font used to draw the text in the window. */
1262 
1263   float m_TextHeight;
1264     /* Maximum total height of the letters in the text, includes the part above
1265     the baseline and the part below.  Doesn't include the sliver of space
1266     between lines. */
1267 
1268   rgb_color m_UnfocusedColour;
1269     /* The colour to use for unfocused mode, when user typing isn't active. */
1270 };
1271 
1272 
1273 
1274 /******************************************************************************
1275  * The BWindow class for this program.  It displays the database in real time,
1276  * and has various buttons and gadgets in the top half for changing settings
1277  * (live changes, no OK button, and they reflect changes done by other programs
1278  * using the server too).  The bottom half is a scrolling view listing all the
1279  * words in the database.  A simple graphic blotch behind each word shows
1280  * whether the word is strongly or weakly related to spam or genuine messages.
1281  * Most operations go through the scripting message system, but it also peeks
1282  * at the BApplication data for examining simple things and when redrawing the
1283  * list of words.
1284  */
1285 
1286 class DatabaseWindow : public BWindow
1287 {
1288 public:
1289   /* Constructor and destructor. */
1290   DatabaseWindow ();
1291 
1292   /* BeOS virtual functions. */
1293   virtual void MessageReceived (BMessage *MessagePntr);
1294   virtual bool QuitRequested ();
1295 
1296 private:
1297   /* Member variables. */
1298   ControlsView *m_ControlsViewPntr;
1299   WordsView    *m_WordsViewPntr;
1300 };
1301 
1302 
1303 
1304 /******************************************************************************
1305  * ABSApp is the BApplication class for this program.  This handles messages
1306  * from the outside world (requests to load a database, or to add files to the
1307  * collection).  It responds to command line arguments (if you start up the
1308  * program a second time, the system will just send the arguments to the
1309  * existing running program).  It responds to scripting messages.  And it
1310  * responds to messages from the window.  Its thread does the main work of
1311  * updating the database and reading / writing files.
1312  */
1313 
1314 class ABSApp : public BApplication
1315 {
1316 public:
1317   /* Constructor and destructor. */
1318   ABSApp ();
1319   ~ABSApp ();
1320 
1321   /* BeOS virtual functions. */
1322   virtual void AboutRequested ();
1323   virtual void ArgvReceived (int32 argc, char **argv);
1324   virtual status_t GetSupportedSuites (BMessage *MessagePntr);
1325   virtual void MessageReceived (BMessage *MessagePntr);
1326   virtual void Pulse ();
1327   virtual bool QuitRequested ();
1328   virtual void ReadyToRun ();
1329   virtual void RefsReceived (BMessage *MessagePntr);
1330   virtual BHandler *ResolveSpecifier (BMessage *MessagePntr, int32 Index,
1331     BMessage *SpecifierMsgPntr, int32 SpecificationKind, const char *Property);
1332 
1333 private:
1334   /* Our member functions. */
1335   status_t AddFileToDatabase (ClassificationTypes IsSpamOrWhat,
1336     const char *FileName, char *ErrorMessage);
1337   status_t AddPositionIOToDatabase (ClassificationTypes IsSpamOrWhat,
1338     BPositionIO *MessageIOPntr, const char *OptionalFileName,
1339     char *ErrorMessage);
1340   status_t AddStringToDatabase (ClassificationTypes IsSpamOrWhat,
1341     const char *String, char *ErrorMessage);
1342   void AddWordsToSet (const char *InputString, size_t NumberOfBytes,
1343     char PrefixCharacter, set<string> &WordSet);
1344   status_t CreateDatabaseFile (char *ErrorMessage);
1345   void DefaultSettings ();
1346   status_t DeleteDatabaseFile (char *ErrorMessage);
1347   status_t EvaluateFile (const char *PathName, BMessage *ReplyMessagePntr,
1348     char *ErrorMessage);
1349   status_t EvaluatePositionIO (BPositionIO *PositionIOPntr,
1350     const char *OptionalFileName, BMessage *ReplyMessagePntr,
1351     char *ErrorMessage);
1352   status_t EvaluateString (const char *BufferPntr, ssize_t BufferSize,
1353     BMessage *ReplyMessagePntr, char *ErrorMessage);
1354   status_t GetWordsFromPositionIO (BPositionIO *PositionIOPntr,
1355     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1356   status_t InstallThings (char *ErrorMessage);
1357   status_t LoadDatabaseIfNeeded (char *ErrorMessage);
1358   status_t LoadSaveDatabase (bool DoLoad, char *ErrorMessage);
1359 public:
1360   status_t LoadSaveSettings (bool DoLoad);
1361 private:
1362   status_t MakeBackup (char *ErrorMessage);
1363   void MakeDatabaseEmpty ();
1364   void ProcessScriptingMessage (BMessage *MessagePntr,
1365     struct property_info *PropInfoPntr);
1366   status_t PurgeOldWords (char *ErrorMessage);
1367   status_t RecursivelyTokenizeMailComponent (
1368     BMailComponent *ComponentPntr, const char *OptionalFileName,
1369     set<string> &WordSet, char *ErrorMessage,
1370     int RecursionLevel, int MaxRecursionLevel);
1371   status_t SaveDatabaseIfNeeded (char *ErrorMessage);
1372   status_t TokenizeParts (BPositionIO *PositionIOPntr,
1373     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1374   status_t TokenizeWhole (BPositionIO *PositionIOPntr,
1375     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1376 
1377 public:
1378   /* Member variables.  Many are read by the window thread to see if it needs
1379   updating, and to draw the words.  However, the other threads will lock the
1380   BApplication or using scripting commands if they want to make changes. */
1381 
1382   bool m_DatabaseHasChanged;
1383     /* Set to TRUE when the in-memory database (stored in m_WordMap) has
1384     changed and is different from the on-disk database file.  When the
1385     application exits, the database will be written out if it has changed. */
1386 
1387   BString m_DatabaseFileName;
1388     /* The absolute path name to use for the database file on disk. */
1389 
1390   bool m_IgnorePreviousClassification;
1391     /* If TRUE then the previous classification of a message (stored in an
1392     attribute on the message file) will be ignored, and the message will be
1393     added to the requested spam/genuine list.  If this is FALSE then the spam
1394     won't be added to the list if it has already been classified as specified,
1395     but if it was mis-classified, it will be removed from the old list and
1396     added to the new list. */
1397 
1398   uint32 m_OldestAge;
1399     /* The age of the oldest word.  This will be the smallest age number in the
1400     database.  Mostly useful for scaling graphics representing age in the word
1401     display.  If the oldest word is no longer the oldest, this variable won't
1402     get immediately updated since it would take a lot of effort to find the
1403     next older age.  Since it's only used for display, we'll let it be slightly
1404     incorrect.  The next database load or purge will fix it. */
1405 
1406   uint32 m_PurgeAge;
1407     /* When purging old words, they have to be at least this old to be eligible
1408     for deletion.  Age is measured as the number of e-mails added to the
1409     database since the word was last updated in the database.  Zero means all
1410     words are old. */
1411 
1412   uint32 m_PurgePopularity;
1413     /* When purging old words, they have to be less than or equal to this
1414     popularity limit to be eligible for deletion.  Popularity is measured as
1415     the number of messages (spam and genuine) which have the word.  Zero means
1416     no words. */
1417 
1418   ScoringModes m_ScoringMode;
1419     /* Controls how to combine the word probabilities into an overall score.
1420     See the PN_SCORING_MODE comments for details. */
1421 
1422   BPath m_SettingsDirectoryPath;
1423     /* The constructor initialises this to the settings directory path.  It
1424     never changes after that. */
1425 
1426   bool m_SettingsHaveChanged;
1427     /* Set to TRUE when the settings are changed (different than the ones which
1428     were loaded).  When the application exits, the settings will be written out
1429     if they have changed. */
1430 
1431   double m_SmallestUseableDouble;
1432     /* When multiplying fractional numbers together, avoid using numbers
1433     smaller than this because the double exponent range is close to being
1434     exhausted.  The IEEE STANDARD 754 floating-point arithmetic (used on the
1435     Intel i8087 and later math processors) has 64 bit numbers with 53 bits of
1436     mantissa, giving it an underflow starting at 0.5**1022 = 2.2e-308 where it
1437     rounds off to the nearest multiple of 0.5**1074 = 4.9e-324. */
1438 
1439   TokenizeModes m_TokenizeMode;
1440     /* Controls how to convert the raw message text into words.  See the
1441     PN_TOKENIZE_MODE comments for details. */
1442 
1443   uint32 m_TotalGenuineMessages;
1444     /* Number of genuine messages which are in the database. */
1445 
1446   uint32 m_TotalSpamMessages;
1447     /* Number of spam messages which are in the database. */
1448 
1449   uint32 m_WordCount;
1450     /* The number of words currently in the database.  Stored separately as a
1451     member variable to avoid having to call m_WordMap.size() all the time,
1452     which other threads can't do while the database is being updated (but they
1453     can look at the word count variable). */
1454 
1455   StatisticsMap m_WordMap;
1456     /* The in-memory data structure holding the set of words and their
1457     associated statistics.  When the database isn't in use, it is an empty
1458     collection.  You should lock the BApplication if you are using the word
1459     collection (reading or writing) from another thread. */
1460 };
1461 
1462 
1463 
1464 /******************************************************************************
1465  * Global utility function to display an error message and return.  The message
1466  * part describes the error, and if ErrorNumber is non-zero, gets the string
1467  * ", error code $X (standard description)." appended to it.  If the message
1468  * is NULL then it gets defaulted to "Something went wrong".  The title part
1469  * doesn't get displayed (no title bar in the dialog box, but you can see it in
1470  * the debugger as the window thread name), and defaults to "Error Message" if
1471  * you didn't specify one.  If running in command line mode, the error gets
1472  * printed to stderr rather than showing up in a dialog box.
1473  */
1474 
1475 static void
1476 DisplayErrorMessage (
1477   const char *MessageString = NULL,
1478   int ErrorNumber = 0,
1479   const char *TitleString = NULL)
1480 {
1481   BAlert *AlertPntr;
1482   char ErrorBuffer [PATH_MAX + 1500];
1483 
1484   if (TitleString == NULL)
1485     TitleString = "SpamDBM Error Message";
1486 
1487   if (MessageString == NULL)
1488   {
1489     if (ErrorNumber == 0)
1490       MessageString = "No error, no message, why bother?";
1491     else
1492       MessageString = "Something went wrong";
1493   }
1494 
1495   if (ErrorNumber != 0)
1496   {
1497     sprintf (ErrorBuffer, "%s, error code $%X/%d (%s) has occured.",
1498       MessageString, ErrorNumber, ErrorNumber, strerror (ErrorNumber));
1499     MessageString = ErrorBuffer;
1500   }
1501 
1502   if (g_CommandLineMode || g_ServerMode)
1503     cerr << TitleString << ": " << MessageString << endl;
1504   else
1505   {
1506     AlertPntr = new BAlert (TitleString, MessageString,
1507       "Acknowledge", NULL, NULL, B_WIDTH_AS_USUAL, B_STOP_ALERT);
1508     if (AlertPntr != NULL) {
1509       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1510       AlertPntr->Go ();
1511     }
1512   }
1513 }
1514 
1515 
1516 
1517 /******************************************************************************
1518  * Word wrap a long line of text into shorter 79 column lines and print the
1519  * result on the given output stream.
1520  */
1521 
1522 static void
1523 WrapTextToStream (ostream& OutputStream, const char *TextPntr)
1524 {
1525   const int LineLength = 79;
1526   char     *StringPntr;
1527   char      TempString [LineLength+1];
1528 
1529   TempString[LineLength] = 0; /* Only needs to be done once. */
1530 
1531   while (*TextPntr != 0)
1532   {
1533     while (isspace (*TextPntr))
1534       TextPntr++; /* Skip leading spaces. */
1535     if (*TextPntr == 0)
1536       break; /* It was all spaces, don't print any more. */
1537 
1538     strncpy (TempString, TextPntr, LineLength);
1539 
1540     /* Advance StringPntr to the end of the temp string, partly to see how long
1541     it is (rather than doing strlen). */
1542 
1543     StringPntr = TempString;
1544     while (*StringPntr != 0)
1545       StringPntr++;
1546 
1547     if (StringPntr - TempString < LineLength)
1548     {
1549       /* This line fits completely. */
1550       OutputStream << TempString << endl;
1551       TextPntr += StringPntr - TempString;
1552       continue;
1553     }
1554 
1555     /* Advance StringPntr to the last space in the temp string. */
1556 
1557     while (StringPntr > TempString)
1558     {
1559       if (isspace (*StringPntr))
1560         break; /* Found the trailing space. */
1561       else /* Go backwards, looking for the trailing space. */
1562         StringPntr--;
1563     }
1564 
1565     /* Remove more trailing spaces at the end of the line, in case there were
1566     several spaces in a row. */
1567 
1568     while (StringPntr > TempString && isspace (StringPntr[-1]))
1569       StringPntr--;
1570 
1571     /* Print the line of text and advance the text pointer too. */
1572 
1573     if (StringPntr == TempString)
1574     {
1575       /* This line has no spaces, don't wrap it, just split off a chunk. */
1576       OutputStream << TempString << endl;
1577       TextPntr += strlen (TempString);
1578       continue;
1579     }
1580 
1581     *StringPntr = 0; /* Cut off after the first trailing space. */
1582     OutputStream << TempString << endl;
1583     TextPntr += StringPntr - TempString;
1584   }
1585 }
1586 
1587 
1588 
1589 /******************************************************************************
1590  * Print the usage info to the stream.  Includes a list of all commands.
1591  */
1592 ostream& PrintUsage (ostream& OutputStream);
1593 
1594 ostream& PrintUsage (ostream& OutputStream)
1595 {
1596   struct property_info *PropInfoPntr;
1597 
1598   OutputStream << "\nSpamDBM - A Spam Database Manager\n";
1599   OutputStream << "Copyright © 2002 by Alexander G. M. Smith.  ";
1600   OutputStream << "Released to the public domain.\n\n";
1601   WrapTextToStream (OutputStream, "Compiled on " __DATE__ " at " __TIME__
1602 ".  $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $  $HeadURL: http://svn.haiku-os.org/haiku/haiku/trunk/src/bin/mail_utils/spamdbm.cpp $");
1603   OutputStream << "\n"
1604 "This is a program for classifying e-mail messages as spam (junk mail which\n"
1605 "you don't want to read) and regular genuine messages.  It can learn what's\n"
1606 "spam and what's genuine.  You just give it a bunch of spam messages and a\n"
1607 "bunch of non-spam ones.  It uses them to make a list of the words from the\n"
1608 "messages with the probability that each word is from a spam message or from\n"
1609 "a genuine message.  Later on, it can use those probabilities to classify\n"
1610 "new messages as spam or not spam.  If the classifier stops working well\n"
1611 "(because the spammers have changed their writing style and vocabulary, or\n"
1612 "your regular correspondants are writing like spammers), you can use this\n"
1613 "program to update the list of words to identify the new messages\n"
1614 "correctly.\n"
1615 "\n"
1616 "The original idea was from Paul Graham's algorithm, which has an excellent\n"
1617 "writeup at: http://www.paulgraham.com/spam.html\n"
1618 "\n"
1619 "Gary Robinson came up with the improved algorithm, which you can read about at:\n"
1620 "http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n"
1621 "\n"
1622 "Then he, Tim Peters and the SpamBayes mailing list developed the Chi-Squared\n"
1623 "test, see http://mail.python.org/pipermail/spambayes/2002-October/001036.html\n"
1624 "for one of the earlier messages leading from the central limit theorem to\n"
1625 "the current chi-squared scoring method.\n"
1626 "\n"
1627 "Thanks go to Isaac Yonemoto for providing a better icon, which we can\n"
1628 "unfortunately no longer use, since the Hormel company wants people to\n"
1629 "avoid associating their meat product with junk e-mail.\n"
1630 "\n"
1631 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes\n"
1632 "uses to extract words from messages.  In particular, HTML is now handled.\n"
1633 "\n"
1634 "Usage: Specify the operation as the first argument followed by more\n"
1635 "information as appropriate.  The program's configuration will affect the\n"
1636 "actual operation (things like the name of the database file to use, or\n"
1637 "whether it should allow non-email messages to be added).  In command line\n"
1638 "mode it will do the operation and exit.  In GUI/server mode a command line\n"
1639 "invocation will just send the command to the running server.  You can also\n"
1640 "use BeOS scripting (see the \"Hey\" command which you can get from\n"
1641 "http://www.bebits.com/app/2042 ) to control the Spam server.  And finally,\n"
1642 "there's also a GUI interface which shows up if you start it without any\n"
1643 "command line arguments.\n"
1644 "\n"
1645 "Commands:\n"
1646 "\n"
1647 "Quit\n"
1648 "Stop the program.  Useful if it's running as a server.\n"
1649 "\n";
1650 
1651   /* Go through all our scripting commands and add a description of each one to
1652   the usage text. */
1653 
1654   for (PropInfoPntr = g_ScriptingPropertyList + 0;
1655   PropInfoPntr->name != 0;
1656   PropInfoPntr++)
1657   {
1658     switch (PropInfoPntr->commands[0])
1659     {
1660       case B_GET_PROPERTY:
1661         OutputStream << "Get " << PropInfoPntr->name << endl;
1662         break;
1663 
1664       case B_SET_PROPERTY:
1665         OutputStream << "Set " << PropInfoPntr->name << " NewValue" << endl;
1666         break;
1667 
1668       case B_COUNT_PROPERTIES:
1669         OutputStream << "Count " << PropInfoPntr->name << endl;
1670         break;
1671 
1672       case B_CREATE_PROPERTY:
1673         OutputStream << "Create " << PropInfoPntr->name << endl;
1674         break;
1675 
1676       case B_DELETE_PROPERTY:
1677         OutputStream << "Delete " << PropInfoPntr->name << endl;
1678         break;
1679 
1680       case B_EXECUTE_PROPERTY:
1681         OutputStream << PropInfoPntr->name << endl;
1682         break;
1683 
1684       default:
1685         OutputStream << "Buggy Command: " << PropInfoPntr->name << endl;
1686         break;
1687     }
1688     WrapTextToStream (OutputStream, (char *)PropInfoPntr->usage);
1689     OutputStream << endl;
1690   }
1691 
1692   return OutputStream;
1693 }
1694 
1695 
1696 
1697 /******************************************************************************
1698  * A utility function to send a command to the application, will return after a
1699  * short delay if the application is busy (doesn't wait for it to be executed).
1700  * The reply from the application is also thrown away.  It used to be an
1701  * overloaded function, but the system couldn't distinguish between bool and
1702  * int, so now it has slightly different names depending on the arguments.
1703  */
1704 
1705 static void
1706 SubmitCommand (BMessage& CommandMessage)
1707 {
1708   status_t ErrorCode;
1709 
1710   ErrorCode = be_app_messenger.SendMessage (&CommandMessage,
1711     be_app_messenger /* reply messenger, throw away the reply */,
1712     1000000 /* delivery timeout */);
1713 
1714   if (ErrorCode != B_OK)
1715     cerr << "SubmitCommand failed to send a command, code " <<
1716     ErrorCode << " (" << strerror (ErrorCode) << ")." << endl;
1717 }
1718 
1719 
1720 static void
1721 SubmitCommandString (
1722   PropertyNumbers Property,
1723   uint32 CommandCode,
1724   const char *StringArgument = NULL)
1725 {
1726   BMessage CommandMessage (CommandCode);
1727 
1728   if (Property < 0 || Property >= PN_MAX)
1729   {
1730     DisplayErrorMessage ("SubmitCommandString bug.");
1731     return;
1732   }
1733   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1734   if (StringArgument != NULL)
1735     CommandMessage.AddString (g_DataName, StringArgument);
1736   SubmitCommand (CommandMessage);
1737 }
1738 
1739 
1740 static void
1741 SubmitCommandInt32 (
1742   PropertyNumbers Property,
1743   uint32 CommandCode,
1744   int32 Int32Argument)
1745 {
1746   BMessage CommandMessage (CommandCode);
1747 
1748   if (Property < 0 || Property >= PN_MAX)
1749   {
1750     DisplayErrorMessage ("SubmitCommandInt32 bug.");
1751     return;
1752   }
1753   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1754   CommandMessage.AddInt32 (g_DataName, Int32Argument);
1755   SubmitCommand (CommandMessage);
1756 }
1757 
1758 
1759 static void
1760 SubmitCommandBool (
1761   PropertyNumbers Property,
1762   uint32 CommandCode,
1763   bool BoolArgument)
1764 {
1765   BMessage CommandMessage (CommandCode);
1766 
1767   if (Property < 0 || Property >= PN_MAX)
1768   {
1769     DisplayErrorMessage ("SubmitCommandBool bug.");
1770     return;
1771   }
1772   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1773   CommandMessage.AddBool (g_DataName, BoolArgument);
1774   SubmitCommand (CommandMessage);
1775 }
1776 
1777 
1778 
1779 /******************************************************************************
1780  * A utility function which will estimate the spaminess of file(s), not
1781  * callable from the application thread since it sends a scripting command to
1782  * the application and waits for results.  For each file there will be an entry
1783  * reference in the message.  For each of those, run it through the spam
1784  * estimator and display a box with the results.  This function is used both by
1785  * the file requestor and by dragging and dropping into the middle of the words
1786  * view.
1787  */
1788 
1789 static void
1790 EstimateRefFilesAndDisplay (BMessage *MessagePntr)
1791 {
1792   BAlert     *AlertPntr;
1793   BEntry      Entry;
1794   entry_ref   EntryRef;
1795   status_t    ErrorCode;
1796   int         i, j;
1797   BPath       Path;
1798   BMessage    ReplyMessage;
1799   BMessage    ScriptingMessage;
1800   const char *StringPntr;
1801   float       TempFloat;
1802   int32       TempInt32;
1803   char        TempString [PATH_MAX + 1024 +
1804                 g_MaxInterestingWords * (g_MaxWordLength + 16)];
1805 
1806   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
1807   {
1808     /* See if the entry is a valid file or directory or other thing. */
1809 
1810     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
1811     if (ErrorCode != B_OK || !Entry.Exists () || Entry.GetPath (&Path) != B_OK)
1812       continue;
1813 
1814     /* Evaluate the spaminess of the file. */
1815 
1816     ScriptingMessage.MakeEmpty ();
1817     ScriptingMessage.what = B_SET_PROPERTY;
1818     ScriptingMessage.AddSpecifier (g_PropertyNames[PN_EVALUATE]);
1819     ScriptingMessage.AddString (g_DataName, Path.Path ());
1820 
1821     if (be_app_messenger.SendMessage (&ScriptingMessage,&ReplyMessage) != B_OK)
1822       break; /* App has died or something is wrong. */
1823 
1824     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
1825     TempInt32 != B_OK)
1826       break; /* Error messages will be displayed elsewhere. */
1827 
1828     ReplyMessage.FindFloat (g_ResultName, &TempFloat);
1829     sprintf (TempString, "%f spam ratio for \"%s\".\nThe top words are:",
1830       (double) TempFloat, Path.Path ());
1831 
1832     for (j = 0; j < 20 /* Don't print too many! */; j++)
1833     {
1834       if (ReplyMessage.FindString ("words", j, &StringPntr) != B_OK ||
1835       ReplyMessage.FindFloat ("ratios", j, &TempFloat) != B_OK)
1836         break;
1837 
1838       sprintf (TempString + strlen (TempString), "\n%s / %f",
1839         StringPntr, TempFloat);
1840     }
1841     if (j >= 20 && j < g_MaxInterestingWords)
1842       sprintf (TempString + strlen (TempString), "\nAnd up to %d more words.",
1843         g_MaxInterestingWords - j);
1844 
1845     AlertPntr = new BAlert ("Estimate", TempString, "OK");
1846     if (AlertPntr != NULL) {
1847       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1848       AlertPntr->Go ();
1849     }
1850   }
1851 }
1852 
1853 
1854 
1855 /******************************************************************************
1856  * A utility function from the http://sourceforge.net/projects/spambayes
1857  * SpamBayes project.  Return prob(chisq >= x2, with v degrees of freedom).  It
1858  * computes the probability that the chi-squared value (a kind of normalized
1859  * error measurement), with v degrees of freedom, would be larger than a given
1860  * number (x2; chi is the Greek letter X thus x2).  So you can tell if the
1861  * error is really unusual (the returned probability is near zero meaning that
1862  * your measured error number is kind of large - actual chi-squared is rarely
1863  * above that number merely due to random effects), or if it happens often
1864  * (usually if the probability is over 5% then it's within 3 standard
1865  * deviations - meaning that chi-squared goes over your number fairly often due
1866  * merely to random effects).  v must be even for this calculation to work.
1867  */
1868 
1869 static double ChiSquaredProbability (double x2, int v)
1870 {
1871   int    halfV = v / 2;
1872   int    i;
1873   double m;
1874   double sum;
1875   double term;
1876 
1877   if (v & 1)
1878     return -1.0; /* Out of range return value as a hint v is odd. */
1879 
1880   /* If x2 is very large, exp(-m) will underflow to 0. */
1881   m = x2 / 2.0;
1882   sum = term = exp (-m);
1883   for (i = 1; i < halfV; i++)
1884   {
1885     term *= m / i;
1886     sum += term;
1887   }
1888 
1889   /* With small x2 and large v, accumulated roundoff error, plus error in the
1890   platform exp(), can cause this to spill a few ULP above 1.0.  For example,
1891   ChiSquaredProbability(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
1892   point.  Returning a value even a teensy bit over 1.0 is no good. */
1893 
1894   if (sum > 1.0)
1895     return 1.0;
1896   return sum;
1897 }
1898 
1899 
1900 
1901 /******************************************************************************
1902  * A utility function to remove the "[Spam 99.9%] " from in front of the
1903  * MAIL:subject attribute of a file.
1904  */
1905 
1906 static status_t RemoveSpamPrefixFromSubjectAttribute (BNode *BNodePntr)
1907 {
1908   status_t    ErrorCode;
1909   const char *MailSubjectName = "MAIL:subject";
1910   char       *StringPntr;
1911   char        SubjectString [2000];
1912 
1913   ErrorCode = BNodePntr->ReadAttr (MailSubjectName,
1914     B_STRING_TYPE, 0 /* offset */, SubjectString,
1915     sizeof (SubjectString) - 1);
1916   if (ErrorCode <= 0)
1917     return 0; /* The attribute isn't there so we don't care. */
1918   if (ErrorCode >= (int) sizeof (SubjectString) - 1)
1919     return 0; /* Can't handle subjects which are too long. */
1920 
1921   SubjectString [ErrorCode] = 0;
1922   ErrorCode = 0; /* So do-nothing exit returns zero. */
1923   if (strncmp (SubjectString, "[Spam ", 6) == 0)
1924   {
1925     for (StringPntr = SubjectString;
1926     *StringPntr != 0 && *StringPntr != ']'; StringPntr++)
1927       ; /* No body in this for loop. */
1928     if (StringPntr[0] == ']' && StringPntr[1] == ' ')
1929     {
1930       ErrorCode = BNodePntr->RemoveAttr (MailSubjectName);
1931       ErrorCode = BNodePntr->WriteAttr (MailSubjectName,
1932         B_STRING_TYPE, 0 /* offset */,
1933         StringPntr + 2, strlen (StringPntr + 2) + 1);
1934       if (ErrorCode > 0)
1935         ErrorCode = 0;
1936     }
1937   }
1938 
1939   return ErrorCode;
1940 }
1941 
1942 
1943 
1944 /******************************************************************************
1945  * The tokenizing functions.  To make tokenization of the text easier to
1946  * understand, it is broken up into several passes.  Each pass goes over the
1947  * text (can include NUL bytes) and extracts all the words it can recognise
1948  * (can be none).  The extracted words are added to the WordSet, with the
1949  * PrefixCharacter prepended (zero if none) so we can distinguish between words
1950  * found in headers and in the text body.  It also modifies the input text
1951  * buffer in-place to change the text that the next pass will see (blanking out
1952  * words that it wants to delete, but not inserting much new text since the
1953  * buffer can't be enlarged).  They all return the number of bytes remaining in
1954  * InputString after it has been modified to be input for the next pass.
1955  * Returns zero if it has exhausted the possibility of getting more words, or
1956  * if something goes wrong.
1957  */
1958 
1959 static size_t TokenizerPassLowerCase (
1960   char *BufferPntr,
1961   size_t NumberOfBytes)
1962 {
1963   char *EndOfStringPntr;
1964 
1965   EndOfStringPntr = BufferPntr + NumberOfBytes;
1966 
1967   while (BufferPntr < EndOfStringPntr)
1968   {
1969     /* Do our own lower case conversion; tolower () has problems with UTF-8
1970     characters that have the high bit set. */
1971 
1972     if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
1973       *BufferPntr = *BufferPntr + ('a' - 'A');
1974     BufferPntr++;
1975   }
1976   return NumberOfBytes;
1977 }
1978 
1979 
1980 /* A utility function for some commonly repeated code.  If this was Modula-2,
1981 we could use a nested procedure.  But it's not.  Adds the given word to the set
1982 of words, checking for maximum word length and prepending the prefix to the
1983 word, which gets modified by this function to reflect the word actually added
1984 to the set. */
1985 
1986 static void
1987 AddWordAndPrefixToSet (
1988   string &Word,
1989   const char *PrefixString,
1990   set<string> &WordSet)
1991 {
1992   if (Word.empty ())
1993     return;
1994 
1995   if (Word.size () > g_MaxWordLength)
1996     Word.resize (g_MaxWordLength);
1997   Word.insert (0, PrefixString);
1998   WordSet.insert (Word);
1999 }
2000 
2001 
2002 /* Hunt through the text for various URLs and extract the components as
2003 separate words.  Doesn't affect the text in the buffer.  Looks for
2004 protocol://user:password@computer:port/path?query=key#anchor strings.  Also
2005 www.blah strings are detected and broken down.  Doesn't do HREF="" strings
2006 where the string has a relative path (no host computer name).  Assumes the
2007 input buffer is already in lower case. */
2008 
2009 static size_t TokenizerPassExtractURLs (
2010   char *BufferPntr,
2011   size_t NumberOfBytes,
2012   char PrefixCharacter,
2013   set<string> &WordSet)
2014 {
2015   char   *AtSignStringPntr;
2016   char   *HostStringPntr;
2017   char   *InputStringEndPntr;
2018   char   *InputStringPntr;
2019   char   *OptionsStringPntr;
2020   char   *PathStringPntr;
2021   char    PrefixString [2];
2022   char   *ProtocolStringPntr;
2023   string  Word;
2024 
2025   InputStringPntr = BufferPntr;
2026   InputStringEndPntr = BufferPntr + NumberOfBytes;
2027   PrefixString [0] = PrefixCharacter;
2028   PrefixString [1] = 0;
2029 
2030   while (InputStringPntr < InputStringEndPntr - 4)
2031   {
2032     HostStringPntr = NULL;
2033     if (memcmp (InputStringPntr, "www.", 4) == 0)
2034       HostStringPntr = InputStringPntr;
2035     else if (memcmp (InputStringPntr, "://", 3) == 0)
2036     {
2037       /* Find the protocol name, and add it as a word such as "ftp:" "http:" */
2038       ProtocolStringPntr = InputStringPntr;
2039       while (ProtocolStringPntr > BufferPntr &&
2040       isalpha (ProtocolStringPntr[-1]))
2041         ProtocolStringPntr--;
2042       Word.assign (ProtocolStringPntr,
2043         (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
2044       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2045       HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
2046     }
2047     if (HostStringPntr == NULL)
2048     {
2049       InputStringPntr++;
2050       continue;
2051     }
2052 
2053     /* Got a host name string starting at HostStringPntr.  It's everything
2054     until the next slash or space, like "user:password@computer:port". */
2055 
2056     InputStringPntr = HostStringPntr;
2057     AtSignStringPntr = NULL;
2058     while (InputStringPntr < InputStringEndPntr &&
2059     (*InputStringPntr != '/' && !isspace (*InputStringPntr)))
2060     {
2061       if (*InputStringPntr == '@')
2062         AtSignStringPntr = InputStringPntr;
2063       InputStringPntr++;
2064     }
2065     if (AtSignStringPntr != NULL)
2066     {
2067       /* Add a word with the user and password, unseparated. */
2068       Word.assign (HostStringPntr,
2069         AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
2070       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2071       HostStringPntr = AtSignStringPntr + 1;
2072     }
2073 
2074     /* Add a word with the computer and port, unseparated. */
2075 
2076     Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
2077     AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2078 
2079     /* Now get the path name, not including the extra junk after ?  and #
2080     separators (they're stored as separate options).  Stops at white space or a
2081     double quote mark. */
2082 
2083     PathStringPntr = InputStringPntr;
2084     OptionsStringPntr = NULL;
2085     while (InputStringPntr < InputStringEndPntr &&
2086     (*InputStringPntr != '"' && !isspace (*InputStringPntr)))
2087     {
2088       if (OptionsStringPntr == NULL &&
2089       (*InputStringPntr == '?' || *InputStringPntr == '#'))
2090         OptionsStringPntr = InputStringPntr;
2091       InputStringPntr++;
2092     }
2093 
2094     if (OptionsStringPntr == NULL)
2095     {
2096       /* No options, all path. */
2097       Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
2098       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2099     }
2100     else
2101     {
2102       /* Insert the path before the options. */
2103       Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
2104       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2105 
2106       /* Insert all the options as a word. */
2107       Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
2108       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2109     }
2110   }
2111   return NumberOfBytes;
2112 }
2113 
2114 
2115 /* Replace long Asian words (likely to actually be sentences) with the first
2116 character in the word. */
2117 
2118 static size_t TokenizerPassTruncateLongAsianWords (
2119   char *BufferPntr,
2120   size_t NumberOfBytes)
2121 {
2122   char *EndOfStringPntr;
2123   char *InputStringPntr;
2124   int   Letter;
2125   char *OutputStringPntr;
2126   char *StartOfInputLongUnicodeWord;
2127   char *StartOfOutputLongUnicodeWord;
2128 
2129   InputStringPntr = BufferPntr;
2130   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2131   OutputStringPntr = InputStringPntr;
2132   StartOfInputLongUnicodeWord = NULL; /* Non-NULL flags it as started. */
2133   StartOfOutputLongUnicodeWord = NULL;
2134 
2135   /* Copy the text from the input to the output (same buffer), but when we find
2136   a sequence of UTF-8 characters that is too long then truncate it down to one
2137   character and reset the output pointer to be after that character, thus
2138   deleting the word.  Replacing the deleted characters after it with spaces
2139   won't work since we need to preserve the lack of space to handle those sneaky
2140   HTML artificial word breakers.  So that Thelongword<blah>ing becomes
2141   "T<blah>ing" rather than "T <blah>ing", so the next step joins them up into
2142   "Ting" rather than "T" and "ing".  The first code in a UTF-8 character is
2143   11xxxxxx and subsequent ones are 10xxxxxx. */
2144 
2145   while (InputStringPntr < EndOfStringPntr)
2146   {
2147     Letter = (unsigned char) *InputStringPntr;
2148     if (Letter < 128) // Got a regular ASCII letter?
2149     {
2150       if (StartOfInputLongUnicodeWord != NULL)
2151       {
2152         if (InputStringPntr - StartOfInputLongUnicodeWord >
2153         (int) g_MaxWordLength * 2)
2154         {
2155           /* Need to truncate the long word (100 bytes or about 50 characters)
2156           back down to the first UTF-8 character, so find out where the first
2157           character ends (skip past the 10xxxxxx bytes), and rewind the output
2158           pointer to be just after that (ignoring the rest of the long word in
2159           effect). */
2160 
2161           OutputStringPntr = StartOfOutputLongUnicodeWord + 1;
2162           while (OutputStringPntr < InputStringPntr)
2163           {
2164             Letter = (unsigned char) *OutputStringPntr;
2165             if (Letter < 128 || Letter >= 192)
2166               break;
2167             ++OutputStringPntr; // Still a UTF-8 middle of the character code.
2168           }
2169         }
2170         StartOfInputLongUnicodeWord = NULL;
2171       }
2172     }
2173     else if (Letter >= 192 && StartOfInputLongUnicodeWord == NULL)
2174     {
2175       /* Got the start of a UTF-8 character.  Remember the spot so we can see
2176       if this is a too long UTF-8 word, which is often a whole sentence in
2177       asian languages, since they sort of use a single character per word. */
2178 
2179       StartOfInputLongUnicodeWord = InputStringPntr;
2180       StartOfOutputLongUnicodeWord = OutputStringPntr;
2181     }
2182     *OutputStringPntr++ = *InputStringPntr++;
2183   }
2184   return OutputStringPntr - BufferPntr;
2185 }
2186 
2187 
2188 /* Find all the words in the string and add them to our local set of words.
2189 The characters considered white space are defined by g_SpaceCharacters.  This
2190 function is also used as a subroutine by other tokenizer functions when they
2191 have a bunch of presumably plain text they want broken into words and added. */
2192 
2193 static size_t TokenizerPassGetPlainWords (
2194   char *BufferPntr,
2195   size_t NumberOfBytes,
2196   char PrefixCharacter,
2197   set<string> &WordSet)
2198 {
2199   string  AccumulatedWord;
2200   char   *EndOfStringPntr;
2201   size_t  Length;
2202   int     Letter;
2203 
2204   if (NumberOfBytes <= 0)
2205     return 0; /* Nothing to process. */
2206 
2207   if (PrefixCharacter != 0)
2208     AccumulatedWord = PrefixCharacter;
2209   EndOfStringPntr = BufferPntr + NumberOfBytes;
2210   while (true)
2211   {
2212     if (BufferPntr >= EndOfStringPntr)
2213       Letter = EOF; // Usually a negative number.
2214     else
2215       Letter = (unsigned char) *BufferPntr++;
2216 
2217     /* See if it is a letter we treat as white space.  Some word separators
2218     like dashes and periods aren't considered as space.  Note that codes above
2219     127 are UTF-8 characters, which we consider non-space. */
2220 
2221     if (Letter < 0 /* EOF is -1 */ ||
2222     (Letter < 128 && g_SpaceCharacters[Letter]))
2223     {
2224       /* That space finished off a word.  Remove trailing periods... */
2225 
2226       while ((Length = AccumulatedWord.size()) > 0 &&
2227       AccumulatedWord [Length-1] == '.')
2228         AccumulatedWord.resize (Length - 1);
2229 
2230       /* If there's anything left in the word, add it to the set.  Also ignore
2231       words which are too big (it's probably some binary encoded data).  But
2232       leave room for supercalifragilisticexpialidoceous.  According to one web
2233       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
2234       currently in English.  Note that some uuencoded data was seen with a 60
2235       character line length. */
2236 
2237       if (PrefixCharacter != 0)
2238         Length--; // Don't count prefix when judging size or emptiness.
2239       if (Length > 0 && Length <= g_MaxWordLength)
2240         WordSet.insert (AccumulatedWord);
2241 
2242       /* Empty out the string to get ready for the next word.  Not quite empty,
2243       start it off with the prefix character if any. */
2244 
2245       if (PrefixCharacter != 0)
2246         AccumulatedWord = PrefixCharacter;
2247       else
2248         AccumulatedWord.resize (0);
2249     }
2250     else /* Not a space-like character, add it to the word. */
2251       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
2252 
2253     if (Letter < 0)
2254       break; /* End of data.  Exit here so that last word got processed. */
2255   }
2256   return NumberOfBytes;
2257 }
2258 
2259 
2260 /* Delete Things from the text.  The Thing is marked by a start string and an
2261 end string, such as "<!--" and "--> for HTML comment things.  All the text
2262 between the markers will be added to the word list before it gets deleted from
2263 the buffer.  The markers must be prepared in lower case and the buffer is
2264 assumed to have already been converted to lower case.  You can specify an empty
2265 string for the end marker if you're just matching a string constant like
2266 "&nbsp;", which you would put in the starting marker.  This is a utility
2267 function used by other tokenizer functions. */
2268 
2269 static size_t TokenizerUtilRemoveStartEndThing (
2270   char *BufferPntr,
2271   size_t NumberOfBytes,
2272   char PrefixCharacter,
2273   set<string> &WordSet,
2274   const char *ThingStartCode,
2275   const char *ThingEndCode,
2276   bool ReplaceWithSpace)
2277 {
2278   char *EndOfStringPntr;
2279   bool  FoundAndDeletedThing;
2280   char *InputStringPntr;
2281   char *OutputStringPntr;
2282   int   ThingEndLength;
2283   char *ThingEndPntr;
2284   int   ThingStartLength;
2285 
2286   InputStringPntr = BufferPntr;
2287   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2288   OutputStringPntr = InputStringPntr;
2289   ThingStartLength = strlen (ThingStartCode);
2290   ThingEndLength = strlen (ThingEndCode);
2291 
2292   if (ThingStartLength <= 0)
2293     return NumberOfBytes; /* Need some things to look for first! */
2294 
2295   while (InputStringPntr < EndOfStringPntr)
2296   {
2297     /* Search for the starting marker. */
2298 
2299     FoundAndDeletedThing = false;
2300     if (EndOfStringPntr - InputStringPntr >=
2301     ThingStartLength + ThingEndLength /* space remains for start + end */ &&
2302     *InputStringPntr == *ThingStartCode &&
2303     memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
2304     {
2305       /* Found the start marker.  Look for the terminating string.  If it is an
2306       empty string, then we've found it right now! */
2307 
2308       ThingEndPntr = InputStringPntr + ThingStartLength;
2309       while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
2310       {
2311         if (ThingEndLength == 0 ||
2312         (*ThingEndPntr == *ThingEndCode &&
2313         memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
2314         {
2315           /* Got the end of the Thing.  First dump the text inbetween the start
2316           and end markers into the words list. */
2317 
2318           TokenizerPassGetPlainWords (InputStringPntr + ThingStartLength,
2319             ThingEndPntr - (InputStringPntr + ThingStartLength),
2320             PrefixCharacter, WordSet);
2321 
2322           /* Delete by not updating the output pointer while moving the input
2323           pointer to just after the ending tag. */
2324 
2325           InputStringPntr = ThingEndPntr + ThingEndLength;
2326           if (ReplaceWithSpace)
2327             *OutputStringPntr++ = ' ';
2328           FoundAndDeletedThing = true;
2329           break;
2330         }
2331         ThingEndPntr++;
2332       } /* End while ThingEndPntr */
2333     }
2334     if (!FoundAndDeletedThing)
2335       *OutputStringPntr++ = *InputStringPntr++;
2336   } /* End while InputStringPntr */
2337 
2338   return OutputStringPntr - BufferPntr;
2339 }
2340 
2341 
2342 static size_t TokenizerPassRemoveHTMLComments (
2343   char *BufferPntr,
2344   size_t NumberOfBytes,
2345   char PrefixCharacter,
2346   set<string> &WordSet)
2347 {
2348   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2349     PrefixCharacter, WordSet, "<!--", "-->", false);
2350 }
2351 
2352 
2353 static size_t TokenizerPassRemoveHTMLStyle (
2354   char *BufferPntr,
2355   size_t NumberOfBytes,
2356   char PrefixCharacter,
2357   set<string> &WordSet)
2358 {
2359   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2360     PrefixCharacter, WordSet,
2361     "<style", "/style>", false /* replace with space if true */);
2362 }
2363 
2364 
2365 /* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
2366 start of the next sentence is recognised at least as the start of a very long
2367 word.  The Japanese comma also does the same job. */
2368 
2369 static size_t TokenizerPassJapanesePeriodsToSpaces (
2370   char *BufferPntr,
2371   size_t NumberOfBytes,
2372   char PrefixCharacter,
2373   set<string> &WordSet)
2374 {
2375   size_t BytesRemaining = NumberOfBytes;
2376 
2377   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2378     BytesRemaining, PrefixCharacter, WordSet, "。" /* period */, "", true);
2379   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2380     BytesRemaining, PrefixCharacter, WordSet, "、" /* comma */, "", true);
2381   return BytesRemaining;
2382 }
2383 
2384 
2385 /* Delete HTML tags from the text.  The contents of the tag are added as words
2386 before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
2387 stage while other HTML things get replaced by nothing. */
2388 
2389 static size_t TokenizerPassRemoveHTMLTags (
2390   char *BufferPntr,
2391   size_t NumberOfBytes,
2392   char PrefixCharacter,
2393   set<string> &WordSet)
2394 {
2395   size_t BytesRemaining = NumberOfBytes;
2396 
2397   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2398     BytesRemaining, PrefixCharacter, WordSet, "&nbsp;", "", true);
2399   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2400     BytesRemaining, PrefixCharacter, WordSet, "<p", ">", true);
2401   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2402     BytesRemaining, PrefixCharacter, WordSet, "<br", ">", true);
2403   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2404     BytesRemaining, PrefixCharacter, WordSet, "<", ">", false);
2405   return BytesRemaining;
2406 }
2407 
2408 
2409 
2410 /******************************************************************************
2411  * Implementation of the ABSApp class, constructor, destructor and the rest of
2412  * the member functions in mostly alphabetical order.
2413  */
2414 
2415 ABSApp::ABSApp ()
2416 : BApplication (g_ABSAppSignature),
2417   m_DatabaseHasChanged (false),
2418   m_SettingsHaveChanged (false)
2419 {
2420   status_t    ErrorCode;
2421   int         HalvingCount;
2422   int         i;
2423   const void *ResourceData;
2424   size_t      ResourceSize;
2425   BResources *ResourcesPntr;
2426 
2427   MakeDatabaseEmpty ();
2428 
2429   /* Set up the pathname which identifies our settings directory.  Note that
2430   the actual settings are loaded later on (or set to defaults) by the main()
2431   function, before this BApplication starts running.  So we don't bother
2432   initialising the other setting related variables here. */
2433 
2434   ErrorCode =
2435     find_directory (B_USER_SETTINGS_DIRECTORY, &m_SettingsDirectoryPath);
2436   if (ErrorCode == B_OK)
2437     ErrorCode = m_SettingsDirectoryPath.Append (g_SettingsDirectoryName);
2438   if (ErrorCode != B_OK)
2439     m_SettingsDirectoryPath.SetTo (".");
2440 
2441   /* Set up the table which identifies which characters are spaces and which
2442   are not.  Spaces are all control characters and all punctuation except for:
2443   apostrophe (so "it's" and possessive versions of words get stored), dash (for
2444   hyphenated words), dollar sign (for cash amounts), period (for IP addresses,
2445   we later remove trailing periods). */
2446 
2447   memset (g_SpaceCharacters, 1, sizeof (g_SpaceCharacters));
2448   g_SpaceCharacters['\''] = false;
2449   g_SpaceCharacters['-'] = false;
2450   g_SpaceCharacters['$'] = false;
2451   g_SpaceCharacters['.'] = false;
2452   for (i = '0'; i <= '9'; i++)
2453     g_SpaceCharacters[i] = false;
2454   for (i = 'A'; i <= 'Z'; i++)
2455     g_SpaceCharacters[i] = false;
2456   for (i = 'a'; i <= 'z'; i++)
2457     g_SpaceCharacters[i] = false;
2458 
2459   /* Initialise the busy cursor from data in the application's resources. */
2460 
2461   if ((ResourcesPntr = AppResources ()) != NULL && (ResourceData =
2462   ResourcesPntr->LoadResource ('CURS', "Busy Cursor", &ResourceSize)) != NULL
2463   && ResourceSize >= 68 /* Size of a raw 2x16x16x8+4 cursor is 68 bytes */)
2464     g_BusyCursor = new BCursor (ResourceData);
2465 
2466   /* Find out the smallest usable double by seeing how small we can make it. */
2467 
2468   m_SmallestUseableDouble = 1.0;
2469   HalvingCount = 0;
2470   while (HalvingCount < 10000 && m_SmallestUseableDouble > 0.0)
2471   {
2472     HalvingCount++;
2473     m_SmallestUseableDouble /= 2;
2474   }
2475 
2476   /* Recreate the number.  But don't make quite as small, we want to allow some
2477   precision bits and a bit of extra margin for intermediate results in future
2478   calculations. */
2479 
2480   HalvingCount -= 50 + sizeof (double) * 8;
2481 
2482   m_SmallestUseableDouble = 1.0;
2483   while (HalvingCount > 0)
2484   {
2485     HalvingCount--;
2486     m_SmallestUseableDouble /= 2;
2487   }
2488 }
2489 
2490 
2491 ABSApp::~ABSApp ()
2492 {
2493   status_t ErrorCode;
2494   char     ErrorMessage [PATH_MAX + 1024];
2495 
2496   if (m_SettingsHaveChanged)
2497     LoadSaveSettings (false /* DoLoad */);
2498   if ((ErrorCode = SaveDatabaseIfNeeded (ErrorMessage)) != B_OK)
2499     DisplayErrorMessage (ErrorMessage, ErrorCode, "Exiting Error");
2500   delete g_BusyCursor;
2501   g_BusyCursor = NULL;
2502 }
2503 
2504 
2505 /* Display a box showing information about this program. */
2506 
2507 void
2508 ABSApp::AboutRequested ()
2509 {
2510   BAlert *AboutAlertPntr;
2511 
2512   AboutAlertPntr = new BAlert ("About",
2513 "SpamDBM - Spam Database Manager\n\n"
2514 
2515 "This is a BeOS program for classifying e-mail messages as spam (unwanted \
2516 junk mail) or as genuine mail using a Bayesian statistical approach.  There \
2517 is also a Mail Daemon Replacement add-on to filter mail using the \
2518 classification statistics collected earlier.\n\n"
2519 
2520 "Written by Alexander G. M. Smith, fall 2002.\n\n"
2521 
2522 "The original idea was from Paul Graham's algorithm, which has an excellent \
2523 writeup at: http://www.paulgraham.com/spam.html\n\n"
2524 
2525 "Gary Robinson came up with the improved algorithm, which you can read about \
2526 at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n\n"
2527 
2528 "Mr. Robinson, Tim Peters and the SpamBayes mailing list people then \
2529 developed the even better chi-squared scoring method.\n\n"
2530 
2531 "Icon courtesy of Isaac Yonemoto, though it is no longer used since Hormel \
2532 doesn't want their meat product associated with junk e-mail.\n\n"
2533 
2534 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes \
2535 uses to extract words from messages.  In particular, HTML is now handled.\n\n"
2536 
2537 "Released to the public domain, with no warranty.\n"
2538 "$Revision: 30630 $\n"
2539 "Compiled on " __DATE__ " at " __TIME__ ".", "Done");
2540   if (AboutAlertPntr != NULL)
2541   {
2542     AboutAlertPntr->SetFlags(AboutAlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
2543     AboutAlertPntr->Go ();
2544   }
2545 }
2546 
2547 
2548 /* Add the text in the given file to the database as an example of a spam or
2549 genuine message, or removes it from the database if you claim it is
2550 CL_UNCERTAIN.  Also resets the spam ratio attribute to show the effect of the
2551 database change. */
2552 
2553 status_t ABSApp::AddFileToDatabase (
2554   ClassificationTypes IsSpamOrWhat,
2555   const char *FileName,
2556   char *ErrorMessage)
2557 {
2558   status_t ErrorCode;
2559   BFile    MessageFile;
2560   BMessage TempBMessage;
2561 
2562   ErrorCode = MessageFile.SetTo (FileName, B_READ_ONLY);
2563   if (ErrorCode != B_OK)
2564   {
2565     sprintf (ErrorMessage, "Unable to open file \"%s\" for reading", FileName);
2566     return ErrorCode;
2567   }
2568 
2569   ErrorCode = AddPositionIOToDatabase (IsSpamOrWhat,
2570     &MessageFile, FileName, ErrorMessage);
2571   MessageFile.Unset ();
2572   if (ErrorCode != B_OK)
2573     return ErrorCode;
2574 
2575   /* Re-evaluate the file so that the user sees the new ratio attribute. */
2576   return EvaluateFile (FileName, &TempBMessage, ErrorMessage);
2577 }
2578 
2579 
2580 /* Add the given text to the database.  The unique words found in MessageIOPntr
2581 will be added to the database (incrementing the count for the number of
2582 messages using each word, either the spam or genuine count depending on
2583 IsSpamOrWhat).  It will remove the message (decrement the word counts) if you
2584 specify CL_UNCERTAIN as the new classification.  And if it switches from spam
2585 to genuine or vice versa, it will do both - decrement the counts for the old
2586 class and increment the counts for the new one.  An attribute will be added to
2587 MessageIOPntr (if it is a file) to record that it has been marked as Spam or
2588 Genuine (so that it doesn't get added to the database a second time).  If it is
2589 being removed from the database, the classification attribute gets removed too.
2590 If things go wrong, a non-zero error code will be returned and an explanation
2591 written to ErrorMessage (assumed to be at least PATH_MAX + 1024 bytes long).
2592 OptionalFileName is just used in the error message to identify the file to the
2593 user. */
2594 
2595 status_t ABSApp::AddPositionIOToDatabase (
2596   ClassificationTypes IsSpamOrWhat,
2597   BPositionIO *MessageIOPntr,
2598   const char *OptionalFileName,
2599   char *ErrorMessage)
2600 {
2601   BNode                             *BNodePntr;
2602   char                               ClassificationString [NAME_MAX];
2603   StatisticsMap::iterator            DataIter;
2604   status_t                           ErrorCode = 0;
2605   pair<StatisticsMap::iterator,bool> InsertResult;
2606   uint32                             NewAge;
2607   StatisticsRecord                   NewStatistics;
2608   ClassificationTypes                PreviousClassification;
2609   StatisticsPointer                  StatisticsPntr;
2610   set<string>::iterator              WordEndIter;
2611   set<string>::iterator              WordIter;
2612   set<string>                        WordSet;
2613 
2614   NewAge = m_TotalGenuineMessages + m_TotalSpamMessages;
2615   if (NewAge >= 0xFFFFFFF0UL)
2616   {
2617     sprintf (ErrorMessage,
2618       "The database is full!  There are %" B_PRIu32 " messages in "
2619       "it and we can't add any more without overflowing the maximum integer "
2620       "representation in 32 bits", NewAge);
2621     return B_NO_MEMORY;
2622   }
2623 
2624   /* Check that this file hasn't already been added to the database. */
2625 
2626   PreviousClassification = CL_UNCERTAIN;
2627   BNodePntr = dynamic_cast<BNode *> (MessageIOPntr);
2628   if (BNodePntr != NULL) /* If this thing might have attributes. */
2629   {
2630     ErrorCode = BNodePntr->ReadAttr (g_AttributeNameClassification,
2631       B_STRING_TYPE, 0 /* offset */, ClassificationString,
2632       sizeof (ClassificationString) - 1);
2633     if (ErrorCode <= 0) /* Positive values for the number of bytes read */
2634       strcpy (ClassificationString, "none");
2635     else /* Just in case it needs a NUL at the end. */
2636       ClassificationString [ErrorCode] = 0;
2637 
2638     if (strcasecmp (ClassificationString, g_ClassifiedSpam) == 0)
2639       PreviousClassification = CL_SPAM;
2640     else if (strcasecmp (ClassificationString, g_ClassifiedGenuine) == 0)
2641       PreviousClassification = CL_GENUINE;
2642   }
2643 
2644   if (!m_IgnorePreviousClassification &&
2645   PreviousClassification != CL_UNCERTAIN)
2646   {
2647     if (IsSpamOrWhat == PreviousClassification)
2648     {
2649       sprintf (ErrorMessage, "Ignoring file \"%s\" since it seems to have "
2650         "already been classified as %s.", OptionalFileName,
2651         g_ClassificationTypeNames [IsSpamOrWhat]);
2652     }
2653     else
2654     {
2655       sprintf (ErrorMessage, "Changing existing classification of file \"%s\" "
2656         "from %s to %s.", OptionalFileName,
2657         g_ClassificationTypeNames [PreviousClassification],
2658         g_ClassificationTypeNames [IsSpamOrWhat]);
2659     }
2660     DisplayErrorMessage (ErrorMessage, 0, "Note");
2661   }
2662 
2663   if (!m_IgnorePreviousClassification &&
2664   IsSpamOrWhat == PreviousClassification)
2665     /* Nothing to do if it is already classified correctly and the user doesn't
2666     want double classification. */
2667     return B_OK;
2668 
2669   /* Get the list of unique words in the file. */
2670 
2671   ErrorCode = GetWordsFromPositionIO (MessageIOPntr, OptionalFileName,
2672     WordSet, ErrorMessage);
2673   if (ErrorCode != B_OK)
2674     return ErrorCode;
2675 
2676   /* Update the count of the number of messages processed, with corrections if
2677   reclassifying a message. */
2678 
2679   m_DatabaseHasChanged = true;
2680 
2681   if (!m_IgnorePreviousClassification &&
2682   PreviousClassification == CL_SPAM && m_TotalSpamMessages > 0)
2683     m_TotalSpamMessages--;
2684 
2685   if (IsSpamOrWhat == CL_SPAM)
2686     m_TotalSpamMessages++;
2687 
2688   if (!m_IgnorePreviousClassification &&
2689   PreviousClassification == CL_GENUINE && m_TotalGenuineMessages > 0)
2690       m_TotalGenuineMessages--;
2691 
2692   if (IsSpamOrWhat == CL_GENUINE)
2693     m_TotalGenuineMessages++;
2694 
2695   /* Mark the file's attributes with the new classification.  Don't care if it
2696   fails. */
2697 
2698   if (BNodePntr != NULL) /* If this thing might have attributes. */
2699   {
2700     ErrorCode = BNodePntr->RemoveAttr (g_AttributeNameClassification);
2701     if (IsSpamOrWhat != CL_UNCERTAIN)
2702     {
2703       strcpy (ClassificationString, g_ClassificationTypeNames [IsSpamOrWhat]);
2704       ErrorCode = BNodePntr->WriteAttr (g_AttributeNameClassification,
2705         B_STRING_TYPE, 0 /* offset */,
2706         ClassificationString, strlen (ClassificationString) + 1);
2707     }
2708   }
2709 
2710   /* Add the words to the database by incrementing or decrementing the counts
2711   for each word as appropriate. */
2712 
2713   WordEndIter = WordSet.end ();
2714   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
2715   {
2716     if ((DataIter = m_WordMap.find (*WordIter)) == m_WordMap.end ())
2717     {
2718       /* No record in the database for the word. */
2719 
2720       if (IsSpamOrWhat == CL_UNCERTAIN)
2721         continue; /* Not adding words, don't have to subtract from nothing. */
2722 
2723       /* Create a new one record in the database for the new word. */
2724 
2725       memset (&NewStatistics, 0, sizeof (NewStatistics));
2726       InsertResult = m_WordMap.insert (
2727         StatisticsMap::value_type (*WordIter, NewStatistics));
2728       if (!InsertResult.second)
2729       {
2730         sprintf (ErrorMessage, "Failed to insert new database entry for "
2731           "word \"%s\", while processing file \"%s\"",
2732           WordIter->c_str (), OptionalFileName);
2733         return B_NO_MEMORY;
2734       }
2735       DataIter = InsertResult.first;
2736       m_WordCount++;
2737     }
2738 
2739     /* Got the database record for the word, update the statistics. */
2740 
2741     StatisticsPntr = &DataIter->second;
2742 
2743     StatisticsPntr->age = NewAge;
2744 
2745     /* Can't update m_OldestAge here, since it would take a lot of effort to
2746     find the next older age.  Since it's only used for display, we'll let it be
2747     slightly incorrect.  The next database load or purge will fix it. */
2748 
2749     if (IsSpamOrWhat == CL_SPAM)
2750       StatisticsPntr->spamCount++;
2751 
2752     if (IsSpamOrWhat == CL_GENUINE)
2753       StatisticsPntr->genuineCount++;
2754 
2755     if (!m_IgnorePreviousClassification &&
2756     PreviousClassification == CL_SPAM && StatisticsPntr->spamCount > 0)
2757       StatisticsPntr->spamCount--;
2758 
2759     if (!m_IgnorePreviousClassification &&
2760     PreviousClassification == CL_GENUINE && StatisticsPntr->genuineCount > 0)
2761       StatisticsPntr->genuineCount--;
2762   }
2763 
2764   return B_OK;
2765 }
2766 
2767 
2768 /* Add the text in the string to the database as an example of a spam or
2769 genuine message. */
2770 
2771 status_t ABSApp::AddStringToDatabase (
2772   ClassificationTypes IsSpamOrWhat,
2773   const char *String,
2774   char *ErrorMessage)
2775 {
2776   BMemoryIO MemoryIO (String, strlen (String));
2777 
2778   return AddPositionIOToDatabase (IsSpamOrWhat, &MemoryIO,
2779    "Memory Buffer" /* OptionalFileName */, ErrorMessage);
2780 }
2781 
2782 
2783 /* Given a bunch of text, find the words within it (doing special tricks to
2784 extract words from HTML), and add them to the set.  Allow NULs in the text.  If
2785 the PrefixCharacter isn't zero then it is prepended to all words found (so you
2786 can distinguish words as being from a header or from the body text).  See also
2787 TokenizeWhole which does something similar. */
2788 
2789 void
2790 ABSApp::AddWordsToSet (
2791   const char *InputString,
2792   size_t NumberOfBytes,
2793   char PrefixCharacter,
2794   set<string> &WordSet)
2795 {
2796   char   *BufferPntr;
2797   size_t  CurrentSize;
2798   int     PassNumber;
2799 
2800   /* Copy the input buffer.  The code will be modifying it in-place as HTML
2801   fragments and other junk are deleted. */
2802 
2803   BufferPntr = new char [NumberOfBytes];
2804   if (BufferPntr == NULL)
2805     return;
2806   memcpy (BufferPntr, InputString, NumberOfBytes);
2807 
2808   /* Do the tokenization.  Each pass does something to the text in the buffer,
2809   and may add words to the word set. */
2810 
2811   CurrentSize = NumberOfBytes;
2812   for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
2813   {
2814     switch (PassNumber)
2815     {
2816       case 1: /* Lowercase first, rest of them assume lower case inputs. */
2817         CurrentSize = TokenizerPassLowerCase (BufferPntr, CurrentSize);
2818         break;
2819       case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
2820         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2821       case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
2822         BufferPntr, CurrentSize); break;
2823       case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
2824         BufferPntr, CurrentSize, 'Z', WordSet); break;
2825       case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
2826         BufferPntr, CurrentSize, 'Z', WordSet); break;
2827       case 6: CurrentSize = TokenizerPassExtractURLs (
2828         BufferPntr, CurrentSize, 'Z', WordSet); break;
2829       case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
2830         BufferPntr, CurrentSize, 'Z', WordSet); break;
2831       case 8: CurrentSize = TokenizerPassGetPlainWords (
2832         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2833       default: break;
2834     }
2835   }
2836 
2837   delete [] BufferPntr;
2838 }
2839 
2840 
2841 /* The user has provided a command line.  This could actually be from a
2842 separate attempt to invoke the program (this application's resource/attributes
2843 have the launch flags set to "single launch", so the shell doesn't start the
2844 program but instead sends the arguments to the already running instance).  In
2845 either case, the command is sent to an intermediary thread where it is
2846 asynchronously converted into a scripting message(s) that are sent back to this
2847 BApplication.  The intermediary is needed since we can't recursively execute
2848 scripting messages while processing a message (this ArgsReceived one). */
2849 
2850 void
2851 ABSApp::ArgvReceived (int32 argc, char **argv)
2852 {
2853   if (g_CommanderLooperPntr != NULL)
2854     g_CommanderLooperPntr->CommandArguments (argc, argv);
2855 }
2856 
2857 
2858 /* Create a new empty database.  Note that we have to write out the new file
2859 immediately, otherwise other operations will see the empty database and then
2860 try to load the file, and complain that it doesn't exist.  Now they will see
2861 the empty database and redundantly load the empty file. */
2862 
2863 status_t ABSApp::CreateDatabaseFile (char *ErrorMessage)
2864 {
2865   MakeDatabaseEmpty ();
2866   m_DatabaseHasChanged = true;
2867   return SaveDatabaseIfNeeded (ErrorMessage); /* Make it now. */
2868 }
2869 
2870 
2871 /* Set the settings to the defaults.  Needed in case there isn't a settings
2872 file or it is obsolete. */
2873 
2874 void
2875 ABSApp::DefaultSettings ()
2876 {
2877   status_t ErrorCode;
2878   BPath    DatabasePath (m_SettingsDirectoryPath);
2879   char     TempString [PATH_MAX];
2880 
2881   /* The default database file is in the settings directory. */
2882 
2883   ErrorCode = DatabasePath.Append (g_DefaultDatabaseFileName);
2884   if (ErrorCode != B_OK)
2885     strcpy (TempString, g_DefaultDatabaseFileName); /* Unlikely to happen. */
2886   else
2887     strcpy (TempString, DatabasePath.Path ());
2888   m_DatabaseFileName.SetTo (TempString);
2889 
2890   // Users need to be allowed to undo their mistakes...
2891   m_IgnorePreviousClassification = true;
2892   g_ServerMode = true;
2893   m_PurgeAge = 2000;
2894   m_PurgePopularity = 2;
2895   m_ScoringMode = SM_CHISQUARED;
2896   m_TokenizeMode = TM_ANY_TEXT_HEADER;
2897 
2898   m_SettingsHaveChanged = true;
2899 }
2900 
2901 
2902 /* Deletes the database file, and the backup file, and clears the database but
2903 marks it as not changed so that it doesn't get written out when the program
2904 exits. */
2905 
2906 status_t ABSApp::DeleteDatabaseFile (char *ErrorMessage)
2907 {
2908   BEntry   FileEntry;
2909   status_t ErrorCode;
2910   int      i;
2911   char     TempString [PATH_MAX+20];
2912 
2913   /* Clear the in-memory database. */
2914 
2915   MakeDatabaseEmpty ();
2916   m_DatabaseHasChanged = false;
2917 
2918   /* Delete the backup files first.  Don't care if it fails. */
2919 
2920   for (i = 0; i < g_MaxBackups; i++)
2921   {
2922     strcpy (TempString, m_DatabaseFileName.String ());
2923     sprintf (TempString + strlen (TempString), g_BackupSuffix, i);
2924     ErrorCode = FileEntry.SetTo (TempString);
2925     if (ErrorCode == B_OK)
2926       FileEntry.Remove ();
2927   }
2928 
2929   /* Delete the main database file. */
2930 
2931   strcpy (TempString, m_DatabaseFileName.String ());
2932   ErrorCode = FileEntry.SetTo (TempString);
2933   if (ErrorCode != B_OK)
2934   {
2935     sprintf (ErrorMessage, "While deleting, failed to make BEntry for "
2936       "\"%s\" (does the directory exist?)", TempString);
2937     return ErrorCode;
2938   }
2939 
2940   ErrorCode = FileEntry.Remove ();
2941   if (ErrorCode != B_OK)
2942     sprintf (ErrorMessage, "While deleting, failed to remove file "
2943       "\"%s\"", TempString);
2944 
2945   return ErrorCode;
2946 }
2947 
2948 
2949 /* Evaluate the given file as being a spam message, and tag it with the
2950 resulting spam probability ratio.  If it also has an e-mail subject attribute,
2951 remove the [Spam 99.9%] prefix since the number usually changes. */
2952 
2953 status_t ABSApp::EvaluateFile (
2954   const char *PathName,
2955   BMessage *ReplyMessagePntr,
2956   char *ErrorMessage)
2957 {
2958   status_t ErrorCode;
2959   float    TempFloat;
2960   BFile    TextFile;
2961 
2962   /* Open the specified file. */
2963 
2964   ErrorCode = TextFile.SetTo (PathName, B_READ_ONLY);
2965   if (ErrorCode != B_OK)
2966   {
2967     sprintf (ErrorMessage, "Problems opening file \"%s\" for evaluating",
2968       PathName);
2969     return ErrorCode;
2970   }
2971 
2972   ErrorCode =
2973     EvaluatePositionIO (&TextFile, PathName, ReplyMessagePntr, ErrorMessage);
2974 
2975   if (ErrorCode == B_OK &&
2976   ReplyMessagePntr->FindFloat (g_ResultName, &TempFloat) == B_OK)
2977   {
2978     TextFile.WriteAttr (g_AttributeNameSpamRatio, B_FLOAT_TYPE,
2979       0 /* offset */, &TempFloat, sizeof (TempFloat));
2980     /* Don't know the spam cutoff ratio, that's in the e-mail filter, so just
2981     blindly remove the prefix, which would have the wrong percentage. */
2982     RemoveSpamPrefixFromSubjectAttribute (&TextFile);
2983   }
2984 
2985   return ErrorCode;
2986 }
2987 
2988 
2989 /* Evaluate a given file or memory buffer (a BPositionIO handles both cases)
2990 for spaminess.  The output is added to the ReplyMessagePntr message, with the
2991 probability ratio stored in "result" (0.0 means genuine and 1.0 means spam).
2992 It also adds the most significant words (used in the ratio calculation) to the
2993 array "words" and the associated per-word probability ratios in "ratios".  If
2994 it fails, an error code is returned and an error message written to the
2995 ErrorMessage string (which is at least MAX_PATH + 1024 bytes long).
2996 OptionalFileName is only used in the error message.
2997 
2998 The math used for combining the individual word probabilities in my method is
2999 based on Gary Robinson's method (formerly it was a variation of Paul Graham's
3000 method) or the Chi-Squared method.  It's input is the database of words that
3001 has a count of the number of spam and number of genuine messages each word
3002 appears in (doesn't matter if it appears more than once in a message, it still
3003 counts as 1).
3004 
3005 The spam word count is divided the by the total number of spam e-mail messages
3006 in the database to get the probability of spam and probability of genuineness
3007 is similarly computed for a particular word.  The spam probability is divided
3008 by the sum of the spam and genuine probabilities to get the Raw Spam Ratio for
3009 the word.  It's nearer to 0.0 for genuine and nearer to 1.0 for spam, and can
3010 be exactly zero or one too.
3011 
3012 To avoid multiplying later results by zero, and to compensate for a lack of
3013 data points, the Raw Spam Ratio is adjusted towards the 0.5 halfway point.  The
3014 0.5 is combined with the raw spam ratio, with a weight of 0.45 (determined to
3015 be a good value by the "spambayes" mailing list tests) messages applied to the
3016 half way point and a weight of the number of spam + genuine messages applied to
3017 the raw spam ratio.  This gives you the compensated spam ratio for the word.
3018 
3019 The top N (150 was good in the spambayes tests) extreme words are selected by
3020 the distance of each word's compensated spam ratio from 0.5.  Then the ratios
3021 of the words are combined.
3022 
3023 The Gary Robinson combining (scoring) method gets one value from the Nth root
3024 of the product of all the word ratios.  The other is the Nth root of the
3025 product of (1 - ratio) for all the words.  The final result is the first value
3026 divided by the sum of the two values.  The Nth root helps spread the resulting
3027 range of values more evenly between 0.0 and 1.0, otherwise the values all clump
3028 together at 0 or 1.  Also you can think of the Nth root as a kind of average
3029 for products; it's like a generic word probability which when multiplied by
3030 itself N times gives you the same result as the N separate actual word
3031 probabilities multiplied together.
3032 
3033 The Chi-Squared combining (scoring) method assumes that the spam word
3034 probabilities are uniformly distributed and computes an error measurement
3035 (called chi squared - see http://bmj.com/collections/statsbk/8.shtml for a good
3036 tutorial) and then sees how likely that error value would be observed in
3037 practice.  If it's rare to observe, then the words are likely not just randomly
3038 occuring and it's spammy.  The same is done for genuine words.  The two
3039 resulting unlikelynesses are compared to see which is more unlikely, if neither
3040 is, then the method says it can't decide.  The SpamBayes notes (see the
3041 classifier.py file in CVS in http://sourceforge.net/projects/spambayes) say:
3042 
3043 "Across vectors of length n, containing random uniformly-distributed
3044 probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution with 2*n
3045 degrees of freedom.  This has been proven (in some appropriate sense) to be the
3046 most sensitive possible test for rejecting the hypothesis that a vector of
3047 probabilities is uniformly distributed.  Gary Robinson's original scheme was
3048 monotonic *with* this test, but skipped the details.  Turns out that getting
3049 closer to the theoretical roots gives a much sharper classification, with a
3050 very small (in # of msgs), but also very broad (in range of scores), "middle
3051 ground", where most of the mistakes live.  In particular, this scheme seems
3052 immune to all forms of "cancellation disease": if there are many strong ham
3053 *and* spam clues, this reliably scores close to 0.5.  Most other schemes are
3054 extremely certain then -- and often wrong."
3055 
3056 I did a test with 448 example genuine messages including personal mail (some
3057 with HTML attachments) and mailing lists, and 267 spam messages for 27471 words
3058 total.  Test messages were more recent messages in the same groups.  Out of 100
3059 test genuine messages, with Gary Robinson (0.56 cutoff limit), 1 (1%) was
3060 falsely identified as spam and 8 of 73 (11%) spam messages were incorrectly
3061 classified as genuine.  With my variation of Paul Graham's scheme (0.90 cutoff)
3062 I got 6 of 100 (6%) genuine messages incorrectly marked as spam and 2 of 73
3063 (3%) spam messages were incorrectly classified as genuine.  Pretty close, but
3064 Robinson's values are more evenly spread out so you can tell just how spammy it
3065 is by looking at the number. */
3066 
3067 struct WordAndRatioStruct
3068 {
3069   double        probabilityRatio; /* Actually the compensated ratio. */
3070   const string *wordPntr;
3071 
3072   bool operator() ( /* Our less-than comparison function for sorting. */
3073     const WordAndRatioStruct &ItemA,
3074     const WordAndRatioStruct &ItemB) const
3075   {
3076     return
3077       (fabs (ItemA.probabilityRatio - 0.5) <
3078       fabs (ItemB.probabilityRatio - 0.5));
3079   };
3080 };
3081 
3082 status_t ABSApp::EvaluatePositionIO (
3083   BPositionIO *PositionIOPntr,
3084   const char *OptionalFileName,
3085   BMessage *ReplyMessagePntr,
3086   char *ErrorMessage)
3087 {
3088   StatisticsMap::iterator            DataEndIter;
3089   StatisticsMap::iterator            DataIter;
3090   status_t                           ErrorCode;
3091   double                             GenuineProbability;
3092   uint32                             GenuineSpamSum;
3093   int                                i;
3094   priority_queue<
3095     WordAndRatioStruct /* Data type stored in the queue */,
3096     vector<WordAndRatioStruct> /* Underlying container */,
3097     WordAndRatioStruct /* Function for comparing elements */>
3098                                      PriorityQueue;
3099   double                             ProductGenuine;
3100   double                             ProductLogGenuine;
3101   double                             ProductLogSpam;
3102   double                             ProductSpam;
3103   double                             RawProbabilityRatio;
3104   float                              ResultRatio;
3105   double                             SpamProbability;
3106   StatisticsPointer                  StatisticsPntr;
3107   double                             TempDouble;
3108   double                             TotalGenuine;
3109   double                             TotalSpam;
3110   WordAndRatioStruct                 WordAndRatio;
3111   set<string>::iterator              WordEndIter;
3112   set<string>::iterator              WordIter;
3113   const WordAndRatioStruct          *WordRatioPntr;
3114   set<string>                        WordSet;
3115 
3116   /* Get the list of unique words in the file / memory buffer. */
3117 
3118   ErrorCode = GetWordsFromPositionIO (PositionIOPntr, OptionalFileName,
3119     WordSet, ErrorMessage);
3120   if (ErrorCode != B_OK)
3121     return ErrorCode;
3122 
3123   /* Prepare a few variables.  Mostly these are stored double values of some of
3124   the numbers involved (to avoid the overhead of multiple conversions from
3125   integer to double), with extra precautions to avoid divide by zero. */
3126 
3127   if (m_TotalGenuineMessages <= 0)
3128     TotalGenuine = 1.0;
3129   else
3130     TotalGenuine = m_TotalGenuineMessages;
3131 
3132   if (m_TotalSpamMessages <= 0)
3133     TotalSpam = 1.0;
3134   else
3135     TotalSpam = m_TotalSpamMessages;
3136 
3137   /* Look up the words in the database and calculate their compensated spam
3138   ratio.  The results are stored in a priority queue so that we can later find
3139   the top g_MaxInterestingWords for doing the actual determination. */
3140 
3141   WordEndIter = WordSet.end ();
3142   DataEndIter = m_WordMap.end ();
3143   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
3144   {
3145     WordAndRatio.wordPntr = &(*WordIter);
3146 
3147     if ((DataIter = m_WordMap.find (*WordIter)) != DataEndIter)
3148     {
3149       StatisticsPntr = &DataIter->second;
3150 
3151       /* Calculate the probability the word is spam and the probability it is
3152       genuine.  Then the raw probability ratio. */
3153 
3154       SpamProbability = StatisticsPntr->spamCount / TotalSpam;
3155       GenuineProbability = StatisticsPntr->genuineCount / TotalGenuine;
3156 
3157       if (SpamProbability + GenuineProbability > 0)
3158         RawProbabilityRatio =
3159         SpamProbability / (SpamProbability + GenuineProbability);
3160       else /* Word with zero statistics, perhaps due to reclassification. */
3161         RawProbabilityRatio = 0.5;
3162 
3163       /* The compensated ratio leans towards 0.5 (g_RobinsonX) more for fewer
3164       data points, with a weight of 0.45 (g_RobinsonS). */
3165 
3166       GenuineSpamSum =
3167         StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
3168 
3169       WordAndRatio.probabilityRatio =
3170         (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
3171         (g_RobinsonS + GenuineSpamSum);
3172     }
3173     else /* Unknown word. With N=0, compensated ratio equation is RobinsonX. */
3174       WordAndRatio.probabilityRatio = g_RobinsonX;
3175 
3176      PriorityQueue.push (WordAndRatio);
3177   }
3178 
3179   /* Compute the combined probability (multiply them together) of the top few
3180   words.  To avoid numeric underflow (doubles can only get as small as 1E-300),
3181   logarithms are also used.  But avoid the logarithms (sum of logs of numbers
3182   is the same as the product of numbers) as much as possible due to reduced
3183   accuracy and slowness. */
3184 
3185   ProductGenuine = 1.0;
3186   ProductLogGenuine = 0.0;
3187   ProductSpam = 1.0;
3188   ProductLogSpam = 0.0;
3189   for (i = 0;
3190   i < g_MaxInterestingWords && !PriorityQueue.empty();
3191   i++, PriorityQueue.pop())
3192   {
3193     WordRatioPntr = &PriorityQueue.top();
3194     ProductSpam *= WordRatioPntr->probabilityRatio;
3195     ProductGenuine *= 1.0 - WordRatioPntr->probabilityRatio;
3196 
3197     /* Check for the numbers getting dangerously small, close to underflowing.
3198     If they are, move the value into the logarithm storage part. */
3199 
3200     if (ProductSpam < m_SmallestUseableDouble)
3201     {
3202       ProductLogSpam += log (ProductSpam);
3203       ProductSpam = 1.0;
3204     }
3205 
3206     if (ProductGenuine < m_SmallestUseableDouble)
3207     {
3208       ProductLogGenuine += log (ProductGenuine);
3209       ProductGenuine = 1.0;
3210     }
3211 
3212     ReplyMessagePntr->AddString ("words", WordRatioPntr->wordPntr->c_str ());
3213     ReplyMessagePntr->AddFloat ("ratios", WordRatioPntr->probabilityRatio);
3214   }
3215 
3216   /* Get the resulting log of the complete products. */
3217 
3218   if (i > 0)
3219   {
3220     ProductLogSpam += log (ProductSpam);
3221     ProductLogGenuine += log (ProductGenuine);
3222   }
3223 
3224   if (m_ScoringMode == SM_ROBINSON)
3225   {
3226     /* Apply Gary Robinson's scoring method where we take the Nth root of the
3227     products.  This is easiest in logarithm form. */
3228 
3229     if (i > 0)
3230     {
3231       ProductSpam = exp (ProductLogSpam / i);
3232       ProductGenuine = exp (ProductLogGenuine / i);
3233       ResultRatio = ProductSpam / (ProductGenuine + ProductSpam);
3234     }
3235     else /* Somehow got no words! */
3236       ResultRatio = g_RobinsonX;
3237   }
3238   else if (m_ScoringMode == SM_CHISQUARED)
3239   {
3240     /* From the SpamBayes notes: "We compute two chi-squared statistics, one
3241     for ham and one for spam.  The sum-of-the-logs business is more sensitive
3242     to probs near 0 than to probs near 1, so the spam measure uses 1-p (so that
3243     high-spamprob words have greatest effect), and the ham measure uses p
3244     directly (so that lo-spamprob words have greatest effect)."  That means we
3245     just reversed the meaning of the previously calculated spam and genuine
3246     products!  Oh well. */
3247 
3248     TempDouble = ProductLogSpam;
3249     ProductLogSpam = ProductLogGenuine;
3250     ProductLogGenuine = TempDouble;
3251 
3252     if (i > 0)
3253     {
3254       ProductSpam =
3255         1.0 - ChiSquaredProbability (-2.0 * ProductLogSpam, 2 * i);
3256       ProductGenuine =
3257         1.0 - ChiSquaredProbability (-2.0 * ProductLogGenuine, 2 * i);
3258 
3259       /* The SpamBayes notes say: "How to combine these into a single spam
3260       score?  We originally used (S-H)/(S+H) scaled into [0., 1.], which equals
3261       S/(S+H).  A systematic problem is that we could end up being near-certain
3262       a thing was (for example) spam, even if S was small, provided that H was
3263       much smaller.  Rob Hooft stared at these problems and invented the
3264       measure we use now, the simpler S-H, scaled into [0., 1.]." */
3265 
3266       ResultRatio = (ProductSpam - ProductGenuine + 1.0) / 2.0;
3267     }
3268     else /* No words to analyse. */
3269       ResultRatio = 0.5;
3270   }
3271   else /* Unknown scoring mode. */
3272   {
3273     strcpy (ErrorMessage, "Unknown scoring mode specified in settings");
3274     return B_BAD_VALUE;
3275   }
3276 
3277   ReplyMessagePntr->AddFloat (g_ResultName, ResultRatio);
3278   return B_OK;
3279 }
3280 
3281 
3282 /* Just evaluate the given string as being spam text. */
3283 
3284 status_t ABSApp::EvaluateString (
3285   const char *BufferPntr,
3286   ssize_t BufferSize,
3287   BMessage *ReplyMessagePntr,
3288   char *ErrorMessage)
3289 {
3290   BMemoryIO MemoryIO (BufferPntr, BufferSize);
3291 
3292   return EvaluatePositionIO (&MemoryIO, "Memory Buffer",
3293     ReplyMessagePntr, ErrorMessage);
3294 }
3295 
3296 
3297 /* Tell other programs about the scripting commands we support.  Try this
3298 command: "hey application/x-vnd.agmsmith.spamdbm getsuites" to
3299 see it in action (this program has to be already running for it to work). */
3300 
3301 status_t ABSApp::GetSupportedSuites (BMessage *MessagePntr)
3302 {
3303   BPropertyInfo TempPropInfo (g_ScriptingPropertyList);
3304 
3305   MessagePntr->AddString ("suites", "suite/x-vnd.agmsmith.spamdbm");
3306   MessagePntr->AddFlat ("messages", &TempPropInfo);
3307   return BApplication::GetSupportedSuites (MessagePntr);
3308 }
3309 
3310 
3311 /* Add all the words in the given file or memory buffer to the supplied set.
3312 The file name is only there for error messages, it assumes you have already
3313 opened the PositionIO to the right file.  If things go wrong, a non-zero error
3314 code will be returned and an explanation written to ErrorMessage (assumed to be
3315 at least PATH_MAX + 1024 bytes long). */
3316 
3317 status_t ABSApp::GetWordsFromPositionIO (
3318   BPositionIO *PositionIOPntr,
3319   const char *OptionalFileName,
3320   set<string> &WordSet,
3321   char *ErrorMessage)
3322 {
3323   status_t ErrorCode;
3324 
3325   if (m_TokenizeMode == TM_WHOLE)
3326     ErrorCode = TokenizeWhole (PositionIOPntr, OptionalFileName,
3327       WordSet, ErrorMessage);
3328   else
3329     ErrorCode = TokenizeParts (PositionIOPntr, OptionalFileName,
3330       WordSet, ErrorMessage);
3331 
3332   if (ErrorCode == B_OK && WordSet.empty ())
3333   {
3334     /* ENOMSG usually means no message found in queue, but I'm using it to show
3335     no words, a good indicator of spam which is pure HTML. */
3336 
3337     sprintf (ErrorMessage, "No words were found in \"%s\"", OptionalFileName);
3338     ErrorCode = ENOMSG;
3339   }
3340 
3341   return ErrorCode;
3342 }
3343 
3344 
3345 /* Set up indices for attributes MAIL:classification (string) and
3346 MAIL:ratio_spam (float) on all mounted disk volumes that support queries.  Also
3347 tell the system to make those attributes visible to the user (so they can see
3348 them in Tracker) and associate them with e-mail messages.  Also set up the
3349 database file MIME type (provide a description and associate it with this
3350 program so that it picks up the right icon).  And register the names for our
3351 sound effects. */
3352 
3353 status_t ABSApp::InstallThings (char *ErrorMessage)
3354 {
3355   int32       Cookie;
3356   dev_t       DeviceID;
3357   status_t    ErrorCode = B_OK;
3358   fs_info     FSInfo;
3359   int32       i;
3360   int32       iClassification;
3361   int32       iProbability;
3362   int32       j;
3363   index_info  IndexInfo;
3364   BMimeType   MimeType;
3365   BMessage    Parameters;
3366   const char *StringPntr;
3367   bool        TempBool;
3368   int32       TempInt32;
3369 
3370   /* Iterate through all mounted devices and try to make the indices on each
3371   one.  Don't bother if the index exists or the device doesn't support indices
3372   (actually queries). */
3373 
3374   Cookie = 0;
3375   while ((DeviceID = next_dev (&Cookie)) >= 0)
3376   {
3377     if (!fs_stat_dev (DeviceID, &FSInfo) && (FSInfo.flags & B_FS_HAS_QUERY))
3378     {
3379       if (fs_stat_index (DeviceID, g_AttributeNameClassification, &IndexInfo)
3380       && errno == B_ENTRY_NOT_FOUND)
3381       {
3382         if (fs_create_index (DeviceID, g_AttributeNameClassification,
3383         B_STRING_TYPE, 0 /* flags */))
3384         {
3385           ErrorCode = errno;
3386           sprintf (ErrorMessage, "Unable to make string index %s on "
3387             "volume #%d, volume name \"%s\", file system type \"%s\", "
3388             "on device \"%s\"", g_AttributeNameClassification,
3389             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3390             FSInfo.device_name);
3391         }
3392       }
3393 
3394       if (fs_stat_index (DeviceID, g_AttributeNameSpamRatio,
3395       &IndexInfo) && errno == B_ENTRY_NOT_FOUND)
3396       {
3397         if (fs_create_index (DeviceID, g_AttributeNameSpamRatio,
3398         B_FLOAT_TYPE, 0 /* flags */))
3399         {
3400           ErrorCode = errno;
3401           sprintf (ErrorMessage, "Unable to make float index %s on "
3402             "volume #%d, volume name \"%s\", file system type \"%s\", "
3403             "on device \"%s\"", g_AttributeNameSpamRatio,
3404             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3405             FSInfo.device_name);
3406         }
3407       }
3408     }
3409   }
3410   if (ErrorCode != B_OK)
3411     return ErrorCode;
3412 
3413   /* Set up the MIME types for the classification attributes, associate them
3414   with e-mail and make them visible to the user (but not editable).  First need
3415   to get the existing MIME settings, then add ours to them (otherwise the
3416   existing ones get wiped out). */
3417 
3418   ErrorCode = MimeType.SetTo ("text/x-email");
3419   if (ErrorCode != B_OK || !MimeType.IsInstalled ())
3420   {
3421     sprintf (ErrorMessage, "No e-mail MIME type (%s) in the system, can't "
3422       "update it to add our special attributes, and without e-mail this "
3423       "program is useless!", MimeType.Type ());
3424     if (ErrorCode == B_OK)
3425       ErrorCode = -1;
3426     return ErrorCode;
3427   }
3428 
3429   ErrorCode = MimeType.GetAttrInfo (&Parameters);
3430   if (ErrorCode != B_OK)
3431   {
3432     sprintf (ErrorMessage, "Unable to retrieve list of attributes "
3433       "associated with e-mail messages in the MIME database");
3434     return ErrorCode;
3435   }
3436 
3437   for (i = 0, iClassification = -1, iProbability = -1;
3438   i < 1000 && (iClassification < 0 || iProbability < 0);
3439   i++)
3440   {
3441     ErrorCode = Parameters.FindString ("attr:name", i, &StringPntr);
3442     if (ErrorCode != B_OK)
3443       break; /* Reached the end of the attributes. */
3444     if (strcmp (StringPntr, g_AttributeNameClassification) == 0)
3445       iClassification = i;
3446     else if (strcmp (StringPntr, g_AttributeNameSpamRatio) == 0)
3447       iProbability = i;
3448   }
3449 
3450   /* Add extra default settings for those programs which previously didn't
3451   update the MIME database with all the attributes that exist (so our new
3452   additions don't show up at the wrong index). */
3453 
3454   i--; /* Set i to index of last valid attribute. */
3455 
3456   for (j = 0; j <= i; j++)
3457   {
3458     if (Parameters.FindString ("attr:public_name", j, &StringPntr) ==
3459     B_BAD_INDEX)
3460     {
3461       if (Parameters.FindString ("attr:name", j, &StringPntr) != B_OK)
3462         StringPntr = "None!";
3463       Parameters.AddString ("attr:public_name", StringPntr);
3464     }
3465   }
3466 
3467   while (Parameters.FindInt32 ("attr:type", i, &TempInt32) == B_BAD_INDEX)
3468     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3469 
3470   while (Parameters.FindBool ("attr:viewable", i, &TempBool) == B_BAD_INDEX)
3471     Parameters.AddBool ("attr:viewable", true);
3472 
3473   while (Parameters.FindBool ("attr:editable", i, &TempBool) == B_BAD_INDEX)
3474     Parameters.AddBool ("attr:editable", false);
3475 
3476   while (Parameters.FindInt32 ("attr:width", i, &TempInt32) == B_BAD_INDEX)
3477     Parameters.AddInt32 ("attr:width", 60);
3478 
3479   while (Parameters.FindInt32 ("attr:alignment", i, &TempInt32) == B_BAD_INDEX)
3480     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3481 
3482   while (Parameters.FindBool ("attr:extra", i, &TempBool) == B_BAD_INDEX)
3483     Parameters.AddBool ("attr:extra", false);
3484 
3485   /* Add our new attributes to e-mail related things, if not already there. */
3486 
3487   if (iClassification < 0)
3488   {
3489     Parameters.AddString ("attr:name", g_AttributeNameClassification);
3490     Parameters.AddString ("attr:public_name", "Classification Group");
3491     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3492     Parameters.AddBool ("attr:viewable", true);
3493     Parameters.AddBool ("attr:editable", false);
3494     Parameters.AddInt32 ("attr:width", 45);
3495     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3496     Parameters.AddBool ("attr:extra", false);
3497   }
3498 
3499   if (iProbability < 0)
3500   {
3501     Parameters.AddString ("attr:name", g_AttributeNameSpamRatio);
3502     Parameters.AddString ("attr:public_name", "Spam/Genuine Estimate");
3503     Parameters.AddInt32 ("attr:type", B_FLOAT_TYPE);
3504     Parameters.AddBool ("attr:viewable", true);
3505     Parameters.AddBool ("attr:editable", false);
3506     Parameters.AddInt32 ("attr:width", 50);
3507     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3508     Parameters.AddBool ("attr:extra", false);
3509   }
3510 
3511   if (iClassification < 0 || iProbability < 0)
3512   {
3513     ErrorCode = MimeType.SetAttrInfo (&Parameters);
3514     if (ErrorCode != B_OK)
3515     {
3516       sprintf (ErrorMessage, "Unable to associate the classification "
3517         "attributes with e-mail messages in the MIME database");
3518       return ErrorCode;
3519     }
3520   }
3521 
3522   /* Set up the MIME type for the database file. */
3523 
3524   sprintf (ErrorMessage, "Problems with setting up MIME type (%s) for "
3525     "the database files", g_ABSDatabaseFileMIMEType); /* A generic message. */
3526 
3527   ErrorCode = MimeType.SetTo (g_ABSDatabaseFileMIMEType);
3528   if (ErrorCode != B_OK)
3529     return ErrorCode;
3530 
3531   MimeType.Delete ();
3532   ErrorCode = MimeType.Install ();
3533   if (ErrorCode != B_OK)
3534   {
3535     sprintf (ErrorMessage, "Failed to install MIME type (%s) in the system",
3536       MimeType.Type ());
3537     return ErrorCode;
3538   }
3539 
3540   MimeType.SetShortDescription ("Spam Database");
3541   MimeType.SetLongDescription ("Bayesian Statistical Database for "
3542     "Classifying Junk E-Mail");
3543   sprintf (ErrorMessage, "1.0 ('%s')", g_DatabaseRecognitionString);
3544   MimeType.SetSnifferRule (ErrorMessage);
3545   MimeType.SetPreferredApp (g_ABSAppSignature);
3546 
3547   /* Set up the names of the sound effects.  Later on the user can associate
3548   sound files with the names by using the Sounds preferences panel or the
3549   installsound command.  The MDR add-on filter will trigger these sounds. */
3550 
3551   add_system_beep_event (g_BeepGenuine);
3552   add_system_beep_event (g_BeepSpam);
3553   add_system_beep_event (g_BeepUncertain);
3554 
3555   return B_OK;
3556 }
3557 
3558 
3559 /* Load the database if it hasn't been loaded yet.  Otherwise do nothing. */
3560 
3561 status_t ABSApp::LoadDatabaseIfNeeded (char *ErrorMessage)
3562 {
3563   if (m_WordMap.empty ())
3564     return LoadSaveDatabase (true /* DoLoad */, ErrorMessage);
3565 
3566   return B_OK;
3567 }
3568 
3569 
3570 /* Either load the database of spam words (DoLoad is TRUE) from the file
3571 specified in the settings, or write (DoLoad is FALSE) the database to it.  If
3572 it doesn't exist (and its parent directories do exist) then it will be created
3573 when saving.  If it doesn't exist when loading, the in-memory database will be
3574 set to an empty one and an error will be returned with an explanation put into
3575 ErrorMessage (should be big enough for a path name and a couple of lines of
3576 text).
3577 
3578 The database file format is a UTF-8 text file (well, there could be some
3579 latin-1 characters and other junk in there - it just copies the bytes from the
3580 e-mail messages directly), with tab characters to separate fields (so that you
3581 can also load it into a spreadsheet).  The first line identifies the overall
3582 file type.  The second lists pairs of classifications plus the number of
3583 messages in each class.  Currently it is just Genuine and Spam, but for future
3584 compatability, that could be followed by more classification pairs.  The
3585 remaining lines each contain a word, the date it was last updated (actually
3586 it's the number of messages in the database when the word was added, smaller
3587 numbers mean it was updated longer ago), the genuine count and the spam count.
3588 */
3589 
3590 status_t ABSApp::LoadSaveDatabase (bool DoLoad, char *ErrorMessage)
3591 {
3592   time_t                             CurrentTime;
3593   FILE                              *DatabaseFile = NULL;
3594   BNode                              DatabaseNode;
3595   BNodeInfo                          DatabaseNodeInfo;
3596   StatisticsMap::iterator            DataIter;
3597   StatisticsMap::iterator            EndIter;
3598   status_t                           ErrorCode;
3599   int                                i;
3600   pair<StatisticsMap::iterator,bool> InsertResult;
3601   char                               LineString [10240];
3602   StatisticsRecord                   Statistics;
3603   const char                        *StringPntr;
3604   char                              *TabPntr;
3605   const char                        *WordPntr;
3606 
3607   if (DoLoad)
3608   {
3609     MakeDatabaseEmpty ();
3610     m_DatabaseHasChanged = false; /* In case of early error exit. */
3611   }
3612   else /* Saving the database, backup the old version on disk. */
3613   {
3614     ErrorCode = MakeBackup (ErrorMessage);
3615     if (ErrorCode != B_OK) /* Usually because the directory isn't there. */
3616       return ErrorCode;
3617   }
3618 
3619   DatabaseFile = fopen (m_DatabaseFileName.String (), DoLoad ? "rb" : "wb");
3620   if (DatabaseFile == NULL)
3621   {
3622     ErrorCode = errno;
3623     sprintf (ErrorMessage, "Can't open database file \"%s\" for %s",
3624       m_DatabaseFileName.String (), DoLoad ? "reading" : "writing");
3625     goto ErrorExit;
3626   }
3627 
3628   /* Process the first line, which identifies the file. */
3629 
3630   if (DoLoad)
3631   {
3632     sprintf (ErrorMessage, "Can't read first line of database file \"%s\", "
3633       "expected it to start with \"%s\"",
3634       m_DatabaseFileName.String (), g_DatabaseRecognitionString);
3635     ErrorCode = -1;
3636 
3637     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3638       goto ErrorExit;
3639     if (strncmp (LineString, g_DatabaseRecognitionString,
3640     strlen (g_DatabaseRecognitionString)) != 0)
3641       goto ErrorExit;
3642   }
3643   else /* Saving */
3644   {
3645     CurrentTime = time (NULL);
3646     if (fprintf (DatabaseFile, "%s V1 (word, age, genuine count, spam count)\t"
3647     "Written by SpamDBM $Revision: 30630 $\t"
3648     "Compiled on " __DATE__ " at " __TIME__ "\tThis file saved on %s",
3649     g_DatabaseRecognitionString, ctime (&CurrentTime)) <= 0)
3650     {
3651       ErrorCode = errno;
3652       sprintf (ErrorMessage, "Problems when writing to database file \"%s\"",
3653         m_DatabaseFileName.String ());
3654       goto ErrorExit;
3655     }
3656   }
3657 
3658   /* The second line lists the different classifications.  We just check to see
3659   that the first two are Genuine and Spam.  If there are others, they'll be
3660   ignored and lost when the database is saved. */
3661 
3662   if (DoLoad)
3663   {
3664     sprintf (ErrorMessage, "Can't read second line of database file \"%s\", "
3665       "expected it to list classifications %s and %s along with their totals",
3666       m_DatabaseFileName.String (), g_ClassifiedGenuine, g_ClassifiedSpam);
3667     ErrorCode = B_BAD_VALUE;
3668 
3669     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3670       goto ErrorExit;
3671     i = strlen (LineString);
3672     if (i > 0 && LineString[i-1] == '\n')
3673       LineString[i-1] = 0; /* Remove trailing line feed character. */
3674 
3675     /* Look for the title word at the start of the line. */
3676 
3677     TabPntr = LineString;
3678     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3679       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3680 
3681     if (strncmp (StringPntr, "Classifications", 15) != 0)
3682       goto ErrorExit;
3683 
3684     /* Look for the Genuine class and count. */
3685 
3686     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3687       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3688 
3689     if (strcmp (StringPntr, g_ClassifiedGenuine) != 0)
3690       goto ErrorExit;
3691 
3692     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3693       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3694 
3695     m_TotalGenuineMessages = atoll (StringPntr);
3696 
3697     /* Look for the Spam class and count. */
3698 
3699     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3700       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3701 
3702     if (strcmp (StringPntr, g_ClassifiedSpam) != 0)
3703       goto ErrorExit;
3704 
3705     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3706       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3707 
3708     m_TotalSpamMessages = atoll (StringPntr);
3709   }
3710   else /* Saving */
3711   {
3712     fprintf (DatabaseFile,
3713       "Classifications and total messages:\t%s\t%" B_PRIu32
3714         "\t%s\t%" B_PRIu32 "\n",
3715       g_ClassifiedGenuine, m_TotalGenuineMessages,
3716       g_ClassifiedSpam, m_TotalSpamMessages);
3717   }
3718 
3719   /* The remainder of the file is the list of words and statistics.  Each line
3720   has a word, a tab, the time when the word was last changed in the database
3721   (sequence number of message addition, starts at 0 and goes up by one for each
3722   message added to the database), a tab then the number of messages in the
3723   first class (genuine) that had that word, then a tab, then the number of
3724   messages in the second class (spam) with that word, and so on. */
3725 
3726   if (DoLoad)
3727   {
3728     while (!feof (DatabaseFile))
3729     {
3730       if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3731       {
3732         ErrorCode = errno;
3733         if (feof (DatabaseFile))
3734           break;
3735         if (ErrorCode == B_OK)
3736           ErrorCode = -1;
3737         sprintf (ErrorMessage, "Error while reading words and statistics "
3738           "from database file \"%s\"", m_DatabaseFileName.String ());
3739         goto ErrorExit;
3740       }
3741 
3742       i = strlen (LineString);
3743       if (i > 0 && LineString[i-1] == '\n')
3744         LineString[i-1] = 0; /* Remove trailing line feed character. */
3745 
3746       /* Get the word at the start of the line, save in WordPntr. */
3747 
3748       TabPntr = LineString;
3749       for (WordPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3750         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3751 
3752       /* Get the date stamp.  Actually a sequence number, not a date. */
3753 
3754       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3755         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3756 
3757       Statistics.age = atoll (StringPntr);
3758 
3759       /* Get the Genuine count. */
3760 
3761       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3762         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3763 
3764       Statistics.genuineCount = atoll (StringPntr);
3765 
3766       /* Get the Spam count. */
3767 
3768       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3769         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3770 
3771       Statistics.spamCount = atoll (StringPntr);
3772 
3773       /* Ignore empty words, totally unused words and ones which are too long
3774       (avoids lots of length checking everywhere). */
3775 
3776       if (WordPntr[0] == 0 || strlen (WordPntr) > g_MaxWordLength ||
3777       (Statistics.genuineCount <= 0 && Statistics.spamCount <= 0))
3778         continue; /* Ignore this line of text, start on next one. */
3779 
3780       /* Add the combination to the database. */
3781 
3782       InsertResult = m_WordMap.insert (
3783         StatisticsMap::value_type (WordPntr, Statistics));
3784       if (InsertResult.second == false)
3785       {
3786         ErrorCode = B_BAD_VALUE;
3787         sprintf (ErrorMessage, "Error while inserting word \"%s\" from "
3788           "database \"%s\", perhaps it is a duplicate",
3789           WordPntr, m_DatabaseFileName.String ());
3790         goto ErrorExit;
3791       }
3792       m_WordCount++;
3793 
3794       /* And the hunt for the oldest word. */
3795 
3796       if (Statistics.age < m_OldestAge)
3797         m_OldestAge = Statistics.age;
3798     }
3799   }
3800   else /* Saving, dump all words and statistics to the file. */
3801   {
3802     EndIter = m_WordMap.end ();
3803     for (DataIter = m_WordMap.begin (); DataIter != EndIter; DataIter++)
3804     {
3805       if (fprintf (DatabaseFile,
3806       "%s\t%" B_PRIu32 "\t%" B_PRIu32 "\t%" B_PRIu32 "\n",
3807       DataIter->first.c_str (), DataIter->second.age,
3808       DataIter->second.genuineCount, DataIter->second.spamCount) <= 0)
3809       {
3810         ErrorCode = errno;
3811         sprintf (ErrorMessage, "Error while writing word \"%s\" to "
3812           "database \"%s\"",
3813           DataIter->first.c_str(), m_DatabaseFileName.String ());
3814         goto ErrorExit;
3815       }
3816     }
3817   }
3818 
3819   /* Set the file type so that the new file gets associated with this program,
3820   and picks up the right icon. */
3821 
3822   if (!DoLoad)
3823   {
3824     sprintf (ErrorMessage, "Unable to set attributes (file type) of database "
3825       "file \"%s\"", m_DatabaseFileName.String ());
3826     ErrorCode = DatabaseNode.SetTo (m_DatabaseFileName.String ());
3827     if (ErrorCode != B_OK)
3828       goto ErrorExit;
3829     DatabaseNodeInfo.SetTo (&DatabaseNode);
3830     ErrorCode = DatabaseNodeInfo.SetType (g_ABSDatabaseFileMIMEType);
3831     if (ErrorCode != B_OK)
3832       goto ErrorExit;
3833   }
3834 
3835   /* Success! */
3836   m_DatabaseHasChanged = false;
3837   ErrorCode = B_OK;
3838 
3839 ErrorExit:
3840   if (DatabaseFile != NULL)
3841     fclose (DatabaseFile);
3842   return ErrorCode;
3843 }
3844 
3845 
3846 /* Either load the settings (DoLoad is TRUE) from the configuration file or
3847 write them (DoLoad is FALSE) to it.  The configuration file is a flattened
3848 BMessage containing the various program settings.  If it doesn't exist (and its
3849 parent directories don't exist) then it will be created when saving.  If it
3850 doesn't exist when loading, the settings will be set to default values. */
3851 
3852 status_t ABSApp::LoadSaveSettings (bool DoLoad)
3853 {
3854   status_t    ErrorCode;
3855   const char *NamePntr;
3856   BMessage    Settings;
3857   BDirectory  SettingsDirectory;
3858   BFile       SettingsFile;
3859   const char *StringPntr;
3860   bool        TempBool;
3861   int32       TempInt32;
3862   char        TempString [PATH_MAX + 100];
3863 
3864   /* Preset things to default values if loading, in case of an error or it's an
3865   older version of the settings file which doesn't have every field defined. */
3866 
3867   if (DoLoad)
3868     DefaultSettings ();
3869 
3870   /* Look for our settings directory.  When saving we can try to create it. */
3871 
3872   ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3873   if (ErrorCode != B_OK)
3874   {
3875     if (DoLoad || ErrorCode != B_ENTRY_NOT_FOUND)
3876     {
3877       sprintf (TempString, "Can't find settings directory \"%s\"",
3878         m_SettingsDirectoryPath.Path ());
3879       goto ErrorExit;
3880     }
3881     ErrorCode = create_directory (m_SettingsDirectoryPath.Path (), 0755);
3882     if (ErrorCode == B_OK)
3883       ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3884     if (ErrorCode != B_OK)
3885     {
3886       sprintf (TempString, "Can't create settings directory \"%s\"",
3887         m_SettingsDirectoryPath.Path ());
3888       goto ErrorExit;
3889     }
3890   }
3891 
3892   ErrorCode = SettingsFile.SetTo (&SettingsDirectory, g_SettingsFileName,
3893     DoLoad ? B_READ_ONLY : B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
3894   if (ErrorCode != B_OK)
3895   {
3896     sprintf (TempString, "Can't open settings file \"%s\" in directory \"%s\" "
3897       "for %s", g_SettingsFileName, m_SettingsDirectoryPath.Path(),
3898       DoLoad ? "reading" : "writing");
3899     goto ErrorExit;
3900   }
3901 
3902   if (DoLoad)
3903   {
3904     ErrorCode = Settings.Unflatten (&SettingsFile);
3905     if (ErrorCode != 0 || Settings.what != g_SettingsWhatCode)
3906     {
3907       sprintf (TempString, "Corrupt data detected while reading settings "
3908         "file \"%s\" in directory \"%s\", will revert to defaults",
3909         g_SettingsFileName, m_SettingsDirectoryPath.Path());
3910       goto ErrorExit;
3911     }
3912   }
3913 
3914   /* Transfer the settings between the BMessage and our various global
3915   variables.  For loading, if the setting isn't present, leave it at the
3916   default value.  Note that loading and saving are intermingled here to make
3917   code maintenance easier (less chance of forgetting to update it if load and
3918   save were separate functions). */
3919 
3920   ErrorCode = B_OK; /* So that saving settings can record an error. */
3921 
3922   NamePntr = "DatabaseFileName";
3923   if (DoLoad)
3924   {
3925     if (Settings.FindString (NamePntr, &StringPntr) == B_OK)
3926       m_DatabaseFileName.SetTo (StringPntr);
3927   }
3928   else if (ErrorCode == B_OK)
3929     ErrorCode = Settings.AddString (NamePntr, m_DatabaseFileName);
3930 
3931   NamePntr = "ServerMode";
3932   if (DoLoad)
3933   {
3934     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3935       g_ServerMode = TempBool;
3936   }
3937   else if (ErrorCode == B_OK)
3938     ErrorCode = Settings.AddBool (NamePntr, g_ServerMode);
3939 
3940   NamePntr = "IgnorePreviousClassification";
3941   if (DoLoad)
3942   {
3943     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3944       m_IgnorePreviousClassification = TempBool;
3945   }
3946   else if (ErrorCode == B_OK)
3947     ErrorCode = Settings.AddBool (NamePntr, m_IgnorePreviousClassification);
3948 
3949   NamePntr = "PurgeAge";
3950   if (DoLoad)
3951   {
3952     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3953       m_PurgeAge = TempInt32;
3954   }
3955   else if (ErrorCode == B_OK)
3956     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgeAge);
3957 
3958   NamePntr = "PurgePopularity";
3959   if (DoLoad)
3960   {
3961     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3962       m_PurgePopularity = TempInt32;
3963   }
3964   else if (ErrorCode == B_OK)
3965     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgePopularity);
3966 
3967   NamePntr = "ScoringMode";
3968   if (DoLoad)
3969   {
3970     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3971       m_ScoringMode = (ScoringModes) TempInt32;
3972     if (m_ScoringMode < 0 || m_ScoringMode >= SM_MAX)
3973       m_ScoringMode = (ScoringModes) 0;
3974   }
3975   else if (ErrorCode == B_OK)
3976     ErrorCode = Settings.AddInt32 (NamePntr, m_ScoringMode);
3977 
3978   NamePntr = "TokenizeMode";
3979   if (DoLoad)
3980   {
3981     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3982       m_TokenizeMode = (TokenizeModes) TempInt32;
3983     if (m_TokenizeMode < 0 || m_TokenizeMode >= TM_MAX)
3984       m_TokenizeMode = (TokenizeModes) 0;
3985   }
3986   else if (ErrorCode == B_OK)
3987     ErrorCode = Settings.AddInt32 (NamePntr, m_TokenizeMode);
3988 
3989   if (ErrorCode != B_OK)
3990   {
3991     strcpy (TempString, "Unable to stuff the program settings into a "
3992       "temporary BMessage, settings not saved");
3993     goto ErrorExit;
3994   }
3995 
3996   /* Save the settings BMessage to the settings file. */
3997 
3998   if (!DoLoad)
3999   {
4000     Settings.what = g_SettingsWhatCode;
4001     ErrorCode = Settings.Flatten (&SettingsFile);
4002     if (ErrorCode != 0)
4003     {
4004       sprintf (TempString, "Problems while writing settings file \"%s\" in "
4005         "directory \"%s\"", g_SettingsFileName,
4006         m_SettingsDirectoryPath.Path ());
4007       goto ErrorExit;
4008     }
4009   }
4010 
4011   m_SettingsHaveChanged = false;
4012   return B_OK;
4013 
4014 ErrorExit: /* Error message in TempString, code in ErrorCode. */
4015   DisplayErrorMessage (TempString, ErrorCode, DoLoad ?
4016     "Loading Settings Error" : "Saving Settings Error");
4017   return ErrorCode;
4018 }
4019 
4020 
4021 void
4022 ABSApp::MessageReceived (BMessage *MessagePntr)
4023 {
4024   const char           *PropertyName;
4025   struct property_info *PropInfoPntr;
4026   int32                 SpecifierIndex;
4027   int32                 SpecifierKind;
4028   BMessage              SpecifierMessage;
4029 
4030   /* See if it is a scripting message that applies to the database or one of
4031   the other operations this program supports.  Pass on other scripting messages
4032   to the inherited parent MessageReceived function (they're usually scripting
4033   messages for the BApplication). */
4034 
4035   switch (MessagePntr->what)
4036   {
4037     case B_GET_PROPERTY:
4038     case B_SET_PROPERTY:
4039     case B_COUNT_PROPERTIES:
4040     case B_CREATE_PROPERTY:
4041     case B_DELETE_PROPERTY:
4042     case B_EXECUTE_PROPERTY:
4043       if (MessagePntr->GetCurrentSpecifier (&SpecifierIndex, &SpecifierMessage,
4044       &SpecifierKind, &PropertyName) == B_OK &&
4045       SpecifierKind == B_DIRECT_SPECIFIER)
4046       {
4047         for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
4048         {
4049           if (PropInfoPntr->name == 0)
4050             break; /* Ran out of commands. */
4051 
4052           if (PropInfoPntr->commands[0] == MessagePntr->what &&
4053           strcasecmp (PropInfoPntr->name, PropertyName) == 0)
4054           {
4055             ProcessScriptingMessage (MessagePntr, PropInfoPntr);
4056             return;
4057           }
4058         }
4059       }
4060       break;
4061   }
4062 
4063   /* Pass the unprocessed message to the inherited function, maybe it knows
4064   what to do.  This includes replies to messages we sent ourselves. */
4065 
4066   BApplication::MessageReceived (MessagePntr);
4067 }
4068 
4069 
4070 /* Rename the existing database file to a backup file name, potentially
4071 replacing an older backup.  If something goes wrong, returns an error code and
4072 puts an explanation in ErrorMessage. */
4073 
4074 status_t ABSApp::MakeBackup (char *ErrorMessage)
4075 {
4076   BEntry   Entry;
4077   status_t ErrorCode;
4078   int      i;
4079   char     LeafName [NAME_MAX];
4080   char     NewName [PATH_MAX+20];
4081   char     OldName [PATH_MAX+20];
4082 
4083   ErrorCode = Entry.SetTo (m_DatabaseFileName.String ());
4084   if (ErrorCode != B_OK)
4085   {
4086     sprintf (ErrorMessage, "While making backup, failed to make a BEntry for "
4087       "\"%s\" (maybe the directory doesn't exist?)",
4088       m_DatabaseFileName.String ());
4089     return ErrorCode;
4090   }
4091   if (!Entry.Exists ())
4092     return B_OK; /* No existing file to worry about overwriting. */
4093   Entry.GetName (LeafName);
4094 
4095   /* Find the first hole (no file) where we will stop the renaming chain. */
4096 
4097   for (i = 0; i < g_MaxBackups - 1; i++)
4098   {
4099     strcpy (OldName, m_DatabaseFileName.String ());
4100     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4101     Entry.SetTo (OldName);
4102     if (!Entry.Exists ())
4103       break;
4104   }
4105 
4106   /* Move the files down by one to fill in the hole in the name series. */
4107 
4108   for (i--; i >= 0; i--)
4109   {
4110     strcpy (OldName, m_DatabaseFileName.String ());
4111     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4112     Entry.SetTo (OldName);
4113     strcpy (NewName, LeafName);
4114     sprintf (NewName + strlen (NewName), g_BackupSuffix, i + 1);
4115     ErrorCode = Entry.Rename (NewName, true /* clobber */);
4116   }
4117 
4118   Entry.SetTo (m_DatabaseFileName.String ());
4119   strcpy (NewName, LeafName);
4120   sprintf (NewName + strlen (NewName), g_BackupSuffix, 0);
4121   ErrorCode = Entry.Rename (NewName, true /* clobber */);
4122   if (ErrorCode != B_OK)
4123     sprintf (ErrorMessage, "While making backup, failed to rename "
4124       "\"%s\" to \"%s\"", m_DatabaseFileName.String (), NewName);
4125 
4126   return ErrorCode;
4127 }
4128 
4129 
4130 void
4131 ABSApp::MakeDatabaseEmpty ()
4132 {
4133   m_WordMap.clear (); /* Sets the map to empty, deallocating any old data. */
4134   m_WordCount = 0;
4135   m_TotalGenuineMessages = 0;
4136   m_TotalSpamMessages = 0;
4137   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4138 }
4139 
4140 
4141 /* Do what the scripting command says.  A reply message will be sent back with
4142 several fields: "error" containing the numerical error code (0 for success),
4143 "CommandText" with a text representation of the command, "result" with the
4144 resulting data for a get or count command.  If it isn't understood, then rather
4145 than a B_REPLY kind of message, it will be a B_MESSAGE_NOT_UNDERSTOOD message
4146 with an "error" number and an "message" string with a description. */
4147 
4148 void
4149 ABSApp::ProcessScriptingMessage (
4150   BMessage *MessagePntr,
4151   struct property_info *PropInfoPntr)
4152 {
4153   bool        ArgumentBool = false;
4154   bool        ArgumentGotBool = false;
4155   bool        ArgumentGotInt32 = false;
4156   bool        ArgumentGotString = false;
4157   int32       ArgumentInt32 = 0;
4158   const char *ArgumentString = NULL;
4159   BString     CommandText;
4160   status_t    ErrorCode;
4161   int         i;
4162   BMessage    ReplyMessage (B_MESSAGE_NOT_UNDERSTOOD);
4163   ssize_t     StringBufferSize;
4164   BMessage    TempBMessage;
4165   BPath       TempPath;
4166   char        TempString [PATH_MAX + 1024];
4167 
4168   if (g_QuitCountdown >= 0 && !g_CommandLineMode)
4169   {
4170     g_QuitCountdown = -1;
4171     cerr << "Quit countdown aborted due to a scripting command arriving.\n";
4172   }
4173 
4174   if (g_BusyCursor != NULL)
4175     SetCursor (g_BusyCursor);
4176 
4177   ErrorCode = MessagePntr->FindData (g_DataName, B_STRING_TYPE,
4178     (const void **) &ArgumentString, &StringBufferSize);
4179   if (ErrorCode == B_OK)
4180   {
4181     if (PropInfoPntr->extra_data != PN_EVALUATE_STRING &&
4182     PropInfoPntr->extra_data != PN_SPAM_STRING &&
4183     PropInfoPntr->extra_data != PN_GENUINE_STRING &&
4184     strlen (ArgumentString) >= PATH_MAX)
4185     {
4186       sprintf (TempString, "\"data\" string of a scripting message is too "
4187         "long, for SET %s action", PropInfoPntr->name);
4188       ErrorCode = B_NAME_TOO_LONG;
4189       goto ErrorExit;
4190     }
4191     ArgumentGotString = true;
4192   }
4193   else if (MessagePntr->FindBool (g_DataName, &ArgumentBool) == B_OK)
4194     ArgumentGotBool = true;
4195   else if (MessagePntr->FindInt32 (g_DataName, &ArgumentInt32) == B_OK)
4196     ArgumentGotInt32 = true;
4197 
4198   /* Prepare a Human readable description of the scripting command. */
4199 
4200   switch (PropInfoPntr->commands[0])
4201   {
4202     case B_SET_PROPERTY:
4203       CommandText.SetTo ("Set ");
4204       break;
4205 
4206     case B_GET_PROPERTY:
4207       CommandText.SetTo ("Get ");
4208       break;
4209 
4210     case B_COUNT_PROPERTIES:
4211       CommandText.SetTo ("Count ");
4212       break;
4213 
4214     case B_CREATE_PROPERTY:
4215       CommandText.SetTo ("Create ");
4216       break;
4217 
4218     case B_DELETE_PROPERTY:
4219       CommandText.SetTo ("Delete ");
4220       break;
4221 
4222     case B_EXECUTE_PROPERTY:
4223       CommandText.SetTo ("Execute ");
4224       break;
4225 
4226     default:
4227       sprintf (TempString, "Bug: scripting command for \"%s\" has an unknown "
4228         "action code %d", PropInfoPntr->name,
4229         (int) PropInfoPntr->commands[0]);
4230       ErrorCode = -1;
4231       goto ErrorExit;
4232   }
4233   CommandText.Append (PropInfoPntr->name);
4234 
4235   /* Add on the argument value to our readable command, if there is one. */
4236 
4237   if (ArgumentGotString)
4238   {
4239     CommandText.Append (" \"");
4240     CommandText.Append (ArgumentString);
4241     CommandText.Append ("\"");
4242   }
4243   if (ArgumentGotBool)
4244     CommandText.Append (ArgumentBool ? " true" : " false");
4245   if (ArgumentGotInt32)
4246   {
4247     sprintf (TempString, " %" B_PRId32, ArgumentInt32);
4248     CommandText.Append (TempString);
4249   }
4250 
4251   /* From now on the scripting command has been recognized and is in the
4252   correct format, so it always returns a B_REPLY message.  A readable version
4253   of the command is also added to make debugging easier. */
4254 
4255   ReplyMessage.what = B_REPLY;
4256   ReplyMessage.AddString ("CommandText", CommandText);
4257 
4258   /* Now actually do the command.  First prepare a default error message. */
4259 
4260   sprintf (TempString, "Operation code %d (get, set, count, etc) "
4261     "unsupported for property %s",
4262     (int) PropInfoPntr->commands[0], PropInfoPntr->name);
4263   ErrorCode = B_BAD_INDEX;
4264 
4265   switch (PropInfoPntr->extra_data)
4266   {
4267     case PN_DATABASE_FILE:
4268       switch (PropInfoPntr->commands[0])
4269       {
4270         case B_GET_PROPERTY: /* Get the database file name. */
4271           ReplyMessage.AddString (g_ResultName, m_DatabaseFileName);
4272           break;
4273 
4274         case B_SET_PROPERTY: /* Set the database file name to a new one. */
4275           if (!ArgumentGotString)
4276           {
4277             ErrorCode = B_BAD_TYPE;
4278             sprintf (TempString, "You need to specify a string for the "
4279               "SET %s command", PropInfoPntr->name);
4280             goto ErrorExit;
4281           }
4282           ErrorCode = TempPath.SetTo (ArgumentString, NULL /* leaf */,
4283             true /* normalize - verifies parent directories exist */);
4284           if (ErrorCode != B_OK)
4285           {
4286             sprintf (TempString, "New database path name of \"%s\" is invalid "
4287               "(parent directories must exist)", ArgumentString);
4288             goto ErrorExit;
4289           }
4290           if ((ErrorCode = SaveDatabaseIfNeeded (TempString)) != B_OK)
4291             goto ErrorExit;
4292           MakeDatabaseEmpty (); /* So that the new one gets loaded if used. */
4293 
4294           if (strlen (TempPath.Leaf ()) > NAME_MAX-strlen(g_BackupSuffix)-1)
4295           {
4296             /* Truncate the name so that there is enough space for the backup
4297             extension.  Approximately. */
4298             strcpy (TempString, TempPath.Leaf ());
4299             TempString [NAME_MAX - strlen (g_BackupSuffix) - 1] = 0;
4300             TempPath.GetParent (&TempPath);
4301             TempPath.Append (TempString);
4302           }
4303           m_DatabaseFileName.SetTo (TempPath.Path ());
4304           m_SettingsHaveChanged = true;
4305           break;
4306 
4307         case B_CREATE_PROPERTY: /* Make a new database file plus more. */
4308           if ((ErrorCode = CreateDatabaseFile (TempString)) != B_OK)
4309             goto ErrorExit;
4310           break;
4311 
4312         case B_DELETE_PROPERTY: /* Delete the file and its backups too. */
4313           if ((ErrorCode = DeleteDatabaseFile (TempString)) != B_OK)
4314             goto ErrorExit;
4315           break;
4316 
4317         case B_COUNT_PROPERTIES:
4318           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4319             goto ErrorExit;
4320           ReplyMessage.AddInt32 (g_ResultName, m_WordCount);
4321           break;
4322 
4323         default: /* Unknown operation code, error message already set. */
4324           goto ErrorExit;
4325       }
4326       break;
4327 
4328     case PN_SPAM:
4329     case PN_SPAM_STRING:
4330     case PN_GENUINE:
4331     case PN_GENUINE_STRING:
4332     case PN_UNCERTAIN:
4333       switch (PropInfoPntr->commands[0])
4334       {
4335         case B_COUNT_PROPERTIES: /* Get the number of spam/genuine messages. */
4336           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4337             goto ErrorExit;
4338           if (PropInfoPntr->extra_data == PN_SPAM ||
4339           PropInfoPntr->extra_data == PN_SPAM_STRING)
4340             ReplyMessage.AddInt32 (g_ResultName, m_TotalSpamMessages);
4341           else
4342             ReplyMessage.AddInt32 (g_ResultName, m_TotalGenuineMessages);
4343           break;
4344 
4345         case B_SET_PROPERTY: /* Add spam/genuine/uncertain to database. */
4346           if (!ArgumentGotString)
4347           {
4348             ErrorCode = B_BAD_TYPE;
4349             sprintf (TempString, "You need to specify a string (%s) "
4350               "for the SET %s command",
4351               (PropInfoPntr->extra_data == PN_GENUINE_STRING ||
4352               PropInfoPntr->extra_data == PN_SPAM_STRING)
4353               ? "text of the message to be added"
4354               : "pathname of the file containing the text to be added",
4355               PropInfoPntr->name);
4356             goto ErrorExit;
4357           }
4358           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4359             goto ErrorExit;
4360           if (PropInfoPntr->extra_data == PN_GENUINE ||
4361           PropInfoPntr->extra_data == PN_SPAM ||
4362           PropInfoPntr->extra_data == PN_UNCERTAIN)
4363             ErrorCode = AddFileToDatabase (
4364               (PropInfoPntr->extra_data == PN_SPAM) ? CL_SPAM :
4365               ((PropInfoPntr->extra_data == PN_GENUINE) ? CL_GENUINE :
4366               CL_UNCERTAIN),
4367               ArgumentString, TempString /* ErrorMessage */);
4368           else
4369             ErrorCode = AddStringToDatabase (
4370               (PropInfoPntr->extra_data == PN_SPAM_STRING) ?
4371               CL_SPAM : CL_GENUINE,
4372               ArgumentString, TempString /* ErrorMessage */);
4373           if (ErrorCode != B_OK)
4374             goto ErrorExit;
4375           break;
4376 
4377         default: /* Unknown operation code, error message already set. */
4378           goto ErrorExit;
4379       }
4380       break;
4381 
4382     case PN_IGNORE_PREVIOUS_CLASSIFICATION:
4383       switch (PropInfoPntr->commands[0])
4384       {
4385         case B_GET_PROPERTY:
4386           ReplyMessage.AddBool (g_ResultName, m_IgnorePreviousClassification);
4387           break;
4388 
4389         case B_SET_PROPERTY:
4390           if (!ArgumentGotBool)
4391           {
4392             ErrorCode = B_BAD_TYPE;
4393             sprintf (TempString, "You need to specify a boolean (true/yes, "
4394               "false/no) for the SET %s command", PropInfoPntr->name);
4395             goto ErrorExit;
4396           }
4397           m_IgnorePreviousClassification = ArgumentBool;
4398           m_SettingsHaveChanged = true;
4399           break;
4400 
4401         default: /* Unknown operation code, error message already set. */
4402           goto ErrorExit;
4403       }
4404       break;
4405 
4406     case PN_SERVER_MODE:
4407       switch (PropInfoPntr->commands[0])
4408       {
4409         case B_GET_PROPERTY:
4410           ReplyMessage.AddBool (g_ResultName, g_ServerMode);
4411           break;
4412 
4413         case B_SET_PROPERTY:
4414           if (!ArgumentGotBool)
4415           {
4416             ErrorCode = B_BAD_TYPE;
4417             sprintf (TempString, "You need to specify a boolean (true/yes, "
4418               "false/no) for the SET %s command", PropInfoPntr->name);
4419             goto ErrorExit;
4420           }
4421           g_ServerMode = ArgumentBool;
4422           m_SettingsHaveChanged = true;
4423           break;
4424 
4425         default: /* Unknown operation code, error message already set. */
4426           goto ErrorExit;
4427       }
4428       break;
4429 
4430     case PN_FLUSH:
4431       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4432       (ErrorCode = SaveDatabaseIfNeeded (TempString)) == B_OK)
4433         break;
4434       goto ErrorExit;
4435 
4436     case PN_PURGE_AGE:
4437       switch (PropInfoPntr->commands[0])
4438       {
4439         case B_GET_PROPERTY:
4440           ReplyMessage.AddInt32 (g_ResultName, m_PurgeAge);
4441           break;
4442 
4443         case B_SET_PROPERTY:
4444           if (!ArgumentGotInt32)
4445           {
4446             ErrorCode = B_BAD_TYPE;
4447             sprintf (TempString, "You need to specify a 32 bit integer "
4448               "for the SET %s command", PropInfoPntr->name);
4449             goto ErrorExit;
4450           }
4451           m_PurgeAge = ArgumentInt32;
4452           m_SettingsHaveChanged = true;
4453           break;
4454 
4455         default: /* Unknown operation code, error message already set. */
4456           goto ErrorExit;
4457       }
4458       break;
4459 
4460     case PN_PURGE_POPULARITY:
4461       switch (PropInfoPntr->commands[0])
4462       {
4463         case B_GET_PROPERTY:
4464           ReplyMessage.AddInt32 (g_ResultName, m_PurgePopularity);
4465           break;
4466 
4467         case B_SET_PROPERTY:
4468           if (!ArgumentGotInt32)
4469           {
4470             ErrorCode = B_BAD_TYPE;
4471             sprintf (TempString, "You need to specify a 32 bit integer "
4472               "for the SET %s command", PropInfoPntr->name);
4473             goto ErrorExit;
4474           }
4475           m_PurgePopularity = ArgumentInt32;
4476           m_SettingsHaveChanged = true;
4477           break;
4478 
4479         default: /* Unknown operation code, error message already set. */
4480           goto ErrorExit;
4481       }
4482       break;
4483 
4484     case PN_PURGE:
4485       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4486       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK &&
4487       (ErrorCode = PurgeOldWords (TempString)) == B_OK)
4488         break;
4489       goto ErrorExit;
4490 
4491     case PN_OLDEST:
4492       if (PropInfoPntr->commands[0] == B_GET_PROPERTY &&
4493       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4494       {
4495         ReplyMessage.AddInt32 (g_ResultName, m_OldestAge);
4496         break;
4497       }
4498       goto ErrorExit;
4499 
4500     case PN_EVALUATE:
4501     case PN_EVALUATE_STRING:
4502       if (PropInfoPntr->commands[0] == B_SET_PROPERTY)
4503       {
4504         if (!ArgumentGotString)
4505         {
4506           ErrorCode = B_BAD_TYPE;
4507           sprintf (TempString, "You need to specify a string for the "
4508             "SET %s command", PropInfoPntr->name);
4509           goto ErrorExit;
4510         }
4511         if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4512         {
4513           if (PropInfoPntr->extra_data == PN_EVALUATE)
4514           {
4515             if ((ErrorCode = EvaluateFile (ArgumentString, &ReplyMessage,
4516             TempString)) == B_OK)
4517               break;
4518           }
4519           else /* PN_EVALUATE_STRING */
4520           {
4521             if ((ErrorCode = EvaluateString (ArgumentString, StringBufferSize,
4522             &ReplyMessage, TempString)) == B_OK)
4523               break;
4524           }
4525         }
4526       }
4527       goto ErrorExit;
4528 
4529     case PN_RESET_TO_DEFAULTS:
4530       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY)
4531       {
4532         DefaultSettings ();
4533         break;
4534       }
4535       goto ErrorExit;
4536 
4537     case PN_INSTALL_THINGS:
4538       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4539       (ErrorCode = InstallThings (TempString)) == B_OK)
4540         break;
4541       goto ErrorExit;
4542 
4543     case PN_SCORING_MODE:
4544       switch (PropInfoPntr->commands[0])
4545       {
4546         case B_GET_PROPERTY:
4547           ReplyMessage.AddString (g_ResultName,
4548             g_ScoringModeNames[m_ScoringMode]);
4549           break;
4550 
4551         case B_SET_PROPERTY:
4552           i = SM_MAX;
4553           if (ArgumentGotString)
4554             for (i = 0; i < SM_MAX; i++)
4555             {
4556               if (strcasecmp (ArgumentString, g_ScoringModeNames [i]) == 0)
4557               {
4558                 m_ScoringMode = (ScoringModes) i;
4559                 m_SettingsHaveChanged = true;
4560                 break;
4561               }
4562             }
4563           if (i >= SM_MAX) /* Didn't find a valid scoring mode word. */
4564           {
4565             ErrorCode = B_BAD_TYPE;
4566             sprintf (TempString, "You used the unrecognized \"%s\" as "
4567               "a scoring mode for the SET %s command.  Should be one of: ",
4568               ArgumentGotString ? ArgumentString : "not specified",
4569               PropInfoPntr->name);
4570             for (i = 0; i < SM_MAX; i++)
4571             {
4572               strcat (TempString, g_ScoringModeNames [i]);
4573               if (i < SM_MAX - 1)
4574                 strcat (TempString, ", ");
4575             }
4576             goto ErrorExit;
4577           }
4578           break;
4579 
4580         default: /* Unknown operation code, error message already set. */
4581           goto ErrorExit;
4582       }
4583       break;
4584 
4585     case PN_TOKENIZE_MODE:
4586       switch (PropInfoPntr->commands[0])
4587       {
4588         case B_GET_PROPERTY:
4589           ReplyMessage.AddString (g_ResultName,
4590             g_TokenizeModeNames[m_TokenizeMode]);
4591           break;
4592 
4593         case B_SET_PROPERTY:
4594           i = TM_MAX;
4595           if (ArgumentGotString)
4596             for (i = 0; i < TM_MAX; i++)
4597             {
4598               if (strcasecmp (ArgumentString, g_TokenizeModeNames [i]) == 0)
4599               {
4600                 m_TokenizeMode = (TokenizeModes) i;
4601                 m_SettingsHaveChanged = true;
4602                 break;
4603               }
4604             }
4605           if (i >= TM_MAX) /* Didn't find a valid tokenize mode word. */
4606           {
4607             ErrorCode = B_BAD_TYPE;
4608             sprintf (TempString, "You used the unrecognized \"%s\" as "
4609               "a tokenize mode for the SET %s command.  Should be one of: ",
4610               ArgumentGotString ? ArgumentString : "not specified",
4611               PropInfoPntr->name);
4612             for (i = 0; i < TM_MAX; i++)
4613             {
4614               strcat (TempString, g_TokenizeModeNames [i]);
4615               if (i < TM_MAX - 1)
4616                 strcat (TempString, ", ");
4617             }
4618             goto ErrorExit;
4619           }
4620           break;
4621 
4622         default: /* Unknown operation code, error message already set. */
4623           goto ErrorExit;
4624       }
4625       break;
4626 
4627     default:
4628       sprintf (TempString, "Bug!  Unrecognized property identification "
4629         "number %d (should be between 0 and %d).  Fix the entry in "
4630         "the g_ScriptingPropertyList array!",
4631         (int) PropInfoPntr->extra_data, PN_MAX - 1);
4632       goto ErrorExit;
4633   }
4634 
4635   /* Success. */
4636 
4637   ReplyMessage.AddInt32 ("error", B_OK);
4638   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4639     this /* Reply's reply handler */, 500000 /* send timeout */);
4640   if (ErrorCode != B_OK)
4641     cerr << "ProcessScriptingMessage failed to send a reply message, code " <<
4642     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4643     CommandText.String () << endl;
4644   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4645   return;
4646 
4647 ErrorExit: /* Error message in TempString, return code in ErrorCode. */
4648   ReplyMessage.AddInt32 ("error", ErrorCode);
4649   ReplyMessage.AddString ("message", TempString);
4650   DisplayErrorMessage (TempString, ErrorCode);
4651   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4652     this /* Reply's reply handler */, 500000 /* send timeout */);
4653   if (ErrorCode != B_OK)
4654     cerr << "ProcessScriptingMessage failed to send an error message, code " <<
4655     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4656     CommandText.String () << endl;
4657   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4658 }
4659 
4660 
4661 /* Since quitting stops the program before the results of a script command are
4662 received, we use a time delay to do the quit and make sure there are no pending
4663 commands being processed by the auxiliary looper which is sending us commands.
4664 Also, we have a countdown which can be interrupted by an incoming scripting
4665 message in case one client tells us to quit while another one is still using us
4666 (happens when you have two or more e-mail accounts).  But if the system is
4667 shutting down, quit immediately! */
4668 
4669 void
4670 ABSApp::Pulse ()
4671 {
4672   if (g_QuitCountdown == 0)
4673   {
4674     if (g_CommanderLooperPntr == NULL ||
4675     !g_CommanderLooperPntr->IsBusy ())
4676       PostMessage (B_QUIT_REQUESTED);
4677   }
4678   else if (g_QuitCountdown > 0)
4679   {
4680     cerr << "SpamDBM quitting in " << g_QuitCountdown << ".\n";
4681     g_QuitCountdown--;
4682   }
4683 }
4684 
4685 
4686 /* A quit request message has come in.  If the quit countdown has reached zero,
4687 allow the request, otherwise reject it (and start the countdown if it hasn't
4688 been started). */
4689 
4690 bool
4691 ABSApp::QuitRequested ()
4692 {
4693   BMessage  *QuitMessage;
4694   team_info  RemoteInfo;
4695   BMessenger RemoteMessenger;
4696   team_id    RemoteTeam;
4697 
4698   /* See if the quit is from the system shutdown command (which goes through
4699   the registrar server), if so, quit immediately. */
4700 
4701   QuitMessage = CurrentMessage ();
4702   if (QuitMessage != NULL && QuitMessage->IsSourceRemote ())
4703   {
4704     RemoteMessenger = QuitMessage->ReturnAddress ();
4705     RemoteTeam = RemoteMessenger.Team ();
4706     if (get_team_info (RemoteTeam, &RemoteInfo) == B_OK &&
4707     strstr (RemoteInfo.args, "registrar") != NULL)
4708       g_QuitCountdown = 0;
4709   }
4710 
4711   if (g_QuitCountdown == 0)
4712     return BApplication::QuitRequested ();
4713 
4714   if (g_QuitCountdown < 0)
4715 //    g_QuitCountdown = 10; /* Start the countdown. */
4716     g_QuitCountdown = 5; /* Quit more quickly */
4717 
4718   return false;
4719 }
4720 
4721 
4722 /* Go through the current database and delete words which are too old (time is
4723 equivalent to the number of messages added to the database) and too unpopular
4724 (words not used by many messages).  Hopefully this will get rid of words which
4725 are just hunks of binary or other garbage.  The database has been loaded
4726 elsewhere. */
4727 
4728 status_t
4729 ABSApp::PurgeOldWords (char *ErrorMessage)
4730 {
4731   uint32                  CurrentTime;
4732   StatisticsMap::iterator CurrentIter;
4733   StatisticsMap::iterator EndIter;
4734   StatisticsMap::iterator NextIter;
4735   char                    TempString [80];
4736 
4737   strcpy (ErrorMessage, "Purge can't fail"); /* So argument gets used. */
4738   CurrentTime = m_TotalGenuineMessages + m_TotalSpamMessages - 1;
4739   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4740 
4741   EndIter = m_WordMap.end ();
4742   NextIter = m_WordMap.begin ();
4743   while (NextIter != EndIter) {
4744     CurrentIter = NextIter++;
4745 
4746     if (CurrentTime - CurrentIter->second.age >= m_PurgeAge &&
4747     CurrentIter->second.genuineCount + CurrentIter->second.spamCount <=
4748     m_PurgePopularity) {
4749       /* Delete this word, it is unpopular and old.  Sob. */
4750 
4751       m_WordMap.erase (CurrentIter);
4752       if (m_WordCount > 0)
4753         m_WordCount--;
4754 
4755       m_DatabaseHasChanged = true;
4756     }
4757     else /* This word is still in the database.  Update oldest age. */
4758     {
4759       if (CurrentIter->second.age < m_OldestAge)
4760         m_OldestAge = CurrentIter->second.age;
4761     }
4762   }
4763 
4764   /* Just a little bug check here.  Just in case. */
4765 
4766   if (m_WordCount != m_WordMap.size ()) {
4767     sprintf (TempString, "Our word count of %" B_PRIu32 " doesn't match the "
4768       "size of the database, %lu", m_WordCount, m_WordMap.size());
4769     DisplayErrorMessage (TempString, -1, "Bug!");
4770     m_WordCount = m_WordMap.size ();
4771   }
4772 
4773   return B_OK;
4774 }
4775 
4776 
4777 void
4778 ABSApp::ReadyToRun ()
4779 {
4780   DatabaseWindow *DatabaseWindowPntr;
4781   float           JunkFloat;
4782   BButton        *TempButtonPntr;
4783   BCheckBox      *TempCheckBoxPntr;
4784   font_height     TempFontHeight;
4785   BMenuBar       *TempMenuBarPntr;
4786   BMenuItem      *TempMenuItemPntr;
4787   BPopUpMenu     *TempPopUpMenuPntr;
4788   BRadioButton   *TempRadioButtonPntr;
4789   BRect           TempRect;
4790   const char     *TempString = "Testing My Things";
4791   BStringView    *TempStringViewPntr;
4792   BTextControl   *TempTextPntr;
4793   BWindow        *TempWindowPntr;
4794 
4795   /* This batch of code gets some measurements which will be used for laying
4796   out controls and other GUI elements.  Set the spacing between buttons and
4797   other controls to the width of the letter "M" in the user's desired font. */
4798 
4799  g_MarginBetweenControls = (int) be_plain_font->StringWidth ("M");
4800 
4801   /* Also find out how much space a line of text uses. */
4802 
4803   be_plain_font->GetHeight (&TempFontHeight);
4804   g_LineOfTextHeight = ceilf (
4805     TempFontHeight.ascent + TempFontHeight.descent + TempFontHeight.leading);
4806 
4807   /* Start finding out the height of various user interface gadgets, which can
4808   vary based on the current font size.  Make a temporary gadget, which is
4809   attached to our window, then resize it to its prefered size so that it
4810   accomodates the font size and other frills it needs. */
4811 
4812   TempWindowPntr = new (std::nothrow) BWindow (BRect (10, 20, 200, 200),
4813 	"Temporary Window", B_DOCUMENT_WINDOW,
4814 	B_NO_WORKSPACE_ACTIVATION | B_ASYNCHRONOUS_CONTROLS);
4815   if (TempWindowPntr == NULL) {
4816     DisplayErrorMessage ("Unable to create temporary window for finding "
4817       "sizes of controls.");
4818     g_QuitCountdown = 0;
4819     return;
4820   }
4821 
4822   TempRect = TempWindowPntr->Bounds ();
4823 
4824   /* Find the height of a single line of text in a BStringView. */
4825 
4826   TempStringViewPntr = new (std::nothrow) BStringView (TempRect, TempString, TempString);
4827   if (TempStringViewPntr != NULL) {
4828     TempWindowPntr->Lock();
4829     TempWindowPntr->AddChild (TempStringViewPntr);
4830     TempStringViewPntr->GetPreferredSize (&JunkFloat, &g_StringViewHeight);
4831     TempWindowPntr->RemoveChild (TempStringViewPntr);
4832     TempWindowPntr->Unlock();
4833     delete TempStringViewPntr;
4834   }
4835 
4836   /* Find the height of a button, which seems to be larger than a text
4837   control and can make life difficult.  Make a temporary button, which
4838   is attached to our window so that it resizes to accomodate the font size. */
4839 
4840   TempButtonPntr = new (std::nothrow) BButton (TempRect, TempString, TempString, NULL);
4841   if (TempButtonPntr != NULL) {
4842     TempWindowPntr->Lock();
4843     TempWindowPntr->AddChild (TempButtonPntr);
4844     TempButtonPntr->GetPreferredSize (&JunkFloat, &g_ButtonHeight);
4845     TempWindowPntr->RemoveChild (TempButtonPntr);
4846     TempWindowPntr->Unlock();
4847     delete TempButtonPntr;
4848   }
4849 
4850   /* Find the height of a text box. */
4851 
4852   TempTextPntr = new (std::nothrow) BTextControl (TempRect, TempString, NULL /* label */,
4853     TempString, NULL);
4854   if (TempTextPntr != NULL) {
4855     TempWindowPntr->Lock ();
4856     TempWindowPntr->AddChild (TempTextPntr);
4857     TempTextPntr->GetPreferredSize (&JunkFloat, &g_TextBoxHeight);
4858     TempWindowPntr->RemoveChild (TempTextPntr);
4859     TempWindowPntr->Unlock ();
4860     delete TempTextPntr;
4861   }
4862 
4863   /* Find the height of a checkbox control. */
4864 
4865   TempCheckBoxPntr = new (std::nothrow) BCheckBox (TempRect, TempString, TempString, NULL);
4866   if (TempCheckBoxPntr != NULL) {
4867     TempWindowPntr->Lock ();
4868     TempWindowPntr->AddChild (TempCheckBoxPntr);
4869     TempCheckBoxPntr->GetPreferredSize (&JunkFloat, &g_CheckBoxHeight);
4870     TempWindowPntr->RemoveChild (TempCheckBoxPntr);
4871     TempWindowPntr->Unlock ();
4872     delete TempCheckBoxPntr;
4873   }
4874 
4875   /* Find the height of a radio button control. */
4876 
4877   TempRadioButtonPntr =
4878     new (std::nothrow) BRadioButton (TempRect, TempString, TempString, NULL);
4879   if (TempRadioButtonPntr != NULL) {
4880     TempWindowPntr->Lock ();
4881     TempWindowPntr->AddChild (TempRadioButtonPntr);
4882     TempRadioButtonPntr->GetPreferredSize (&JunkFloat, &g_RadioButtonHeight);
4883     TempWindowPntr->RemoveChild (TempRadioButtonPntr);
4884     TempWindowPntr->Unlock ();
4885     delete TempRadioButtonPntr;
4886   }
4887 
4888   /* Find the height of a pop-up menu. */
4889 
4890   TempMenuBarPntr = new (std::nothrow) BMenuBar (TempRect, TempString,
4891     B_FOLLOW_LEFT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
4892     true /* resize to fit items */);
4893   TempPopUpMenuPntr = new (std::nothrow) BPopUpMenu (TempString);
4894   TempMenuItemPntr = new (std::nothrow) BMenuItem (TempString, new BMessage (12345), 'g');
4895 
4896   if (TempMenuBarPntr != NULL && TempPopUpMenuPntr != NULL &&
4897   TempMenuItemPntr != NULL)
4898   {
4899     TempPopUpMenuPntr->AddItem (TempMenuItemPntr);
4900     TempMenuBarPntr->AddItem (TempPopUpMenuPntr);
4901 
4902     TempWindowPntr->Lock ();
4903     TempWindowPntr->AddChild (TempMenuBarPntr);
4904     TempMenuBarPntr->GetPreferredSize (&JunkFloat, &g_PopUpMenuHeight);
4905     TempWindowPntr->RemoveChild (TempMenuBarPntr);
4906     TempWindowPntr->Unlock ();
4907     delete TempMenuBarPntr; // It will delete contents too.
4908   }
4909 
4910   TempWindowPntr->Lock ();
4911   TempWindowPntr->Quit ();
4912 
4913   SetPulseRate (500000);
4914 
4915   if (g_CommandLineMode)
4916     g_QuitCountdown = 0; /* Quit as soon as queued up commands done. */
4917   else /* GUI mode, make a window. */
4918   {
4919     DatabaseWindowPntr = new (std::nothrow) DatabaseWindow ();
4920     if (DatabaseWindowPntr == NULL) {
4921       DisplayErrorMessage ("Unable to create window.");
4922       g_QuitCountdown = 0;
4923     } else {
4924       DatabaseWindowPntr->Show (); /* Starts the window's message loop. */
4925     }
4926   }
4927 
4928   g_AppReadyToRunCompleted = true;
4929 }
4930 
4931 
4932 /* Given a mail component (body text, attachment, whatever), look for words in
4933 it.  If the tokenize mode specifies that it isn't one of the ones we are
4934 looking for, just skip it.  For container type components, recursively examine
4935 their contents, up to the maximum depth specified. */
4936 
4937 status_t
4938 ABSApp::RecursivelyTokenizeMailComponent (
4939   BMailComponent *ComponentPntr,
4940   const char *OptionalFileName,
4941   set<string> &WordSet,
4942   char *ErrorMessage,
4943   int RecursionLevel,
4944   int MaxRecursionLevel)
4945 {
4946   char                        AttachmentName [B_FILE_NAME_LENGTH];
4947   BMailAttachment            *AttachmentPntr;
4948   BMimeType                   ComponentMIMEType;
4949   BMailContainer             *ContainerPntr;
4950   BMallocIO                   ContentsIO;
4951   const char                 *ContentsBufferPntr;
4952   size_t                      ContentsBufferSize;
4953   status_t                    ErrorCode;
4954   bool                        ExamineComponent;
4955   const char                 *HeaderKeyPntr;
4956   const char                 *HeaderValuePntr;
4957   int                         i;
4958   int                         j;
4959   const char                 *NameExtension;
4960   int                         NumComponents;
4961   BMimeType                   TextAnyMIMEType ("text");
4962   BMimeType                   TextPlainMIMEType ("text/plain");
4963 
4964   if (ComponentPntr == NULL)
4965     return B_OK;
4966 
4967   /* Add things in the sub-headers that might be useful.  Things like the file
4968   name of attachments, the encoding type, etc. */
4969 
4970   if (m_TokenizeMode == TM_PLAIN_TEXT_HEADER ||
4971   m_TokenizeMode == TM_ANY_TEXT_HEADER ||
4972   m_TokenizeMode == TM_ALL_PARTS_HEADER ||
4973   m_TokenizeMode == TM_JUST_HEADER)
4974   {
4975     for (i = 0; i < 1000; i++)
4976     {
4977       HeaderKeyPntr = ComponentPntr->HeaderAt (i);
4978       if (HeaderKeyPntr == NULL)
4979         break;
4980       AddWordsToSet (HeaderKeyPntr, strlen (HeaderKeyPntr),
4981         'H' /* Prefix for Headers, uppercase unlike normal words. */, WordSet);
4982       for (j = 0; j < 1000; j++)
4983       {
4984         HeaderValuePntr = ComponentPntr->HeaderField (HeaderKeyPntr, j);
4985         if (HeaderValuePntr == NULL)
4986           break;
4987         AddWordsToSet (HeaderValuePntr, strlen (HeaderValuePntr),
4988           'H', WordSet);
4989       }
4990     }
4991   }
4992 
4993   /* Check the MIME type of the thing.  It's used to decide if the contents are
4994   worth examining for words. */
4995 
4996   ErrorCode = ComponentPntr->MIMEType (&ComponentMIMEType);
4997   if (ErrorCode != B_OK)
4998   {
4999     sprintf (ErrorMessage, "ABSApp::RecursivelyTokenizeMailComponent: "
5000       "Unable to get MIME type at level %d in \"%s\"",
5001       RecursionLevel, OptionalFileName);
5002     return ErrorCode;
5003   }
5004   if (ComponentMIMEType.Type() == NULL)
5005   {
5006     /* Have to make up a MIME type for things which don't have them, such as
5007     the main body text, otherwise it would get ignored. */
5008 
5009     if (NULL != dynamic_cast<BTextMailComponent *>(ComponentPntr))
5010       ComponentMIMEType.SetType ("text/plain");
5011   }
5012   if (!TextAnyMIMEType.Contains (&ComponentMIMEType) &&
5013   NULL != (AttachmentPntr = dynamic_cast<BMailAttachment *>(ComponentPntr)))
5014   {
5015     /* Sometimes spam doesn't give a text MIME type for text when they do an
5016     attachment (which is often base64 encoded).  Use the file name extension to
5017     see if it really is text. */
5018     NameExtension = NULL;
5019     if (AttachmentPntr->FileName (AttachmentName) >= 0)
5020       NameExtension = strrchr (AttachmentName, '.');
5021     if (NameExtension != NULL)
5022     {
5023       if (strcasecmp (NameExtension, ".txt") == 0)
5024         ComponentMIMEType.SetType ("text/plain");
5025       else if (strcasecmp (NameExtension, ".htm") == 0 ||
5026       strcasecmp (NameExtension, ".html") == 0)
5027         ComponentMIMEType.SetType ("text/html");
5028     }
5029   }
5030 
5031   switch (m_TokenizeMode)
5032   {
5033     case TM_PLAIN_TEXT:
5034     case TM_PLAIN_TEXT_HEADER:
5035       ExamineComponent = TextPlainMIMEType.Contains (&ComponentMIMEType);
5036       break;
5037 
5038     case TM_ANY_TEXT:
5039     case TM_ANY_TEXT_HEADER:
5040       ExamineComponent = TextAnyMIMEType.Contains (&ComponentMIMEType);
5041       break;
5042 
5043     case TM_ALL_PARTS:
5044     case TM_ALL_PARTS_HEADER:
5045       ExamineComponent = true;
5046       break;
5047 
5048     default:
5049       ExamineComponent = false;
5050       break;
5051   }
5052 
5053   if (ExamineComponent)
5054   {
5055     /* Get the contents of the component.  This will be UTF-8 text (converted
5056     from whatever encoding was used) for text attachments.  For other ones,
5057     it's just the raw data, or perhaps decoded from base64 encoding. */
5058 
5059     ContentsIO.SetBlockSize (16 * 1024);
5060     ErrorCode = ComponentPntr->GetDecodedData (&ContentsIO);
5061     if (ErrorCode == B_OK) /* Can fail for container components: no data. */
5062     {
5063       /* Look for words in the decoded data. */
5064 
5065       ContentsBufferPntr = (const char *) ContentsIO.Buffer ();
5066       ContentsBufferSize = ContentsIO.BufferLength ();
5067       if (ContentsBufferPntr != NULL /* can be empty */)
5068         AddWordsToSet (ContentsBufferPntr, ContentsBufferSize,
5069           0 /* no prefix character, this is body text */, WordSet);
5070     }
5071   }
5072 
5073   /* Examine any sub-components in the message. */
5074 
5075   if (RecursionLevel + 1 <= MaxRecursionLevel &&
5076   NULL != (ContainerPntr = dynamic_cast<BMailContainer *>(ComponentPntr)))
5077   {
5078     NumComponents = ContainerPntr->CountComponents ();
5079 
5080     for (i = 0; i < NumComponents; i++)
5081     {
5082       ComponentPntr = ContainerPntr->GetComponent (i);
5083 
5084       ErrorCode = RecursivelyTokenizeMailComponent (ComponentPntr,
5085         OptionalFileName, WordSet, ErrorMessage, RecursionLevel + 1,
5086         MaxRecursionLevel);
5087       if (ErrorCode != B_OK)
5088         break;
5089     }
5090   }
5091 
5092   return ErrorCode;
5093 }
5094 
5095 
5096 /* The user has tried to open a file or several files with this application,
5097 via Tracker's open-with menu item.  If it is a database type file, then change
5098 the database file name to it.  Otherwise, ask the user whether they want to
5099 classify it as spam or non-spam.  There will be at most around 100 files, BeOS
5100 R5.0.3's Tracker crashes if it tries to pass on more than that many using Open
5101 With... etc.  The command is sent to an intermediary thread where it is
5102 asynchronously converted into a scripting message(s) that are sent back to this
5103 BApplication.  The intermediary is needed since we can't recursively execute
5104 scripting messages while processing a message (this RefsReceived one). */
5105 
5106 void
5107 ABSApp::RefsReceived (BMessage *MessagePntr)
5108 {
5109   if (g_CommanderLooperPntr != NULL)
5110     g_CommanderLooperPntr->CommandReferences (MessagePntr);
5111 }
5112 
5113 
5114 /* A scripting command is looking for something to execute it.  See if it is
5115 targetted at our database. */
5116 
5117 BHandler * ABSApp::ResolveSpecifier (
5118   BMessage *MessagePntr,
5119   int32 Index,
5120   BMessage *SpecifierMsgPntr,
5121   int32 SpecificationKind,
5122   const char *PropertyPntr)
5123 {
5124   int i;
5125 
5126   /* See if it is one of our commands. */
5127 
5128   if (SpecificationKind == B_DIRECT_SPECIFIER)
5129   {
5130     for (i = PN_MAX - 1; i >= 0; i--)
5131     {
5132       if (strcasecmp (PropertyPntr, g_PropertyNames [i]) == 0)
5133         return this; /* Found it!  Return the Handler (which is us). */
5134     }
5135   }
5136 
5137   /* Handle an unrecognized scripting command, let the parent figure it out. */
5138 
5139   return BApplication::ResolveSpecifier (
5140     MessagePntr, Index, SpecifierMsgPntr, SpecificationKind, PropertyPntr);
5141 }
5142 
5143 
5144 /* Save the database if it hasn't been saved yet.  Otherwise do nothing. */
5145 
5146 status_t ABSApp::SaveDatabaseIfNeeded (char *ErrorMessage)
5147 {
5148   if (m_DatabaseHasChanged)
5149     return LoadSaveDatabase (false /* DoLoad */, ErrorMessage);
5150 
5151   return B_OK;
5152 }
5153 
5154 
5155 /* Presumably the file is an e-mail message (or at least the header portion of
5156 one).  Break it into parts: header, body and MIME components.  Then add the
5157 words in the portions that match the current tokenization settings to the set
5158 of words. */
5159 
5160 status_t ABSApp::TokenizeParts (
5161   BPositionIO *PositionIOPntr,
5162   const char *OptionalFileName,
5163   set<string> &WordSet,
5164   char *ErrorMessage)
5165 {
5166   status_t        ErrorCode = B_OK;
5167   BEmailMessage   WholeEMail;
5168 
5169   sprintf (ErrorMessage, "ABSApp::TokenizeParts: While getting e-mail "
5170     "headers, had problems with \"%s\"", OptionalFileName);
5171 
5172   ErrorCode = WholeEMail.SetToRFC822 (
5173     PositionIOPntr /* it does its own seeking to the start */,
5174     -1 /* length */, true /* parse_now */);
5175   if (ErrorCode < 0) goto ErrorExit;
5176 
5177   ErrorCode = RecursivelyTokenizeMailComponent (&WholeEMail,
5178     OptionalFileName, WordSet, ErrorMessage, 0 /* Initial recursion level */,
5179     (m_TokenizeMode == TM_JUST_HEADER) ? 0 : 500 /* Max recursion level */);
5180 
5181 ErrorExit:
5182   return ErrorCode;
5183 }
5184 
5185 
5186 /* Add all the words in the whole file or memory buffer to the supplied set.
5187 The file doesn't have to be an e-mail message since it isn't parsed for e-mail
5188 headers or MIME headers or anything.  It blindly adds everything that looks
5189 like a word, though it does convert quoted printable codes to the characters
5190 they represent.  See also AddWordsToSet which does something more advanced. */
5191 
5192 status_t ABSApp::TokenizeWhole (
5193   BPositionIO *PositionIOPntr,
5194   const char *OptionalFileName,
5195   set<string> &WordSet,
5196   char *ErrorMessage)
5197 {
5198   string                AccumulatedWord;
5199   uint8                 Buffer [16 * 1024];
5200   uint8                *BufferCurrentPntr = Buffer + 0;
5201   uint8                *BufferEndPntr = Buffer + 0;
5202   const char           *IOErrorString =
5203                           "TokenizeWhole: Error %ld while reading \"%s\"";
5204   size_t                Length;
5205   int                   Letter = ' ';
5206   char                  HexString [4];
5207   int                   NextLetter = ' ';
5208   int                   NextNextLetter = ' ';
5209 
5210   /* Use a buffer since reading single characters from a BFile is so slow.
5211   BufferCurrentPntr is the position of the next character to be read.  When it
5212   reaches BufferEndPntr, it is time to fill the buffer again. */
5213 
5214 #define ReadChar(CharVar) \
5215   { \
5216     if (BufferCurrentPntr < BufferEndPntr) \
5217       CharVar = *BufferCurrentPntr++; \
5218     else /* Try to fill the buffer. */ \
5219     { \
5220       ssize_t AmountRead; \
5221       AmountRead = PositionIOPntr->Read (Buffer, sizeof (Buffer)); \
5222       if (AmountRead < 0) \
5223       { \
5224         sprintf (ErrorMessage, IOErrorString, AmountRead, OptionalFileName); \
5225         return AmountRead; \
5226       } \
5227       else if (AmountRead == 0) \
5228         CharVar = EOF; \
5229       else \
5230       { \
5231         BufferEndPntr = Buffer + AmountRead; \
5232         BufferCurrentPntr = Buffer + 0; \
5233         CharVar = *BufferCurrentPntr++; \
5234       } \
5235     } \
5236   }
5237 
5238   /* Read all the words in the file and add them to our local set of words.  A
5239   set is used since we don't care how many times a word occurs. */
5240 
5241   while (true)
5242   {
5243     /* We read two letters ahead so that we can decode quoted printable
5244     characters (an equals sign followed by two hex digits or a new line).  Note
5245     that Letter can become EOF (-1) when end of file is reached. */
5246 
5247     Letter = NextLetter;
5248     NextLetter = NextNextLetter;
5249     ReadChar (NextNextLetter);
5250 
5251     /* Decode quoted printable codes first, so that the rest of the code just
5252     sees an ordinary character.  Or even nothing, if it is the hidden line
5253     break combination.  This may falsely corrupt stuff following an equals
5254     sign, but usually won't. */
5255 
5256     if (Letter == '=')
5257     {
5258       if ((NextLetter == '\r' && NextNextLetter == '\n') ||
5259       (NextLetter == '\n' && NextNextLetter == '\r'))
5260       {
5261         /* Make the "=\r\n" pair disappear.  It's not even white space. */
5262         ReadChar (NextLetter);
5263         ReadChar (NextNextLetter);
5264         continue;
5265       }
5266       if (NextLetter == '\n' || NextLetter == '\r')
5267       {
5268         /* Make the "=\n" pair disappear.  It's not even white space. */
5269         NextLetter = NextNextLetter;
5270         ReadChar (NextNextLetter);
5271         continue;
5272       }
5273       if (NextNextLetter != EOF &&
5274       isxdigit (NextLetter) && isxdigit (NextNextLetter))
5275       {
5276         /* Convert the hex code to a letter. */
5277         HexString[0] = NextLetter;
5278         HexString[1] = NextNextLetter;
5279         HexString[2] = 0;
5280         Letter = strtoul (HexString, NULL, 16 /* number system base */);
5281         ReadChar (NextLetter);
5282         ReadChar (NextNextLetter);
5283       }
5284     }
5285 
5286     /* Convert to lower case to improve word matches.  Of course this loses a
5287     bit of information, such as MONEY vs Money, an indicator of spam.  Well,
5288     apparently that isn't all that useful a distinction, so do it. */
5289 
5290     if (Letter >= 'A' && Letter < 'Z')
5291       Letter = Letter + ('a' - 'A');
5292 
5293     /* See if it is a letter we treat as white space - all control characters
5294     and all punctuation except for: apostrophe (so "it's" and possessive
5295     versions of words get stored), dash (for hyphenated words), dollar sign
5296     (for cash amounts), period (for IP addresses, we later remove trailing
5297     (periods).  Note that codes above 127 are UTF-8 characters, which we
5298     consider non-space. */
5299 
5300     if (Letter < 0 /* EOF */ || (Letter < 128 && g_SpaceCharacters[Letter]))
5301     {
5302       /* That space finished off a word.  Remove trailing periods... */
5303 
5304       while ((Length = AccumulatedWord.size()) > 0 &&
5305       AccumulatedWord [Length-1] == '.')
5306         AccumulatedWord.resize (Length - 1);
5307 
5308       /* If there's anything left in the word, add it to the set.  Also ignore
5309       words which are too big (it's probably some binary encoded data).  But
5310       leave room for supercalifragilisticexpialidoceous.  According to one web
5311       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
5312       currently in English.  Note that some uuencoded data was seen with a 60
5313       character line length. */
5314 
5315       if (Length > 0 && Length <= g_MaxWordLength)
5316         WordSet.insert (AccumulatedWord);
5317 
5318       /* Empty out the string to get ready for the next word. */
5319 
5320       AccumulatedWord.resize (0);
5321     }
5322     else /* Not a space-like character, add it to the word. */
5323       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
5324 
5325     /* Stop at end of file or error.  Don't care which.  Exit here so that last
5326     word got processed. */
5327 
5328     if (Letter == EOF)
5329       break;
5330   }
5331 
5332   return B_OK;
5333 }
5334 
5335 
5336 
5337 /******************************************************************************
5338  * Implementation of the ClassificationChoicesView class, constructor,
5339  * destructor and the rest of the member functions in mostly alphabetical
5340  * order.
5341  */
5342 
5343 ClassificationChoicesWindow::ClassificationChoicesWindow (
5344   BRect FrameRect,
5345   const char *FileName,
5346   int NumberOfFiles)
5347 : BWindow (FrameRect, "Classification Choices", B_TITLED_WINDOW,
5348     B_NOT_ZOOMABLE | B_NOT_RESIZABLE | B_ASYNCHRONOUS_CONTROLS),
5349   m_BulkModeSelectedPntr (NULL),
5350   m_ChoosenClassificationPntr (NULL)
5351 {
5352   ClassificationChoicesView *SubViewPntr;
5353 
5354   SubViewPntr = new ClassificationChoicesView (Bounds(),
5355     FileName, NumberOfFiles);
5356   AddChild (SubViewPntr);
5357   SubViewPntr->ResizeToPreferred ();
5358   ResizeTo (SubViewPntr->Frame().Width(), SubViewPntr->Frame().Height());
5359 }
5360 
5361 
5362 void
5363 ClassificationChoicesWindow::MessageReceived (BMessage *MessagePntr)
5364 {
5365   BControl *ControlPntr;
5366 
5367   if (MessagePntr->what >= MSG_CLASS_BUTTONS &&
5368   MessagePntr->what < MSG_CLASS_BUTTONS + CL_MAX)
5369   {
5370     if (m_ChoosenClassificationPntr != NULL)
5371       *m_ChoosenClassificationPntr =
5372         (ClassificationTypes) (MessagePntr->what - MSG_CLASS_BUTTONS);
5373     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5374     return;
5375   }
5376 
5377   if (MessagePntr->what == MSG_BULK_CHECKBOX)
5378   {
5379     if (m_BulkModeSelectedPntr != NULL &&
5380     MessagePntr->FindPointer ("source", (void **) &ControlPntr) == B_OK)
5381       *m_BulkModeSelectedPntr = (ControlPntr->Value() == B_CONTROL_ON);
5382     return;
5383   }
5384 
5385   if (MessagePntr->what == MSG_CANCEL_BUTTON)
5386   {
5387     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5388     return;
5389   }
5390 
5391   BWindow::MessageReceived (MessagePntr);
5392 }
5393 
5394 
5395 void
5396 ClassificationChoicesWindow::Go (
5397   bool *BulkModeSelectedPntr,
5398   ClassificationTypes *ChoosenClassificationPntr)
5399 {
5400   status_t  ErrorCode = 0;
5401   BView    *MainViewPntr;
5402   thread_id WindowThreadID;
5403 
5404   m_BulkModeSelectedPntr = BulkModeSelectedPntr;
5405   m_ChoosenClassificationPntr = ChoosenClassificationPntr;
5406   if (m_ChoosenClassificationPntr != NULL)
5407     *m_ChoosenClassificationPntr = CL_MAX;
5408 
5409   Show (); // Starts the window thread running.
5410 
5411   /* Move the window to the center of the screen it is now being displayed on
5412   (have to wait for it to be showing). */
5413 
5414   Lock ();
5415   MainViewPntr = FindView ("ClassificationChoicesView");
5416   if (MainViewPntr != NULL)
5417   {
5418     BRect   TempRect;
5419     BScreen TempScreen (this);
5420     float   X;
5421     float   Y;
5422 
5423     TempRect = TempScreen.Frame ();
5424     X = TempRect.Width() / 2;
5425     Y = TempRect.Height() / 2;
5426     TempRect = MainViewPntr->Frame();
5427     X -= TempRect.Width() / 2;
5428     Y -= TempRect.Height() / 2;
5429     MoveTo (ceilf (X), ceilf (Y));
5430   }
5431   Unlock ();
5432 
5433   /* Wait for the window to go away. */
5434 
5435   WindowThreadID = Thread ();
5436   if (WindowThreadID >= 0)
5437     // Delay until the window thread has died, presumably window deleted now.
5438     wait_for_thread (WindowThreadID, &ErrorCode);
5439 }
5440 
5441 
5442 
5443 /******************************************************************************
5444  * Implementation of the ClassificationChoicesView class, constructor,
5445  * destructor and the rest of the member functions in mostly alphabetical
5446  * order.
5447  */
5448 
5449 ClassificationChoicesView::ClassificationChoicesView (
5450   BRect FrameRect,
5451   const char *FileName,
5452   int NumberOfFiles)
5453 : BView (FrameRect, "ClassificationChoicesView",
5454     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_NAVIGABLE_JUMP),
5455   m_FileName (FileName),
5456   m_NumberOfFiles (NumberOfFiles),
5457   m_PreferredBottomY (ceilf (g_ButtonHeight * 10))
5458 {
5459 }
5460 
5461 
5462 void
5463 ClassificationChoicesView::AttachedToWindow ()
5464 {
5465   BButton            *ButtonPntr;
5466   BCheckBox          *CheckBoxPntr;
5467   ClassificationTypes Classification;
5468   float               Margin;
5469   float               RowHeight;
5470   float               RowTop;
5471   BTextView          *TextViewPntr;
5472   BRect               TempRect;
5473   char                TempString [2048];
5474   BRect               TextRect;
5475   float               X;
5476 
5477   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5478 
5479   RowHeight = g_ButtonHeight;
5480   if (g_CheckBoxHeight > RowHeight)
5481     RowHeight = g_CheckBoxHeight;
5482   RowHeight = ceilf (RowHeight * 1.1);
5483 
5484   TempRect = Bounds ();
5485   RowTop = TempRect.top;
5486 
5487   /* Show the file name text. */
5488 
5489   Margin = ceilf ((RowHeight - g_StringViewHeight) / 2);
5490   TempRect = Bounds ();
5491   TempRect.top = RowTop + Margin;
5492   TextRect = TempRect;
5493   TextRect.OffsetTo (0, 0);
5494   TextRect.InsetBy (g_MarginBetweenControls, 2);
5495   sprintf (TempString, "How do you want to classify the file named \"%s\"?",
5496     m_FileName);
5497   TextViewPntr = new BTextView (TempRect, "FileText", TextRect,
5498     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE);
5499   AddChild (TextViewPntr);
5500   TextViewPntr->SetText (TempString);
5501   TextViewPntr->MakeEditable (false);
5502   TextViewPntr->SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5503   TextViewPntr->ResizeTo (TempRect.Width (),
5504     3 + TextViewPntr->TextHeight (0, sizeof (TempString)));
5505   RowTop = TextViewPntr->Frame().bottom + Margin;
5506 
5507   /* Make the classification buttons. */
5508 
5509   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5510   TempRect = Bounds ();
5511   TempRect.top = RowTop + Margin;
5512   X = Bounds().left + g_MarginBetweenControls;
5513   for (Classification = (ClassificationTypes) 0; Classification < CL_MAX;
5514   Classification = (ClassificationTypes) ((int) Classification + 1))
5515   {
5516     TempRect = Bounds ();
5517     TempRect.top = RowTop + Margin;
5518     TempRect.left = X;
5519     sprintf (TempString, "%s Button",
5520       g_ClassificationTypeNames [Classification]);
5521     ButtonPntr = new BButton (TempRect, TempString,
5522       g_ClassificationTypeNames [Classification], new BMessage (
5523       ClassificationChoicesWindow::MSG_CLASS_BUTTONS + Classification));
5524     AddChild (ButtonPntr);
5525     ButtonPntr->ResizeToPreferred ();
5526     X = ButtonPntr->Frame().right + 3 * g_MarginBetweenControls;
5527   }
5528   RowTop += ceilf (RowHeight * 1.2);
5529 
5530   /* Make the Cancel button. */
5531 
5532   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5533   TempRect = Bounds ();
5534   TempRect.top = RowTop + Margin;
5535   TempRect.left += g_MarginBetweenControls;
5536   ButtonPntr = new BButton (TempRect, "Cancel Button",
5537     "Cancel", new BMessage (ClassificationChoicesWindow::MSG_CANCEL_BUTTON));
5538   AddChild (ButtonPntr);
5539   ButtonPntr->ResizeToPreferred ();
5540   X = ButtonPntr->Frame().right + g_MarginBetweenControls;
5541 
5542   /* Make the checkbox for bulk operations. */
5543 
5544   if (m_NumberOfFiles > 1)
5545   {
5546     Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
5547     TempRect = Bounds ();
5548     TempRect.top = RowTop + Margin;
5549     TempRect.left = X;
5550     sprintf (TempString, "Mark all %d remaining messages the same way.",
5551       m_NumberOfFiles - 1);
5552     CheckBoxPntr = new BCheckBox (TempRect, "BulkBox", TempString,
5553       new BMessage (ClassificationChoicesWindow::MSG_BULK_CHECKBOX));
5554     AddChild (CheckBoxPntr);
5555     CheckBoxPntr->ResizeToPreferred ();
5556   }
5557   RowTop += RowHeight;
5558 
5559   m_PreferredBottomY = RowTop;
5560 }
5561 
5562 
5563 void
5564 ClassificationChoicesView::GetPreferredSize (float *width, float *height)
5565 {
5566   if (width != NULL)
5567     *width = Bounds().Width();
5568   if (height != NULL)
5569     *height = m_PreferredBottomY;
5570 }
5571 
5572 
5573 
5574 /******************************************************************************
5575  * Implementation of the CommanderLooper class, constructor, destructor and the
5576  * rest of the member functions in mostly alphabetical order.
5577  */
5578 
5579 CommanderLooper::CommanderLooper ()
5580 : BLooper ("CommanderLooper", B_NORMAL_PRIORITY),
5581   m_IsBusy (false)
5582 {
5583 }
5584 
5585 
5586 CommanderLooper::~CommanderLooper ()
5587 {
5588   g_CommanderLooperPntr = NULL;
5589   delete g_CommanderMessenger;
5590   g_CommanderMessenger = NULL;
5591 }
5592 
5593 
5594 /* Process some command line arguments.  Basically just send a message to this
5595 looper itself to do the work later.  That way the caller can continue doing
5596 whatever they're doing, particularly if it's the BApplication. */
5597 
5598 void
5599 CommanderLooper::CommandArguments (int argc, char **argv)
5600 {
5601   int      i;
5602   BMessage InternalMessage;
5603 
5604   InternalMessage.what = MSG_COMMAND_ARGUMENTS;
5605   for (i = 0; i < argc; i++)
5606     InternalMessage.AddString ("arg", argv[i]);
5607 
5608   PostMessage (&InternalMessage);
5609 }
5610 
5611 
5612 /* Copy the refs out of the given message and stuff them into an internal
5613 message to ourself (so that the original message can be returned to the caller,
5614 and if it is Tracker, it can close the file handles it has open).  Optionally
5615 allow preset classification rather than asking the user (set BulkMode to TRUE
5616 and specify the class with BulkClassification). */
5617 
5618 void
5619 CommanderLooper::CommandReferences (
5620   BMessage *MessagePntr,
5621   bool BulkMode,
5622   ClassificationTypes BulkClassification)
5623 {
5624   entry_ref EntryRef;
5625   int       i;
5626   BMessage  InternalMessage;
5627 
5628   InternalMessage.what = MSG_COMMAND_FILE_REFS;
5629   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
5630     InternalMessage.AddRef ("refs", &EntryRef);
5631   InternalMessage.AddBool ("BulkMode", BulkMode);
5632   InternalMessage.AddInt32 ("BulkClassification", BulkClassification);
5633 
5634   PostMessage (&InternalMessage);
5635 }
5636 
5637 
5638 /* This function is called by other threads to see if the CommanderLooper is
5639 busy working on something. */
5640 
5641 bool
5642 CommanderLooper::IsBusy ()
5643 {
5644   if (m_IsBusy)
5645     return true;
5646 
5647   if (IsLocked () || !MessageQueue()->IsEmpty ())
5648     return true;
5649 
5650   return false;
5651 }
5652 
5653 
5654 void
5655 
5656 CommanderLooper::MessageReceived (BMessage *MessagePntr)
5657 {
5658   m_IsBusy = true;
5659 
5660   if (MessagePntr->what == MSG_COMMAND_ARGUMENTS)
5661     ProcessArgs (MessagePntr);
5662   else if (MessagePntr->what == MSG_COMMAND_FILE_REFS)
5663     ProcessRefs (MessagePntr);
5664   else
5665     BLooper::MessageReceived (MessagePntr);
5666 
5667   m_IsBusy = false;
5668 }
5669 
5670 
5671 /* Process the command line by converting it into a series of scripting
5672 messages (possibly thousands) and sent them to the BApplication synchronously
5673 (so we can print the result). */
5674 
5675 void
5676 CommanderLooper::ProcessArgs (BMessage *MessagePntr)
5677 {
5678   int32                 argc = 0;
5679   const char          **argv = NULL;
5680   int                   ArgumentIndex;
5681   uint32                CommandCode;
5682   const char           *CommandWord;
5683   status_t              ErrorCode;
5684   const char           *ErrorTitle = "ProcessArgs";
5685   char                 *EndPntr;
5686   int32                 i;
5687   BMessage              ReplyMessage;
5688   BMessage              ScriptMessage;
5689   struct property_info *PropInfoPntr;
5690   const char           *PropertyName;
5691   bool                  TempBool;
5692   float                 TempFloat;
5693   int32                 TempInt32;
5694   const char           *TempStringPntr;
5695   type_code             TypeCode;
5696   const char           *ValuePntr;
5697 
5698   /* Get the argument count and pointers to arguments out of the message and
5699   into our argc and argv. */
5700 
5701   ErrorCode = MessagePntr->GetInfo ("arg", &TypeCode, &argc);
5702   if (ErrorCode != B_OK || TypeCode != B_STRING_TYPE)
5703   {
5704     DisplayErrorMessage ("Unable to find argument strings in message",
5705       ErrorCode, ErrorTitle);
5706     goto ErrorExit;
5707   }
5708 
5709   if (argc < 2)
5710   {
5711     cerr << PrintUsage;
5712     DisplayErrorMessage ("You need to specify a command word, like GET, SET "
5713       "and so on followed by a property, like DatabaseFile, and maybe "
5714       "followed by a value of some sort", -1, ErrorTitle);
5715     goto ErrorExit;
5716   }
5717 
5718   argv = (const char **) malloc (sizeof (char *) * argc);
5719   if (argv == NULL)
5720   {
5721     DisplayErrorMessage ("Out of memory when allocating argv array",
5722       ENOMEM, ErrorTitle);
5723     goto ErrorExit;
5724   }
5725 
5726   for (i = 0; i < argc; i++)
5727   {
5728     if ((ErrorCode = MessagePntr->FindString ("arg", i, &argv[i])) != B_OK)
5729     {
5730       DisplayErrorMessage ("Unable to find argument in the BMessage",
5731         ErrorCode, ErrorTitle);
5732       goto ErrorExit;
5733     }
5734   }
5735 
5736   CommandWord = argv[1];
5737 
5738   /* Special case for the Quit command since it isn't a scripting command. */
5739 
5740   if (strcasecmp (CommandWord, "quit") == 0)
5741   {
5742     g_QuitCountdown = 10;
5743     goto ErrorExit;
5744   }
5745 
5746   /* Find the corresponding scripting command. */
5747 
5748   if (strcasecmp (CommandWord, "set") == 0)
5749     CommandCode = B_SET_PROPERTY;
5750   else if (strcasecmp (CommandWord, "get") == 0)
5751     CommandCode = B_GET_PROPERTY;
5752   else if (strcasecmp (CommandWord, "count") == 0)
5753     CommandCode = B_COUNT_PROPERTIES;
5754   else if (strcasecmp (CommandWord, "create") == 0)
5755     CommandCode = B_CREATE_PROPERTY;
5756   else if (strcasecmp (CommandWord, "delete") == 0)
5757     CommandCode = B_DELETE_PROPERTY;
5758   else
5759     CommandCode = B_EXECUTE_PROPERTY;
5760 
5761   if (CommandCode == B_EXECUTE_PROPERTY)
5762   {
5763     PropertyName = CommandWord;
5764     ArgumentIndex = 2; /* Arguments to the command start at this index. */
5765   }
5766   else
5767   {
5768     if (CommandCode == B_SET_PROPERTY)
5769     {
5770       /* SET commands require at least one argument value. */
5771       if (argc < 4)
5772       {
5773         cerr << PrintUsage;
5774         DisplayErrorMessage ("SET commands require at least one "
5775           "argument value after the property name", -1, ErrorTitle);
5776         goto ErrorExit;
5777       }
5778     }
5779     else
5780       if (argc < 3)
5781       {
5782         cerr << PrintUsage;
5783         DisplayErrorMessage ("You need to specify a property to act on",
5784           -1, ErrorTitle);
5785         goto ErrorExit;
5786       }
5787     PropertyName = argv[2];
5788     ArgumentIndex = 3;
5789   }
5790 
5791   /* See if it is one of our commands. */
5792 
5793   for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
5794   {
5795     if (PropInfoPntr->name == 0)
5796     {
5797       cerr << PrintUsage;
5798       DisplayErrorMessage ("The property specified isn't known or "
5799         "doesn't support the requested action (usually means it is an "
5800         "unknown command)", -1, ErrorTitle);
5801       goto ErrorExit; /* Unrecognized command. */
5802     }
5803 
5804     if (PropInfoPntr->commands[0] == CommandCode &&
5805     strcasecmp (PropertyName, PropInfoPntr->name) == 0)
5806       break;
5807   }
5808 
5809   /* Make the equivalent command message.  For commands with multiple
5810   arguments, repeat the message for each single argument and just change the
5811   data portion for each extra argument.  Send the command and wait for a reply,
5812   which we'll print out. */
5813 
5814   ScriptMessage.MakeEmpty ();
5815   ScriptMessage.what = CommandCode;
5816   ScriptMessage.AddSpecifier (PropertyName);
5817   while (true)
5818   {
5819     if (ArgumentIndex < argc) /* If there are arguments to be added. */
5820     {
5821       ValuePntr = argv[ArgumentIndex];
5822 
5823       /* Convert the value into the likely kind of data. */
5824 
5825       if (strcasecmp (ValuePntr, "yes") == 0 ||
5826       strcasecmp (ValuePntr, "true") == 0)
5827         ScriptMessage.AddBool (g_DataName, true);
5828       else if (strcasecmp (ValuePntr, "no") == 0 ||
5829       strcasecmp (ValuePntr, "false") == 0)
5830         ScriptMessage.AddBool (g_DataName, false);
5831       else
5832       {
5833         /* See if it is a number. */
5834         i = strtol (ValuePntr, &EndPntr, 0);
5835         if (*EndPntr == 0)
5836           ScriptMessage.AddInt32 (g_DataName, i);
5837         else /* Nope, it's just a string. */
5838           ScriptMessage.AddString (g_DataName, ValuePntr);
5839       }
5840     }
5841 
5842     ErrorCode = be_app_messenger.SendMessage (&ScriptMessage, &ReplyMessage);
5843     if (ErrorCode != B_OK)
5844     {
5845       DisplayErrorMessage ("Unable to send scripting command",
5846         ErrorCode, ErrorTitle);
5847       goto ErrorExit;
5848     }
5849 
5850     /* Print the reply to the scripting command.  Even in server mode.  To
5851     standard output. */
5852 
5853     if (ReplyMessage.FindString ("CommandText", &TempStringPntr) == B_OK)
5854     {
5855       TempInt32 = -1;
5856       if (ReplyMessage.FindInt32 ("error", &TempInt32) == B_OK &&
5857       TempInt32 == B_OK)
5858       {
5859         /* It's a successful reply to one of our scripting messages.  Print out
5860         the returned values code for command line users to see. */
5861 
5862         cout << "Result of command to " << TempStringPntr << " is:\t";
5863         if (ReplyMessage.FindString (g_ResultName, &TempStringPntr) == B_OK)
5864           cout << "\"" << TempStringPntr << "\"";
5865         else if (ReplyMessage.FindInt32 (g_ResultName, &TempInt32) == B_OK)
5866           cout << TempInt32;
5867         else if (ReplyMessage.FindFloat (g_ResultName, &TempFloat) == B_OK)
5868           cout << TempFloat;
5869         else if (ReplyMessage.FindBool (g_ResultName, &TempBool) == B_OK)
5870           cout << (TempBool ? "true" : "false");
5871         else
5872           cout << "just plain success";
5873         if (ReplyMessage.FindInt32 ("count", &TempInt32) == B_OK)
5874           cout << "\t(count " << TempInt32 << ")";
5875         for (i = 0; (i < 50) &&
5876         ReplyMessage.FindString ("words", i, &TempStringPntr) == B_OK &&
5877         ReplyMessage.FindFloat ("ratios", i, &TempFloat) == B_OK;
5878         i++)
5879         {
5880           if (i == 0)
5881             cout << "\twith top words:\t";
5882           else
5883             cout << "\t";
5884           cout << TempStringPntr << "/" << TempFloat;
5885         }
5886         cout << endl;
5887       }
5888       else /* An error reply, print out the error, even in server mode. */
5889       {
5890         cout << "Failure of command " << TempStringPntr << ", error ";
5891         cout << TempInt32 << " (" << strerror (TempInt32) << ")";
5892         if (ReplyMessage.FindString ("message", &TempStringPntr) == B_OK)
5893           cout << ", message: " << TempStringPntr;
5894         cout << "." << endl;
5895       }
5896     }
5897 
5898     /* Advance to the next argument and its scripting message. */
5899 
5900     ScriptMessage.RemoveName (g_DataName);
5901     if (++ArgumentIndex >= argc)
5902       break;
5903   }
5904 
5905 ErrorExit:
5906   free (argv);
5907 }
5908 
5909 
5910 /* Given a bunch of references to files, open the files.  If it's a database
5911 file, switch to using it as a database.  Otherwise, treat them as text files
5912 and add them to the database.  Prompt the user for the spam or genuine or
5913 uncertain (declassification) choice, with the option to bulk mark many files at
5914 once. */
5915 
5916 void
5917 CommanderLooper::ProcessRefs (BMessage *MessagePntr)
5918 {
5919   bool                         BulkMode = false;
5920   ClassificationTypes          BulkClassification = CL_GENUINE;
5921   ClassificationChoicesWindow *ChoiceWindowPntr;
5922   BEntry                       Entry;
5923   entry_ref                    EntryRef;
5924   status_t                     ErrorCode;
5925   const char                  *ErrorTitle = "CommanderLooper::ProcessRefs";
5926   int32                        NumberOfRefs = 0;
5927   BPath                        Path;
5928   int                          RefIndex;
5929   BMessage                     ReplyMessage;
5930   BMessage                     ScriptingMessage;
5931   bool                         TempBool;
5932   BFile                        TempFile;
5933   int32                        TempInt32;
5934   char                         TempString [PATH_MAX + 1024];
5935   type_code                    TypeCode;
5936 
5937   // Wait for ReadyToRun to finish initializing the globals with the sizes of
5938   // the controls, since they are needed when we show the custom alert box for
5939   // choosing the message type.
5940 
5941   TempInt32 = 0;
5942   while (!g_AppReadyToRunCompleted && TempInt32++ < 10)
5943     snooze (200000);
5944 
5945   ErrorCode = MessagePntr->GetInfo ("refs", &TypeCode, &NumberOfRefs);
5946   if (ErrorCode != B_OK || TypeCode != B_REF_TYPE || NumberOfRefs <= 0)
5947   {
5948     DisplayErrorMessage ("Unable to get refs from the message",
5949       ErrorCode, ErrorTitle);
5950     return;
5951   }
5952 
5953   if (MessagePntr->FindBool ("BulkMode", &TempBool) == B_OK)
5954     BulkMode = TempBool;
5955   if (MessagePntr->FindInt32 ("BulkClassification", &TempInt32) == B_OK &&
5956   TempInt32 >= 0 && TempInt32 < CL_MAX)
5957     BulkClassification = (ClassificationTypes) TempInt32;
5958 
5959   for (RefIndex = 0;
5960   MessagePntr->FindRef ("refs", RefIndex, &EntryRef) == B_OK;
5961   RefIndex++)
5962   {
5963     ScriptingMessage.MakeEmpty ();
5964     ScriptingMessage.what = 0; /* Haven't figured out what to do yet. */
5965 
5966     /* See if the entry is a valid file or directory or other thing. */
5967 
5968     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
5969     if (ErrorCode != B_OK ||
5970     ((ErrorCode = /* assignment */ B_ENTRY_NOT_FOUND) != 0 /* this pacifies
5971     mwcc -nwhitehorn */ && !Entry.Exists ()) ||
5972     ((ErrorCode = Entry.GetPath (&Path)) != B_OK))
5973     {
5974       DisplayErrorMessage ("Bad entry reference encountered, will skip it",
5975         ErrorCode, ErrorTitle);
5976       BulkMode = false;
5977       continue; /* Bad file reference, try the next one. */
5978     }
5979 
5980     /* If it's a file, check if it is a spam database file.  Go by the magic
5981     text at the start of the file, in case someone has edited the file with a
5982     spreadsheet or other tool and lost the MIME type. */
5983 
5984     if (Entry.IsFile ())
5985     {
5986       ErrorCode = TempFile.SetTo (&Entry, B_READ_ONLY);
5987       if (ErrorCode != B_OK)
5988       {
5989         sprintf (TempString, "Unable to open file \"%s\" for reading, will "
5990           "skip it", Path.Path ());
5991         DisplayErrorMessage (TempString, ErrorCode, ErrorTitle);
5992         BulkMode = false;
5993         continue;
5994       }
5995       if (TempFile.Read (TempString, strlen (g_DatabaseRecognitionString)) ==
5996       (int) strlen (g_DatabaseRecognitionString) && strncmp (TempString,
5997       g_DatabaseRecognitionString, strlen (g_DatabaseRecognitionString)) == 0)
5998       {
5999         ScriptingMessage.what = B_SET_PROPERTY;
6000         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6001         ScriptingMessage.AddString (g_DataName, Path.Path ());
6002       }
6003       TempFile.Unset ();
6004     }
6005 
6006     /* Not a database file.  Could be a directory or a file.  Submit it as
6007     something to be marked spam or genuine. */
6008 
6009     if (ScriptingMessage.what == 0)
6010     {
6011       if (!Entry.IsFile ())
6012       {
6013         sprintf (TempString, "\"%s\" is not a file, can't do anything with it",
6014           Path.Path ());
6015         DisplayErrorMessage (TempString, -1, ErrorTitle);
6016         BulkMode = false;
6017         continue;
6018       }
6019 
6020       if (!BulkMode) /* Have to ask the user. */
6021       {
6022         ChoiceWindowPntr = new ClassificationChoicesWindow (
6023           BRect (40, 40, 40 + 50 * g_MarginBetweenControls,
6024           40 + g_ButtonHeight * 5), Path.Path (), NumberOfRefs - RefIndex);
6025         ChoiceWindowPntr->Go (&BulkMode, &BulkClassification);
6026         if (BulkClassification == CL_MAX)
6027           break; /* Cancel was picked. */
6028       }
6029 
6030       /* Format the command for classifying the file. */
6031 
6032       ScriptingMessage.what = B_SET_PROPERTY;
6033 
6034       if (BulkClassification == CL_GENUINE)
6035         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_GENUINE]);
6036       else if (BulkClassification == CL_SPAM)
6037         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_SPAM]);
6038       else if (BulkClassification == CL_UNCERTAIN)
6039         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_UNCERTAIN]);
6040       else /* Broken code */
6041         break;
6042       ScriptingMessage.AddString (g_DataName, Path.Path ());
6043     }
6044 
6045     /* Tell the BApplication to do the work, and wait for it to finish.  The
6046     BApplication will display any error messages for us. */
6047 
6048     ErrorCode =
6049       be_app_messenger.SendMessage (&ScriptingMessage, &ReplyMessage);
6050     if (ErrorCode != B_OK)
6051     {
6052       DisplayErrorMessage ("Unable to send scripting command",
6053         ErrorCode, ErrorTitle);
6054       return;
6055     }
6056 
6057     /* If there was an error, allow the user to stop by switching off bulk
6058     mode.  The message will already have been displayed in an alert box, if
6059     server mode is off. */
6060 
6061     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
6062     TempInt32 != B_OK)
6063       BulkMode = false;
6064   }
6065 }
6066 
6067 
6068 
6069 /******************************************************************************
6070  * Implementation of the ControlsView class, constructor, destructor and the
6071  * rest of the member functions in mostly alphabetical order.
6072  */
6073 
6074 ControlsView::ControlsView (BRect NewBounds)
6075 : BView (NewBounds, "ControlsView", B_FOLLOW_TOP | B_FOLLOW_LEFT_RIGHT,
6076     B_WILL_DRAW | B_PULSE_NEEDED | B_NAVIGABLE_JUMP | B_FRAME_EVENTS),
6077   m_AboutButtonPntr (NULL),
6078   m_AddExampleButtonPntr (NULL),
6079   m_BrowseButtonPntr (NULL),
6080   m_BrowseFilePanelPntr (NULL),
6081   m_CreateDatabaseButtonPntr (NULL),
6082   m_DatabaseFileNameTextboxPntr (NULL),
6083   m_DatabaseLoadDone (false),
6084   m_EstimateSpamButtonPntr (NULL),
6085   m_EstimateSpamFilePanelPntr (NULL),
6086   m_GenuineCountTextboxPntr (NULL),
6087   m_IgnorePreviousClassCheckboxPntr (NULL),
6088   m_InstallThingsButtonPntr (NULL),
6089   m_PurgeAgeTextboxPntr (NULL),
6090   m_PurgeButtonPntr (NULL),
6091   m_PurgePopularityTextboxPntr (NULL),
6092   m_ResetToDefaultsButtonPntr (NULL),
6093   m_ScoringModeMenuBarPntr (NULL),
6094   m_ScoringModePopUpMenuPntr (NULL),
6095   m_ServerModeCheckboxPntr (NULL),
6096   m_SpamCountTextboxPntr (NULL),
6097   m_TimeOfLastPoll (0),
6098   m_TokenizeModeMenuBarPntr (NULL),
6099   m_TokenizeModePopUpMenuPntr (NULL),
6100   m_WordCountTextboxPntr (NULL)
6101 {
6102 }
6103 
6104 
6105 ControlsView::~ControlsView ()
6106 {
6107   if (m_BrowseFilePanelPntr != NULL)
6108   {
6109     delete m_BrowseFilePanelPntr;
6110     m_BrowseFilePanelPntr = NULL;
6111   }
6112 
6113   if (m_EstimateSpamFilePanelPntr != NULL)
6114   {
6115     delete m_EstimateSpamFilePanelPntr;
6116     m_EstimateSpamFilePanelPntr = NULL;
6117   }
6118 }
6119 
6120 
6121 void
6122 ControlsView::AttachedToWindow ()
6123 {
6124   float         BigPurgeButtonTop;
6125   BMessage      CommandMessage;
6126   const char   *EightDigitsString = " 12345678 ";
6127   float         Height;
6128   float         Margin;
6129   float         RowHeight;
6130   float         RowTop;
6131   ScoringModes  ScoringMode;
6132   const char   *StringPntr;
6133   BMenuItem    *TempMenuItemPntr;
6134   BRect         TempRect;
6135   char          TempString [PATH_MAX];
6136   TokenizeModes TokenizeMode;
6137   float         Width;
6138   float         X;
6139 
6140   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
6141 
6142   TempRect = Bounds ();
6143   X = TempRect.right;
6144   RowTop = TempRect.top;
6145   RowHeight = g_ButtonHeight;
6146   if (g_TextBoxHeight > RowHeight)
6147     RowHeight = g_TextBoxHeight;
6148   RowHeight = ceilf (RowHeight * 1.1);
6149 
6150   /* Make the Create button at the far right of the first row of controls,
6151   which are all database file related. */
6152 
6153   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6154   TempRect = Bounds ();
6155   TempRect.top = RowTop + Margin;
6156   TempRect.bottom = TempRect.top + g_ButtonHeight;
6157 
6158   CommandMessage.MakeEmpty ();
6159   CommandMessage.what = B_CREATE_PROPERTY;
6160   CommandMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6161   m_CreateDatabaseButtonPntr = new BButton (TempRect, "Create Button",
6162     "Create", new BMessage (CommandMessage), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6163   if (m_CreateDatabaseButtonPntr == NULL) goto ErrorExit;
6164   AddChild (m_CreateDatabaseButtonPntr);
6165   m_CreateDatabaseButtonPntr->SetTarget (be_app);
6166   m_CreateDatabaseButtonPntr->ResizeToPreferred ();
6167   m_CreateDatabaseButtonPntr->GetPreferredSize (&Width, &Height);
6168   m_CreateDatabaseButtonPntr->MoveTo (X - Width, TempRect.top);
6169   X -= Width + g_MarginBetweenControls;
6170 
6171   /* Make the Browse button, middle of the first row. */
6172 
6173   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6174   TempRect = Bounds ();
6175   TempRect.top = RowTop + Margin;
6176   TempRect.bottom = TempRect.top + g_ButtonHeight;
6177 
6178   m_BrowseButtonPntr = new BButton (TempRect, "Browse Button",
6179     "Browse…", new BMessage (MSG_BROWSE_BUTTON), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6180   if (m_BrowseButtonPntr == NULL) goto ErrorExit;
6181   AddChild (m_BrowseButtonPntr);
6182   m_BrowseButtonPntr->SetTarget (this);
6183   m_BrowseButtonPntr->ResizeToPreferred ();
6184   m_BrowseButtonPntr->GetPreferredSize (&Width, &Height);
6185   m_BrowseButtonPntr->MoveTo (X - Width, TempRect.top);
6186   X -= Width + g_MarginBetweenControls;
6187 
6188   /* Fill the rest of the space on the first row with the file name box. */
6189 
6190   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6191   TempRect = Bounds ();
6192   TempRect.top = RowTop + Margin;
6193   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6194   TempRect.right = X;
6195 
6196   StringPntr = "Word Database:";
6197   strcpy (m_DatabaseFileNameCachedValue, "Unknown...");
6198   m_DatabaseFileNameTextboxPntr = new BTextControl (TempRect,
6199     "File Name",
6200     StringPntr /* label */,
6201     m_DatabaseFileNameCachedValue /* text */,
6202     new BMessage (MSG_DATABASE_NAME),
6203     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6204     B_WILL_DRAW | B_NAVIGABLE | B_NAVIGABLE_JUMP);
6205   AddChild (m_DatabaseFileNameTextboxPntr);
6206   m_DatabaseFileNameTextboxPntr->SetTarget (this);
6207   m_DatabaseFileNameTextboxPntr->SetDivider (
6208     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6209 
6210   /* Second row contains the purge age, and a long line explaining it.  There
6211   is space to the right where the top half of the big purge button will go. */
6212 
6213   RowTop += RowHeight /* previous row's RowHeight */;
6214   BigPurgeButtonTop = RowTop;
6215   TempRect = Bounds ();
6216   X = TempRect.left;
6217   RowHeight = g_TextBoxHeight;
6218   RowHeight = ceilf (RowHeight * 1.1);
6219 
6220   StringPntr = "Number of occurrences needed to store a word:";
6221   m_PurgeAgeCachedValue = 12345678;
6222 
6223   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6224   TempRect.top = RowTop + Margin;
6225   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6226   TempRect.left = X;
6227   TempRect.right = TempRect.left +
6228     be_plain_font->StringWidth (StringPntr) +
6229     be_plain_font->StringWidth (EightDigitsString) +
6230     3 * g_MarginBetweenControls;
6231 
6232   sprintf (TempString, "%d", (int) m_PurgeAgeCachedValue);
6233   m_PurgeAgeTextboxPntr = new BTextControl (TempRect,
6234     "Purge Age",
6235     StringPntr /* label */,
6236     TempString /* text */,
6237     new BMessage (MSG_PURGE_AGE),
6238     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6239     B_WILL_DRAW | B_NAVIGABLE);
6240   AddChild (m_PurgeAgeTextboxPntr);
6241   m_PurgeAgeTextboxPntr->SetTarget (this);
6242   m_PurgeAgeTextboxPntr->SetDivider (
6243     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6244 
6245   /* Third row contains the purge popularity and bottom half of the purge
6246   button. */
6247 
6248   RowTop += RowHeight /* previous row's RowHeight */;
6249   TempRect = Bounds ();
6250   X = TempRect.left;
6251   RowHeight = g_TextBoxHeight;
6252   RowHeight = ceilf (RowHeight * 1.1);
6253 
6254   StringPntr = "Number of messages to store words from:";
6255   m_PurgePopularityCachedValue = 87654321;
6256   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6257   TempRect.top = RowTop + Margin;
6258   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6259   TempRect.left = X;
6260   TempRect.right = TempRect.left +
6261     be_plain_font->StringWidth (StringPntr) +
6262     be_plain_font->StringWidth (EightDigitsString) +
6263     3 * g_MarginBetweenControls;
6264   X = TempRect.right + g_MarginBetweenControls;
6265 
6266   sprintf (TempString, "%d", (int) m_PurgePopularityCachedValue);
6267   m_PurgePopularityTextboxPntr = new BTextControl (TempRect,
6268     "Purge Popularity",
6269     StringPntr /* label */,
6270     TempString /* text */,
6271     new BMessage (MSG_PURGE_POPULARITY),
6272     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6273     B_WILL_DRAW | B_NAVIGABLE);
6274   AddChild (m_PurgePopularityTextboxPntr);
6275   m_PurgePopularityTextboxPntr->SetTarget (this);
6276   m_PurgePopularityTextboxPntr->SetDivider (
6277     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6278 
6279   /* Make the purge button, which will take up space in the 2nd and 3rd rows,
6280   on the right side.  Twice as tall as a regular button too. */
6281 
6282   StringPntr = "Remove Old Words";
6283   Margin = ceilf ((((RowTop + RowHeight) - BigPurgeButtonTop) -
6284     2 * g_TextBoxHeight) / 2);
6285   TempRect.top = BigPurgeButtonTop + Margin;
6286   TempRect.bottom = TempRect.top + 2 * g_TextBoxHeight;
6287   TempRect.left = X;
6288   TempRect.right = X + ceilf (2 * be_plain_font->StringWidth (StringPntr));
6289 
6290   CommandMessage.MakeEmpty ();
6291   CommandMessage.what = B_EXECUTE_PROPERTY;
6292   CommandMessage.AddSpecifier (g_PropertyNames[PN_PURGE]);
6293   m_PurgeButtonPntr = new BButton (TempRect, "Purge Button",
6294     StringPntr, new BMessage (CommandMessage), B_FOLLOW_LEFT | B_FOLLOW_TOP);
6295   if (m_PurgeButtonPntr == NULL) goto ErrorExit;
6296   m_PurgeButtonPntr->ResizeToPreferred();
6297   AddChild (m_PurgeButtonPntr);
6298   m_PurgeButtonPntr->SetTarget (be_app);
6299 
6300   /* The fourth row contains the ignore previous classification checkbox. */
6301 
6302   RowTop += RowHeight /* previous row's RowHeight */;
6303   TempRect = Bounds ();
6304   X = TempRect.left;
6305   RowHeight = g_CheckBoxHeight;
6306   RowHeight = ceilf (RowHeight * 1.1);
6307 
6308   StringPntr = "Allow Retraining on a Message";
6309   m_IgnorePreviousClassCachedValue = false;
6310 
6311   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6312   TempRect.top = RowTop + Margin;
6313   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6314   TempRect.left = X;
6315   m_IgnorePreviousClassCheckboxPntr = new BCheckBox (TempRect,
6316     "Ignore Check",
6317     StringPntr,
6318     new BMessage (MSG_IGNORE_CLASSIFICATION),
6319     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6320   if (m_IgnorePreviousClassCheckboxPntr == NULL) goto ErrorExit;
6321   AddChild (m_IgnorePreviousClassCheckboxPntr);
6322   m_IgnorePreviousClassCheckboxPntr->SetTarget (this);
6323   m_IgnorePreviousClassCheckboxPntr->ResizeToPreferred ();
6324   m_IgnorePreviousClassCheckboxPntr->GetPreferredSize (&Width, &Height);
6325   X += Width + g_MarginBetweenControls;
6326 
6327   /* The fifth row contains the server mode checkbox. */
6328 
6329   RowTop += RowHeight /* previous row's RowHeight */;
6330   TempRect = Bounds ();
6331   RowHeight = g_CheckBoxHeight;
6332   RowHeight = ceilf (RowHeight * 1.1);
6333 
6334   StringPntr = "Print errors to Terminal";
6335   m_ServerModeCachedValue = false;
6336 
6337   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6338   TempRect.top = RowTop + Margin;
6339   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6340   m_ServerModeCheckboxPntr = new BCheckBox (TempRect,
6341     "ServerMode Check",
6342     StringPntr,
6343     new BMessage (MSG_SERVER_MODE),
6344     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6345   if (m_ServerModeCheckboxPntr == NULL) goto ErrorExit;
6346   AddChild (m_ServerModeCheckboxPntr);
6347   m_ServerModeCheckboxPntr->SetTarget (this);
6348   m_ServerModeCheckboxPntr->ResizeToPreferred ();
6349   m_ServerModeCheckboxPntr->GetPreferredSize (&Width, &Height);
6350 
6351   /* This row just contains a huge pop-up menu which shows the tokenize mode
6352   and an explanation of what each mode does. */
6353 
6354   RowTop += RowHeight /* previous row's RowHeight */;
6355   TempRect = Bounds ();
6356   RowHeight = g_PopUpMenuHeight;
6357   RowHeight = ceilf (RowHeight * 1.1);
6358 
6359   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6360   TempRect.top = RowTop + Margin;
6361   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6362 
6363   m_TokenizeModeCachedValue = TM_MAX; /* Illegal value will force redraw. */
6364   m_TokenizeModeMenuBarPntr = new BMenuBar (TempRect, "TokenizeModeMenuBar",
6365     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6366     false /* resize to fit items */);
6367   if (m_TokenizeModeMenuBarPntr == NULL) goto ErrorExit;
6368   m_TokenizeModePopUpMenuPntr = new BPopUpMenu ("TokenizeModePopUpMenu");
6369   if (m_TokenizeModePopUpMenuPntr == NULL) goto ErrorExit;
6370 
6371   for (TokenizeMode = (TokenizeModes) 0;
6372   TokenizeMode < TM_MAX;
6373   TokenizeMode = (TokenizeModes) ((int) TokenizeMode + 1))
6374   {
6375     /* Each different tokenize mode gets its own menu item.  Selecting the item
6376     will send a canned command to the application to switch to the appropriate
6377     tokenize mode.  An optional explanation of each mode is added to the mode
6378     name string. */
6379 
6380     CommandMessage.MakeEmpty ();
6381     CommandMessage.what = B_SET_PROPERTY;
6382     CommandMessage.AddSpecifier (g_PropertyNames[PN_TOKENIZE_MODE]);
6383     CommandMessage.AddString (g_DataName, g_TokenizeModeNames[TokenizeMode]);
6384     strcpy (TempString, g_TokenizeModeNames[TokenizeMode]);
6385     switch (TokenizeMode)
6386     {
6387       case TM_WHOLE:
6388         strcat (TempString, " - Scan everything");
6389         break;
6390 
6391       case TM_PLAIN_TEXT:
6392         strcat (TempString, " - Scan e-mail body text except rich text");
6393         break;
6394 
6395       case TM_PLAIN_TEXT_HEADER:
6396         strcat (TempString, " - Scan entire e-mail text except rich text");
6397         break;
6398 
6399       case TM_ANY_TEXT:
6400         strcat (TempString, " - Scan e-mail body text and text attachments");
6401         break;
6402 
6403       case TM_ANY_TEXT_HEADER:
6404        strcat (TempString, " - Scan entire e-mail text and text attachments (recommended)");
6405         break;
6406 
6407       case TM_ALL_PARTS:
6408         strcat (TempString, " - Scan e-mail body and all attachments");
6409         break;
6410 
6411       case TM_ALL_PARTS_HEADER:
6412         strcat (TempString, " - Scan all parts of the e-mail");
6413         break;
6414 
6415       case TM_JUST_HEADER:
6416         strcat (TempString, " - Scan just the header (mail routing information)");
6417         break;
6418 
6419       default:
6420         break;
6421     }
6422     TempMenuItemPntr =
6423       new BMenuItem (TempString, new BMessage (CommandMessage));
6424     if (TempMenuItemPntr == NULL) goto ErrorExit;
6425     TempMenuItemPntr->SetTarget (be_app);
6426     m_TokenizeModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6427   }
6428   m_TokenizeModeMenuBarPntr->AddItem (m_TokenizeModePopUpMenuPntr);
6429   AddChild (m_TokenizeModeMenuBarPntr);
6430 
6431   /* This row just contains a huge pop-up menu which shows the scoring mode
6432   and an explanation of what each mode does. */
6433 
6434   RowTop += RowHeight /* previous row's RowHeight */;
6435   TempRect = Bounds ();
6436   RowHeight = g_PopUpMenuHeight;
6437   RowHeight = ceilf (RowHeight * 1.1);
6438 
6439   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6440   TempRect.top = RowTop + Margin;
6441   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6442 
6443   m_ScoringModeCachedValue = SM_MAX; /* Illegal value will force redraw. */
6444   m_ScoringModeMenuBarPntr = new BMenuBar (TempRect, "ScoringModeMenuBar",
6445     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6446     false /* resize to fit items */);
6447   if (m_ScoringModeMenuBarPntr == NULL) goto ErrorExit;
6448   m_ScoringModePopUpMenuPntr = new BPopUpMenu ("ScoringModePopUpMenu");
6449   if (m_ScoringModePopUpMenuPntr == NULL) goto ErrorExit;
6450 
6451   for (ScoringMode = (ScoringModes) 0;
6452   ScoringMode < SM_MAX;
6453   ScoringMode = (ScoringModes) ((int) ScoringMode + 1))
6454   {
6455     /* Each different scoring mode gets its own menu item.  Selecting the item
6456     will send a canned command to the application to switch to the appropriate
6457     scoring mode.  An optional explanation of each mode is added to the mode
6458     name string. */
6459 
6460     CommandMessage.MakeEmpty ();
6461     CommandMessage.what = B_SET_PROPERTY;
6462     CommandMessage.AddSpecifier (g_PropertyNames[PN_SCORING_MODE]);
6463     CommandMessage.AddString (g_DataName, g_ScoringModeNames[ScoringMode]);
6464 /*
6465     strcpy (TempString, g_ScoringModeNames[ScoringMode]);
6466     switch (ScoringMode)
6467     {
6468       case SM_ROBINSON:
6469         strcat (TempString, " - Learning Method 1: Naive Bayesian");
6470         break;
6471 
6472       case SM_CHISQUARED:
6473         strcat (TempString, " - Learning Method 2: Chi-Squared");
6474         break;
6475 
6476       default:
6477         break;
6478     }
6479 */
6480     switch (ScoringMode)
6481     {
6482       case SM_ROBINSON:
6483         strcpy (TempString, "Learning method 1: Naive Bayesian");
6484         break;
6485 
6486       case SM_CHISQUARED:
6487         strcpy (TempString, "Learning method 2: Chi-Squared");
6488         break;
6489 
6490       default:
6491         break;
6492     }
6493     TempMenuItemPntr =
6494       new BMenuItem (TempString, new BMessage (CommandMessage));
6495     if (TempMenuItemPntr == NULL) goto ErrorExit;
6496     TempMenuItemPntr->SetTarget (be_app);
6497     m_ScoringModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6498   }
6499   m_ScoringModeMenuBarPntr->AddItem (m_ScoringModePopUpMenuPntr);
6500   AddChild (m_ScoringModeMenuBarPntr);
6501 
6502   /* The next row has the install MIME types button and the reset to defaults
6503   button, one on the left and the other on the right. */
6504 
6505   RowTop += RowHeight /* previous row's RowHeight */;
6506   TempRect = Bounds ();
6507   RowHeight = g_ButtonHeight;
6508   RowHeight = ceilf (RowHeight * 1.1);
6509 
6510   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6511   TempRect.top = RowTop + Margin;
6512   TempRect.bottom = TempRect.top + g_ButtonHeight;
6513 
6514   CommandMessage.MakeEmpty ();
6515   CommandMessage.what = B_EXECUTE_PROPERTY;
6516   CommandMessage.AddSpecifier (g_PropertyNames[PN_INSTALL_THINGS]);
6517   m_InstallThingsButtonPntr = new BButton (TempRect, "Install Button",
6518     "Install spam types",
6519     new BMessage (CommandMessage),
6520     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6521   if (m_InstallThingsButtonPntr == NULL) goto ErrorExit;
6522   AddChild (m_InstallThingsButtonPntr);
6523   m_InstallThingsButtonPntr->SetTarget (be_app);
6524   m_InstallThingsButtonPntr->ResizeToPreferred ();
6525 
6526   /* The Reset to Defaults button.  On the right side of the row. */
6527 
6528   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6529   TempRect = Bounds ();
6530   TempRect.top = RowTop + Margin;
6531   TempRect.bottom = TempRect.top + g_ButtonHeight;
6532 
6533   CommandMessage.MakeEmpty ();
6534   CommandMessage.what = B_EXECUTE_PROPERTY;
6535   CommandMessage.AddSpecifier (g_PropertyNames[PN_RESET_TO_DEFAULTS]);
6536   m_ResetToDefaultsButtonPntr = new BButton (TempRect, "Reset Button",
6537     "Default settings", new BMessage (CommandMessage),
6538     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6539   if (m_ResetToDefaultsButtonPntr == NULL) goto ErrorExit;
6540   AddChild (m_ResetToDefaultsButtonPntr);
6541   m_ResetToDefaultsButtonPntr->SetTarget (be_app);
6542   m_ResetToDefaultsButtonPntr->ResizeToPreferred ();
6543   m_ResetToDefaultsButtonPntr->GetPreferredSize (&Width, &Height);
6544   m_ResetToDefaultsButtonPntr->MoveTo (TempRect.right - Width, TempRect.top);
6545 
6546   /* The next row contains the Estimate, Add Examples and About buttons. */
6547 
6548   RowTop += RowHeight /* previous row's RowHeight */;
6549   TempRect = Bounds ();
6550   X = TempRect.left;
6551   RowHeight = g_ButtonHeight;
6552   RowHeight = ceilf (RowHeight * 1.1);
6553 
6554   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6555   TempRect.top = RowTop + Margin;
6556   TempRect.bottom = TempRect.top + g_ButtonHeight;
6557   TempRect.left = X;
6558 
6559   m_EstimateSpamButtonPntr = new BButton (TempRect, "Estimate Button",
6560     "Scan a message",
6561     new BMessage (MSG_ESTIMATE_BUTTON),
6562     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6563   if (m_EstimateSpamButtonPntr == NULL) goto ErrorExit;
6564   AddChild (m_EstimateSpamButtonPntr);
6565   m_EstimateSpamButtonPntr->SetTarget (this);
6566   m_EstimateSpamButtonPntr->ResizeToPreferred ();
6567   X = m_EstimateSpamButtonPntr->Frame().right + g_MarginBetweenControls;
6568 
6569   /* The Add Example button in the middle.  Does the same as the browse button,
6570   but don't tell anyone that! */
6571 
6572   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6573   TempRect.top = RowTop + Margin;
6574   TempRect.bottom = TempRect.top + g_ButtonHeight;
6575   TempRect.left = X;
6576 
6577   m_AddExampleButtonPntr = new BButton (TempRect, "Example Button",
6578     "Train spam filter on a message",
6579     new BMessage (MSG_BROWSE_BUTTON),
6580     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6581     B_WILL_DRAW | B_NAVIGABLE | B_FULL_UPDATE_ON_RESIZE);
6582   if (m_AddExampleButtonPntr == NULL) goto ErrorExit;
6583   AddChild (m_AddExampleButtonPntr);
6584   m_AddExampleButtonPntr->SetTarget (this);
6585   m_AddExampleButtonPntr->ResizeToPreferred ();
6586   X = m_AddExampleButtonPntr->Frame().right + g_MarginBetweenControls;
6587 
6588   /* Add the About button on the right. */
6589 
6590   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6591   TempRect = Bounds ();
6592   TempRect.top = RowTop + Margin;
6593   TempRect.bottom = TempRect.top + g_ButtonHeight;
6594   TempRect.left = X;
6595 
6596   m_AboutButtonPntr = new BButton (TempRect, "About Button",
6597     "About…",
6598     new BMessage (B_ABOUT_REQUESTED),
6599     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6600   if (m_AboutButtonPntr == NULL) goto ErrorExit;
6601   AddChild (m_AboutButtonPntr);
6602   m_AboutButtonPntr->SetTarget (be_app);
6603 
6604   /* This row displays various counters.  Starting with the genuine messages
6605   count on the left. */
6606 
6607   RowTop += RowHeight /* previous row's RowHeight */;
6608   TempRect = Bounds ();
6609   RowHeight = g_TextBoxHeight;
6610   RowHeight = ceilf (RowHeight * 1.1);
6611 
6612   StringPntr = "Genuine messages:";
6613   m_GenuineCountCachedValue = 87654321;
6614   sprintf (TempString, "%d", (int) m_GenuineCountCachedValue);
6615 
6616   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6617   TempRect = Bounds ();
6618   TempRect.top = RowTop + Margin;
6619   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6620   TempRect.right = TempRect.left +
6621     be_plain_font->StringWidth (StringPntr) +
6622     be_plain_font->StringWidth (TempString) +
6623     3 * g_MarginBetweenControls;
6624 
6625   m_GenuineCountTextboxPntr = new BTextControl (TempRect,
6626     "Genuine count",
6627     StringPntr /* label */,
6628     TempString /* text */,
6629     NULL /* no message */,
6630     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6631     B_WILL_DRAW /* not B_NAVIGABLE */);
6632   AddChild (m_GenuineCountTextboxPntr);
6633   m_GenuineCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6634   m_GenuineCountTextboxPntr->SetDivider (
6635     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6636   m_GenuineCountTextboxPntr->SetEnabled (false); /* For display only. */
6637 
6638   /* The word count in the center. */
6639 
6640   StringPntr = "Word count:";
6641   m_WordCountCachedValue = 87654321;
6642   sprintf (TempString, "%d", (int) m_WordCountCachedValue);
6643 
6644   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6645   TempRect = Bounds ();
6646   TempRect.top = RowTop + Margin;
6647   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6648   Width = be_plain_font->StringWidth (StringPntr) +
6649     be_plain_font->StringWidth (TempString) +
6650     3 * g_MarginBetweenControls;
6651   TempRect.left = ceilf ((TempRect.right - TempRect.left) / 2 - Width / 2);
6652   TempRect.right = TempRect.left + Width;
6653 
6654   m_WordCountTextboxPntr = new BTextControl (TempRect,
6655     "Word count",
6656     StringPntr /* label */,
6657     TempString /* text */,
6658     NULL /* no message */,
6659     B_FOLLOW_H_CENTER | B_FOLLOW_TOP,
6660     B_WILL_DRAW /* not B_NAVIGABLE */);
6661   AddChild (m_WordCountTextboxPntr);
6662   m_WordCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6663   m_WordCountTextboxPntr->SetDivider (
6664     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6665   m_WordCountTextboxPntr->SetEnabled (false); /* For display only. */
6666 
6667   /* The spam count on the far right. */
6668 
6669   StringPntr = "Spam messages:";
6670   m_SpamCountCachedValue = 87654321;
6671   sprintf (TempString, "%d", (int) m_SpamCountCachedValue);
6672 
6673   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6674   TempRect = Bounds ();
6675   TempRect.top = RowTop + Margin;
6676   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6677   TempRect.left = TempRect.right -
6678     be_plain_font->StringWidth (StringPntr) -
6679     be_plain_font->StringWidth (TempString) -
6680     3 * g_MarginBetweenControls;
6681 
6682   m_SpamCountTextboxPntr = new BTextControl (TempRect,
6683     "Spam count",
6684     StringPntr /* label */,
6685     TempString /* text */,
6686     NULL /* no message */,
6687     B_FOLLOW_RIGHT | B_FOLLOW_TOP,
6688     B_WILL_DRAW /* not B_NAVIGABLE */);
6689   AddChild (m_SpamCountTextboxPntr);
6690   m_SpamCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6691   m_SpamCountTextboxPntr->SetDivider (
6692     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6693   m_SpamCountTextboxPntr->SetEnabled (false); /* For display only. */
6694 
6695   /* Change the size of our view so it only takes up the space needed by the
6696   buttons. */
6697 
6698   RowTop += RowHeight /* previous row's RowHeight */;
6699   ResizeTo (Bounds().Width(), RowTop - Bounds().top + 1);
6700 
6701   return; /* Successful. */
6702 
6703 ErrorExit:
6704   DisplayErrorMessage ("Unable to initialise the controls view.");
6705 }
6706 
6707 
6708 void
6709 ControlsView::BrowseForDatabaseFile ()
6710 {
6711   if (m_BrowseFilePanelPntr == NULL)
6712   {
6713     BEntry      DirectoryEntry;
6714     entry_ref   DirectoryEntryRef;
6715     BMessage    GetDatabasePathCommand;
6716     BMessage    GetDatabasePathResult;
6717     const char *StringPntr = NULL;
6718 
6719     /* Create a new file panel.  First set up the entry ref stuff so that the
6720     file panel can open to show the initial directory (the one where the
6721     database file currently is).  Note that we have to create it after the
6722     window and view are up and running, otherwise the BMessenger won't point to
6723     a valid looper/handler.  First find out the current database file name to
6724     use as a starting point. */
6725 
6726     GetDatabasePathCommand.what = B_GET_PROPERTY;
6727     GetDatabasePathCommand.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6728     be_app_messenger.SendMessage (&GetDatabasePathCommand,
6729       &GetDatabasePathResult, 5000000 /* delivery timeout */,
6730       5000000 /* reply timeout */);
6731     if (GetDatabasePathResult.FindString (g_ResultName, &StringPntr) != B_OK ||
6732     DirectoryEntry.SetTo (StringPntr) != B_OK ||
6733     DirectoryEntry.GetParent (&DirectoryEntry) != B_OK)
6734       DirectoryEntry.SetTo ("."); /* Default directory if we can't find it. */
6735     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6736     {
6737       DisplayErrorMessage (
6738         "Unable to set up the file requestor starting directory.  Sorry.");
6739       return;
6740     }
6741 
6742     m_BrowseFilePanelPntr = new BFilePanel (
6743       B_OPEN_PANEL /* mode */,
6744       &be_app_messenger /* target for event messages */,
6745       &DirectoryEntryRef /* starting directory */,
6746       B_FILE_NODE,
6747       true /* true for multiple selections */,
6748       NULL /* canned message */,
6749       NULL /* ref filter */,
6750       false /* true for modal */,
6751       true /* true to hide when done */);
6752   }
6753 
6754   if (m_BrowseFilePanelPntr != NULL)
6755     m_BrowseFilePanelPntr->Show (); /* Answer returned later in RefsReceived. */
6756 }
6757 
6758 
6759 void
6760 ControlsView::BrowseForFileToEstimate ()
6761 {
6762   if (m_EstimateSpamFilePanelPntr == NULL)
6763   {
6764     BEntry      DirectoryEntry;
6765     entry_ref   DirectoryEntryRef;
6766     status_t    ErrorCode;
6767     BMessenger  MessengerToSelf (this);
6768     BPath       PathToMailDirectory;
6769 
6770     /* Create a new file panel.  First set up the entry ref stuff so that the
6771     file panel can open to show the initial directory (the user's mail
6772     directory).  Note that we have to create the panel after the window and
6773     view are up and running, otherwise the BMessenger won't point to a valid
6774     looper/handler. */
6775 
6776     ErrorCode = find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6777     if (ErrorCode == B_OK)
6778     {
6779       PathToMailDirectory.Append ("mail");
6780       ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(),
6781         true /* traverse symbolic links*/);
6782       if (ErrorCode != B_OK || !DirectoryEntry.Exists ())
6783       {
6784         /* If no mail directory, try home directory. */
6785         find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6786         ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6787       }
6788     }
6789     if (ErrorCode != B_OK)
6790       PathToMailDirectory.SetTo (".");
6791 
6792     DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6793     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6794     {
6795       DisplayErrorMessage (
6796         "Unable to set up the file requestor starting directory.  Sorry.");
6797       return;
6798     }
6799 
6800     m_EstimateSpamFilePanelPntr = new BFilePanel (
6801       B_OPEN_PANEL /* mode */,
6802       &MessengerToSelf /* target for event messages */,
6803       &DirectoryEntryRef /* starting directory */,
6804       B_FILE_NODE,
6805       true /* true for multiple selections */,
6806       new BMessage (MSG_ESTIMATE_FILE_REFS) /* canned message */,
6807       NULL /* ref filter */,
6808       false /* true for modal */,
6809       true /* true to hide when done */);
6810   }
6811 
6812   if (m_EstimateSpamFilePanelPntr != NULL)
6813     m_EstimateSpamFilePanelPntr->Show (); /* Answer sent via a message. */
6814 }
6815 
6816 
6817 /* The display has been resized.  Have to manually adjust the popup menu bar to
6818 show the new size (the sub-items need to be resized too).  Then make it redraw.
6819 Well, actually just resetting the mark on the current item will resize it
6820 properly. */
6821 
6822 void
6823 ControlsView::FrameResized (float, float)
6824 {
6825   m_ScoringModeCachedValue = SM_MAX; /* Force it to reset the mark. */
6826   m_TokenizeModeCachedValue = TM_MAX; /* Force it to reset the mark. */
6827 }
6828 
6829 
6830 void
6831 ControlsView::MessageReceived (BMessage *MessagePntr)
6832 {
6833   BMessage CommandMessage;
6834   bool     TempBool;
6835   uint32   TempUint32;
6836 
6837   switch (MessagePntr->what)
6838   {
6839     case MSG_BROWSE_BUTTON:
6840       BrowseForDatabaseFile ();
6841       break;
6842 
6843     case MSG_DATABASE_NAME:
6844       if (strcmp (m_DatabaseFileNameCachedValue,
6845       m_DatabaseFileNameTextboxPntr->Text ()) != 0)
6846         SubmitCommandString (PN_DATABASE_FILE, B_SET_PROPERTY,
6847         m_DatabaseFileNameTextboxPntr->Text ());
6848       break;
6849 
6850     case MSG_ESTIMATE_BUTTON:
6851       BrowseForFileToEstimate ();
6852       break;
6853 
6854     case MSG_ESTIMATE_FILE_REFS:
6855       EstimateRefFilesAndDisplay (MessagePntr);
6856       break;
6857 
6858     case MSG_IGNORE_CLASSIFICATION:
6859       TempBool = (m_IgnorePreviousClassCheckboxPntr->Value() == B_CONTROL_ON);
6860       if (m_IgnorePreviousClassCachedValue != TempBool)
6861         SubmitCommandBool (PN_IGNORE_PREVIOUS_CLASSIFICATION,
6862         B_SET_PROPERTY, TempBool);
6863       break;
6864 
6865     case MSG_PURGE_AGE:
6866       TempUint32 = strtoul (m_PurgeAgeTextboxPntr->Text (), NULL, 10);
6867       if (m_PurgeAgeCachedValue != TempUint32)
6868         SubmitCommandInt32 (PN_PURGE_AGE, B_SET_PROPERTY, TempUint32);
6869       break;
6870 
6871     case MSG_PURGE_POPULARITY:
6872       TempUint32 = strtoul (m_PurgePopularityTextboxPntr->Text (), NULL, 10);
6873       if (m_PurgePopularityCachedValue != TempUint32)
6874         SubmitCommandInt32 (PN_PURGE_POPULARITY, B_SET_PROPERTY, TempUint32);
6875       break;
6876 
6877     case MSG_SERVER_MODE:
6878       TempBool = (m_ServerModeCheckboxPntr->Value() == B_CONTROL_ON);
6879       if (m_ServerModeCachedValue != TempBool)
6880         SubmitCommandBool (PN_SERVER_MODE, B_SET_PROPERTY, TempBool);
6881       break;
6882 
6883     default:
6884       BView::MessageReceived (MessagePntr);
6885   }
6886 }
6887 
6888 
6889 /* Check the server for changes in the state of the database, and if there are
6890 any changes, update the displayed values.  Since this is a read only
6891 examination of the server, we go directly to the application rather than
6892 sending it messages.  Also, when sending messages, we can't find out what it is
6893 doing while it is busy with a batch of spam additions (all the spam add
6894 commands will be in the queue ahead of our requests for info).  Instead, we
6895 lock the BApplication (so it isn't changing things while we're looking) and
6896 retrieve our values. */
6897 
6898 void
6899 ControlsView::PollServerForChanges ()
6900 {
6901   ABSApp     *MyAppPntr;
6902   BMenuItem  *TempMenuItemPntr;
6903   char        TempString [PATH_MAX];
6904   BWindow    *WindowPntr;
6905 
6906   /* We need a pointer to our window, for changing the title etc. */
6907 
6908   WindowPntr = Window ();
6909   if (WindowPntr == NULL)
6910     return; /* No window, no point in updating the display! */
6911 
6912   /* Check the server mode flag.  If the mode is off, then the window has to be
6913   minimized.  Similarly, if it gets turned on, maximize the window.  Note that
6914   the user can maximize the window manually, even while still in server mode.
6915   */
6916 
6917   if (g_ServerMode != m_ServerModeCachedValue &&
6918   m_ServerModeCheckboxPntr != NULL)
6919   {
6920     m_ServerModeCachedValue = g_ServerMode;
6921     m_ServerModeCheckboxPntr->SetValue (
6922       m_ServerModeCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6923     WindowPntr->Minimize (m_ServerModeCachedValue);
6924   }
6925 
6926   if (WindowPntr->IsMinimized ())
6927     return; /* Window isn't visible, don't waste time updating it. */
6928 
6929   /* So that people don't stare at a blank screen, request a database load if
6930   nothing is there.  But only do it once, so the user doesn't get a lot of
6931   invalid database messages if one doesn't exist yet.  In server mode, we never
6932   get this far so it is only loaded when the user wants to see something. */
6933 
6934   if (!m_DatabaseLoadDone)
6935   {
6936     m_DatabaseLoadDone = true;
6937     /* Counting the number of words will load the database. */
6938     SubmitCommandString (PN_DATABASE_FILE, B_COUNT_PROPERTIES, "");
6939   }
6940 
6941   /* Check various read only values, which can be read from the BApplication
6942   without having to lock it.  This is useful for displaying the number of words
6943   as it is changing.  First up is the purge age setting. */
6944 
6945   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
6946   if (MyAppPntr == NULL)
6947     return; /* Doesn't exist or is the wrong class.  Not likely! */
6948 
6949   if (MyAppPntr->m_PurgeAge != m_PurgeAgeCachedValue &&
6950   m_PurgeAgeTextboxPntr != NULL)
6951   {
6952     m_PurgeAgeCachedValue = MyAppPntr->m_PurgeAge;
6953     sprintf (TempString, "%" B_PRIu32, m_PurgeAgeCachedValue);
6954     m_PurgeAgeTextboxPntr->SetText (TempString);
6955   }
6956 
6957   /* Check the purge popularity. */
6958 
6959   if (MyAppPntr->m_PurgePopularity != m_PurgePopularityCachedValue &&
6960   m_PurgePopularityTextboxPntr != NULL)
6961   {
6962     m_PurgePopularityCachedValue = MyAppPntr->m_PurgePopularity;
6963     sprintf (TempString, "%" B_PRIu32, m_PurgePopularityCachedValue);
6964     m_PurgePopularityTextboxPntr->SetText (TempString);
6965   }
6966 
6967   /* Check the Ignore Previous Classification flag. */
6968 
6969   if (MyAppPntr->m_IgnorePreviousClassification !=
6970   m_IgnorePreviousClassCachedValue &&
6971   m_IgnorePreviousClassCheckboxPntr != NULL)
6972   {
6973     m_IgnorePreviousClassCachedValue =
6974       MyAppPntr->m_IgnorePreviousClassification;
6975     m_IgnorePreviousClassCheckboxPntr->SetValue (
6976       m_IgnorePreviousClassCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6977   }
6978 
6979   /* Update the genuine count. */
6980 
6981   if (MyAppPntr->m_TotalGenuineMessages != m_GenuineCountCachedValue &&
6982   m_GenuineCountTextboxPntr != NULL)
6983   {
6984     m_GenuineCountCachedValue = MyAppPntr->m_TotalGenuineMessages;
6985     sprintf (TempString, "%" B_PRIu32, m_GenuineCountCachedValue);
6986     m_GenuineCountTextboxPntr->SetText (TempString);
6987   }
6988 
6989   /* Update the spam count. */
6990 
6991   if (MyAppPntr->m_TotalSpamMessages != m_SpamCountCachedValue &&
6992   m_SpamCountTextboxPntr != NULL)
6993   {
6994     m_SpamCountCachedValue = MyAppPntr->m_TotalSpamMessages;
6995     sprintf (TempString, "%" B_PRIu32, m_SpamCountCachedValue);
6996     m_SpamCountTextboxPntr->SetText (TempString);
6997   }
6998 
6999   /* Update the word count. */
7000 
7001   if (MyAppPntr->m_WordCount != m_WordCountCachedValue &&
7002   m_WordCountTextboxPntr != NULL)
7003   {
7004     m_WordCountCachedValue = MyAppPntr->m_WordCount;
7005     sprintf (TempString, "%" B_PRIu32, m_WordCountCachedValue);
7006     m_WordCountTextboxPntr->SetText (TempString);
7007   }
7008 
7009   /* Update the tokenize mode pop-up menu. */
7010 
7011   if (MyAppPntr->m_TokenizeMode != m_TokenizeModeCachedValue &&
7012   m_TokenizeModePopUpMenuPntr != NULL)
7013   {
7014     m_TokenizeModeCachedValue = MyAppPntr->m_TokenizeMode;
7015     TempMenuItemPntr =
7016       m_TokenizeModePopUpMenuPntr->ItemAt ((int) m_TokenizeModeCachedValue);
7017     if (TempMenuItemPntr != NULL)
7018       TempMenuItemPntr->SetMarked (true);
7019   }
7020 
7021   /* Update the scoring mode pop-up menu. */
7022 
7023   if (MyAppPntr->m_ScoringMode != m_ScoringModeCachedValue &&
7024   m_ScoringModePopUpMenuPntr != NULL)
7025   {
7026     m_ScoringModeCachedValue = MyAppPntr->m_ScoringMode;
7027     TempMenuItemPntr =
7028       m_ScoringModePopUpMenuPntr->ItemAt ((int) m_ScoringModeCachedValue);
7029     if (TempMenuItemPntr != NULL)
7030       TempMenuItemPntr->SetMarked (true);
7031   }
7032 
7033   /* Lock the application.  This will stop it from processing any further
7034   messages until we are done.  Or if it is busy, the lock will fail. */
7035 
7036   if (MyAppPntr->LockWithTimeout (100000) != B_OK)
7037     return; /* It's probably busy doing something. */
7038 
7039   /* See if the database file name has changed. */
7040 
7041   if (strcmp (MyAppPntr->m_DatabaseFileName.String (),
7042   m_DatabaseFileNameCachedValue) != 0 &&
7043   m_DatabaseFileNameTextboxPntr != NULL)
7044   {
7045     strcpy (m_DatabaseFileNameCachedValue,
7046       MyAppPntr->m_DatabaseFileName.String ());
7047     m_DatabaseFileNameTextboxPntr->SetText (m_DatabaseFileNameCachedValue);
7048     WindowPntr->SetTitle (m_DatabaseFileNameCachedValue);
7049   }
7050 
7051   /* Done.  Let the BApplication continue processing messages. */
7052 
7053   MyAppPntr->Unlock ();
7054 }
7055 
7056 
7057 void
7058 ControlsView::Pulse ()
7059 {
7060   if (system_time () > m_TimeOfLastPoll + 200000)
7061   {
7062     PollServerForChanges ();
7063     m_TimeOfLastPoll = system_time ();
7064   }
7065 }
7066 
7067 
7068 
7069 /******************************************************************************
7070  * Implementation of the DatabaseWindow class, constructor, destructor and the
7071  * rest of the member functions in mostly alphabetical order.
7072  */
7073 
7074 DatabaseWindow::DatabaseWindow ()
7075 : BWindow (BRect (30, 30, 620, 400),
7076     "Haiku spam filter server",
7077     B_DOCUMENT_WINDOW, B_ASYNCHRONOUS_CONTROLS)
7078 {
7079   BRect TempRect;
7080 
7081   /* Add the controls view. */
7082 
7083   m_ControlsViewPntr = new ControlsView (Bounds ());
7084   if (m_ControlsViewPntr == NULL)
7085     goto ErrorExit;
7086   AddChild (m_ControlsViewPntr);
7087 
7088   /* Add the word view in the remaining space under the controls view. */
7089 
7090 
7091   TempRect = Bounds ();
7092   TempRect.top = m_ControlsViewPntr->Frame().bottom + 1;
7093   m_WordsViewPntr = new WordsView (TempRect);
7094   if (m_WordsViewPntr == NULL)
7095     goto ErrorExit;
7096   AddChild (m_WordsViewPntr);
7097 
7098  /* Minimize the window if we are starting up in server mode.  This is done
7099 	before the window is open so it doesn't flash onto the screen, and possibly
7100 	steal a keystroke or two.  The ControlsView will further update the minimize
7101 	mode when it detects changes in the server mode. */
7102   Minimize (g_ServerMode);
7103 
7104   return;
7105 
7106 ErrorExit:
7107   DisplayErrorMessage ("Unable to initialise the window contents.");
7108 }
7109 
7110 
7111 void
7112 DatabaseWindow::MessageReceived (BMessage *MessagePntr)
7113 {
7114   if (MessagePntr->what == B_MOUSE_WHEEL_CHANGED)
7115   {
7116     /* Pass the mouse wheel stuff down to the words view, since that's the only
7117     one which does scrolling so we don't need to worry about whether it has
7118     focus or not. */
7119 
7120     if (m_WordsViewPntr != NULL)
7121       m_WordsViewPntr->MessageReceived (MessagePntr);
7122   }
7123   else
7124     BWindow::MessageReceived (MessagePntr);
7125 }
7126 
7127 
7128 bool
7129 DatabaseWindow::QuitRequested ()
7130 {
7131   be_app->PostMessage (B_QUIT_REQUESTED);
7132   return true;
7133 }
7134 
7135 
7136 
7137 /******************************************************************************
7138  * Implementation of the word display view.
7139  */
7140 
7141 WordsView::WordsView (BRect NewBounds)
7142 : BView (NewBounds, "WordsView", B_FOLLOW_ALL_SIDES,
7143     B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE | B_NAVIGABLE | B_PULSE_NEEDED),
7144   m_ArrowLineDownPntr (NULL),
7145   m_ArrowLineUpPntr (NULL),
7146   m_ArrowPageDownPntr (NULL),
7147   m_ArrowPageUpPntr (NULL),
7148   m_LastTimeAKeyWasPressed (0)
7149 {
7150   font_height TempFontHeight;
7151 
7152   GetFont (&m_TextFont); /* Modify the default font to be our own. */
7153   m_TextFont.SetSize (ceilf (m_TextFont.Size() * 1.1));
7154   m_TextFont.GetHeight (&TempFontHeight);
7155   SetFont (&m_TextFont);
7156 
7157   m_LineHeight = ceilf (TempFontHeight.ascent +
7158     TempFontHeight.descent + TempFontHeight.leading);
7159   m_AscentHeight = ceilf (TempFontHeight.ascent);
7160   m_TextHeight = ceilf (TempFontHeight.ascent +
7161     TempFontHeight.descent);
7162 
7163   m_FocusedColour.red = 255;
7164   m_FocusedColour.green = 255;
7165   m_FocusedColour.blue = 255;
7166   m_FocusedColour.alpha = 255;
7167 
7168   m_UnfocusedColour.red = 245;
7169   m_UnfocusedColour.green = 245;
7170   m_UnfocusedColour.blue = 255;
7171   m_UnfocusedColour.alpha = 255;
7172 
7173   m_BackgroundColour = m_UnfocusedColour;
7174   SetViewColor (m_BackgroundColour);
7175   SetLowColor (m_BackgroundColour);
7176   SetHighColor (0, 0, 0);
7177 
7178   strcpy (m_FirstDisplayedWord, "a");
7179 }
7180 
7181 
7182 void
7183 WordsView::AttachedToWindow ()
7184 {
7185   BPolygon        DownLinePolygon (g_DownLinePoints,
7186                     sizeof (g_DownLinePoints) /
7187                     sizeof (g_DownLinePoints[0]));
7188 
7189   BPolygon        DownPagePolygon (g_DownPagePoints,
7190                     sizeof (g_DownPagePoints) /
7191                     sizeof (g_DownPagePoints[0]));
7192 
7193   BPolygon        UpLinePolygon (g_UpLinePoints,
7194                     sizeof (g_UpLinePoints) /
7195                     sizeof (g_UpLinePoints[0]));
7196 
7197   BPolygon        UpPagePolygon (g_UpPagePoints,
7198                     sizeof (g_UpPagePoints) /
7199                     sizeof (g_UpPagePoints[0]));
7200 
7201   BPicture        TempOffPicture;
7202   BPicture        TempOnPicture;
7203   BRect           TempRect;
7204 
7205   /* Make the buttons and associated polygon images for the forward and
7206   backwards a word or a page of words buttons.  They're the width of the scroll
7207   bar area on the right, but twice as tall as usual, since there is no scroll
7208   bar and that will make it easier to use them.  First the up a line button. */
7209 
7210   SetHighColor (0, 0, 0);
7211   BeginPicture (&TempOffPicture);
7212   FillPolygon (&UpLinePolygon);
7213   SetHighColor (180, 180, 180);
7214   StrokePolygon (&UpLinePolygon);
7215   EndPicture ();
7216 
7217   SetHighColor (128, 128, 128);
7218   BeginPicture (&TempOnPicture);
7219   FillPolygon (&UpLinePolygon);
7220   EndPicture ();
7221 
7222   TempRect = Bounds ();
7223   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7224   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7225   m_ArrowLineUpPntr = new BPictureButton (TempRect, "Up Line",
7226     &TempOffPicture, &TempOnPicture,
7227     new BMessage (MSG_LINE_UP), B_ONE_STATE_BUTTON,
7228     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7229   if (m_ArrowLineUpPntr == NULL) goto ErrorExit;
7230   AddChild (m_ArrowLineUpPntr);
7231   m_ArrowLineUpPntr->SetTarget (this);
7232 
7233   /* Up a page button. */
7234 
7235   SetHighColor (0, 0, 0);
7236   BeginPicture (&TempOffPicture);
7237   FillPolygon (&UpPagePolygon);
7238   SetHighColor (180, 180, 180);
7239   StrokePolygon (&UpPagePolygon);
7240   EndPicture ();
7241 
7242   SetHighColor (128, 128, 128);
7243   BeginPicture (&TempOnPicture);
7244   FillPolygon (&UpPagePolygon);
7245   EndPicture ();
7246 
7247   TempRect = Bounds ();
7248   TempRect.top += 2 * B_H_SCROLL_BAR_HEIGHT + 1;
7249   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7250   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7251   m_ArrowPageUpPntr = new BPictureButton (TempRect, "Up Page",
7252     &TempOffPicture, &TempOnPicture,
7253     new BMessage (MSG_PAGE_UP), B_ONE_STATE_BUTTON,
7254     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7255   if (m_ArrowPageUpPntr == NULL) goto ErrorExit;
7256   AddChild (m_ArrowPageUpPntr);
7257   m_ArrowPageUpPntr->SetTarget (this);
7258 
7259   /* Down a page button. */
7260 
7261   SetHighColor (0, 0, 0);
7262   BeginPicture (&TempOffPicture);
7263   FillPolygon (&DownPagePolygon);
7264   SetHighColor (180, 180, 180);
7265   StrokePolygon (&DownPagePolygon);
7266   EndPicture ();
7267 
7268   SetHighColor (128, 128, 128);
7269   BeginPicture (&TempOnPicture);
7270   FillPolygon (&DownPagePolygon);
7271   EndPicture ();
7272 
7273   TempRect = Bounds ();
7274   TempRect.bottom -= 3 * B_H_SCROLL_BAR_HEIGHT + 1;
7275   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7276   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7277   m_ArrowPageDownPntr = new BPictureButton (TempRect, "Down Page",
7278     &TempOffPicture, &TempOnPicture,
7279     new BMessage (MSG_PAGE_DOWN), B_ONE_STATE_BUTTON,
7280     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7281   if (m_ArrowPageDownPntr == NULL) goto ErrorExit;
7282   AddChild (m_ArrowPageDownPntr);
7283   m_ArrowPageDownPntr->SetTarget (this);
7284 
7285   /* Down a line button. */
7286 
7287   SetHighColor (0, 0, 0);
7288   BeginPicture (&TempOffPicture);
7289   FillPolygon (&DownLinePolygon);
7290   SetHighColor (180, 180, 180);
7291   StrokePolygon (&DownLinePolygon);
7292   EndPicture ();
7293 
7294   SetHighColor (128, 128, 128);
7295   BeginPicture (&TempOnPicture);
7296   FillPolygon (&DownLinePolygon);
7297   EndPicture ();
7298 
7299   TempRect = Bounds ();
7300   TempRect.bottom -= B_H_SCROLL_BAR_HEIGHT;
7301   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7302   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7303   m_ArrowLineDownPntr = new BPictureButton (TempRect, "Down Line",
7304     &TempOffPicture, &TempOnPicture,
7305     new BMessage (MSG_LINE_DOWN), B_ONE_STATE_BUTTON,
7306     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7307   if (m_ArrowLineDownPntr == NULL) goto ErrorExit;
7308   AddChild (m_ArrowLineDownPntr);
7309   m_ArrowLineDownPntr->SetTarget (this);
7310 
7311   return;
7312 
7313 ErrorExit:
7314   DisplayErrorMessage ("Problems while making view displaying the words.");
7315 }
7316 
7317 
7318 /* Draw the words starting with the one at or after m_FirstDisplayedWord.  This
7319 requires looking at the database in the BApplication, which may or may not be
7320 available (if it isn't, don't draw, a redraw will usually be requested by the
7321 Pulse member function when it keeps on noticing that the stuff on the display
7322 doesn't match the database). */
7323 
7324 void
7325 WordsView::Draw (BRect UpdateRect)
7326 {
7327   float                   AgeDifference;
7328   float                   AgeProportion;
7329   float                   CenterX;
7330   float                   ColumnLeftCenterX;
7331   float                   ColumnMiddleCenterX;
7332   float                   ColumnRightCenterX;
7333   float                   CompensatedRatio;
7334   StatisticsMap::iterator DataIter;
7335   StatisticsMap::iterator EndIter;
7336   rgb_color               FillColour;
7337   float                   GenuineProportion;
7338   uint32                  GenuineSpamSum;
7339   float                   HeightPixels;
7340   float                   HeightProportion;
7341   float                   LeftBounds;
7342   ABSApp                 *MyAppPntr;
7343   uint32                  NewestAge;
7344   uint32                  OldestAge;
7345   float                   OneFifthTotalGenuine;
7346   float                   OneFifthTotalSpam;
7347   double                  RawProbabilityRatio;
7348   float                   RightBounds;
7349   float                   SpamProportion;
7350   StatisticsPointer       StatisticsPntr;
7351   BRect                   TempRect;
7352   char                    TempString [PATH_MAX];
7353   float                   TotalGenuineMessages = 1.0; /* Avoid divide by 0. */
7354   float                   TotalSpamMessages = 1.0;
7355   float                   Width;
7356   float                   Y;
7357 
7358   /* Lock the application.  This will stop it from processing any further
7359   messages until we are done.  Or if it is busy, the lock will fail. */
7360 
7361   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7362   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (100000) != B_OK)
7363     return; /* It's probably busy doing something. */
7364 
7365   /* Set up various loop invariant variables. */
7366 
7367   if (MyAppPntr->m_TotalGenuineMessages > 0)
7368     TotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7369   OneFifthTotalGenuine = TotalGenuineMessages / 5;
7370 
7371   if (MyAppPntr->m_TotalSpamMessages > 0)
7372     TotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7373   OneFifthTotalSpam = TotalSpamMessages / 5;
7374 
7375   EndIter = MyAppPntr->m_WordMap.end ();
7376 
7377   OldestAge = MyAppPntr->m_OldestAge;
7378   NewestAge = /* actually newest age plus one */
7379     MyAppPntr->m_TotalGenuineMessages + MyAppPntr->m_TotalSpamMessages;
7380 
7381   if (NewestAge == 0)
7382     goto NormalExit; /* No words to display, or something is badly wrong. */
7383 
7384   NewestAge--; /* The newest message has age NewestAge. */
7385   AgeDifference = NewestAge - OldestAge; /* Can be zero if just one message. */
7386 
7387   LeftBounds = Bounds().left;
7388   RightBounds = Bounds().right - B_V_SCROLL_BAR_WIDTH;
7389   Width = RightBounds - LeftBounds;
7390   FillColour.alpha = 255;
7391 
7392   CenterX = ceilf (LeftBounds + Width * 0.5);
7393   ColumnLeftCenterX = ceilf (LeftBounds + Width * 0.05);
7394   ColumnMiddleCenterX = CenterX;
7395   ColumnRightCenterX = ceilf (LeftBounds + Width * 0.95);
7396 
7397   for (DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord),
7398   Y = Bounds().top;
7399   DataIter != EndIter && Y < UpdateRect.bottom;
7400   DataIter++, Y += m_LineHeight)
7401   {
7402     if (Y + m_LineHeight < UpdateRect.top)
7403       continue; /* Not in the visible area yet, don't actually draw. */
7404 
7405     /* Draw the colour bar behind the word.  It reflects the spamness or
7406     genuineness of that particular word, plus the importance of the word and
7407     the age of the word.
7408 
7409     First calculate the compensated spam ratio (described elsewhere).  It is
7410     close to 0.0 for genuine words and close to 1.0 for pure spam.  It is drawn
7411     as a blue bar to the left of center if it is less than 0.5, and a red bar
7412     on the right of center if it is greater than 0.5.  At exactly 0.5 nothing
7413     is drawn; the word is worthless as an indicator.
7414 
7415     The height of the bar corresponds to the number of messages the word was
7416     found in.  Make the height proportional to the total of spam and genuine
7417     messages for the word divided by the sum of the most extreme spam and
7418     genuine counts in the database.
7419 
7420     The staturation of the colour corresponds to the age of the word, with old
7421     words being almost white rather than solid blue or red. */
7422 
7423     StatisticsPntr = &DataIter->second;
7424 
7425     SpamProportion = StatisticsPntr->spamCount / TotalSpamMessages;
7426     GenuineProportion = StatisticsPntr->genuineCount / TotalGenuineMessages;
7427     if (SpamProportion + GenuineProportion > 0.0f)
7428       RawProbabilityRatio =
7429       SpamProportion / (SpamProportion + GenuineProportion);
7430     else
7431       RawProbabilityRatio = g_RobinsonX;
7432 
7433     /* The compensated ratio leans towards 0.5 (RobinsonX) more for fewer
7434     data points, with a weight of 0.45 (RobinsonS). */
7435 
7436     GenuineSpamSum =
7437       StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
7438     CompensatedRatio =
7439       (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
7440       (g_RobinsonS + GenuineSpamSum);
7441 
7442     /* Used to use the height based on the most frequent word, but some words,
7443     like "From", show up in all messages which made most other words just
7444     appear as a thin line.  I did a histogram plot of the sizes in my test
7445     database, and figured that you get better coverage of 90% of the messages
7446     if you use 1/5 of the total number as the count which gives you 100%
7447     height.  The other 10% get a full height bar, but most people wouldn't care
7448     that they're super frequently used. */
7449 
7450     HeightProportion = 0.5f * (StatisticsPntr->genuineCount /
7451       OneFifthTotalGenuine + StatisticsPntr->spamCount / OneFifthTotalSpam);
7452 
7453     if (HeightProportion > 1.0f)
7454       HeightProportion = 1.0f;
7455     HeightPixels = ceilf (HeightProportion * m_TextHeight);
7456 
7457     if (AgeDifference <= 0.0f)
7458       AgeProportion = 1.0; /* New is 1.0, old is 0.0 */
7459     else
7460       AgeProportion = (StatisticsPntr->age - OldestAge) / AgeDifference;
7461 
7462     TempRect.top = ceilf (Y + m_TextHeight / 2 - HeightPixels / 2);
7463     TempRect.bottom = TempRect.top + HeightPixels;
7464 
7465     if (CompensatedRatio < 0.5f)
7466     {
7467       TempRect.left = ceilf (
7468         CenterX - 1.6f * (0.5f - CompensatedRatio) * (CenterX - LeftBounds));
7469       TempRect.right = CenterX;
7470       FillColour.red = 230 - (int) (AgeProportion * 230.0f);
7471       FillColour.green = FillColour.red;
7472       FillColour.blue = 255;
7473     }
7474     else /* Ratio >= 0.5, red spam block. */
7475     {
7476       TempRect.left = CenterX;
7477       TempRect.right = ceilf (
7478         CenterX + 1.6f * (CompensatedRatio - 0.5f) * (RightBounds - CenterX));
7479       FillColour.blue = 230 - (int) (AgeProportion * 230.0f);
7480       FillColour.green = FillColour.blue;
7481       FillColour.red = 255;
7482     }
7483     SetHighColor (FillColour);
7484     SetDrawingMode (B_OP_COPY);
7485     FillRect (TempRect);
7486 
7487     /* Print the text centered in columns of various widths.  The number of
7488     genuine messages in the left 10% of the width, the word in the middle 80%,
7489     and the number of spam messages using the word in the right 10%. */
7490 
7491     SetHighColor (0, 0, 0);
7492     SetDrawingMode (B_OP_OVER); /* So that antialiased text mixes better. */
7493 
7494     sprintf (TempString, "%" B_PRIu32, StatisticsPntr->genuineCount);
7495     Width = m_TextFont.StringWidth (TempString);
7496     MovePenTo (ceilf (ColumnLeftCenterX - Width / 2), Y + m_AscentHeight);
7497     DrawString (TempString);
7498 
7499     strcpy (TempString, DataIter->first.c_str ());
7500     Width = m_TextFont.StringWidth (TempString);
7501     MovePenTo (ceilf (ColumnMiddleCenterX - Width / 2), Y + m_AscentHeight);
7502     DrawString (TempString);
7503 
7504     sprintf (TempString, "%" B_PRIu32, StatisticsPntr->spamCount);
7505     Width = m_TextFont.StringWidth (TempString);
7506     MovePenTo (ceilf (ColumnRightCenterX - Width / 2), Y + m_AscentHeight);
7507     DrawString (TempString);
7508   }
7509 
7510   /* Draw the first word (the one which the user types in to select the first
7511   displayed word) on the right, in the scroll bar margin, rotated 90 degrees to
7512   fit between the page up and page down buttons. */
7513 
7514   Width = m_TextFont.StringWidth (m_FirstDisplayedWord);
7515   if (Width > 0)
7516   {
7517     TempRect = Bounds ();
7518     TempRect.top += 4 * B_H_SCROLL_BAR_HEIGHT + 1;
7519     TempRect.bottom -= 5 * B_H_SCROLL_BAR_HEIGHT + 1;
7520 
7521     MovePenTo (TempRect.right - m_TextHeight + m_AscentHeight - 1,
7522       ceilf ((TempRect.bottom + TempRect.top) / 2 + Width / 2));
7523     m_TextFont.SetRotation (90);
7524     SetFont (&m_TextFont, B_FONT_ROTATION);
7525     DrawString (m_FirstDisplayedWord);
7526     m_TextFont.SetRotation (0);
7527     SetFont (&m_TextFont, B_FONT_ROTATION);
7528   }
7529 
7530 NormalExit:
7531 
7532   /* Successfully finished drawing.  Update the cached values to match what we
7533   have drawn. */
7534   m_CachedTotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7535   m_CachedTotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7536   m_CachedWordCount = MyAppPntr->m_WordCount;
7537 
7538   /* Done.  Let the BApplication continue processing messages. */
7539   MyAppPntr->Unlock ();
7540 }
7541 
7542 
7543 /* When the user presses keys, they select the first word to be displayed in
7544 the view (it's the word at or lexicographically after the word typed in).  The
7545 keys are appended to the starting word, until the user stops typing for a
7546 while, then the next key will be the first letter of a new starting word. */
7547 
7548 void
7549 WordsView::KeyDown (const char *BufferPntr, int32 NumBytes)
7550 {
7551   int32          CharLength;
7552   bigtime_t      CurrentTime;
7553   char           TempString [40];
7554 
7555   CurrentTime = system_time ();
7556 
7557   if (NumBytes < (int32) sizeof (TempString))
7558   {
7559     memcpy (TempString, BufferPntr, NumBytes);
7560     TempString [NumBytes] = 0;
7561     CharLength = strlen (TempString); /* So NUL bytes don't get through. */
7562 
7563     /* Check for arrow keys, which move the view up and down. */
7564 
7565     if (CharLength == 1 &&
7566     (TempString[0] == B_UP_ARROW ||
7567     TempString[0] == B_DOWN_ARROW ||
7568     TempString[0] == B_PAGE_UP ||
7569     TempString[0] == B_PAGE_DOWN))
7570     {
7571       MoveTextUpOrDown ((TempString[0] == B_UP_ARROW) ? MSG_LINE_UP :
7572         ((TempString[0] == B_DOWN_ARROW) ? MSG_LINE_DOWN :
7573         ((TempString[0] == B_PAGE_UP) ? MSG_PAGE_UP : MSG_PAGE_DOWN)));
7574     }
7575     else if (CharLength > 1 ||
7576     (CharLength == 1 && 32 <= (uint8) TempString[0]))
7577     {
7578       /* Have a non-control character, or some sort of multibyte char.  Add it
7579       to the word and mark things for redisplay starting at the resulting word.
7580       */
7581 
7582       if (CurrentTime - m_LastTimeAKeyWasPressed >= 1000000 /* microseconds */)
7583         strcpy (m_FirstDisplayedWord, TempString); /* Starting a new word. */
7584       else if (strlen (m_FirstDisplayedWord) + CharLength <= g_MaxWordLength)
7585         strcat (m_FirstDisplayedWord, TempString); /* Append to existing. */
7586 
7587       Invalidate ();
7588     }
7589   }
7590 
7591   m_LastTimeAKeyWasPressed = CurrentTime;
7592   BView::KeyDown (BufferPntr, NumBytes);
7593 }
7594 
7595 
7596 /* Change the background colour to show that we have the focus.  When we have
7597 it, keystrokes will select the word to be displayed at the top of the list. */
7598 
7599 void
7600 WordsView::MakeFocus (bool Focused)
7601 {
7602   if (Focused)
7603     m_BackgroundColour = m_FocusedColour;
7604   else
7605     m_BackgroundColour = m_UnfocusedColour;
7606   SetViewColor (m_BackgroundColour);
7607   SetLowColor (m_BackgroundColour);
7608 
7609   /* Also need to set the background colour for the scroll buttons, since they
7610   can't be made transparent. */
7611 
7612   if (m_ArrowLineDownPntr != NULL)
7613   {
7614     m_ArrowLineDownPntr->SetViewColor (m_BackgroundColour);
7615     m_ArrowLineDownPntr->Invalidate ();
7616   }
7617 
7618   if (m_ArrowLineUpPntr != NULL)
7619   {
7620     m_ArrowLineUpPntr->SetViewColor (m_BackgroundColour);
7621     m_ArrowLineUpPntr->Invalidate ();
7622   }
7623 
7624   if (m_ArrowPageDownPntr != NULL)
7625   {
7626     m_ArrowPageDownPntr->SetViewColor (m_BackgroundColour);
7627     m_ArrowPageDownPntr->Invalidate ();
7628   }
7629 
7630   if (m_ArrowPageUpPntr != NULL)
7631   {
7632     m_ArrowPageUpPntr->SetViewColor (m_BackgroundColour);
7633     m_ArrowPageUpPntr->Invalidate ();
7634   }
7635 
7636   Invalidate ();
7637 
7638   BView::MakeFocus (Focused);
7639 }
7640 
7641 
7642 void
7643 WordsView::MessageReceived (BMessage *MessagePntr)
7644 {
7645   int32     CountFound;
7646   float     DeltaY; /* Usually -1.0, 0.0 or +1.0. */
7647   type_code TypeFound;
7648 
7649   switch (MessagePntr->what)
7650   {
7651     case B_MOUSE_WHEEL_CHANGED:
7652       if (MessagePntr->FindFloat ("be:wheel_delta_y", &DeltaY) != 0) break;
7653       if (DeltaY < 0)
7654         MoveTextUpOrDown (MSG_LINE_UP);
7655       else if (DeltaY > 0)
7656         MoveTextUpOrDown (MSG_LINE_DOWN);
7657       break;
7658 
7659     case MSG_LINE_DOWN:
7660     case MSG_LINE_UP:
7661     case MSG_PAGE_DOWN:
7662     case MSG_PAGE_UP:
7663       MoveTextUpOrDown (MessagePntr->what);
7664       break;
7665 
7666     case B_SIMPLE_DATA: /* Something has been dropped in our view. */
7667       if (MessagePntr->GetInfo ("refs", &TypeFound, &CountFound) == B_OK &&
7668       CountFound > 0 && TypeFound == B_REF_TYPE)
7669       {
7670         RefsDroppedHere (MessagePntr);
7671         break;
7672       }
7673       /* Else fall through to the default case, in case it is something else
7674       dropped that the system knows about. */
7675 
7676     default:
7677       BView::MessageReceived (MessagePntr);
7678   }
7679 }
7680 
7681 
7682 /* If the user clicks on our view, take over the focus. */
7683 
7684 void
7685 WordsView::MouseDown (BPoint)
7686 {
7687   if (!IsFocus ())
7688     MakeFocus (true);
7689 }
7690 
7691 
7692 void
7693 WordsView::MoveTextUpOrDown (uint32 MovementType)
7694 {
7695   StatisticsMap::iterator  DataIter;
7696   int                      i;
7697   ABSApp                  *MyAppPntr;
7698   int                      PageSize;
7699 
7700   /* Lock the application.  This will stop it from processing any further
7701   messages until we are done (we need to look at the word list directly).  Or
7702   if it is busy, the lock will fail. */
7703 
7704   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7705   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (2000000) != B_OK)
7706     return; /* It's probably busy doing something. */
7707 
7708   PageSize = (int) (Bounds().Height() / m_LineHeight - 1);
7709   if (PageSize < 1)
7710     PageSize = 1;
7711 
7712   DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord);
7713 
7714   switch (MovementType)
7715   {
7716     case MSG_LINE_UP:
7717       if (DataIter != MyAppPntr->m_WordMap.begin ())
7718         DataIter--;
7719       break;
7720 
7721     case MSG_LINE_DOWN:
7722       if (DataIter != MyAppPntr->m_WordMap.end ())
7723         DataIter++;
7724       break;
7725 
7726     case MSG_PAGE_UP:
7727       for (i = 0; i < PageSize; i++)
7728       {
7729         if (DataIter == MyAppPntr->m_WordMap.begin ())
7730           break;
7731         DataIter--;
7732       }
7733       break;
7734 
7735     case MSG_PAGE_DOWN:
7736       for (i = 0; i < PageSize; i++)
7737       {
7738         if (DataIter == MyAppPntr->m_WordMap.end ())
7739           break;
7740         DataIter++;
7741       }
7742       break;
7743   }
7744 
7745   if (DataIter != MyAppPntr->m_WordMap.end ())
7746     strcpy (m_FirstDisplayedWord, DataIter->first.c_str ());
7747 
7748   Invalidate ();
7749 
7750   MyAppPntr->Unlock ();
7751 }
7752 
7753 
7754 /* This function periodically polls the BApplication to see if anything has
7755 changed.  If the word list is different or the display has changed in some
7756 other way, it will then try to refresh the display, repeating the attempt until
7757 it gets successfully drawn. */
7758 
7759 void
7760 WordsView::Pulse ()
7761 {
7762   ABSApp *MyAppPntr;
7763 
7764   /* Probe the BApplication to see if it has changed. */
7765 
7766   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7767   if (MyAppPntr == NULL)
7768     return; /* Something is wrong, give up. */
7769 
7770   if (MyAppPntr->m_TotalGenuineMessages != m_CachedTotalGenuineMessages ||
7771   MyAppPntr->m_TotalSpamMessages != m_CachedTotalSpamMessages ||
7772   MyAppPntr->m_WordCount != m_CachedWordCount)
7773     Invalidate ();
7774 }
7775 
7776 
7777 /* The user has dragged and dropped some file references on the words view.  If
7778 it is in the left third, add the file(s) as examples of genuine messages, right
7779 third for spam messages and if it is in the middle third then evaluate the
7780 file(s) for spaminess. */
7781 
7782 void
7783 WordsView::RefsDroppedHere (BMessage *MessagePntr)
7784 {
7785   float  Left;
7786   bool   SpamExample = true; /* TRUE if example is of spam, FALSE genuine. */
7787   float  Third;
7788   BPoint WhereDropped;
7789 
7790   /* Find out which third of the view it was dropped into. */
7791 
7792   if (MessagePntr->FindPoint ("_drop_point_", &WhereDropped) != B_OK)
7793     return;  /* Need to know where it was dropped. */
7794   ConvertFromScreen (&WhereDropped);
7795   Third = Bounds().Width() / 3;
7796   Left = Bounds().left;
7797   if (WhereDropped.x < Left + Third)
7798     SpamExample = false;
7799   else if (WhereDropped.x < Left + 2 * Third)
7800   {
7801     /* In the middle third, evaluate all files for spaminess. */
7802     EstimateRefFilesAndDisplay (MessagePntr);
7803     return;
7804   }
7805 
7806   if (g_CommanderLooperPntr != NULL)
7807     g_CommanderLooperPntr->CommandReferences (
7808     MessagePntr, true /* BulkMode */, SpamExample ? CL_SPAM : CL_GENUINE);
7809 }
7810 
7811 
7812 
7813 /******************************************************************************
7814  * Finally, the main program which drives it all.
7815  */
7816 
7817 int main (int argc, char**)
7818 {
7819   g_CommandLineMode = (argc > 1);
7820   if (!g_CommandLineMode)
7821     cout << PrintUsage; /* In case no arguments specified. */
7822 
7823   g_CommanderLooperPntr = new CommanderLooper;
7824   if (g_CommanderLooperPntr != NULL)
7825   {
7826     g_CommanderMessenger = new BMessenger (NULL, g_CommanderLooperPntr);
7827     g_CommanderLooperPntr->Run ();
7828   }
7829 
7830   ABSApp MyApp;
7831 
7832   if (MyApp.InitCheck () == 0)
7833   {
7834     MyApp.LoadSaveSettings (true /* DoLoad */);
7835     MyApp.Run ();
7836   }
7837 
7838   if (g_CommanderLooperPntr != NULL)
7839   {
7840     g_CommanderLooperPntr->PostMessage (B_QUIT_REQUESTED);
7841     snooze (100000); /* Let the CommanderLooper thread run so it quits. */
7842   }
7843 
7844   cerr << "SpamDBM shutting down..." << endl;
7845   return 0; /* And implicitly destroys MyApp, which writes out the database. */
7846 }
7847