xref: /haiku/src/bin/mail_utils/spamdbm.cpp (revision c9ad965c81b08802fed0827fd1dd16f45297928a)
1 /******************************************************************************
2  * $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $
3  *
4  * This is a BeOS program for classifying e-mail messages as spam (unwanted
5  * junk mail) or as genuine mail using a Bayesian statistical approach.  There
6  * is also a Mail Daemon Replacement add-on to filter mail using the
7  * classification statistics collected earlier.
8  *
9  * See also http://www.paulgraham.com/spam.html for a good writeup and
10  * http://www.tuxedo.org/~esr/bogofilter/ for another implementation.
11  * And more recently, Gary Robinson's write up of his improved algorithm
12  * at http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
13  * which gives a better spread in spam ratios and slightly fewer
14  * misclassifications.
15  *
16  * Note that this uses the AGMS vacation coding style, not the OpenTracker one.
17  * That means no tabs, indents are two spaces, m_ is the prefix for member
18  * variables, g_ is the prefix for global names, C style comments, constants
19  * are in all capital letters and most other things are mixed case, it's word
20  * wrapped to fit in 79 characters per line to make proofreading on paper
21  * easier, and functions are listed in reverse dependency order so that forward
22  * declarations (function prototypes with no code) aren't needed.
23  *
24  * The Original Design:
25  * There is a spam database (just a file listing words and number of times they
26  * were used in spam and non-spam messages) that a BeMailDaemon input filter
27  * will use when scanning email.  It will mark the mail with the spam
28  * probability (an attribute, optionally a mail header field) and optionally do
29  * something if the probability exceeds a user defined level (delete message,
30  * change subject, file in a different folder).  Or should that be a different
31  * filter?  Outside the mail system, the probability can be used in queries to
32  * find spam.
33  *
34  * A second user application will be used to update the database.  Besides
35  * showing you the current list of words, you can drag and drop files to mark
36  * them as spam or non-spam (a balanced binary tree is used internally to make
37  * word storage fast).  It will add a second attribute to the files to show how
38  * they have been classified by the user (and won't update the database if you
39  * accidentally try to classify a file again).  Besides drag and drop, there
40  * will be a command line interface and a message passing interface.  BeMail
41  * (or other programs) will then communicate via messages to tell it when the
42  * user marks a message as spam or not (via having separate delete spam /
43  * delete genuine mail buttons and a menu item or two).
44  *
45  * Plus lots of details, like the rename swap method to update the database
46  * file (so programs with the old file open aren't affected).  A nice tab text
47  * format so you can open the database in a spreadsheet.  Startup and shutdown
48  * control of the updater from BeMail.  Automatic creation of the indices
49  * needed by the filter.  MIME types for the database file.  Icons for the app.
50  * System settings to enable tracker to display the new attributes when viewing
51  * e-mail (and maybe news articles if someone ever gets around to an NNTP as
52  * files reader).  Documentation.  Recursive directory traversal for the
53  * command line or directory drag and drop.  Options for the updater to warn or
54  * ignore non-email files.  Etc.
55  *
56  * The Actual Implementation:
57  * The spam database updates and the test for spam have been combined into one
58  * program which runs as a server.  That way there won't be as long a delay
59  * when the e-mail system wants to check for spam, because the database is
60  * already loaded by the server and in memory.  The MDR mail filter add-on
61  * simply sends scripting commands to the server (and starts it up if it isn't
62  * already running).  The filter takes care of marking the messages when it
63  * gets the rating back from the server, and then the rest of the mail system
64  * rule chain can delete the message or otherwise manipulate it.
65  *
66  * Revision History (now manually updated due to SVN's philosophy)
67  * $Log: spamdbm.cpp,v $
68  * ------------------------------------------------------------------------
69  * r15195 | agmsmith | 2005-11-27 21:07:55 -0500 (Sun, 27 Nov 2005) | 4 lines
70  * Just a few minutes after checking in, I mentioned it to Japanese expert Koki
71  * and he suggested also including the Japanese comma.  So before I forget to
72  * do it...
73  *
74  * ------------------------------------------------------------------------
75  * r15194 | agmsmith | 2005-11-27 20:37:13 -0500 (Sun, 27 Nov 2005) | 5 lines
76  * Truncate overly long URLs to the maximum word length.  Convert Japanese
77  * periods to spaces so that more "words" are found.  Fix UTF-8 comparison
78  * problems with tolower() incorrectly converting characters with the high bit
79  * set.
80  *
81  * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
82  * Added better tokenization so that HTML is parsed and things like tags
83  * between letters of a word no longer hide that word.  After testing, the
84  * result seems to be a tighter spread of ratings when done in full text plus
85  * header mode.
86  *
87  * Revision 1.10  2005/11/24 02:08:39  agmsmith
88  * Fixed up prefix codes, Z for things that are inside other things.
89  *
90  * Revision 1.9  2005/11/21 03:28:03  agmsmith
91  * Added a function for extracting URLs.
92  *
93  * Revision 1.8  2005/11/09 03:36:18  agmsmith
94  * Removed noframes detection (doesn't show up in e-mails).  Now use
95  * just H for headers and Z for HTML tag junk.
96  *
97  * Revision 1.7  2005/10/24 00:00:08  agmsmith
98  * Adding HTML tag removal, which also affected the search function so it
99  * could search for single part things like  .
100  *
101  * Revision 1.6  2005/10/17 01:55:08  agmsmith
102  * Remove HTML comments and a few other similar things.
103  *
104  * Revision 1.5  2005/10/16 18:35:36  agmsmith
105  * Under construction - looking into HTML not being in UTF-8.
106  *
107  * Revision 1.4  2005/10/11 01:51:21  agmsmith
108  * Starting on the tokenising passes.  Still need to test asian truncation.
109  *
110  * Revision 1.3  2005/10/06 11:54:07  agmsmith
111  * Not much.
112  *
113  * Revision 1.2  2005/09/12 01:49:37  agmsmith
114  * Enable case folding for the whole file tokenizer.
115  *
116  * r13961 | agmsmith | 2005-08-13 22:25:28 -0400 (Sat, 13 Aug 2005) | 2 lines
117  * Source code changes so that mboxtobemail now compiles and is in the build
118  * system.
119  *
120  * r13959 | agmsmith | 2005-08-13 22:05:27 -0400 (Sat, 13 Aug 2005) | 2 lines
121  * Rename the directory before doing anything else, otherwise svn dies badly.
122  *
123  * r13952 | agmsmith | 2005-08-13 15:31:42 -0400 (Sat, 13 Aug 2005) | 3 lines
124  * Added the resources and file type associations, changed the application
125  * signature and otherwise made the spam detection system work properly again.
126  *
127  * r13951 | agmsmith | 2005-08-13 11:40:01 -0400 (Sat, 13 Aug 2005) | 2 lines
128  * Had to do the file rename as a separate operation due to SVN limitations.
129  *
130  * r13950 | agmsmith | 2005-08-13 11:38:44 -0400 (Sat, 13 Aug 2005) | 3 lines
131  * Oops, "spamdb" is already used for a Unix package.  And spamdatabase is
132  * already reserved by a domain name squatter.  Use "spamdbm" instead.
133  *
134  * r13949 | agmsmith | 2005-08-13 11:17:52 -0400 (Sat, 13 Aug 2005) | 3 lines
135  * Renamed spamfilter to be the more meaningful spamdb (spam database) and
136  * moved it into its own source directory in preparation for adding resources.
137  *
138  * r13628 | agmsmith | 2005-07-10 20:11:29 -0400 (Sun, 10 Jul 2005) | 3 lines
139  * Updated keyword expansion to use SVN keywords.  Also seeing if svn is
140  * working well enough for me to update files from BeOS R5.
141  *
142  * r11909 | axeld | 2005-03-18 19:09:19 -0500 (Fri, 18 Mar 2005) | 2 lines
143  * Moved bin/ directory out of apps/.
144  *
145  * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
146  * Move trunk into respective module.
147  *
148  * r10362 | nwhitehorn | 2004-12-06 20:14:05 -0500 (Mon, 06 Dec 2004) | 2 lines
149  * Fixed the spam filter so it works correctly now.
150  *
151  * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
152  * Added AGMS's excellent spam detection software.  Still some weirdness with
153  * the configuration interface from E-mail prefs.
154  *
155  * Revision 1.2  2004/12/07 01:14:05  nwhitehorn
156  * Fixed the spam filter so it works correctly now.
157  *
158  * Revision 1.87  2004/09/20 15:57:26  nwhitehorn
159  * Mostly updated the tree to Be/Haiku style identifier naming conventions.  I
160  * have a few more things to work out, mostly in mail_util.h, and then I'm
161  * proceeding to jamify the build system.  Then we go into Haiku CVS.
162  *
163  * Revision 1.86  2003/07/26 16:47:46  agmsmith
164  * Bug - wasn't allowing double classification if the user had turned on
165  * the option to ignore the previous classification.
166  *
167  * Revision 1.85  2003/07/08 14:52:57  agmsmith
168  * Fix bug with classification choices dialog box coming up with weird
169  * sizes due to RefsReceived message coming in before ReadyToRun had
170  * finished setting up the default sizes of the controls.
171  *
172  * Revision 1.84  2003/07/04 19:59:29  agmsmith
173  * Now with a GUI option to let you declassify messages (set them back
174  * to uncertain, rather than spam or genuine).  Required a BAlert
175  * replacement since BAlerts can't do four buttons.
176  *
177  * Revision 1.83  2003/07/03 20:40:36  agmsmith
178  * Added Uncertain option for declassifying messages.
179  *
180  * Revision 1.82  2003/06/16 14:57:13  agmsmith
181  * Detect spam which uses mislabeled text attachments, going by the file name
182  * extension.
183  *
184  * Revision 1.81  2003/04/08 20:27:04  agmsmith
185  * AGMSBayesianSpamServer now shuts down immediately and returns true if
186  * it is asked to quit by the registrar.
187  *
188  * Revision 1.80  2003/04/07 19:20:27  agmsmith
189  * Ooops, int64 doesn't exist, use long long instead.
190  *
191  * Revision 1.79  2003/04/07 19:05:22  agmsmith
192  * Now with Allen Brunson's atoll for PPC (you need the %Ld, but that
193  * becomes %lld on other systems).
194  *
195  * Revision 1.78  2003/04/04 22:43:53  agmsmith
196  * Fixed up atoll PPC processor hack so it would actually work, was just
197  * returning zero which meant that it wouldn't load in the database file
198  * (read the size as zero).
199  *
200  * Revision 1.77  2003/01/22 03:19:48  agmsmith
201  * Don't convert words to lower case, the case is important for spam.
202  * Particularly sentences which start with exciting words, which you
203  * normally won't use at the start of a sentence (and thus capitalize).
204  *
205  * Revision 1.76  2002/12/18 02:29:22  agmsmith
206  * Add space for the Uncertain display in Tracker.
207  *
208  * Revision 1.75  2002/12/18 01:54:37  agmsmith
209  * Added uncertain sound effect.
210  *
211  * Revision 1.74  2002/12/13 23:53:12  agmsmith
212  * Minimize the window before opening it so that it doesn't flash on the
213  * screen in server mode.  Also load the database when the window is
214  * displayed so that the user can see the words.
215  *
216  * Revision 1.73  2002/12/13 20:55:57  agmsmith
217  * Documentation.
218  *
219  * Revision 1.72  2002/12/13 20:26:11  agmsmith
220  * Fixed bug with adding messages in strings to database (was limited to
221  * messages at most 1K long).  Also changed default server mode to true
222  * since that's what people use most.
223  *
224  * Revision 1.71  2002/12/11 22:37:30  agmsmith
225  * Added commands to train on spam and genuine e-mail messages passed
226  * in string arguments rather then via external files.
227  *
228  * Revision 1.70  2002/12/10 22:12:41  agmsmith
229  * Adding a message to the database now uses a BPositionIO rather than a
230  * file and file name (for future string rather than file additions).  Also
231  * now re-evaluate a file after reclassifying it so that the user can see
232  * the new ratio.  Also remove the [Spam 99.9%] subject prefix when doing
233  * a re-evaluation or classification (the number would be wrong).
234  *
235  * Revision 1.69  2002/12/10 01:46:04  agmsmith
236  * Added the Chi-Squared scoring method.
237  *
238  * Revision 1.68  2002/11/29 22:08:25  agmsmith
239  * Change default purge age to 2000 so that hitting the purge button
240  * doesn't erase stuff from the new sample database.
241  *
242  * Revision 1.67  2002/11/25 20:39:39  agmsmith
243  * Don't need to massage the MIME type since the mail library now does
244  * the lower case conversion and converts TEXT to text/plain too.
245  *
246  * Revision 1.66  2002/11/20 22:57:12  nwhitehorn
247  * PPC Compatibility Fixes
248  *
249  * Revision 1.65  2002/11/10 18:43:55  agmsmith
250  * Added a time delay to some quitting operations so that scripting commands
251  * from a second client (like a second e-mail account) will make the program
252  * abort the quit operation.
253  *
254  * Revision 1.64  2002/11/05 18:05:16  agmsmith
255  * Looked at Nathan's PPC changes (thanks!), modified style a bit.
256  *
257  * Revision 1.63  2002/11/04 03:30:22  nwhitehorn
258  * Now works (or compiles at least) on PowerPC.  I'll get around to testing it
259  * later.
260  *
261  * Revision 1.62  2002/11/04 01:03:33  agmsmith
262  * Fixed warnings so it compiles under the bemaildaemon system.
263  *
264  * Revision 1.61  2002/11/03 23:00:37  agmsmith
265  * Added to the bemaildaemon project on SourceForge.  Hmmmm, seems to switch to
266  * a new version if I commit and specify a message, but doesn't accept the
267  * message and puts up the text editor.  Must be a bug where cvs eats the first
268  * option after "commit".
269  *
270  * Revision 1.60.1.1  2002/10/22 14:29:27  agmsmith
271  * Needed to recompile with the original Libmail.so from Beta/1 since
272  * the current library uses a different constructor, and thus wouldn't
273  * run when used with the old library.
274  *
275  * Revision 1.60  2002/10/21 16:41:27  agmsmith
276  * Return a special error code when no words are found in a message,
277  * so that messages without text/plain parts can be recognized as
278  * spam by the mail filter.
279  *
280  * Revision 1.59  2002/10/20 21:29:47  agmsmith
281  * Watch out for MIME types of "text", treat as text/plain.
282  *
283  * Revision 1.58  2002/10/20 18:29:07  agmsmith
284  * *** empty log message ***
285  *
286  * Revision 1.57  2002/10/20 18:25:02  agmsmith
287  * Fix case sensitivity in MIME type tests, and fix text/any test.
288  *
289  * Revision 1.56  2002/10/19 17:00:10  agmsmith
290  * Added the pop-up menu for the tokenize modes.
291  *
292  * Revision 1.55  2002/10/19 14:54:06  agmsmith
293  * Fudge MIME type of body text components so that they get
294  * treated as text.
295  *
296  * Revision 1.54  2002/10/19 00:56:37  agmsmith
297  * The parsing of e-mail messages seems to be working now, just need
298  * to add some user interface stuff for the tokenizing mode.
299  *
300  * Revision 1.53  2002/10/18 23:37:56  agmsmith
301  * More mail kit usage, can now decode headers, but more to do.
302  *
303  * Revision 1.52  2002/10/16 23:52:33  agmsmith
304  * Getting ready to add more tokenizing modes, exploring Mail Kit to break
305  * apart messages into components (and decode BASE64 and other encodings).
306  *
307  * Revision 1.51  2002/10/11 20:05:31  agmsmith
308  * Added installation of sound effect names, which the filter will use.
309  *
310  * Revision 1.50  2002/10/02 16:50:02  agmsmith
311  * Forgot to add credits to the algorithm inventors.
312  *
313  * Revision 1.49  2002/10/01 00:39:29  agmsmith
314  * Added drag and drop to evaluate files or to add them to the list.
315  *
316  * Revision 1.48  2002/09/30 19:44:17  agmsmith
317  * Switched to Gary Robinson's method, removed max spam/genuine word.
318  *
319  * Revision 1.47  2002/09/23 17:08:55  agmsmith
320  * Add an attribute with the spam ratio to files which have been evaluated.
321  *
322  * Revision 1.46  2002/09/23 02:50:32  agmsmith
323  * Fiddling with display width of e-mail attributes.
324  *
325  * Revision 1.45  2002/09/23 01:13:56  agmsmith
326  * Oops, bug in string evaluation scripting.
327  *
328  * Revision 1.44  2002/09/22 21:00:55  agmsmith
329  * Added EvaluateString so that the BeMail add-on can pass the info without
330  * having to create a temporary file.
331  *
332  * Revision 1.43  2002/09/20 19:56:02  agmsmith
333  * Added about box and button for estimating the spam ratio of a file.
334  *
335  * Revision 1.42  2002/09/20 01:22:26  agmsmith
336  * More testing, decide that an extreme ratio bias point of 0.5 is good.
337  *
338  * Revision 1.41  2002/09/19 21:17:12  agmsmith
339  * Changed a few names and proofread the program.
340  *
341  * Revision 1.40  2002/09/19 14:27:17  agmsmith
342  * Rearranged execution of commands, moving them to a separate looper
343  * rather than the BApplication, so that thousands of files could be
344  * processed without worrying about the message queue filling up.
345  *
346  * Revision 1.39  2002/09/18 18:47:16  agmsmith
347  * Stop flickering when the view is partially obscured, update cached
348  * values in all situations except when app is busy.
349  *
350  * Revision 1.38  2002/09/18 18:08:11  agmsmith
351  * Add a function for evaluating the spam ratio of a message.
352  *
353  * Revision 1.37  2002/09/16 01:30:16  agmsmith
354  * Added Get Oldest command.
355  *
356  * Revision 1.36  2002/09/16 00:47:52  agmsmith
357  * Change the display to counter-weigh the spam ratio by the number of
358  * messages.
359  *
360  * Revision 1.35  2002/09/15 20:49:35  agmsmith
361  * Scrolling improved, buttons, keys and mouse wheel added.
362  *
363  * Revision 1.34  2002/09/15 03:46:10  agmsmith
364  * Up and down buttons under construction.
365  *
366  * Revision 1.33  2002/09/15 02:09:21  agmsmith
367  * Took out scroll bar.
368  *
369  * Revision 1.32  2002/09/15 02:05:30  agmsmith
370  * Trying to add a scroll bar, but it isn't very useful.
371  *
372  * Revision 1.31  2002/09/14 23:06:28  agmsmith
373  * Now has live updates of the list of words.
374  *
375  * Revision 1.30  2002/09/14 19:53:11  agmsmith
376  * Now with a better display of the words.
377  *
378  * Revision 1.29  2002/09/13 21:33:54  agmsmith
379  * Now draws the words in the word display view, but still primitive.
380  *
381  * Revision 1.28  2002/09/13 19:28:02  agmsmith
382  * Added display of most genuine and most spamiest, fixed up cursor.
383  *
384  * Revision 1.27  2002/09/13 03:08:42  agmsmith
385  * Show current word and message counts, and a busy cursor.
386  *
387  * Revision 1.26  2002/09/13 00:00:08  agmsmith
388  * Fixed up some deadlock problems, now using asynchronous message replies.
389  *
390  * Revision 1.25  2002/09/12 17:56:58  agmsmith
391  * Keep track of words which are spamiest and genuinest.
392  *
393  * Revision 1.24  2002/09/12 01:57:10  agmsmith
394  * Added server mode.
395  *
396  * Revision 1.23  2002/09/11 23:30:45  agmsmith
397  * Added Purge button and ignore classification checkbox.
398  *
399  * Revision 1.22  2002/09/11 21:23:13  agmsmith
400  * Added bulk update choice, purge button, moved to a BView container
401  * for all the controls (so background colour could be set, and Pulse
402  * works normally for it too).
403  *
404  * Revision 1.21  2002/09/10 22:52:49  agmsmith
405  * You can now change the database name in the GUI.
406  *
407  * Revision 1.20  2002/09/09 14:20:42  agmsmith
408  * Now can have multiple backups, and implemented refs received.
409  *
410  * Revision 1.19  2002/09/07 19:14:56  agmsmith
411  * Added standard GUI measurement code.
412  *
413  * Revision 1.18  2002/09/06 21:03:03  agmsmith
414  * Rearranging code to avoid forward references when adding a window class.
415  *
416  * Revision 1.17  2002/09/06 02:54:00  agmsmith
417  * Added the ability to purge old words from the database.
418  *
419  * Revision 1.16  2002/09/05 00:46:03  agmsmith
420  * Now adds spam to the database!
421  *
422  * Revision 1.15  2002/09/04 20:32:15  agmsmith
423  * Read ahead a couple of letters to decode quoted-printable better.
424  *
425  * Revision 1.14  2002/09/04 03:10:03  agmsmith
426  * Can now tokenize (break into words) a text file.
427  *
428  * Revision 1.13  2002/09/03 21:50:54  agmsmith
429  * Count database command, set up MIME type for the database file.
430  *
431  * Revision 1.12  2002/09/03 19:55:54  agmsmith
432  * Added loading and saving the database.
433  *
434  * Revision 1.11  2002/09/02 03:35:33  agmsmith
435  * Create indices and set up attribute associations with the e-mail MIME type.
436  *
437  * Revision 1.10  2002/09/01 15:52:49  agmsmith
438  * Can now delete the database.
439  *
440  * Revision 1.9  2002/08/31 21:55:32  agmsmith
441  * Yet more scripting.
442  *
443  * Revision 1.8  2002/08/31 21:41:37  agmsmith
444  * Under construction, with example code to decode a B_REPLY.
445  *
446  * Revision 1.7  2002/08/30 19:29:06  agmsmith
447  * Combined loading and saving settings into one function.
448  *
449  * Revision 1.6  2002/08/30 02:01:10  agmsmith
450  * Working on loading and saving settings.
451  *
452  * Revision 1.5  2002/08/29 23:17:42  agmsmith
453  * More scripting.
454  *
455  * Revision 1.4  2002/08/28 00:40:52  agmsmith
456  * Scripting now seems to work, at least the messages flow properly.
457  *
458  * Revision 1.3  2002/08/25 21:51:44  agmsmith
459  * Getting the about text formatting right.
460  *
461  * Revision 1.2  2002/08/25 21:28:20  agmsmith
462  * Trying out the BeOS scripting system as a way of implementing the program.
463  *
464  * Revision 1.1  2002/08/24 02:27:51  agmsmith
465  * Initial revision
466  */
467 
468 /* Standard C Library. */
469 
470 #include <stdio.h>
471 #include <stdlib.h>
472 #include <errno.h>
473 
474 /* Standard C++ library. */
475 
476 #include <iostream>
477 
478 /* STL (Standard Template Library) headers. */
479 
480 #include <map>
481 #include <queue>
482 #include <set>
483 #include <string>
484 #include <vector>
485 
486 using namespace std;
487 
488 /* BeOS (Be Operating System) headers. */
489 
490 #include <Alert.h>
491 #include <Application.h>
492 #include <Beep.h>
493 #include <Button.h>
494 #include <CheckBox.h>
495 #include <Cursor.h>
496 #include <Directory.h>
497 #include <Entry.h>
498 #include <File.h>
499 #include <FilePanel.h>
500 #include <FindDirectory.h>
501 #include <fs_index.h>
502 #include <fs_info.h>
503 #include <MenuBar.h>
504 #include <MenuItem.h>
505 #include <Message.h>
506 #include <MessageQueue.h>
507 #include <MessageRunner.h>
508 #include <Mime.h>
509 #include <NodeInfo.h>
510 #include <Path.h>
511 #include <Picture.h>
512 #include <PictureButton.h>
513 #include <Point.h>
514 #include <Polygon.h>
515 #include <PopUpMenu.h>
516 #include <PropertyInfo.h>
517 #include <RadioButton.h>
518 #include <Resources.h>
519 #include <Screen.h>
520 #include <ScrollBar.h>
521 #include <String.h>
522 #include <StringView.h>
523 #include <TextControl.h>
524 #include <View.h>
525 
526 /* Included from the Mail Daemon Replacement project (MDR) include/public
527 directory, available from http://sourceforge.net/projects/bemaildaemon/ */
528 
529 #include <MailMessage.h>
530 #include <MailAttachment.h>
531 
532 
533 /******************************************************************************
534  * Global variables, and not-so-variable things too.  Grouped by functionality.
535  */
536 
537 static float g_MarginBetweenControls; /* Space of a letter "M" between them. */
538 static float g_LineOfTextHeight;      /* Height of text the current font. */
539 static float g_StringViewHeight;      /* Height of a string view text box. */
540 static float g_ButtonHeight;          /* How many pixels tall buttons are. */
541 static float g_CheckBoxHeight;        /* Same for check boxes. */
542 static float g_RadioButtonHeight;     /* Also for radio buttons. */
543 static float g_PopUpMenuHeight;       /* Again for pop-up menus. */
544 static float g_TextBoxHeight;         /* Ditto for editable text controls. */
545 
546 static const char *g_ABSAppSignature =
547   "application/x-vnd.agmsmith.spamdbm";
548 
549 static const char *g_ABSDatabaseFileMIMEType =
550   "text/x-vnd.agmsmith.spam_probability_database";
551 
552 static const char *g_DefaultDatabaseFileName =
553   "SpamDBM Database";
554 
555 static const char *g_DatabaseRecognitionString =
556   "Spam Database File";
557 
558 static const char *g_AttributeNameClassification = "MAIL:classification";
559 static const char *g_AttributeNameSpamRatio = "MAIL:ratio_spam";
560 static const char *g_BeepGenuine = "SpamFilter-Genuine";
561 static const char *g_BeepSpam = "SpamFilter-Spam";
562 static const char *g_BeepUncertain = "SpamFilter-Uncertain";
563 static const char *g_ClassifiedSpam = "Spam";
564 static const char *g_ClassifiedGenuine = "Genuine";
565 static const char *g_DataName = "data";
566 static const char *g_ResultName = "result";
567 
568 static const char *g_SettingsDirectoryName = "Mail";
569 static const char *g_SettingsFileName = "SpamDBM Settings";
570 static const uint32 g_SettingsWhatCode = 'SDBM';
571 static const char *g_BackupSuffix = ".backup %d";
572 static const int g_MaxBackups = 10; /* Numbered from 0 to g_MaxBackups - 1. */
573 static const size_t g_MaxWordLength = 50; /* Words longer than this aren't. */
574 static const int g_MaxInterestingWords = 150; /* Top N words are examined. */
575 static const double g_RobinsonS = 0.45; /* Default weight for no data. */
576 static const double g_RobinsonX = 0.5; /* Halfway point for no data. */
577 
578 static bool g_CommandLineMode;
579   /* TRUE if the program was started from the command line (and thus should
580   exit after processing the command), FALSE if it is running with a graphical
581   user interface. */
582 
583 static bool g_ServerMode;
584   /* When TRUE the program runs in server mode - error messages don't result in
585   pop-up dialog boxes, but you can still see them in stderr.  Also the window
586   is minimized, if it exists. */
587 
588 static int g_QuitCountdown = -1;
589   /* Set to the number of pulse timing events (about one every half second) to
590   count down before the program quits.  Negative means stop counting.  Zero
591   means quit at the next pulse event.  This is used to keep the program alive
592   for a short while after someone requests that it quit, in case more scripting
593   commands come in, which will stop the countdown.  Needed to handle the case
594   where there are multiple e-mail accounts all requesting spam identification,
595   and one finishes first and tells the server to quit.  It also checks to see
596   that there is no more work to do before trying to quit. */
597 
598 static volatile bool g_AppReadyToRunCompleted = false;
599   /* The BApplication starts processing messages before ReadyToRun finishes,
600   which can lead to initialisation problems (button heights not determined).
601   So wait for this to turn TRUE in code that might run early, like
602   RefsReceived. */
603 
604 static class CommanderLooper *g_CommanderLooperPntr = NULL;
605 static BMessenger *g_CommanderMessenger = NULL;
606   /* Some globals for use with the looper which processes external commands
607   (arguments received, file references received), needed for avoiding deadlocks
608   which would happen if the BApplication sent a scripting message to itself. */
609 
610 static BCursor *g_BusyCursor = NULL;
611   /* The busy cursor, will be loaded from the resource file during application
612   startup. */
613 
614 typedef enum PropertyNumbersEnum
615 {
616   PN_DATABASE_FILE = 0,
617   PN_SPAM,
618   PN_SPAM_STRING,
619   PN_GENUINE,
620   PN_GENUINE_STRING,
621   PN_UNCERTAIN,
622   PN_IGNORE_PREVIOUS_CLASSIFICATION,
623   PN_SERVER_MODE,
624   PN_FLUSH,
625   PN_PURGE_AGE,
626   PN_PURGE_POPULARITY,
627   PN_PURGE,
628   PN_OLDEST,
629   PN_EVALUATE,
630   PN_EVALUATE_STRING,
631   PN_RESET_TO_DEFAULTS,
632   PN_INSTALL_THINGS,
633   PN_TOKENIZE_MODE,
634   PN_SCORING_MODE,
635   PN_MAX
636 } PropertyNumbers;
637 
638 static char * g_PropertyNames [PN_MAX] =
639 {
640   "DatabaseFile",
641   "Spam",
642   "SpamString",
643   "Genuine",
644   "GenuineString",
645   "Uncertain",
646   "IgnorePreviousClassification",
647   "ServerMode",
648   "Flush",
649   "PurgeAge",
650   "PurgePopularity",
651   "Purge",
652   "Oldest",
653   "Evaluate",
654   "EvaluateString",
655   "ResetToDefaults",
656   "InstallThings",
657   "TokenizeMode",
658   "ScoringMode"
659 };
660 
661 /* This array lists the scripting commands we can handle, in a format that the
662 scripting system can understand too. */
663 
664 static struct property_info g_ScriptingPropertyList [] =
665 {
666   /* *name; commands[10]; specifiers[10]; *usage; extra_data; ... */
667   {g_PropertyNames[PN_DATABASE_FILE], {B_GET_PROPERTY, 0},
668     {B_DIRECT_SPECIFIER, 0}, "Get the pathname of the current database file.  "
669     "The default name is something like B_USER_SETTINGS_DIRECTORY / "
670     "Mail / SpamDBM Database", PN_DATABASE_FILE,
671     {}, {}, {}},
672   {g_PropertyNames[PN_DATABASE_FILE], {B_SET_PROPERTY, 0},
673     {B_DIRECT_SPECIFIER, 0}, "Change the pathname of the database file to "
674     "use.  It will automatically be converted to an absolute path name, "
675     "so make sure the parent directories exist before setting it.  If it "
676     "doesn't exist, you'll have to use the create command next.",
677     PN_DATABASE_FILE, {}, {}, {}},
678   {g_PropertyNames[PN_DATABASE_FILE], {B_CREATE_PROPERTY, 0},
679     {B_DIRECT_SPECIFIER, 0}, "Creates a new empty database, will replace "
680     "the existing database file too.", PN_DATABASE_FILE, {}, {}, {}},
681   {g_PropertyNames[PN_DATABASE_FILE], {B_DELETE_PROPERTY, 0},
682     {B_DIRECT_SPECIFIER, 0}, "Deletes the database file and all backup copies "
683     "of that file too.  Really only of use for uninstallers.",
684     PN_DATABASE_FILE, {}, {}, {}},
685   {g_PropertyNames[PN_DATABASE_FILE], {B_COUNT_PROPERTIES, 0},
686     {B_DIRECT_SPECIFIER, 0}, "Returns the number of words in the database.",
687     PN_DATABASE_FILE, {}, {}, {}},
688   {g_PropertyNames[PN_SPAM], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
689     "Adds the spam in the given file (specify full pathname to be safe) to "
690     "the database.  The words in the files will be added to the list of words "
691     "in the database that identify spam messages.  The files processed will "
692     "also have the attribute MAIL:classification added with a value of "
693     "\"Spam\" or \"Genuine\" as specified.  They also have their spam ratio "
694     "attribute updated, as if you had also used the Evaluate command on "
695     "them.  If they already have the MAIL:classification "
696     "attribute and it matches the new classification then they won't get "
697     "processed (and if it is different, they will get removed from the "
698     "statistics for the old class and added to the statistics for the new "
699     "one).  You can turn off that behaviour with the "
700     "IgnorePreviousClassification property.  The command line version lets "
701     "you specify more than one pathname.", PN_SPAM, {}, {}, {}},
702   {g_PropertyNames[PN_SPAM], {B_COUNT_PROPERTIES, 0}, {B_DIRECT_SPECIFIER, 0},
703     "Returns the number of spam messages in the database.", PN_SPAM,
704     {}, {}, {}},
705   {g_PropertyNames[PN_SPAM_STRING], {B_SET_PROPERTY, 0},
706     {B_DIRECT_SPECIFIER, 0}, "Adds the spam in the given string (assumed to "
707     "be the text of a whole e-mail message, not just a file name) to the "
708     "database.", PN_SPAM_STRING, {}, {}, {}},
709   {g_PropertyNames[PN_GENUINE], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
710     "Similar to adding spam except that the message file is added to the "
711     "genuine statistics.", PN_GENUINE, {}, {}, {}},
712   {g_PropertyNames[PN_GENUINE], {B_COUNT_PROPERTIES, 0},
713     {B_DIRECT_SPECIFIER, 0}, "Returns the number of genuine messages in the "
714     "database.", PN_GENUINE, {}, {}, {}},
715   {g_PropertyNames[PN_GENUINE_STRING], {B_SET_PROPERTY, 0},
716     {B_DIRECT_SPECIFIER, 0}, "Adds the genuine message in the given string "
717     "(assumed to be the text of a whole e-mail message, not just a file name) "
718     "to the database.", PN_GENUINE_STRING, {}, {}, {}},
719   {g_PropertyNames[PN_UNCERTAIN], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
720     "Similar to adding spam except that the message file is removed from the "
721     "database, undoing the previous classification.  Obviously, it needs to "
722     "have been classified previously (using the file attributes) so it can "
723     "tell if it is removing spam or genuine words.", PN_UNCERTAIN, {}, {}, {}},
724   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_SET_PROPERTY, 0},
725     {B_DIRECT_SPECIFIER, 0}, "If set to true then the previous classification "
726     "(which was saved as an attribute of the e-mail message file) will be "
727     "ignored, so that you can add the message to the database again.  If set "
728     "to false (the normal case), the attribute will be examined, and if the "
729     "message has already been classified as what you claim it is, nothing "
730     "will be done.  If it was misclassified, then the message will be removed "
731     "from the statistics for the old class and added to the stats for the "
732     "new classification you have requested.",
733     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
734   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_GET_PROPERTY, 0},
735     {B_DIRECT_SPECIFIER, 0}, "Find out the current setting of the flag for "
736     "ignoring the previously recorded classification.",
737     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
738   {g_PropertyNames[PN_SERVER_MODE], {B_SET_PROPERTY, 0},
739     {B_DIRECT_SPECIFIER, 0}, "If set to true then error messages get printed "
740     "to the standard error stream rather than showing up in an alert box.  "
741     "It also starts up with the window minimized.", PN_SERVER_MODE,
742     {}, {}, {}},
743   {g_PropertyNames[PN_SERVER_MODE], {B_GET_PROPERTY, 0},
744     {B_DIRECT_SPECIFIER, 0}, "Find out the setting of the server mode flag.",
745     PN_SERVER_MODE, {}, {}, {}},
746   {g_PropertyNames[PN_FLUSH], {B_EXECUTE_PROPERTY, 0},
747     {B_DIRECT_SPECIFIER, 0}, "Writes out the database file to disk, if it has "
748     "been updated in memory but hasn't been saved to disk.  It will "
749     "automatically get written when the program exits, so this command is "
750     "mostly useful for server mode.", PN_FLUSH, {}, {}, {}},
751   {g_PropertyNames[PN_PURGE_AGE], {B_SET_PROPERTY, 0},
752     {B_DIRECT_SPECIFIER, 0}, "Sets the old age limit.  Words which haven't "
753       "been updated since this many message additions to the database may be "
754       "deleted when you do a purge.  A good value is 1000, meaning that if a "
755       "word hasn't appeared in the last 1000 spam/genuine messages, it will "
756       "be forgotten.  Zero will purge all words, 1 will purge words not in "
757       "the last message added to the database, 2 will purge words not in the "
758       "last two messages added, and so on.  This is mostly useful for "
759       "removing those one time words which are often hunks of binary garbage, "
760       "not real words.  This acts in combination with the popularity limit; "
761       "both conditions have to be valid before the word gets deleted.",
762       PN_PURGE_AGE, {}, {}, {}},
763   {g_PropertyNames[PN_PURGE_AGE], {B_GET_PROPERTY, 0},
764     {B_DIRECT_SPECIFIER, 0}, "Gets the old age limit.", PN_PURGE_AGE,
765     {}, {}, {}},
766   {g_PropertyNames[PN_PURGE_POPULARITY], {B_SET_PROPERTY, 0},
767     {B_DIRECT_SPECIFIER, 0}, "Sets the popularity limit.  Words which aren't "
768     "this popular may be deleted when you do a purge.  A good value is 5, "
769     "which means that the word is safe from purging if it has been seen in 6 "
770     "or more e-mail messages.  If it's only in 5 or less, then it may get "
771     "purged.  The extreme is zero, where only words that haven't been seen "
772     "in any message are deleted (usually means no words).  This acts in "
773     "combination with the old age limit; both conditions have to be valid "
774     "before the word gets deleted.", PN_PURGE_POPULARITY, {}, {}, {}},
775   {g_PropertyNames[PN_PURGE_POPULARITY], {B_GET_PROPERTY, 0},
776     {B_DIRECT_SPECIFIER, 0}, "Gets the purge popularity limit.",
777     PN_PURGE_POPULARITY, {}, {}, {}},
778   {g_PropertyNames[PN_PURGE], {B_EXECUTE_PROPERTY, 0},
779     {B_DIRECT_SPECIFIER, 0}, "Purges the old obsolete words from the "
780     "database, if they are old enough according to the age limit and also "
781     "unpopular enough according to the popularity limit.", PN_PURGE,
782     {}, {}, {}},
783   {g_PropertyNames[PN_OLDEST], {B_GET_PROPERTY, 0},
784     {B_DIRECT_SPECIFIER, 0}, "Gets the age of the oldest message in the "
785     "database.  It's relative to the beginning of time, so you need to do "
786     "(total messages - age - 1) to see how many messages ago it was added.",
787     PN_OLDEST, {}, {}, {}},
788   {g_PropertyNames[PN_EVALUATE], {B_SET_PROPERTY, 0},
789     {B_DIRECT_SPECIFIER, 0}, "Evaluates a given file (by path name) to see "
790     "if it is spam or not.  Returns the ratio of spam probability vs genuine "
791     "probability, 0.0 meaning completely genuine, 1.0 for completely spam.  "
792     "Normally you should safely be able to consider it as spam if it is over "
793     "0.56 for the Robinson scoring method.  For the ChiSquared method, the "
794     "numbers are near 0 for genuine, near 1 for spam, and anywhere in the "
795     "middle means it can't decide.  The program attaches a MAIL:ratio_spam "
796     "attribute with the ratio as its "
797     "float32 value to the file.  Also returns the top few interesting words "
798     "in \"words\" and the associated per-word probability ratios in "
799     "\"ratios\".", PN_EVALUATE, {}, {}, {}},
800   {g_PropertyNames[PN_EVALUATE_STRING], {B_SET_PROPERTY, 0},
801     {B_DIRECT_SPECIFIER, 0}, "Like Evaluate, but rather than a file name, "
802     "the string argument contains the entire text of the message to be "
803     "evaluated.", PN_EVALUATE_STRING, {}, {}, {}},
804   {g_PropertyNames[PN_RESET_TO_DEFAULTS], {B_EXECUTE_PROPERTY, 0},
805     {B_DIRECT_SPECIFIER, 0}, "Resets all the configuration options to the "
806     "default values, including the database name.", PN_RESET_TO_DEFAULTS,
807     {}, {}, {}},
808   {g_PropertyNames[PN_INSTALL_THINGS], {B_EXECUTE_PROPERTY, 0},
809     {B_DIRECT_SPECIFIER, 0}, "Creates indices for the MAIL:classification and "
810     "MAIL:ratio_spam attributes on all volumes which support BeOS queries, "
811     "identifies them to the system as e-mail related attributes (modifies "
812     "the text/x-email MIME type), and sets up the new MIME type "
813     "(text/x-vnd.agmsmith.spam_probability_database) for the database file.  "
814     "Also registers names for the sound effects used by the separate filter "
815     "program (use the installsound BeOS program or the Sounds preferences "
816     "program to associate sound files with the names).", PN_INSTALL_THINGS,
817     {}, {}, {}},
818   {g_PropertyNames[PN_TOKENIZE_MODE], {B_SET_PROPERTY, 0},
819     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for breaking up the "
820     "message into words.  Use \"Whole\" for the whole file (also use it for "
821     "non-email files).  The file isn't broken into parts; the whole thing is "
822     "converted into words, headers and attachments are just more raw data.  "
823     "Well, not quite raw data since it converts quoted-printable codes "
824     "(equals sign followed by hex digits or end of line) to the equivalent "
825     "single characters.  \"PlainText\" breaks the file into MIME components "
826     "and only looks at the ones which are of MIME type text/plain.  "
827     "\"AnyText\" will look for words in all text/* things, including "
828     "text/html attachments.  \"AllParts\" will decode all message components "
829     "and look for words in them, including binary attachments.  "
830     "\"JustHeader\" will only look for words in the message header.  "
831     "\"AllPartsAndHeader\", \"PlainTextAndHeader\" and \"AnyTextAndHeader\" "
832     "will also include the words from the message headers.", PN_TOKENIZE_MODE,
833     {}, {}, {}},
834   {g_PropertyNames[PN_TOKENIZE_MODE], {B_GET_PROPERTY, 0},
835     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for breaking up the "
836     "message into words.", PN_TOKENIZE_MODE, {}, {}, {}},
837   {g_PropertyNames[PN_SCORING_MODE], {B_SET_PROPERTY, 0},
838     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for combining the "
839     "probabilities of individual words into an overall score.  "
840     "\"Robinson\" mode will use Gary Robinson's nth root of the product "
841     "method.  It gives a nice range of values between 0 and 1 so you can "
842     "see shades of spaminess.  The cutoff point between spam and genuine "
843     "varies depending on your database of words (0.56 was one point in "
844     "some experiments).  \"ChiSquared\" mode will use chi-squared "
845     "statistics to evaluate the difference in probabilities that the lists "
846     "of word ratios are random.  The result is very close to 0 for genuine "
847     "and very close to 1 for spam, and near the middle if it is uncertain.",
848     PN_SCORING_MODE, {}, {}, {}},
849   {g_PropertyNames[PN_SCORING_MODE], {B_GET_PROPERTY, 0},
850     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for combining the "
851     "individual word ratios into an overall score.", PN_SCORING_MODE,
852     {}, {}, {}},
853   {0, {0}, {0}, 0, 0, {}, {}, {}} /* End of list of property commands. */
854 };
855 
856 
857 /* The various scoring modes as text and enums.  See PN_SCORING_MODE. */
858 
859 typedef enum ScoringModeEnum
860 {
861   SM_ROBINSON = 0,
862   SM_CHISQUARED,
863   SM_MAX
864 } ScoringModes;
865 
866 static char * g_ScoringModeNames [SM_MAX] =
867 {
868   "Robinson",
869   "ChiSquared"
870 };
871 
872 
873 /* The various tokenizing modes as text and enums.  See PN_TOKENIZE_MODE. */
874 
875 typedef enum TokenizeModeEnum
876 {
877   TM_WHOLE = 0,
878   TM_PLAIN_TEXT,
879   TM_PLAIN_TEXT_HEADER,
880   TM_ANY_TEXT,
881   TM_ANY_TEXT_HEADER,
882   TM_ALL_PARTS,
883   TM_ALL_PARTS_HEADER,
884   TM_JUST_HEADER,
885   TM_MAX
886 } TokenizeModes;
887 
888 static char * g_TokenizeModeNames [TM_MAX] =
889 {
890   "All",
891   "Plain Text",
892   "Plain Text and Header",
893   "Any Text",
894   "Any Text and Header",
895   "All Parts",
896   "All Parts and Header",
897   "Just Header"
898 };
899 
900 
901 /* Possible message classifications. */
902 
903 typedef enum ClassificationTypesEnum
904 {
905   CL_GENUINE = 0,
906   CL_SPAM,
907   CL_UNCERTAIN,
908   CL_MAX
909 } ClassificationTypes;
910 
911 static const char * g_ClassificationTypeNames [CL_MAX] =
912 {
913   g_ClassifiedGenuine,
914   g_ClassifiedSpam,
915   "Uncertain"
916 };
917 
918 
919 /* Some polygon graphics for the scroll arrows. */
920 
921 static BPoint g_UpLinePoints [] =
922 {
923   BPoint (8, 2 * (1)),
924   BPoint (14, 2 * (6)),
925   BPoint (10, 2 * (6)),
926   BPoint (10, 2 * (13)),
927   BPoint (6, 2 * (13)),
928   BPoint (6, 2 * (6)),
929   BPoint (2, 2 * (6))
930 };
931 
932 static BPoint g_DownLinePoints [] =
933 {
934   BPoint (8, 2 * (14-1)),
935   BPoint (14, 2 * (14-6)),
936   BPoint (10, 2 * (14-6)),
937   BPoint (10, 2 * (14-13)),
938   BPoint (6, 2 * (14-13)),
939   BPoint (6, 2 * (14-6)),
940   BPoint (2, 2 * (14-6))
941 };
942 
943 static BPoint g_UpPagePoints [] =
944 {
945   BPoint (8, 2 * (1)),
946   BPoint (13, 2 * (6)),
947   BPoint (10, 2 * (6)),
948   BPoint (14, 2 * (10)),
949   BPoint (10, 2 * (10)),
950   BPoint (10, 2 * (13)),
951   BPoint (6, 2 * (13)),
952   BPoint (6, 2 * (10)),
953   BPoint (2, 2 * (10)),
954   BPoint (6, 2 * (6)),
955   BPoint (3, 2 * (6))
956 };
957 
958 static BPoint g_DownPagePoints [] =
959 {
960   BPoint (8, 2 * (14-1)),
961   BPoint (13, 2 * (14-6)),
962   BPoint (10, 2 * (14-6)),
963   BPoint (14, 2 * (14-10)),
964   BPoint (10, 2 * (14-10)),
965   BPoint (10, 2 * (14-13)),
966   BPoint (6, 2 * (14-13)),
967   BPoint (6, 2 * (14-10)),
968   BPoint (2, 2 * (14-10)),
969   BPoint (6, 2 * (14-6)),
970   BPoint (3, 2 * (14-6))
971 };
972 
973 
974 /* An array of flags to identify characters which are considered to be spaces.
975 If character code X has g_SpaceCharacters[X] set to true then it is a
976 space-like character.  Character codes 128 and above are always non-space since
977 they are UTF-8 characters.  Initialised in the ABSApp constructor. */
978 
979 static bool g_SpaceCharacters [128];
980 
981 
982 
983 /******************************************************************************
984  * Each word in the spam database gets one of these structures.  The database
985  * has a string (the word) as the key and this structure as the value
986  * (statistics for that word).
987  */
988 
989 typedef struct StatisticsStruct
990 {
991   uint32 age;
992     /* Sequence number for the time when this word was last updated in the
993     database, so that we can remove old words (haven't been seen in recent
994     spam).  It's zero for the first file ever added (spam or genuine) to the
995     database, 1 for all words added or updated by the second file, etc.  If a
996     later file updates an existing word, it gets the age of the later file. */
997 
998   uint32 genuineCount;
999     /* Number of genuine messages that have this word. */
1000 
1001   uint32 spamCount;
1002     /* A count of the number of spam e-mail messages which contain the word. */
1003 
1004 } StatisticsRecord, *StatisticsPointer;
1005 
1006 typedef map<string, StatisticsRecord> StatisticsMap;
1007   /* Define this type which will be used for our main data storage facility, so
1008   we can more conveniently specify things that are derived from it, like
1009   iterators. */
1010 
1011 
1012 
1013 /******************************************************************************
1014  * An alert box asking how the user wants to mark messages.  There are buttons
1015  * for each classification category, and a checkbox to mark all remaining N
1016  * messages the same way.  And a cancel button.  To use it, first create the
1017  * ClassificationChoicesWindow, specifying the input arguments.  Then call the
1018  * Go method which will show the window, stuff the user's answer into your
1019  * output arguments (class set to CL_MAX if the user cancels), and destroy the
1020  * window.  Implemented because BAlert only allows 3 buttons, max!
1021  */
1022 
1023 class ClassificationChoicesWindow : public BWindow
1024 {
1025 public:
1026   /* Constructor and destructor. */
1027   ClassificationChoicesWindow (BRect FrameRect,
1028     const char *FileName, int NumberOfFiles);
1029 
1030   /* BeOS virtual functions. */
1031   virtual void MessageReceived (BMessage *MessagePntr);
1032 
1033   /* Our methods. */
1034   void Go (bool *BulkModeSelectedPntr,
1035     ClassificationTypes *ChoosenClassificationPntr);
1036 
1037   /* Various message codes for various buttons etc. */
1038   static const uint32 MSG_CLASS_BUTTONS = 'ClB0';
1039   static const uint32 MSG_CANCEL_BUTTON = 'Cncl';
1040   static const uint32 MSG_BULK_CHECKBOX = 'BlkK';
1041 
1042 private:
1043   /* Member variables. */
1044   bool *m_BulkModeSelectedPntr;
1045   ClassificationTypes *m_ChoosenClassificationPntr;
1046 };
1047 
1048 class ClassificationChoicesView : public BView
1049 {
1050 public:
1051   /* Constructor and destructor. */
1052   ClassificationChoicesView (BRect FrameRect,
1053     const char *FileName, int NumberOfFiles);
1054 
1055   /* BeOS virtual functions. */
1056   virtual void AttachedToWindow ();
1057   virtual void GetPreferredSize (float *width, float *height);
1058 
1059 private:
1060   /* Member variables. */
1061   const char *m_FileName;
1062   int         m_NumberOfFiles;
1063   float       m_PreferredBottomY;
1064 };
1065 
1066 
1067 
1068 /******************************************************************************
1069  * Due to deadlock problems with the BApplication posting scripting messages to
1070  * itself, we need to add a second Looper.  Its job is to just to convert
1071  * command line arguments and arguments from the Tracker (refs received) into a
1072  * series of scripting commands sent to the main BApplication.  It also prints
1073  * out the replies received (to stdout for command line replies).  An instance
1074  * of this class will be created and run by the main() function, and shut down
1075  * by it too.
1076  */
1077 
1078 class CommanderLooper : public BLooper
1079 {
1080 public:
1081   CommanderLooper ();
1082   ~CommanderLooper ();
1083   virtual void MessageReceived (BMessage *MessagePntr);
1084 
1085   void CommandArguments (int argc, char **argv);
1086   void CommandReferences (BMessage *MessagePntr,
1087     bool BulkMode = false,
1088     ClassificationTypes BulkClassification = CL_GENUINE);
1089   bool IsBusy ();
1090 
1091 private:
1092   void ProcessArgs (BMessage *MessagePntr);
1093   void ProcessRefs (BMessage *MessagePntr);
1094 
1095   static const uint32 MSG_COMMAND_ARGUMENTS = 'CArg';
1096   static const uint32 MSG_COMMAND_FILE_REFS = 'CRef';
1097 
1098   bool m_IsBusy;
1099 };
1100 
1101 
1102 
1103 /******************************************************************************
1104  * This view contains the various buttons and other controls for setting
1105  * configuration options and displaying the state of the database (but not the
1106  * actual list of words).  It will appear in the top half of the
1107  * DatabaseWindow.
1108  */
1109 
1110 class ControlsView : public BView
1111 {
1112 public:
1113   /* Constructor and destructor. */
1114   ControlsView (BRect NewBounds);
1115   ~ControlsView ();
1116 
1117   /* BeOS virtual functions. */
1118   virtual void AttachedToWindow ();
1119   virtual void FrameResized (float Width, float Height);
1120   virtual void MessageReceived (BMessage *MessagePntr);
1121   virtual void Pulse ();
1122 
1123 private:
1124   /* Various message codes for various buttons etc. */
1125   static const uint32 MSG_BROWSE_BUTTON = 'Brws';
1126   static const uint32 MSG_DATABASE_NAME = 'DbNm';
1127   static const uint32 MSG_ESTIMATE_BUTTON = 'Estm';
1128   static const uint32 MSG_ESTIMATE_FILE_REFS = 'ERef';
1129   static const uint32 MSG_IGNORE_CLASSIFICATION = 'IPCl';
1130   static const uint32 MSG_PURGE_AGE = 'PuAg';
1131   static const uint32 MSG_PURGE_BUTTON = 'Purg';
1132   static const uint32 MSG_PURGE_POPULARITY = 'PuPo';
1133   static const uint32 MSG_SERVER_MODE = 'SrvM';
1134 
1135   /* Our member functions. */
1136   void BrowseForDatabaseFile ();
1137   void BrowseForFileToEstimate ();
1138   void PollServerForChanges ();
1139 
1140   /* Member variables. */
1141   BButton        *m_AboutButtonPntr;
1142   BButton        *m_AddExampleButtonPntr;
1143   BButton        *m_BrowseButtonPntr;
1144   BFilePanel     *m_BrowseFilePanelPntr;
1145   BButton        *m_CreateDatabaseButtonPntr;
1146   char            m_DatabaseFileNameCachedValue [PATH_MAX];
1147   BTextControl   *m_DatabaseFileNameTextboxPntr;
1148   bool            m_DatabaseLoadDone;
1149   BButton        *m_EstimateSpamButtonPntr;
1150   BFilePanel     *m_EstimateSpamFilePanelPntr;
1151   uint32          m_GenuineCountCachedValue;
1152   BTextControl   *m_GenuineCountTextboxPntr;
1153   bool            m_IgnorePreviousClassCachedValue;
1154   BCheckBox      *m_IgnorePreviousClassCheckboxPntr;
1155   BButton        *m_InstallThingsButtonPntr;
1156   uint32          m_PurgeAgeCachedValue;
1157   BTextControl   *m_PurgeAgeTextboxPntr;
1158   BButton        *m_PurgeButtonPntr;
1159   uint32          m_PurgePopularityCachedValue;
1160   BTextControl   *m_PurgePopularityTextboxPntr;
1161   BButton        *m_ResetToDefaultsButtonPntr;
1162   ScoringModes    m_ScoringModeCachedValue;
1163   BMenuBar       *m_ScoringModeMenuBarPntr;
1164   BPopUpMenu     *m_ScoringModePopUpMenuPntr;
1165   bool            m_ServerModeCachedValue;
1166   BCheckBox      *m_ServerModeCheckboxPntr;
1167   uint32          m_SpamCountCachedValue;
1168   BTextControl   *m_SpamCountTextboxPntr;
1169   bigtime_t       m_TimeOfLastPoll;
1170   TokenizeModes   m_TokenizeModeCachedValue;
1171   BMenuBar       *m_TokenizeModeMenuBarPntr;
1172   BPopUpMenu     *m_TokenizeModePopUpMenuPntr;
1173   uint32          m_WordCountCachedValue;
1174   BTextControl   *m_WordCountTextboxPntr;
1175 };
1176 
1177 
1178 /* Various message codes for various buttons etc. */
1179 static const uint32 MSG_LINE_DOWN = 'LnDn';
1180 static const uint32 MSG_LINE_UP = 'LnUp';
1181 static const uint32 MSG_PAGE_DOWN = 'PgDn';
1182 static const uint32 MSG_PAGE_UP = 'PgUp';
1183 
1184 /******************************************************************************
1185  * This view contains the list of words.  It displays as many as can fit in the
1186  * view rectangle, starting at a specified word (so it can simulate scrolling).
1187  * Usually it will appear in the bottom half of the DatabaseWindow.
1188  */
1189 
1190 class WordsView : public BView
1191 {
1192 public:
1193   /* Constructor and destructor. */
1194   WordsView (BRect NewBounds);
1195 
1196   /* BeOS virtual functions. */
1197   virtual void AttachedToWindow ();
1198   virtual void Draw (BRect UpdateRect);
1199   virtual void KeyDown (const char *BufferPntr, int32 NumBytes);
1200   virtual void MakeFocus (bool Focused);
1201   virtual void MessageReceived (BMessage *MessagePntr);
1202   virtual void MouseDown (BPoint point);
1203   virtual void Pulse ();
1204 
1205 private:
1206   /* Our member functions. */
1207   void MoveTextUpOrDown (uint32 MovementType);
1208   void RefsDroppedHere (BMessage *MessagePntr);
1209 
1210   /* Member variables. */
1211   BPictureButton *m_ArrowLineDownPntr;
1212   BPictureButton *m_ArrowLineUpPntr;
1213   BPictureButton *m_ArrowPageDownPntr;
1214   BPictureButton *m_ArrowPageUpPntr;
1215     /* Various buttons for controlling scrolling, since we can't use a scroll
1216     bar.  To make them less obvious, their background view colour needs to be
1217     changed whenever the main view's colour changes. */
1218 
1219   float m_AscentHeight;
1220     /* The ascent height for the font used to draw words.  Height from the top
1221     of the highest letter to the base line (which is near the middle bottom of
1222     the letters, the line where you would align your writing of the text by
1223     hand, all letters have part above, some also have descenders below this
1224     line). */
1225 
1226   rgb_color m_BackgroundColour;
1227     /* The current background colour.  Changes when the focus changes. */
1228 
1229   uint32 m_CachedTotalGenuineMessages;
1230   uint32 m_CachedTotalSpamMessages;
1231   uint32 m_CachedWordCount;
1232     /* These are cached copies of the similar values in the BApplication.  They
1233     reflect what's currently displayed.  If they are different than the values
1234     from the BApplication then the polling loop will try to redraw the display.
1235     They get set to the values actually used during drawing when drawing is
1236     successful. */
1237 
1238   char m_FirstDisplayedWord [g_MaxWordLength + 1];
1239     /* The scrolling display starts at this word.  Since we can't use index
1240     numbers (word[12345] for example), we use the word itself.  The scroll
1241     buttons set this to the next or previous word in the database.  Typing by
1242     the user when the view has the focus will also change this starting word.
1243     */
1244 
1245   rgb_color m_FocusedColour;
1246     /* The colour to use for focused mode (typing by the user is received by
1247     our view). */
1248 
1249   bigtime_t m_LastTimeAKeyWasPressed;
1250     /* Records the time when a key was last pressed.  Used for determining when
1251     the user has stopped typing a batch of letters. */
1252 
1253   float m_LineHeight;
1254     /* Height of a line of text in the font used for the word display.
1255     Includes the height of the letters plus a bit of extra space for between
1256     the lines (called leading). */
1257 
1258   BFont m_TextFont;
1259     /* The font used to draw the text in the window. */
1260 
1261   float m_TextHeight;
1262     /* Maximum total height of the letters in the text, includes the part above
1263     the baseline and the part below.  Doesn't include the sliver of space
1264     between lines. */
1265 
1266   rgb_color m_UnfocusedColour;
1267     /* The colour to use for unfocused mode, when user typing isn't active. */
1268 };
1269 
1270 
1271 
1272 /******************************************************************************
1273  * The BWindow class for this program.  It displays the database in real time,
1274  * and has various buttons and gadgets in the top half for changing settings
1275  * (live changes, no OK button, and they reflect changes done by other programs
1276  * using the server too).  The bottom half is a scrolling view listing all the
1277  * words in the database.  A simple graphic blotch behind each word shows
1278  * whether the word is strongly or weakly related to spam or genuine messages.
1279  * Most operations go through the scripting message system, but it also peeks
1280  * at the BApplication data for examining simple things and when redrawing the
1281  * list of words.
1282  */
1283 
1284 class DatabaseWindow : public BWindow
1285 {
1286 public:
1287   /* Constructor and destructor. */
1288   DatabaseWindow ();
1289 
1290   /* BeOS virtual functions. */
1291   virtual void MessageReceived (BMessage *MessagePntr);
1292   virtual bool QuitRequested ();
1293 
1294 private:
1295   /* Member variables. */
1296   ControlsView *m_ControlsViewPntr;
1297   WordsView    *m_WordsViewPntr;
1298 };
1299 
1300 
1301 
1302 /******************************************************************************
1303  * ABSApp is the BApplication class for this program.  This handles messages
1304  * from the outside world (requests to load a database, or to add files to the
1305  * collection).  It responds to command line arguments (if you start up the
1306  * program a second time, the system will just send the arguments to the
1307  * existing running program).  It responds to scripting messages.  And it
1308  * responds to messages from the window.  Its thread does the main work of
1309  * updating the database and reading / writing files.
1310  */
1311 
1312 class ABSApp : public BApplication
1313 {
1314 public:
1315   /* Constructor and destructor. */
1316   ABSApp ();
1317   ~ABSApp ();
1318 
1319   /* BeOS virtual functions. */
1320   virtual void AboutRequested ();
1321   virtual void ArgvReceived (int32 argc, char **argv);
1322   virtual status_t GetSupportedSuites (BMessage *MessagePntr);
1323   virtual void MessageReceived (BMessage *MessagePntr);
1324   virtual void Pulse ();
1325   virtual bool QuitRequested ();
1326   virtual void ReadyToRun ();
1327   virtual void RefsReceived (BMessage *MessagePntr);
1328   virtual BHandler *ResolveSpecifier (BMessage *MessagePntr, int32 Index,
1329     BMessage *SpecifierMsgPntr, int32 SpecificationKind, const char *Property);
1330 
1331 private:
1332   /* Our member functions. */
1333   status_t AddFileToDatabase (ClassificationTypes IsSpamOrWhat,
1334     const char *FileName, char *ErrorMessage);
1335   status_t AddPositionIOToDatabase (ClassificationTypes IsSpamOrWhat,
1336     BPositionIO *MessageIOPntr, const char *OptionalFileName,
1337     char *ErrorMessage);
1338   status_t AddStringToDatabase (ClassificationTypes IsSpamOrWhat,
1339     const char *String, char *ErrorMessage);
1340   void AddWordsToSet (const char *InputString, size_t NumberOfBytes,
1341     char PrefixCharacter, set<string> &WordSet);
1342   status_t CreateDatabaseFile (char *ErrorMessage);
1343   void DefaultSettings ();
1344   status_t DeleteDatabaseFile (char *ErrorMessage);
1345   status_t EvaluateFile (const char *PathName, BMessage *ReplyMessagePntr,
1346     char *ErrorMessage);
1347   status_t EvaluatePositionIO (BPositionIO *PositionIOPntr,
1348     const char *OptionalFileName, BMessage *ReplyMessagePntr,
1349     char *ErrorMessage);
1350   status_t EvaluateString (const char *BufferPntr, ssize_t BufferSize,
1351     BMessage *ReplyMessagePntr, char *ErrorMessage);
1352   status_t GetWordsFromPositionIO (BPositionIO *PositionIOPntr,
1353     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1354   status_t InstallThings (char *ErrorMessage);
1355   status_t LoadDatabaseIfNeeded (char *ErrorMessage);
1356   status_t LoadSaveDatabase (bool DoLoad, char *ErrorMessage);
1357 public:
1358   status_t LoadSaveSettings (bool DoLoad);
1359 private:
1360   status_t MakeBackup (char *ErrorMessage);
1361   void MakeDatabaseEmpty ();
1362   void ProcessScriptingMessage (BMessage *MessagePntr,
1363     struct property_info *PropInfoPntr);
1364   status_t PurgeOldWords (char *ErrorMessage);
1365   status_t RecursivelyTokenizeMailComponent (
1366     BMailComponent *ComponentPntr, const char *OptionalFileName,
1367     set<string> &WordSet, char *ErrorMessage,
1368     int RecursionLevel, int MaxRecursionLevel);
1369   status_t SaveDatabaseIfNeeded (char *ErrorMessage);
1370   status_t TokenizeParts (BPositionIO *PositionIOPntr,
1371     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1372   status_t TokenizeWhole (BPositionIO *PositionIOPntr,
1373     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1374 
1375 public:
1376   /* Member variables.  Many are read by the window thread to see if it needs
1377   updating, and to draw the words.  However, the other threads will lock the
1378   BApplication or using scripting commands if they want to make changes. */
1379 
1380   bool m_DatabaseHasChanged;
1381     /* Set to TRUE when the in-memory database (stored in m_WordMap) has
1382     changed and is different from the on-disk database file.  When the
1383     application exits, the database will be written out if it has changed. */
1384 
1385   BString m_DatabaseFileName;
1386     /* The absolute path name to use for the database file on disk. */
1387 
1388   bool m_IgnorePreviousClassification;
1389     /* If TRUE then the previous classification of a message (stored in an
1390     attribute on the message file) will be ignored, and the message will be
1391     added to the requested spam/genuine list.  If this is FALSE then the spam
1392     won't be added to the list if it has already been classified as specified,
1393     but if it was mis-classified, it will be removed from the old list and
1394     added to the new list. */
1395 
1396   uint32 m_OldestAge;
1397     /* The age of the oldest word.  This will be the smallest age number in the
1398     database.  Mostly useful for scaling graphics representing age in the word
1399     display.  If the oldest word is no longer the oldest, this variable won't
1400     get immediately updated since it would take a lot of effort to find the
1401     next older age.  Since it's only used for display, we'll let it be slightly
1402     incorrect.  The next database load or purge will fix it. */
1403 
1404   uint32 m_PurgeAge;
1405     /* When purging old words, they have to be at least this old to be eligible
1406     for deletion.  Age is measured as the number of e-mails added to the
1407     database since the word was last updated in the database.  Zero means all
1408     words are old. */
1409 
1410   uint32 m_PurgePopularity;
1411     /* When purging old words, they have to be less than or equal to this
1412     popularity limit to be eligible for deletion.  Popularity is measured as
1413     the number of messages (spam and genuine) which have the word.  Zero means
1414     no words. */
1415 
1416   ScoringModes m_ScoringMode;
1417     /* Controls how to combine the word probabilities into an overall score.
1418     See the PN_SCORING_MODE comments for details. */
1419 
1420   BPath m_SettingsDirectoryPath;
1421     /* The constructor initialises this to the settings directory path.  It
1422     never changes after that. */
1423 
1424   bool m_SettingsHaveChanged;
1425     /* Set to TRUE when the settings are changed (different than the ones which
1426     were loaded).  When the application exits, the settings will be written out
1427     if they have changed. */
1428 
1429   double m_SmallestUseableDouble;
1430     /* When multiplying fractional numbers together, avoid using numbers
1431     smaller than this because the double exponent range is close to being
1432     exhausted.  The IEEE STANDARD 754 floating-point arithmetic (used on the
1433     Intel i8087 and later math processors) has 64 bit numbers with 53 bits of
1434     mantissa, giving it an underflow starting at 0.5**1022 = 2.2e-308 where it
1435     rounds off to the nearest multiple of 0.5**1074 = 4.9e-324. */
1436 
1437   TokenizeModes m_TokenizeMode;
1438     /* Controls how to convert the raw message text into words.  See the
1439     PN_TOKENIZE_MODE comments for details. */
1440 
1441   uint32 m_TotalGenuineMessages;
1442     /* Number of genuine messages which are in the database. */
1443 
1444   uint32 m_TotalSpamMessages;
1445     /* Number of spam messages which are in the database. */
1446 
1447   uint32 m_WordCount;
1448     /* The number of words currently in the database.  Stored separately as a
1449     member variable to avoid having to call m_WordMap.size() all the time,
1450     which other threads can't do while the database is being updated (but they
1451     can look at the word count variable). */
1452 
1453   StatisticsMap m_WordMap;
1454     /* The in-memory data structure holding the set of words and their
1455     associated statistics.  When the database isn't in use, it is an empty
1456     collection.  You should lock the BApplication if you are using the word
1457     collection (reading or writing) from another thread. */
1458 };
1459 
1460 
1461 
1462 /******************************************************************************
1463  * Global utility function to display an error message and return.  The message
1464  * part describes the error, and if ErrorNumber is non-zero, gets the string
1465  * ", error code $X (standard description)." appended to it.  If the message
1466  * is NULL then it gets defaulted to "Something went wrong".  The title part
1467  * doesn't get displayed (no title bar in the dialog box, but you can see it in
1468  * the debugger as the window thread name), and defaults to "Error Message" if
1469  * you didn't specify one.  If running in command line mode, the error gets
1470  * printed to stderr rather than showing up in a dialog box.
1471  */
1472 
1473 static void DisplayErrorMessage (
1474   const char *MessageString = NULL,
1475   int ErrorNumber = 0,
1476   const char *TitleString = NULL)
1477 {
1478   BAlert *AlertPntr;
1479   char ErrorBuffer [PATH_MAX + 1500];
1480 
1481   if (TitleString == NULL)
1482     TitleString = "SpamDBM Error Message";
1483 
1484   if (MessageString == NULL)
1485   {
1486     if (ErrorNumber == 0)
1487       MessageString = "No error, no message, why bother?";
1488     else
1489       MessageString = "Something went wrong";
1490   }
1491 
1492   if (ErrorNumber != 0)
1493   {
1494     sprintf (ErrorBuffer, "%s, error code $%X/%d (%s) has occured.",
1495       MessageString, ErrorNumber, ErrorNumber, strerror (ErrorNumber));
1496     MessageString = ErrorBuffer;
1497   }
1498 
1499   if (g_CommandLineMode || g_ServerMode)
1500     cerr << TitleString << ": " << MessageString << endl;
1501   else
1502   {
1503     AlertPntr = new BAlert (TitleString, MessageString,
1504       "Acknowledge", NULL, NULL, B_WIDTH_AS_USUAL, B_STOP_ALERT);
1505     if (AlertPntr != NULL)
1506       AlertPntr->Go ();
1507   }
1508 }
1509 
1510 
1511 
1512 /******************************************************************************
1513  * Word wrap a long line of text into shorter 79 column lines and print the
1514  * result on the given output stream.
1515  */
1516 
1517 static void WrapTextToStream (ostream& OutputStream, const char *TextPntr)
1518 {
1519   const int LineLength = 79;
1520   char     *StringPntr;
1521   char      TempString [LineLength+1];
1522 
1523   TempString[LineLength] = 0; /* Only needs to be done once. */
1524 
1525   while (*TextPntr != 0)
1526   {
1527     while (isspace (*TextPntr))
1528       TextPntr++; /* Skip leading spaces. */
1529     if (*TextPntr == 0)
1530       break; /* It was all spaces, don't print any more. */
1531 
1532     strncpy (TempString, TextPntr, LineLength);
1533 
1534     /* Advance StringPntr to the end of the temp string, partly to see how long
1535     it is (rather than doing strlen). */
1536 
1537     StringPntr = TempString;
1538     while (*StringPntr != 0)
1539       StringPntr++;
1540 
1541     if (StringPntr - TempString < LineLength)
1542     {
1543       /* This line fits completely. */
1544       OutputStream << TempString << endl;
1545       TextPntr += StringPntr - TempString;
1546       continue;
1547     }
1548 
1549     /* Advance StringPntr to the last space in the temp string. */
1550 
1551     while (StringPntr > TempString)
1552     {
1553       if (isspace (*StringPntr))
1554         break; /* Found the trailing space. */
1555       else /* Go backwards, looking for the trailing space. */
1556         StringPntr--;
1557     }
1558 
1559     /* Remove more trailing spaces at the end of the line, in case there were
1560     several spaces in a row. */
1561 
1562     while (StringPntr > TempString && isspace (StringPntr[-1]))
1563       StringPntr--;
1564 
1565     /* Print the line of text and advance the text pointer too. */
1566 
1567     if (StringPntr == TempString)
1568     {
1569       /* This line has no spaces, don't wrap it, just split off a chunk. */
1570       OutputStream << TempString << endl;
1571       TextPntr += strlen (TempString);
1572       continue;
1573     }
1574 
1575     *StringPntr = 0; /* Cut off after the first trailing space. */
1576     OutputStream << TempString << endl;
1577     TextPntr += StringPntr - TempString;
1578   }
1579 }
1580 
1581 
1582 
1583 /******************************************************************************
1584  * Print the usage info to the stream.  Includes a list of all commands.
1585  */
1586 ostream& PrintUsage (ostream& OutputStream);
1587 
1588 ostream& PrintUsage (ostream& OutputStream)
1589 {
1590   struct property_info *PropInfoPntr;
1591 
1592   OutputStream << "\nSpamDBM - A Spam Database Manager\n";
1593   OutputStream << "Copyright © 2002 by Alexander G. M. Smith.  ";
1594   OutputStream << "Released to the public domain.\n\n";
1595   WrapTextToStream (OutputStream, "Compiled on " __DATE__ " at " __TIME__
1596 ".  $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $  $HeadURL: http://svn.haiku-os.org/haiku/haiku/trunk/src/bin/mail_utils/spamdbm.cpp $");
1597   OutputStream << "\n"
1598 "This is a program for classifying e-mail messages as spam (junk mail which\n"
1599 "you don't want to read) and regular genuine messages.  It can learn what's\n"
1600 "spam and what's genuine.  You just give it a bunch of spam messages and a\n"
1601 "bunch of non-spam ones.  It uses them to make a list of the words from the\n"
1602 "messages with the probability that each word is from a spam message or from\n"
1603 "a genuine message.  Later on, it can use those probabilities to classify\n"
1604 "new messages as spam or not spam.  If the classifier stops working well\n"
1605 "(because the spammers have changed their writing style and vocabulary, or\n"
1606 "your regular correspondants are writing like spammers), you can use this\n"
1607 "program to update the list of words to identify the new messages\n"
1608 "correctly.\n"
1609 "\n"
1610 "The original idea was from Paul Graham's algorithm, which has an excellent\n"
1611 "writeup at: http://www.paulgraham.com/spam.html\n"
1612 "\n"
1613 "Gary Robinson came up with the improved algorithm, which you can read about at:\n"
1614 "http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n"
1615 "\n"
1616 "Then he, Tim Peters and the SpamBayes mailing list developed the Chi-Squared\n"
1617 "test, see http://mail.python.org/pipermail/spambayes/2002-October/001036.html\n"
1618 "for one of the earlier messages leading from the central limit theorem to\n"
1619 "the current chi-squared scoring method.\n"
1620 "\n"
1621 "Thanks go to Isaac Yonemoto for providing a better icon, which we can\n"
1622 "unfortunately no longer use, since the Hormel company wants people to\n"
1623 "avoid associating their meat product with junk e-mail.\n"
1624 "\n"
1625 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes\n"
1626 "uses to extract words from messages.  In particular, HTML is now handled.\n"
1627 "\n"
1628 "Usage: Specify the operation as the first argument followed by more\n"
1629 "information as appropriate.  The program's configuration will affect the\n"
1630 "actual operation (things like the name of the database file to use, or\n"
1631 "whether it should allow non-email messages to be added).  In command line\n"
1632 "mode it will do the operation and exit.  In GUI/server mode a command line\n"
1633 "invocation will just send the command to the running server.  You can also\n"
1634 "use BeOS scripting (see the \"Hey\" command which you can get from\n"
1635 "http://www.bebits.com/app/2042 ) to control the Spam server.  And finally,\n"
1636 "there's also a GUI interface which shows up if you start it without any\n"
1637 "command line arguments.\n"
1638 "\n"
1639 "Commands:\n"
1640 "\n"
1641 "Quit\n"
1642 "Stop the program.  Useful if it's running as a server.\n"
1643 "\n";
1644 
1645   /* Go through all our scripting commands and add a description of each one to
1646   the usage text. */
1647 
1648   for (PropInfoPntr = g_ScriptingPropertyList + 0;
1649   PropInfoPntr->name != 0;
1650   PropInfoPntr++)
1651   {
1652     switch (PropInfoPntr->commands[0])
1653     {
1654       case B_GET_PROPERTY:
1655         OutputStream << "Get " << PropInfoPntr->name << endl;
1656         break;
1657 
1658       case B_SET_PROPERTY:
1659         OutputStream << "Set " << PropInfoPntr->name << " NewValue" << endl;
1660         break;
1661 
1662       case B_COUNT_PROPERTIES:
1663         OutputStream << "Count " << PropInfoPntr->name << endl;
1664         break;
1665 
1666       case B_CREATE_PROPERTY:
1667         OutputStream << "Create " << PropInfoPntr->name << endl;
1668         break;
1669 
1670       case B_DELETE_PROPERTY:
1671         OutputStream << "Delete " << PropInfoPntr->name << endl;
1672         break;
1673 
1674       case B_EXECUTE_PROPERTY:
1675         OutputStream << PropInfoPntr->name << endl;
1676         break;
1677 
1678       default:
1679         OutputStream << "Buggy Command: " << PropInfoPntr->name << endl;
1680         break;
1681     }
1682     WrapTextToStream (OutputStream, (char *)PropInfoPntr->usage);
1683     OutputStream << endl;
1684   }
1685 
1686   return OutputStream;
1687 }
1688 
1689 
1690 
1691 /******************************************************************************
1692  * A utility function to send a command to the application, will return after a
1693  * short delay if the application is busy (doesn't wait for it to be executed).
1694  * The reply from the application is also thrown away.  It used to be an
1695  * overloaded function, but the system couldn't distinguish between bool and
1696  * int, so now it has slightly different names depending on the arguments.
1697  */
1698 
1699 static void SubmitCommand (BMessage& CommandMessage)
1700 {
1701   status_t ErrorCode;
1702 
1703   ErrorCode = be_app_messenger.SendMessage (&CommandMessage,
1704     be_app_messenger /* reply messenger, throw away the reply */,
1705     1000000 /* delivery timeout */);
1706 
1707   if (ErrorCode != B_OK)
1708     cerr << "SubmitCommand failed to send a command, code " <<
1709     ErrorCode << " (" << strerror (ErrorCode) << ")." << endl;
1710 }
1711 
1712 
1713 static void SubmitCommandString (
1714   PropertyNumbers Property,
1715   uint32 CommandCode,
1716   const char *StringArgument = NULL)
1717 {
1718   BMessage CommandMessage (CommandCode);
1719 
1720   if (Property < 0 || Property >= PN_MAX)
1721   {
1722     DisplayErrorMessage ("SubmitCommandString bug.");
1723     return;
1724   }
1725   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1726   if (StringArgument != NULL)
1727     CommandMessage.AddString (g_DataName, StringArgument);
1728   SubmitCommand (CommandMessage);
1729 }
1730 
1731 
1732 static void SubmitCommandInt32 (
1733   PropertyNumbers Property,
1734   uint32 CommandCode,
1735   int32 Int32Argument)
1736 {
1737   BMessage CommandMessage (CommandCode);
1738 
1739   if (Property < 0 || Property >= PN_MAX)
1740   {
1741     DisplayErrorMessage ("SubmitCommandInt32 bug.");
1742     return;
1743   }
1744   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1745   CommandMessage.AddInt32 (g_DataName, Int32Argument);
1746   SubmitCommand (CommandMessage);
1747 }
1748 
1749 
1750 static void SubmitCommandBool (
1751   PropertyNumbers Property,
1752   uint32 CommandCode,
1753   bool BoolArgument)
1754 {
1755   BMessage CommandMessage (CommandCode);
1756 
1757   if (Property < 0 || Property >= PN_MAX)
1758   {
1759     DisplayErrorMessage ("SubmitCommandBool bug.");
1760     return;
1761   }
1762   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1763   CommandMessage.AddBool (g_DataName, BoolArgument);
1764   SubmitCommand (CommandMessage);
1765 }
1766 
1767 
1768 
1769 /******************************************************************************
1770  * A utility function which will estimate the spaminess of file(s), not
1771  * callable from the application thread since it sends a scripting command to
1772  * the application and waits for results.  For each file there will be an entry
1773  * reference in the message.  For each of those, run it through the spam
1774  * estimator and display a box with the results.  This function is used both by
1775  * the file requestor and by dragging and dropping into the middle of the words
1776  * view.
1777  */
1778 
1779 static void EstimateRefFilesAndDisplay (BMessage *MessagePntr)
1780 {
1781   BAlert     *AlertPntr;
1782   BEntry      Entry;
1783   entry_ref   EntryRef;
1784   status_t    ErrorCode;
1785   int         i, j;
1786   BPath       Path;
1787   BMessage    ReplyMessage;
1788   BMessage    ScriptingMessage;
1789   const char *StringPntr;
1790   float       TempFloat;
1791   int32       TempInt32;
1792   char        TempString [PATH_MAX + 1024 +
1793                 g_MaxInterestingWords * (g_MaxWordLength + 16)];
1794 
1795   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
1796   {
1797     /* See if the entry is a valid file or directory or other thing. */
1798 
1799     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
1800     if (ErrorCode != B_OK || !Entry.Exists () || Entry.GetPath (&Path) != B_OK)
1801       continue;
1802 
1803     /* Evaluate the spaminess of the file. */
1804 
1805     ScriptingMessage.MakeEmpty ();
1806     ScriptingMessage.what = B_SET_PROPERTY;
1807     ScriptingMessage.AddSpecifier (g_PropertyNames[PN_EVALUATE]);
1808     ScriptingMessage.AddString (g_DataName, Path.Path ());
1809 
1810     if (be_app_messenger.SendMessage (&ScriptingMessage,&ReplyMessage) != B_OK)
1811       break; /* App has died or something is wrong. */
1812 
1813     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
1814     TempInt32 != B_OK)
1815       break; /* Error messages will be displayed elsewhere. */
1816 
1817     ReplyMessage.FindFloat (g_ResultName, &TempFloat);
1818     sprintf (TempString, "%f spam ratio for \"%s\".\nThe top words are:",
1819       (double) TempFloat, Path.Path ());
1820 
1821     for (j = 0; j < 20 /* Don't print too many! */; j++)
1822     {
1823       if (ReplyMessage.FindString ("words", j, &StringPntr) != B_OK ||
1824       ReplyMessage.FindFloat ("ratios", j, &TempFloat) != B_OK)
1825         break;
1826 
1827       sprintf (TempString + strlen (TempString), "\n%s / %f",
1828         StringPntr, TempFloat);
1829     }
1830     if (j >= 20 && j < g_MaxInterestingWords)
1831       sprintf (TempString + strlen (TempString), "\nAnd up to %d more words.",
1832         g_MaxInterestingWords - j);
1833 
1834     AlertPntr = new BAlert ("Estimate", TempString, "OK");
1835     if (AlertPntr != NULL)
1836       AlertPntr->Go ();
1837   }
1838 }
1839 
1840 
1841 
1842 /******************************************************************************
1843  * A utility function from the http://sourceforge.net/projects/spambayes
1844  * SpamBayes project.  Return prob(chisq >= x2, with v degrees of freedom).  It
1845  * computes the probability that the chi-squared value (a kind of normalized
1846  * error measurement), with v degrees of freedom, would be larger than a given
1847  * number (x2; chi is the Greek letter X thus x2).  So you can tell if the
1848  * error is really unusual (the returned probability is near zero meaning that
1849  * your measured error number is kind of large - actual chi-squared is rarely
1850  * above that number merely due to random effects), or if it happens often
1851  * (usually if the probability is over 5% then it's within 3 standard
1852  * deviations - meaning that chi-squared goes over your number fairly often due
1853  * merely to random effects).  v must be even for this calculation to work.
1854  */
1855 
1856 static double ChiSquaredProbability (double x2, int v)
1857 {
1858   int    halfV = v / 2;
1859   int    i;
1860   double m;
1861   double sum;
1862   double term;
1863 
1864   if (v & 1)
1865     return -1.0; /* Out of range return value as a hint v is odd. */
1866 
1867   /* If x2 is very large, exp(-m) will underflow to 0. */
1868   m = x2 / 2.0;
1869   sum = term = exp (-m);
1870   for (i = 1; i < halfV; i++)
1871   {
1872     term *= m / i;
1873     sum += term;
1874   }
1875 
1876   /* With small x2 and large v, accumulated roundoff error, plus error in the
1877   platform exp(), can cause this to spill a few ULP above 1.0.  For example,
1878   ChiSquaredProbability(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
1879   point.  Returning a value even a teensy bit over 1.0 is no good. */
1880 
1881   if (sum > 1.0)
1882     return 1.0;
1883   return sum;
1884 }
1885 
1886 
1887 
1888 /******************************************************************************
1889  * A utility function to remove the "[Spam 99.9%] " from in front of the
1890  * MAIL:subject attribute of a file.
1891  */
1892 
1893 static status_t RemoveSpamPrefixFromSubjectAttribute (BNode *BNodePntr)
1894 {
1895   status_t    ErrorCode;
1896   const char *MailSubjectName = "MAIL:subject";
1897   char       *StringPntr;
1898   char        SubjectString [2000];
1899 
1900   ErrorCode = BNodePntr->ReadAttr (MailSubjectName,
1901     B_STRING_TYPE, 0 /* offset */, SubjectString,
1902     sizeof (SubjectString) - 1);
1903   if (ErrorCode <= 0)
1904     return 0; /* The attribute isn't there so we don't care. */
1905   if (ErrorCode >= (int) sizeof (SubjectString) - 1)
1906     return 0; /* Can't handle subjects which are too long. */
1907 
1908   SubjectString [ErrorCode] = 0;
1909   ErrorCode = 0; /* So do-nothing exit returns zero. */
1910   if (strncmp (SubjectString, "[Spam ", 6) == 0)
1911   {
1912     for (StringPntr = SubjectString;
1913     *StringPntr != 0 && *StringPntr != ']'; StringPntr++)
1914       ; /* No body in this for loop. */
1915     if (StringPntr[0] == ']' && StringPntr[1] == ' ')
1916     {
1917       ErrorCode = BNodePntr->RemoveAttr (MailSubjectName);
1918       ErrorCode = BNodePntr->WriteAttr (MailSubjectName,
1919         B_STRING_TYPE, 0 /* offset */,
1920         StringPntr + 2, strlen (StringPntr + 2) + 1);
1921       if (ErrorCode > 0)
1922         ErrorCode = 0;
1923     }
1924   }
1925 
1926   return ErrorCode;
1927 }
1928 
1929 
1930 
1931 /******************************************************************************
1932  * The tokenizing functions.  To make tokenization of the text easier to
1933  * understand, it is broken up into several passes.  Each pass goes over the
1934  * text (can include NUL bytes) and extracts all the words it can recognise
1935  * (can be none).  The extracted words are added to the WordSet, with the
1936  * PrefixCharacter prepended (zero if none) so we can distinguish between words
1937  * found in headers and in the text body.  It also modifies the input text
1938  * buffer in-place to change the text that the next pass will see (blanking out
1939  * words that it wants to delete, but not inserting much new text since the
1940  * buffer can't be enlarged).  They all return the number of bytes remaining in
1941  * InputString after it has been modified to be input for the next pass.
1942  * Returns zero if it has exhausted the possibility of getting more words, or
1943  * if something goes wrong.
1944  */
1945 
1946 static size_t TokenizerPassLowerCase (
1947   char *BufferPntr,
1948   size_t NumberOfBytes)
1949 {
1950   char *EndOfStringPntr;
1951 
1952   EndOfStringPntr = BufferPntr + NumberOfBytes;
1953 
1954   while (BufferPntr < EndOfStringPntr)
1955   {
1956     /* Do our own lower case conversion; tolower () has problems with UTF-8
1957     characters that have the high bit set. */
1958 
1959     if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
1960       *BufferPntr = *BufferPntr + ('a' - 'A');
1961     BufferPntr++;
1962   }
1963   return NumberOfBytes;
1964 }
1965 
1966 
1967 /* A utility function for some commonly repeated code.  If this was Modula-2,
1968 we could use a nested procedure.  But it's not.  Adds the given word to the set
1969 of words, checking for maximum word length and prepending the prefix to the
1970 word, which gets modified by this function to reflect the word actually added
1971 to the set. */
1972 
1973 static void AddWordAndPrefixToSet (
1974   string &Word,
1975   const char *PrefixString,
1976   set<string> &WordSet)
1977 {
1978   if (Word.empty ())
1979     return;
1980 
1981   if (Word.size () > g_MaxWordLength)
1982     Word.resize (g_MaxWordLength);
1983   Word.insert (0, PrefixString);
1984   WordSet.insert (Word);
1985 }
1986 
1987 
1988 /* Hunt through the text for various URLs and extract the components as
1989 separate words.  Doesn't affect the text in the buffer.  Looks for
1990 protocol://user:password@computer:port/path?query=key#anchor strings.  Also
1991 www.blah strings are detected and broken down.  Doesn't do HREF="" strings
1992 where the string has a relative path (no host computer name).  Assumes the
1993 input buffer is already in lower case. */
1994 
1995 static size_t TokenizerPassExtractURLs (
1996   char *BufferPntr,
1997   size_t NumberOfBytes,
1998   char PrefixCharacter,
1999   set<string> &WordSet)
2000 {
2001   char   *AtSignStringPntr;
2002   char   *HostStringPntr;
2003   char   *InputStringEndPntr;
2004   char   *InputStringPntr;
2005   char   *OptionsStringPntr;
2006   char   *PathStringPntr;
2007   char    PrefixString [2];
2008   char   *ProtocolStringPntr;
2009   string  Word;
2010 
2011   InputStringPntr = BufferPntr;
2012   InputStringEndPntr = BufferPntr + NumberOfBytes;
2013   PrefixString [0] = PrefixCharacter;
2014   PrefixString [1] = 0;
2015 
2016   while (InputStringPntr < InputStringEndPntr - 4)
2017   {
2018     HostStringPntr = NULL;
2019     if (memcmp (InputStringPntr, "www.", 4) == 0)
2020       HostStringPntr = InputStringPntr;
2021     else if (memcmp (InputStringPntr, "://", 3) == 0)
2022     {
2023       /* Find the protocol name, and add it as a word such as "ftp:" "http:" */
2024       ProtocolStringPntr = InputStringPntr;
2025       while (ProtocolStringPntr > BufferPntr &&
2026       isalpha (ProtocolStringPntr[-1]))
2027         ProtocolStringPntr--;
2028       Word.assign (ProtocolStringPntr,
2029         (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
2030       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2031       HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
2032     }
2033     if (HostStringPntr == NULL)
2034     {
2035       InputStringPntr++;
2036       continue;
2037     }
2038 
2039     /* Got a host name string starting at HostStringPntr.  It's everything
2040     until the next slash or space, like "user:password@computer:port". */
2041 
2042     InputStringPntr = HostStringPntr;
2043     AtSignStringPntr = NULL;
2044     while (InputStringPntr < InputStringEndPntr &&
2045     (*InputStringPntr != '/' && !isspace (*InputStringPntr)))
2046     {
2047       if (*InputStringPntr == '@')
2048         AtSignStringPntr = InputStringPntr;
2049       InputStringPntr++;
2050     }
2051     if (AtSignStringPntr != NULL)
2052     {
2053       /* Add a word with the user and password, unseparated. */
2054       Word.assign (HostStringPntr,
2055         AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
2056       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2057       HostStringPntr = AtSignStringPntr + 1;
2058     }
2059 
2060     /* Add a word with the computer and port, unseparated. */
2061 
2062     Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
2063     AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2064 
2065     /* Now get the path name, not including the extra junk after ?  and #
2066     separators (they're stored as separate options).  Stops at white space or a
2067     double quote mark. */
2068 
2069     PathStringPntr = InputStringPntr;
2070     OptionsStringPntr = NULL;
2071     while (InputStringPntr < InputStringEndPntr &&
2072     (*InputStringPntr != '"' && !isspace (*InputStringPntr)))
2073     {
2074       if (OptionsStringPntr == NULL &&
2075       (*InputStringPntr == '?' || *InputStringPntr == '#'))
2076         OptionsStringPntr = InputStringPntr;
2077       InputStringPntr++;
2078     }
2079 
2080     if (OptionsStringPntr == NULL)
2081     {
2082       /* No options, all path. */
2083       Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
2084       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2085     }
2086     else
2087     {
2088       /* Insert the path before the options. */
2089       Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
2090       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2091 
2092       /* Insert all the options as a word. */
2093       Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
2094       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2095     }
2096   }
2097   return NumberOfBytes;
2098 }
2099 
2100 
2101 /* Replace long Asian words (likely to actually be sentences) with the first
2102 character in the word. */
2103 
2104 static size_t TokenizerPassTruncateLongAsianWords (
2105   char *BufferPntr,
2106   size_t NumberOfBytes)
2107 {
2108   char *EndOfStringPntr;
2109   char *InputStringPntr;
2110   int   Letter;
2111   char *OutputStringPntr;
2112   char *StartOfInputLongUnicodeWord;
2113   char *StartOfOutputLongUnicodeWord;
2114 
2115   InputStringPntr = BufferPntr;
2116   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2117   OutputStringPntr = InputStringPntr;
2118   StartOfInputLongUnicodeWord = NULL; /* Non-NULL flags it as started. */
2119   StartOfOutputLongUnicodeWord = NULL;
2120 
2121   /* Copy the text from the input to the output (same buffer), but when we find
2122   a sequence of UTF-8 characters that is too long then truncate it down to one
2123   character and reset the output pointer to be after that character, thus
2124   deleting the word.  Replacing the deleted characters after it with spaces
2125   won't work since we need to preserve the lack of space to handle those sneaky
2126   HTML artificial word breakers.  So that Thelongword<blah>ing becomes
2127   "T<blah>ing" rather than "T <blah>ing", so the next step joins them up into
2128   "Ting" rather than "T" and "ing".  The first code in a UTF-8 character is
2129   11xxxxxx and subsequent ones are 10xxxxxx. */
2130 
2131   while (InputStringPntr < EndOfStringPntr)
2132   {
2133     Letter = (unsigned char) *InputStringPntr;
2134     if (Letter < 128) // Got a regular ASCII letter?
2135     {
2136       if (StartOfInputLongUnicodeWord != NULL)
2137       {
2138         if (InputStringPntr - StartOfInputLongUnicodeWord >
2139         (int) g_MaxWordLength * 2)
2140         {
2141           /* Need to truncate the long word (100 bytes or about 50 characters)
2142           back down to the first UTF-8 character, so find out where the first
2143           character ends (skip past the 10xxxxxx bytes), and rewind the output
2144           pointer to be just after that (ignoring the rest of the long word in
2145           effect). */
2146 
2147           OutputStringPntr = StartOfOutputLongUnicodeWord + 1;
2148           while (OutputStringPntr < InputStringPntr)
2149           {
2150             Letter = (unsigned char) *OutputStringPntr;
2151             if (Letter < 128 || Letter >= 192)
2152               break;
2153             ++OutputStringPntr; // Still a UTF-8 middle of the character code.
2154           }
2155         }
2156         StartOfInputLongUnicodeWord = NULL;
2157       }
2158     }
2159     else if (Letter >= 192 && StartOfInputLongUnicodeWord == NULL)
2160     {
2161       /* Got the start of a UTF-8 character.  Remember the spot so we can see
2162       if this is a too long UTF-8 word, which is often a whole sentence in
2163       asian languages, since they sort of use a single character per word. */
2164 
2165       StartOfInputLongUnicodeWord = InputStringPntr;
2166       StartOfOutputLongUnicodeWord = OutputStringPntr;
2167     }
2168     *OutputStringPntr++ = *InputStringPntr++;
2169   }
2170   return OutputStringPntr - BufferPntr;
2171 }
2172 
2173 
2174 /* Find all the words in the string and add them to our local set of words.
2175 The characters considered white space are defined by g_SpaceCharacters.  This
2176 function is also used as a subroutine by other tokenizer functions when they
2177 have a bunch of presumably plain text they want broken into words and added. */
2178 
2179 static size_t TokenizerPassGetPlainWords (
2180   char *BufferPntr,
2181   size_t NumberOfBytes,
2182   char PrefixCharacter,
2183   set<string> &WordSet)
2184 {
2185   string  AccumulatedWord;
2186   char   *EndOfStringPntr;
2187   size_t  Length;
2188   int     Letter;
2189 
2190   if (NumberOfBytes <= 0)
2191     return 0; /* Nothing to process. */
2192 
2193   if (PrefixCharacter != 0)
2194     AccumulatedWord = PrefixCharacter;
2195   EndOfStringPntr = BufferPntr + NumberOfBytes;
2196   while (true)
2197   {
2198     if (BufferPntr >= EndOfStringPntr)
2199       Letter = EOF; // Usually a negative number.
2200     else
2201       Letter = (unsigned char) *BufferPntr++;
2202 
2203     /* See if it is a letter we treat as white space.  Some word separators
2204     like dashes and periods aren't considered as space.  Note that codes above
2205     127 are UTF-8 characters, which we consider non-space. */
2206 
2207     if (Letter < 0 /* EOF is -1 */ ||
2208     (Letter < 128 && g_SpaceCharacters[Letter]))
2209     {
2210       /* That space finished off a word.  Remove trailing periods... */
2211 
2212       while ((Length = AccumulatedWord.size()) > 0 &&
2213       AccumulatedWord [Length-1] == '.')
2214         AccumulatedWord.resize (Length - 1);
2215 
2216       /* If there's anything left in the word, add it to the set.  Also ignore
2217       words which are too big (it's probably some binary encoded data).  But
2218       leave room for supercalifragilisticexpialidoceous.  According to one web
2219       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
2220       currently in English.  Note that some uuencoded data was seen with a 60
2221       character line length. */
2222 
2223       if (PrefixCharacter != 0)
2224         Length--; // Don't count prefix when judging size or emptiness.
2225       if (Length > 0 && Length <= g_MaxWordLength)
2226         WordSet.insert (AccumulatedWord);
2227 
2228       /* Empty out the string to get ready for the next word.  Not quite empty,
2229       start it off with the prefix character if any. */
2230 
2231       if (PrefixCharacter != 0)
2232         AccumulatedWord = PrefixCharacter;
2233       else
2234         AccumulatedWord.resize (0);
2235     }
2236     else /* Not a space-like character, add it to the word. */
2237       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
2238 
2239     if (Letter < 0)
2240       break; /* End of data.  Exit here so that last word got processed. */
2241   }
2242   return NumberOfBytes;
2243 }
2244 
2245 
2246 /* Delete Things from the text.  The Thing is marked by a start string and an
2247 end string, such as "<!--" and "--> for HTML comment things.  All the text
2248 between the markers will be added to the word list before it gets deleted from
2249 the buffer.  The markers must be prepared in lower case and the buffer is
2250 assumed to have already been converted to lower case.  You can specify an empty
2251 string for the end marker if you're just matching a string constant like
2252 "&nbsp;", which you would put in the starting marker.  This is a utility
2253 function used by other tokenizer functions. */
2254 
2255 static size_t TokenizerUtilRemoveStartEndThing (
2256   char *BufferPntr,
2257   size_t NumberOfBytes,
2258   char PrefixCharacter,
2259   set<string> &WordSet,
2260   const char *ThingStartCode,
2261   const char *ThingEndCode,
2262   bool ReplaceWithSpace)
2263 {
2264   char *EndOfStringPntr;
2265   bool  FoundAndDeletedThing;
2266   char *InputStringPntr;
2267   char *OutputStringPntr;
2268   int   ThingEndLength;
2269   char *ThingEndPntr;
2270   int   ThingStartLength;
2271 
2272   InputStringPntr = BufferPntr;
2273   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2274   OutputStringPntr = InputStringPntr;
2275   ThingStartLength = strlen (ThingStartCode);
2276   ThingEndLength = strlen (ThingEndCode);
2277 
2278   if (ThingStartLength <= 0)
2279     return NumberOfBytes; /* Need some things to look for first! */
2280 
2281   while (InputStringPntr < EndOfStringPntr)
2282   {
2283     /* Search for the starting marker. */
2284 
2285     FoundAndDeletedThing = false;
2286     if (EndOfStringPntr - InputStringPntr >=
2287     ThingStartLength + ThingEndLength /* space remains for start + end */ &&
2288     *InputStringPntr == *ThingStartCode &&
2289     memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
2290     {
2291       /* Found the start marker.  Look for the terminating string.  If it is an
2292       empty string, then we've found it right now! */
2293 
2294       ThingEndPntr = InputStringPntr + ThingStartLength;
2295       while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
2296       {
2297         if (ThingEndLength == 0 ||
2298         (*ThingEndPntr == *ThingEndCode &&
2299         memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
2300         {
2301           /* Got the end of the Thing.  First dump the text inbetween the start
2302           and end markers into the words list. */
2303 
2304           TokenizerPassGetPlainWords (InputStringPntr + ThingStartLength,
2305             ThingEndPntr - (InputStringPntr + ThingStartLength),
2306             PrefixCharacter, WordSet);
2307 
2308           /* Delete by not updating the output pointer while moving the input
2309           pointer to just after the ending tag. */
2310 
2311           InputStringPntr = ThingEndPntr + ThingEndLength;
2312           if (ReplaceWithSpace)
2313             *OutputStringPntr++ = ' ';
2314           FoundAndDeletedThing = true;
2315           break;
2316         }
2317         ThingEndPntr++;
2318       } /* End while ThingEndPntr */
2319     }
2320     if (!FoundAndDeletedThing)
2321       *OutputStringPntr++ = *InputStringPntr++;
2322   } /* End while InputStringPntr */
2323 
2324   return OutputStringPntr - BufferPntr;
2325 }
2326 
2327 
2328 static size_t TokenizerPassRemoveHTMLComments (
2329   char *BufferPntr,
2330   size_t NumberOfBytes,
2331   char PrefixCharacter,
2332   set<string> &WordSet)
2333 {
2334   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2335     PrefixCharacter, WordSet, "<!--", "-->", false);
2336 }
2337 
2338 
2339 static size_t TokenizerPassRemoveHTMLStyle (
2340   char *BufferPntr,
2341   size_t NumberOfBytes,
2342   char PrefixCharacter,
2343   set<string> &WordSet)
2344 {
2345   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2346     PrefixCharacter, WordSet,
2347     "<style", "/style>", false /* replace with space if true */);
2348 }
2349 
2350 
2351 /* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
2352 start of the next sentence is recognised at least as the start of a very long
2353 word.  The Japanese comma also does the same job. */
2354 
2355 static size_t TokenizerPassJapanesePeriodsToSpaces (
2356   char *BufferPntr,
2357   size_t NumberOfBytes,
2358   char PrefixCharacter,
2359   set<string> &WordSet)
2360 {
2361   size_t BytesRemaining = NumberOfBytes;
2362 
2363   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2364     BytesRemaining, PrefixCharacter, WordSet, "。" /* period */, "", true);
2365   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2366     BytesRemaining, PrefixCharacter, WordSet, "、" /* comma */, "", true);
2367   return BytesRemaining;
2368 }
2369 
2370 
2371 /* Delete HTML tags from the text.  The contents of the tag are added as words
2372 before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
2373 stage while other HTML things get replaced by nothing. */
2374 
2375 static size_t TokenizerPassRemoveHTMLTags (
2376   char *BufferPntr,
2377   size_t NumberOfBytes,
2378   char PrefixCharacter,
2379   set<string> &WordSet)
2380 {
2381   size_t BytesRemaining = NumberOfBytes;
2382 
2383   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2384     BytesRemaining, PrefixCharacter, WordSet, "&nbsp;", "", true);
2385   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2386     BytesRemaining, PrefixCharacter, WordSet, "<p", ">", true);
2387   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2388     BytesRemaining, PrefixCharacter, WordSet, "<br", ">", true);
2389   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2390     BytesRemaining, PrefixCharacter, WordSet, "<", ">", false);
2391   return BytesRemaining;
2392 }
2393 
2394 
2395 
2396 /******************************************************************************
2397  * Implementation of the ABSApp class, constructor, destructor and the rest of
2398  * the member functions in mostly alphabetical order.
2399  */
2400 
2401 ABSApp::ABSApp ()
2402 : BApplication (g_ABSAppSignature),
2403   m_DatabaseHasChanged (false),
2404   m_SettingsHaveChanged (false)
2405 {
2406   status_t    ErrorCode;
2407   int         HalvingCount;
2408   int         i;
2409   const void *ResourceData;
2410   size_t      ResourceSize;
2411   BResources *ResourcesPntr;
2412 
2413   MakeDatabaseEmpty ();
2414 
2415   /* Set up the pathname which identifies our settings directory.  Note that
2416   the actual settings are loaded later on (or set to defaults) by the main()
2417   function, before this BApplication starts running.  So we don't bother
2418   initialising the other setting related variables here. */
2419 
2420   ErrorCode =
2421     find_directory (B_USER_SETTINGS_DIRECTORY, &m_SettingsDirectoryPath);
2422   if (ErrorCode == B_OK)
2423     ErrorCode = m_SettingsDirectoryPath.Append (g_SettingsDirectoryName);
2424   if (ErrorCode != B_OK)
2425     m_SettingsDirectoryPath.SetTo (".");
2426 
2427   /* Set up the table which identifies which characters are spaces and which
2428   are not.  Spaces are all control characters and all punctuation except for:
2429   apostrophe (so "it's" and possessive versions of words get stored), dash (for
2430   hyphenated words), dollar sign (for cash amounts), period (for IP addresses,
2431   we later remove trailing periods). */
2432 
2433   memset (g_SpaceCharacters, 1, sizeof (g_SpaceCharacters));
2434   g_SpaceCharacters['\''] = false;
2435   g_SpaceCharacters['-'] = false;
2436   g_SpaceCharacters['$'] = false;
2437   g_SpaceCharacters['.'] = false;
2438   for (i = '0'; i <= '9'; i++)
2439     g_SpaceCharacters[i] = false;
2440   for (i = 'A'; i <= 'Z'; i++)
2441     g_SpaceCharacters[i] = false;
2442   for (i = 'a'; i <= 'z'; i++)
2443     g_SpaceCharacters[i] = false;
2444 
2445   /* Initialise the busy cursor from data in the application's resources. */
2446 
2447   if ((ResourcesPntr = AppResources ()) != NULL && (ResourceData =
2448   ResourcesPntr->LoadResource ('CURS', "Busy Cursor", &ResourceSize)) != NULL
2449   && ResourceSize >= 68 /* Size of a raw 2x16x16x8+4 cursor is 68 bytes */)
2450     g_BusyCursor = new BCursor (ResourceData);
2451 
2452   /* Find out the smallest usable double by seeing how small we can make it. */
2453 
2454   m_SmallestUseableDouble = 1.0;
2455   HalvingCount = 0;
2456   while (HalvingCount < 10000 && m_SmallestUseableDouble > 0.0)
2457   {
2458     HalvingCount++;
2459     m_SmallestUseableDouble /= 2;
2460   }
2461 
2462   /* Recreate the number.  But don't make quite as small, we want to allow some
2463   precision bits and a bit of extra margin for intermediate results in future
2464   calculations. */
2465 
2466   HalvingCount -= 50 + sizeof (double) * 8;
2467 
2468   m_SmallestUseableDouble = 1.0;
2469   while (HalvingCount > 0)
2470   {
2471     HalvingCount--;
2472     m_SmallestUseableDouble /= 2;
2473   }
2474 }
2475 
2476 
2477 ABSApp::~ABSApp ()
2478 {
2479   status_t ErrorCode;
2480   char     ErrorMessage [PATH_MAX + 1024];
2481 
2482   if (m_SettingsHaveChanged)
2483     LoadSaveSettings (false /* DoLoad */);
2484   if ((ErrorCode = SaveDatabaseIfNeeded (ErrorMessage)) != B_OK)
2485     DisplayErrorMessage (ErrorMessage, ErrorCode, "Exiting Error");
2486   delete g_BusyCursor;
2487   g_BusyCursor = NULL;
2488 }
2489 
2490 
2491 /* Display a box showing information about this program. */
2492 
2493 void ABSApp::AboutRequested ()
2494 {
2495   BAlert *AboutAlertPntr;
2496 
2497   AboutAlertPntr = new BAlert ("About",
2498 "SpamDBM - Spam Database Manager\n\n"
2499 
2500 "This is a BeOS program for classifying e-mail messages as spam (unwanted \
2501 junk mail) or as genuine mail using a Bayesian statistical approach.  There \
2502 is also a Mail Daemon Replacement add-on to filter mail using the \
2503 classification statistics collected earlier.\n\n"
2504 
2505 "Written by Alexander G. M. Smith, fall 2002.\n\n"
2506 
2507 "The original idea was from Paul Graham's algorithm, which has an excellent \
2508 writeup at: http://www.paulgraham.com/spam.html\n\n"
2509 
2510 "Gary Robinson came up with the improved algorithm, which you can read about \
2511 at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n\n"
2512 
2513 "Mr. Robinson, Tim Peters and the SpamBayes mailing list people then \
2514 developed the even better chi-squared scoring method.\n\n"
2515 
2516 "Icon courtesy of Isaac Yonemoto, though it is no longer used since Hormel \
2517 doesn't want their meat product associated with junk e-mail.\n\n"
2518 
2519 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes \
2520 uses to extract words from messages.  In particular, HTML is now handled.\n\n"
2521 
2522 "Released to the public domain, with no warranty.\n"
2523 "$Revision: 30630 $\n"
2524 "Compiled on " __DATE__ " at " __TIME__ ".", "Done");
2525   if (AboutAlertPntr != NULL)
2526   {
2527     AboutAlertPntr->SetShortcut (0, B_ESCAPE);
2528     AboutAlertPntr->Go ();
2529   }
2530 }
2531 
2532 
2533 /* Add the text in the given file to the database as an example of a spam or
2534 genuine message, or removes it from the database if you claim it is
2535 CL_UNCERTAIN.  Also resets the spam ratio attribute to show the effect of the
2536 database change. */
2537 
2538 status_t ABSApp::AddFileToDatabase (
2539   ClassificationTypes IsSpamOrWhat,
2540   const char *FileName,
2541   char *ErrorMessage)
2542 {
2543   status_t ErrorCode;
2544   BFile    MessageFile;
2545   BMessage TempBMessage;
2546 
2547   ErrorCode = MessageFile.SetTo (FileName, B_READ_ONLY);
2548   if (ErrorCode != B_OK)
2549   {
2550     sprintf (ErrorMessage, "Unable to open file \"%s\" for reading", FileName);
2551     return ErrorCode;
2552   }
2553 
2554   ErrorCode = AddPositionIOToDatabase (IsSpamOrWhat,
2555     &MessageFile, FileName, ErrorMessage);
2556   MessageFile.Unset ();
2557   if (ErrorCode != B_OK)
2558     return ErrorCode;
2559 
2560   /* Re-evaluate the file so that the user sees the new ratio attribute. */
2561   return EvaluateFile (FileName, &TempBMessage, ErrorMessage);
2562 }
2563 
2564 
2565 /* Add the given text to the database.  The unique words found in MessageIOPntr
2566 will be added to the database (incrementing the count for the number of
2567 messages using each word, either the spam or genuine count depending on
2568 IsSpamOrWhat).  It will remove the message (decrement the word counts) if you
2569 specify CL_UNCERTAIN as the new classification.  And if it switches from spam
2570 to genuine or vice versa, it will do both - decrement the counts for the old
2571 class and increment the counts for the new one.  An attribute will be added to
2572 MessageIOPntr (if it is a file) to record that it has been marked as Spam or
2573 Genuine (so that it doesn't get added to the database a second time).  If it is
2574 being removed from the database, the classification attribute gets removed too.
2575 If things go wrong, a non-zero error code will be returned and an explanation
2576 written to ErrorMessage (assumed to be at least PATH_MAX + 1024 bytes long).
2577 OptionalFileName is just used in the error message to identify the file to the
2578 user. */
2579 
2580 status_t ABSApp::AddPositionIOToDatabase (
2581   ClassificationTypes IsSpamOrWhat,
2582   BPositionIO *MessageIOPntr,
2583   const char *OptionalFileName,
2584   char *ErrorMessage)
2585 {
2586   BNode                             *BNodePntr;
2587   char                               ClassificationString [NAME_MAX];
2588   StatisticsMap::iterator            DataIter;
2589   status_t                           ErrorCode = 0;
2590   pair<StatisticsMap::iterator,bool> InsertResult;
2591   uint32                             NewAge;
2592   StatisticsRecord                   NewStatistics;
2593   ClassificationTypes                PreviousClassification;
2594   StatisticsPointer                  StatisticsPntr;
2595   set<string>::iterator              WordEndIter;
2596   set<string>::iterator              WordIter;
2597   set<string>                        WordSet;
2598 
2599   NewAge = m_TotalGenuineMessages + m_TotalSpamMessages;
2600   if (NewAge >= 0xFFFFFFF0UL)
2601   {
2602     sprintf (ErrorMessage, "The database is full!  There are %lu messages in "
2603       "it and we can't add any more without overflowing the maximum integer "
2604       "representation in 32 bits", NewAge);
2605     return B_NO_MEMORY;
2606   }
2607 
2608   /* Check that this file hasn't already been added to the database. */
2609 
2610   PreviousClassification = CL_UNCERTAIN;
2611   BNodePntr = dynamic_cast<BNode *> (MessageIOPntr);
2612   if (BNodePntr != NULL) /* If this thing might have attributes. */
2613   {
2614     ErrorCode = BNodePntr->ReadAttr (g_AttributeNameClassification,
2615       B_STRING_TYPE, 0 /* offset */, ClassificationString,
2616       sizeof (ClassificationString) - 1);
2617     if (ErrorCode <= 0) /* Positive values for the number of bytes read */
2618       strcpy (ClassificationString, "none");
2619     else /* Just in case it needs a NUL at the end. */
2620       ClassificationString [ErrorCode] = 0;
2621 
2622     if (strcasecmp (ClassificationString, g_ClassifiedSpam) == 0)
2623       PreviousClassification = CL_SPAM;
2624     else if (strcasecmp (ClassificationString, g_ClassifiedGenuine) == 0)
2625       PreviousClassification = CL_GENUINE;
2626   }
2627 
2628   if (!m_IgnorePreviousClassification &&
2629   PreviousClassification != CL_UNCERTAIN)
2630   {
2631     if (IsSpamOrWhat == PreviousClassification)
2632     {
2633       sprintf (ErrorMessage, "Ignoring file \"%s\" since it seems to have "
2634         "already been classified as %s.", OptionalFileName,
2635         g_ClassificationTypeNames [IsSpamOrWhat]);
2636     }
2637     else
2638     {
2639       sprintf (ErrorMessage, "Changing existing classification of file \"%s\" "
2640         "from %s to %s.", OptionalFileName,
2641         g_ClassificationTypeNames [PreviousClassification],
2642         g_ClassificationTypeNames [IsSpamOrWhat]);
2643     }
2644     DisplayErrorMessage (ErrorMessage, 0, "Note");
2645   }
2646 
2647   if (!m_IgnorePreviousClassification &&
2648   IsSpamOrWhat == PreviousClassification)
2649     /* Nothing to do if it is already classified correctly and the user doesn't
2650     want double classification. */
2651     return B_OK;
2652 
2653   /* Get the list of unique words in the file. */
2654 
2655   ErrorCode = GetWordsFromPositionIO (MessageIOPntr, OptionalFileName,
2656     WordSet, ErrorMessage);
2657   if (ErrorCode != B_OK)
2658     return ErrorCode;
2659 
2660   /* Update the count of the number of messages processed, with corrections if
2661   reclassifying a message. */
2662 
2663   m_DatabaseHasChanged = true;
2664 
2665   if (!m_IgnorePreviousClassification &&
2666   PreviousClassification == CL_SPAM && m_TotalSpamMessages > 0)
2667     m_TotalSpamMessages--;
2668 
2669   if (IsSpamOrWhat == CL_SPAM)
2670     m_TotalSpamMessages++;
2671 
2672   if (!m_IgnorePreviousClassification &&
2673   PreviousClassification == CL_GENUINE && m_TotalGenuineMessages > 0)
2674       m_TotalGenuineMessages--;
2675 
2676   if (IsSpamOrWhat == CL_GENUINE)
2677     m_TotalGenuineMessages++;
2678 
2679   /* Mark the file's attributes with the new classification.  Don't care if it
2680   fails. */
2681 
2682   if (BNodePntr != NULL) /* If this thing might have attributes. */
2683   {
2684     ErrorCode = BNodePntr->RemoveAttr (g_AttributeNameClassification);
2685     if (IsSpamOrWhat != CL_UNCERTAIN)
2686     {
2687       strcpy (ClassificationString, g_ClassificationTypeNames [IsSpamOrWhat]);
2688       ErrorCode = BNodePntr->WriteAttr (g_AttributeNameClassification,
2689         B_STRING_TYPE, 0 /* offset */,
2690         ClassificationString, strlen (ClassificationString) + 1);
2691     }
2692   }
2693 
2694   /* Add the words to the database by incrementing or decrementing the counts
2695   for each word as appropriate. */
2696 
2697   WordEndIter = WordSet.end ();
2698   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
2699   {
2700     if ((DataIter = m_WordMap.find (*WordIter)) == m_WordMap.end ())
2701     {
2702       /* No record in the database for the word. */
2703 
2704       if (IsSpamOrWhat == CL_UNCERTAIN)
2705         continue; /* Not adding words, don't have to subtract from nothing. */
2706 
2707       /* Create a new one record in the database for the new word. */
2708 
2709       memset (&NewStatistics, 0, sizeof (NewStatistics));
2710       InsertResult = m_WordMap.insert (
2711         StatisticsMap::value_type (*WordIter, NewStatistics));
2712       if (!InsertResult.second)
2713       {
2714         sprintf (ErrorMessage, "Failed to insert new database entry for "
2715           "word \"%s\", while processing file \"%s\"",
2716           WordIter->c_str (), OptionalFileName);
2717         return B_NO_MEMORY;
2718       }
2719       DataIter = InsertResult.first;
2720       m_WordCount++;
2721     }
2722 
2723     /* Got the database record for the word, update the statistics. */
2724 
2725     StatisticsPntr = &DataIter->second;
2726 
2727     StatisticsPntr->age = NewAge;
2728 
2729     /* Can't update m_OldestAge here, since it would take a lot of effort to
2730     find the next older age.  Since it's only used for display, we'll let it be
2731     slightly incorrect.  The next database load or purge will fix it. */
2732 
2733     if (IsSpamOrWhat == CL_SPAM)
2734       StatisticsPntr->spamCount++;
2735 
2736     if (IsSpamOrWhat == CL_GENUINE)
2737       StatisticsPntr->genuineCount++;
2738 
2739     if (!m_IgnorePreviousClassification &&
2740     PreviousClassification == CL_SPAM && StatisticsPntr->spamCount > 0)
2741       StatisticsPntr->spamCount--;
2742 
2743     if (!m_IgnorePreviousClassification &&
2744     PreviousClassification == CL_GENUINE && StatisticsPntr->genuineCount > 0)
2745       StatisticsPntr->genuineCount--;
2746   }
2747 
2748   return B_OK;
2749 }
2750 
2751 
2752 /* Add the text in the string to the database as an example of a spam or
2753 genuine message. */
2754 
2755 status_t ABSApp::AddStringToDatabase (
2756   ClassificationTypes IsSpamOrWhat,
2757   const char *String,
2758   char *ErrorMessage)
2759 {
2760   BMemoryIO MemoryIO (String, strlen (String));
2761 
2762   return AddPositionIOToDatabase (IsSpamOrWhat, &MemoryIO,
2763    "Memory Buffer" /* OptionalFileName */, ErrorMessage);
2764 }
2765 
2766 
2767 /* Given a bunch of text, find the words within it (doing special tricks to
2768 extract words from HTML), and add them to the set.  Allow NULs in the text.  If
2769 the PrefixCharacter isn't zero then it is prepended to all words found (so you
2770 can distinguish words as being from a header or from the body text).  See also
2771 TokenizeWhole which does something similar. */
2772 
2773 void ABSApp::AddWordsToSet (
2774   const char *InputString,
2775   size_t NumberOfBytes,
2776   char PrefixCharacter,
2777   set<string> &WordSet)
2778 {
2779   char   *BufferPntr;
2780   size_t  CurrentSize;
2781   int     PassNumber;
2782 
2783   /* Copy the input buffer.  The code will be modifying it in-place as HTML
2784   fragments and other junk are deleted. */
2785 
2786   BufferPntr = new char [NumberOfBytes];
2787   if (BufferPntr == NULL)
2788     return;
2789   memcpy (BufferPntr, InputString, NumberOfBytes);
2790 
2791   /* Do the tokenization.  Each pass does something to the text in the buffer,
2792   and may add words to the word set. */
2793 
2794   CurrentSize = NumberOfBytes;
2795   for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
2796   {
2797     switch (PassNumber)
2798     {
2799       case 1: /* Lowercase first, rest of them assume lower case inputs. */
2800         CurrentSize = TokenizerPassLowerCase (BufferPntr, CurrentSize);
2801         break;
2802       case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
2803         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2804       case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
2805         BufferPntr, CurrentSize); break;
2806       case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
2807         BufferPntr, CurrentSize, 'Z', WordSet); break;
2808       case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
2809         BufferPntr, CurrentSize, 'Z', WordSet); break;
2810       case 6: CurrentSize = TokenizerPassExtractURLs (
2811         BufferPntr, CurrentSize, 'Z', WordSet); break;
2812       case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
2813         BufferPntr, CurrentSize, 'Z', WordSet); break;
2814       case 8: CurrentSize = TokenizerPassGetPlainWords (
2815         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2816       default: break;
2817     }
2818   }
2819 
2820   delete [] BufferPntr;
2821 }
2822 
2823 
2824 /* The user has provided a command line.  This could actually be from a
2825 separate attempt to invoke the program (this application's resource/attributes
2826 have the launch flags set to "single launch", so the shell doesn't start the
2827 program but instead sends the arguments to the already running instance).  In
2828 either case, the command is sent to an intermediary thread where it is
2829 asynchronously converted into a scripting message(s) that are sent back to this
2830 BApplication.  The intermediary is needed since we can't recursively execute
2831 scripting messages while processing a message (this ArgsReceived one). */
2832 
2833 void ABSApp::ArgvReceived (int32 argc, char **argv)
2834 {
2835   if (g_CommanderLooperPntr != NULL)
2836     g_CommanderLooperPntr->CommandArguments (argc, argv);
2837 }
2838 
2839 
2840 /* Create a new empty database.  Note that we have to write out the new file
2841 immediately, otherwise other operations will see the empty database and then
2842 try to load the file, and complain that it doesn't exist.  Now they will see
2843 the empty database and redundantly load the empty file. */
2844 
2845 status_t ABSApp::CreateDatabaseFile (char *ErrorMessage)
2846 {
2847   MakeDatabaseEmpty ();
2848   m_DatabaseHasChanged = true;
2849   return SaveDatabaseIfNeeded (ErrorMessage); /* Make it now. */
2850 }
2851 
2852 
2853 /* Set the settings to the defaults.  Needed in case there isn't a settings
2854 file or it is obsolete. */
2855 
2856 void ABSApp::DefaultSettings ()
2857 {
2858   status_t ErrorCode;
2859   BPath    DatabasePath (m_SettingsDirectoryPath);
2860   char     TempString [PATH_MAX];
2861 
2862   /* The default database file is in the settings directory. */
2863 
2864   ErrorCode = DatabasePath.Append (g_DefaultDatabaseFileName);
2865   if (ErrorCode != B_OK)
2866     strcpy (TempString, g_DefaultDatabaseFileName); /* Unlikely to happen. */
2867   else
2868     strcpy (TempString, DatabasePath.Path ());
2869   m_DatabaseFileName.SetTo (TempString);
2870 
2871   // Users need to be allowed to undo their mistakes...
2872   m_IgnorePreviousClassification = true;
2873   g_ServerMode = true;
2874   m_PurgeAge = 2000;
2875   m_PurgePopularity = 2;
2876   m_ScoringMode = SM_CHISQUARED;
2877   m_TokenizeMode = TM_ANY_TEXT_HEADER;
2878 
2879   m_SettingsHaveChanged = true;
2880 }
2881 
2882 
2883 /* Deletes the database file, and the backup file, and clears the database but
2884 marks it as not changed so that it doesn't get written out when the program
2885 exits. */
2886 
2887 status_t ABSApp::DeleteDatabaseFile (char *ErrorMessage)
2888 {
2889   BEntry   FileEntry;
2890   status_t ErrorCode;
2891   int      i;
2892   char     TempString [PATH_MAX+20];
2893 
2894   /* Clear the in-memory database. */
2895 
2896   MakeDatabaseEmpty ();
2897   m_DatabaseHasChanged = false;
2898 
2899   /* Delete the backup files first.  Don't care if it fails. */
2900 
2901   for (i = 0; i < g_MaxBackups; i++)
2902   {
2903     strcpy (TempString, m_DatabaseFileName.String ());
2904     sprintf (TempString + strlen (TempString), g_BackupSuffix, i);
2905     ErrorCode = FileEntry.SetTo (TempString);
2906     if (ErrorCode == B_OK)
2907       FileEntry.Remove ();
2908   }
2909 
2910   /* Delete the main database file. */
2911 
2912   strcpy (TempString, m_DatabaseFileName.String ());
2913   ErrorCode = FileEntry.SetTo (TempString);
2914   if (ErrorCode != B_OK)
2915   {
2916     sprintf (ErrorMessage, "While deleting, failed to make BEntry for "
2917       "\"%s\" (does the directory exist?)", TempString);
2918     return ErrorCode;
2919   }
2920 
2921   ErrorCode = FileEntry.Remove ();
2922   if (ErrorCode != B_OK)
2923     sprintf (ErrorMessage, "While deleting, failed to remove file "
2924       "\"%s\"", TempString);
2925 
2926   return ErrorCode;
2927 }
2928 
2929 
2930 /* Evaluate the given file as being a spam message, and tag it with the
2931 resulting spam probability ratio.  If it also has an e-mail subject attribute,
2932 remove the [Spam 99.9%] prefix since the number usually changes. */
2933 
2934 status_t ABSApp::EvaluateFile (
2935   const char *PathName,
2936   BMessage *ReplyMessagePntr,
2937   char *ErrorMessage)
2938 {
2939   status_t ErrorCode;
2940   float    TempFloat;
2941   BFile    TextFile;
2942 
2943   /* Open the specified file. */
2944 
2945   ErrorCode = TextFile.SetTo (PathName, B_READ_ONLY);
2946   if (ErrorCode != B_OK)
2947   {
2948     sprintf (ErrorMessage, "Problems opening file \"%s\" for evaluating",
2949       PathName);
2950     return ErrorCode;
2951   }
2952 
2953   ErrorCode =
2954     EvaluatePositionIO (&TextFile, PathName, ReplyMessagePntr, ErrorMessage);
2955 
2956   if (ErrorCode == B_OK &&
2957   ReplyMessagePntr->FindFloat (g_ResultName, &TempFloat) == B_OK)
2958   {
2959     TextFile.WriteAttr (g_AttributeNameSpamRatio, B_FLOAT_TYPE,
2960       0 /* offset */, &TempFloat, sizeof (TempFloat));
2961     /* Don't know the spam cutoff ratio, that's in the e-mail filter, so just
2962     blindly remove the prefix, which would have the wrong percentage. */
2963     RemoveSpamPrefixFromSubjectAttribute (&TextFile);
2964   }
2965 
2966   return ErrorCode;
2967 }
2968 
2969 
2970 /* Evaluate a given file or memory buffer (a BPositionIO handles both cases)
2971 for spaminess.  The output is added to the ReplyMessagePntr message, with the
2972 probability ratio stored in "result" (0.0 means genuine and 1.0 means spam).
2973 It also adds the most significant words (used in the ratio calculation) to the
2974 array "words" and the associated per-word probability ratios in "ratios".  If
2975 it fails, an error code is returned and an error message written to the
2976 ErrorMessage string (which is at least MAX_PATH + 1024 bytes long).
2977 OptionalFileName is only used in the error message.
2978 
2979 The math used for combining the individual word probabilities in my method is
2980 based on Gary Robinson's method (formerly it was a variation of Paul Graham's
2981 method) or the Chi-Squared method.  It's input is the database of words that
2982 has a count of the number of spam and number of genuine messages each word
2983 appears in (doesn't matter if it appears more than once in a message, it still
2984 counts as 1).
2985 
2986 The spam word count is divided the by the total number of spam e-mail messages
2987 in the database to get the probability of spam and probability of genuineness
2988 is similarly computed for a particular word.  The spam probability is divided
2989 by the sum of the spam and genuine probabilities to get the Raw Spam Ratio for
2990 the word.  It's nearer to 0.0 for genuine and nearer to 1.0 for spam, and can
2991 be exactly zero or one too.
2992 
2993 To avoid multiplying later results by zero, and to compensate for a lack of
2994 data points, the Raw Spam Ratio is adjusted towards the 0.5 halfway point.  The
2995 0.5 is combined with the raw spam ratio, with a weight of 0.45 (determined to
2996 be a good value by the "spambayes" mailing list tests) messages applied to the
2997 half way point and a weight of the number of spam + genuine messages applied to
2998 the raw spam ratio.  This gives you the compensated spam ratio for the word.
2999 
3000 The top N (150 was good in the spambayes tests) extreme words are selected by
3001 the distance of each word's compensated spam ratio from 0.5.  Then the ratios
3002 of the words are combined.
3003 
3004 The Gary Robinson combining (scoring) method gets one value from the Nth root
3005 of the product of all the word ratios.  The other is the Nth root of the
3006 product of (1 - ratio) for all the words.  The final result is the first value
3007 divided by the sum of the two values.  The Nth root helps spread the resulting
3008 range of values more evenly between 0.0 and 1.0, otherwise the values all clump
3009 together at 0 or 1.  Also you can think of the Nth root as a kind of average
3010 for products; it's like a generic word probability which when multiplied by
3011 itself N times gives you the same result as the N separate actual word
3012 probabilities multiplied together.
3013 
3014 The Chi-Squared combining (scoring) method assumes that the spam word
3015 probabilities are uniformly distributed and computes an error measurement
3016 (called chi squared - see http://bmj.com/collections/statsbk/8.shtml for a good
3017 tutorial) and then sees how likely that error value would be observed in
3018 practice.  If it's rare to observe, then the words are likely not just randomly
3019 occuring and it's spammy.  The same is done for genuine words.  The two
3020 resulting unlikelynesses are compared to see which is more unlikely, if neither
3021 is, then the method says it can't decide.  The SpamBayes notes (see the
3022 classifier.py file in CVS in http://sourceforge.net/projects/spambayes) say:
3023 
3024 "Across vectors of length n, containing random uniformly-distributed
3025 probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution with 2*n
3026 degrees of freedom.  This has been proven (in some appropriate sense) to be the
3027 most sensitive possible test for rejecting the hypothesis that a vector of
3028 probabilities is uniformly distributed.  Gary Robinson's original scheme was
3029 monotonic *with* this test, but skipped the details.  Turns out that getting
3030 closer to the theoretical roots gives a much sharper classification, with a
3031 very small (in # of msgs), but also very broad (in range of scores), "middle
3032 ground", where most of the mistakes live.  In particular, this scheme seems
3033 immune to all forms of "cancellation disease": if there are many strong ham
3034 *and* spam clues, this reliably scores close to 0.5.  Most other schemes are
3035 extremely certain then -- and often wrong."
3036 
3037 I did a test with 448 example genuine messages including personal mail (some
3038 with HTML attachments) and mailing lists, and 267 spam messages for 27471 words
3039 total.  Test messages were more recent messages in the same groups.  Out of 100
3040 test genuine messages, with Gary Robinson (0.56 cutoff limit), 1 (1%) was
3041 falsely identified as spam and 8 of 73 (11%) spam messages were incorrectly
3042 classified as genuine.  With my variation of Paul Graham's scheme (0.90 cutoff)
3043 I got 6 of 100 (6%) genuine messages incorrectly marked as spam and 2 of 73
3044 (3%) spam messages were incorrectly classified as genuine.  Pretty close, but
3045 Robinson's values are more evenly spread out so you can tell just how spammy it
3046 is by looking at the number. */
3047 
3048 struct WordAndRatioStruct
3049 {
3050   double        probabilityRatio; /* Actually the compensated ratio. */
3051   const string *wordPntr;
3052 
3053   bool operator() ( /* Our less-than comparison function for sorting. */
3054     const WordAndRatioStruct &ItemA,
3055     const WordAndRatioStruct &ItemB) const
3056   {
3057     return
3058       (fabs (ItemA.probabilityRatio - 0.5) <
3059       fabs (ItemB.probabilityRatio - 0.5));
3060   };
3061 };
3062 
3063 status_t ABSApp::EvaluatePositionIO (
3064   BPositionIO *PositionIOPntr,
3065   const char *OptionalFileName,
3066   BMessage *ReplyMessagePntr,
3067   char *ErrorMessage)
3068 {
3069   StatisticsMap::iterator            DataEndIter;
3070   StatisticsMap::iterator            DataIter;
3071   status_t                           ErrorCode;
3072   double                             GenuineProbability;
3073   uint32                             GenuineSpamSum;
3074   int                                i;
3075   priority_queue<
3076     WordAndRatioStruct /* Data type stored in the queue */,
3077     vector<WordAndRatioStruct> /* Underlying container */,
3078     WordAndRatioStruct /* Function for comparing elements */>
3079                                      PriorityQueue;
3080   double                             ProductGenuine;
3081   double                             ProductLogGenuine;
3082   double                             ProductLogSpam;
3083   double                             ProductSpam;
3084   double                             RawProbabilityRatio;
3085   float                              ResultRatio;
3086   double                             SpamProbability;
3087   StatisticsPointer                  StatisticsPntr;
3088   double                             TempDouble;
3089   double                             TotalGenuine;
3090   double                             TotalSpam;
3091   WordAndRatioStruct                 WordAndRatio;
3092   set<string>::iterator              WordEndIter;
3093   set<string>::iterator              WordIter;
3094   const WordAndRatioStruct          *WordRatioPntr;
3095   set<string>                        WordSet;
3096 
3097   /* Get the list of unique words in the file / memory buffer. */
3098 
3099   ErrorCode = GetWordsFromPositionIO (PositionIOPntr, OptionalFileName,
3100     WordSet, ErrorMessage);
3101   if (ErrorCode != B_OK)
3102     return ErrorCode;
3103 
3104   /* Prepare a few variables.  Mostly these are stored double values of some of
3105   the numbers involved (to avoid the overhead of multiple conversions from
3106   integer to double), with extra precautions to avoid divide by zero. */
3107 
3108   if (m_TotalGenuineMessages <= 0)
3109     TotalGenuine = 1.0;
3110   else
3111     TotalGenuine = m_TotalGenuineMessages;
3112 
3113   if (m_TotalSpamMessages <= 0)
3114     TotalSpam = 1.0;
3115   else
3116     TotalSpam = m_TotalSpamMessages;
3117 
3118   /* Look up the words in the database and calculate their compensated spam
3119   ratio.  The results are stored in a priority queue so that we can later find
3120   the top g_MaxInterestingWords for doing the actual determination. */
3121 
3122   WordEndIter = WordSet.end ();
3123   DataEndIter = m_WordMap.end ();
3124   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
3125   {
3126     WordAndRatio.wordPntr = &(*WordIter);
3127 
3128     if ((DataIter = m_WordMap.find (*WordIter)) != DataEndIter)
3129     {
3130       StatisticsPntr = &DataIter->second;
3131 
3132       /* Calculate the probability the word is spam and the probability it is
3133       genuine.  Then the raw probability ratio. */
3134 
3135       SpamProbability = StatisticsPntr->spamCount / TotalSpam;
3136       GenuineProbability = StatisticsPntr->genuineCount / TotalGenuine;
3137 
3138       if (SpamProbability + GenuineProbability > 0)
3139         RawProbabilityRatio =
3140         SpamProbability / (SpamProbability + GenuineProbability);
3141       else /* Word with zero statistics, perhaps due to reclassification. */
3142         RawProbabilityRatio = 0.5;
3143 
3144       /* The compensated ratio leans towards 0.5 (g_RobinsonX) more for fewer
3145       data points, with a weight of 0.45 (g_RobinsonS). */
3146 
3147       GenuineSpamSum =
3148         StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
3149 
3150       WordAndRatio.probabilityRatio =
3151         (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
3152         (g_RobinsonS + GenuineSpamSum);
3153     }
3154     else /* Unknown word. With N=0, compensated ratio equation is RobinsonX. */
3155       WordAndRatio.probabilityRatio = g_RobinsonX;
3156 
3157      PriorityQueue.push (WordAndRatio);
3158   }
3159 
3160   /* Compute the combined probability (multiply them together) of the top few
3161   words.  To avoid numeric underflow (doubles can only get as small as 1E-300),
3162   logarithms are also used.  But avoid the logarithms (sum of logs of numbers
3163   is the same as the product of numbers) as much as possible due to reduced
3164   accuracy and slowness. */
3165 
3166   ProductGenuine = 1.0;
3167   ProductLogGenuine = 0.0;
3168   ProductSpam = 1.0;
3169   ProductLogSpam = 0.0;
3170   for (i = 0;
3171   i < g_MaxInterestingWords && !PriorityQueue.empty();
3172   i++, PriorityQueue.pop())
3173   {
3174     WordRatioPntr = &PriorityQueue.top();
3175     ProductSpam *= WordRatioPntr->probabilityRatio;
3176     ProductGenuine *= 1.0 - WordRatioPntr->probabilityRatio;
3177 
3178     /* Check for the numbers getting dangerously small, close to underflowing.
3179     If they are, move the value into the logarithm storage part. */
3180 
3181     if (ProductSpam < m_SmallestUseableDouble)
3182     {
3183       ProductLogSpam += log (ProductSpam);
3184       ProductSpam = 1.0;
3185     }
3186 
3187     if (ProductGenuine < m_SmallestUseableDouble)
3188     {
3189       ProductLogGenuine += log (ProductGenuine);
3190       ProductGenuine = 1.0;
3191     }
3192 
3193     ReplyMessagePntr->AddString ("words", WordRatioPntr->wordPntr->c_str ());
3194     ReplyMessagePntr->AddFloat ("ratios", WordRatioPntr->probabilityRatio);
3195   }
3196 
3197   /* Get the resulting log of the complete products. */
3198 
3199   if (i > 0)
3200   {
3201     ProductLogSpam += log (ProductSpam);
3202     ProductLogGenuine += log (ProductGenuine);
3203   }
3204 
3205   if (m_ScoringMode == SM_ROBINSON)
3206   {
3207     /* Apply Gary Robinson's scoring method where we take the Nth root of the
3208     products.  This is easiest in logarithm form. */
3209 
3210     if (i > 0)
3211     {
3212       ProductSpam = exp (ProductLogSpam / i);
3213       ProductGenuine = exp (ProductLogGenuine / i);
3214       ResultRatio = ProductSpam / (ProductGenuine + ProductSpam);
3215     }
3216     else /* Somehow got no words! */
3217       ResultRatio = g_RobinsonX;
3218   }
3219   else if (m_ScoringMode == SM_CHISQUARED)
3220   {
3221     /* From the SpamBayes notes: "We compute two chi-squared statistics, one
3222     for ham and one for spam.  The sum-of-the-logs business is more sensitive
3223     to probs near 0 than to probs near 1, so the spam measure uses 1-p (so that
3224     high-spamprob words have greatest effect), and the ham measure uses p
3225     directly (so that lo-spamprob words have greatest effect)."  That means we
3226     just reversed the meaning of the previously calculated spam and genuine
3227     products!  Oh well. */
3228 
3229     TempDouble = ProductLogSpam;
3230     ProductLogSpam = ProductLogGenuine;
3231     ProductLogGenuine = TempDouble;
3232 
3233     if (i > 0)
3234     {
3235       ProductSpam =
3236         1.0 - ChiSquaredProbability (-2.0 * ProductLogSpam, 2 * i);
3237       ProductGenuine =
3238         1.0 - ChiSquaredProbability (-2.0 * ProductLogGenuine, 2 * i);
3239 
3240       /* The SpamBayes notes say: "How to combine these into a single spam
3241       score?  We originally used (S-H)/(S+H) scaled into [0., 1.], which equals
3242       S/(S+H).  A systematic problem is that we could end up being near-certain
3243       a thing was (for example) spam, even if S was small, provided that H was
3244       much smaller.  Rob Hooft stared at these problems and invented the
3245       measure we use now, the simpler S-H, scaled into [0., 1.]." */
3246 
3247       ResultRatio = (ProductSpam - ProductGenuine + 1.0) / 2.0;
3248     }
3249     else /* No words to analyse. */
3250       ResultRatio = 0.5;
3251   }
3252   else /* Unknown scoring mode. */
3253   {
3254     strcpy (ErrorMessage, "Unknown scoring mode specified in settings");
3255     return B_BAD_VALUE;
3256   }
3257 
3258   ReplyMessagePntr->AddFloat (g_ResultName, ResultRatio);
3259   return B_OK;
3260 }
3261 
3262 
3263 /* Just evaluate the given string as being spam text. */
3264 
3265 status_t ABSApp::EvaluateString (
3266   const char *BufferPntr,
3267   ssize_t BufferSize,
3268   BMessage *ReplyMessagePntr,
3269   char *ErrorMessage)
3270 {
3271   BMemoryIO MemoryIO (BufferPntr, BufferSize);
3272 
3273   return EvaluatePositionIO (&MemoryIO, "Memory Buffer",
3274     ReplyMessagePntr, ErrorMessage);
3275 }
3276 
3277 
3278 /* Tell other programs about the scripting commands we support.  Try this
3279 command: "hey application/x-vnd.agmsmith.spamdbm getsuites" to
3280 see it in action (this program has to be already running for it to work). */
3281 
3282 status_t ABSApp::GetSupportedSuites (BMessage *MessagePntr)
3283 {
3284   BPropertyInfo TempPropInfo (g_ScriptingPropertyList);
3285 
3286   MessagePntr->AddString ("suites", "suite/x-vnd.agmsmith.spamdbm");
3287   MessagePntr->AddFlat ("messages", &TempPropInfo);
3288   return BApplication::GetSupportedSuites (MessagePntr);
3289 }
3290 
3291 
3292 /* Add all the words in the given file or memory buffer to the supplied set.
3293 The file name is only there for error messages, it assumes you have already
3294 opened the PositionIO to the right file.  If things go wrong, a non-zero error
3295 code will be returned and an explanation written to ErrorMessage (assumed to be
3296 at least PATH_MAX + 1024 bytes long). */
3297 
3298 status_t ABSApp::GetWordsFromPositionIO (
3299   BPositionIO *PositionIOPntr,
3300   const char *OptionalFileName,
3301   set<string> &WordSet,
3302   char *ErrorMessage)
3303 {
3304   status_t ErrorCode;
3305 
3306   if (m_TokenizeMode == TM_WHOLE)
3307     ErrorCode = TokenizeWhole (PositionIOPntr, OptionalFileName,
3308       WordSet, ErrorMessage);
3309   else
3310     ErrorCode = TokenizeParts (PositionIOPntr, OptionalFileName,
3311       WordSet, ErrorMessage);
3312 
3313   if (ErrorCode == B_OK && WordSet.empty ())
3314   {
3315     /* ENOMSG usually means no message found in queue, but I'm using it to show
3316     no words, a good indicator of spam which is pure HTML. */
3317 
3318     sprintf (ErrorMessage, "No words were found in \"%s\"", OptionalFileName);
3319     ErrorCode = ENOMSG;
3320   }
3321 
3322   return ErrorCode;
3323 }
3324 
3325 
3326 /* Set up indices for attributes MAIL:classification (string) and
3327 MAIL:ratio_spam (float) on all mounted disk volumes that support queries.  Also
3328 tell the system to make those attributes visible to the user (so they can see
3329 them in Tracker) and associate them with e-mail messages.  Also set up the
3330 database file MIME type (provide a description and associate it with this
3331 program so that it picks up the right icon).  And register the names for our
3332 sound effects. */
3333 
3334 status_t ABSApp::InstallThings (char *ErrorMessage)
3335 {
3336   int32       Cookie;
3337   dev_t       DeviceID;
3338   status_t    ErrorCode = B_OK;
3339   fs_info     FSInfo;
3340   int32       i;
3341   int32       iClassification;
3342   int32       iProbability;
3343   int32       j;
3344   index_info  IndexInfo;
3345   BMimeType   MimeType;
3346   BMessage    Parameters;
3347   const char *StringPntr;
3348   bool        TempBool;
3349   int32       TempInt32;
3350 
3351   /* Iterate through all mounted devices and try to make the indices on each
3352   one.  Don't bother if the index exists or the device doesn't support indices
3353   (actually queries). */
3354 
3355   Cookie = 0;
3356   while ((DeviceID = next_dev (&Cookie)) >= 0)
3357   {
3358     if (!fs_stat_dev (DeviceID, &FSInfo) && (FSInfo.flags & B_FS_HAS_QUERY))
3359     {
3360       if (fs_stat_index (DeviceID, g_AttributeNameClassification, &IndexInfo)
3361       && errno == B_ENTRY_NOT_FOUND)
3362       {
3363         if (fs_create_index (DeviceID, g_AttributeNameClassification,
3364         B_STRING_TYPE, 0 /* flags */))
3365         {
3366           ErrorCode = errno;
3367           sprintf (ErrorMessage, "Unable to make string index %s on "
3368             "volume #%d, volume name \"%s\", file system type \"%s\", "
3369             "on device \"%s\"", g_AttributeNameClassification,
3370             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3371             FSInfo.device_name);
3372         }
3373       }
3374 
3375       if (fs_stat_index (DeviceID, g_AttributeNameSpamRatio,
3376       &IndexInfo) && errno == B_ENTRY_NOT_FOUND)
3377       {
3378         if (fs_create_index (DeviceID, g_AttributeNameSpamRatio,
3379         B_FLOAT_TYPE, 0 /* flags */))
3380         {
3381           ErrorCode = errno;
3382           sprintf (ErrorMessage, "Unable to make float index %s on "
3383             "volume #%d, volume name \"%s\", file system type \"%s\", "
3384             "on device \"%s\"", g_AttributeNameSpamRatio,
3385             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3386             FSInfo.device_name);
3387         }
3388       }
3389     }
3390   }
3391   if (ErrorCode != B_OK)
3392     return ErrorCode;
3393 
3394   /* Set up the MIME types for the classification attributes, associate them
3395   with e-mail and make them visible to the user (but not editable).  First need
3396   to get the existing MIME settings, then add ours to them (otherwise the
3397   existing ones get wiped out). */
3398 
3399   ErrorCode = MimeType.SetTo ("text/x-email");
3400   if (ErrorCode != B_OK || !MimeType.IsInstalled ())
3401   {
3402     sprintf (ErrorMessage, "No e-mail MIME type (%s) in the system, can't "
3403       "update it to add our special attributes, and without e-mail this "
3404       "program is useless!", MimeType.Type ());
3405     if (ErrorCode == B_OK)
3406       ErrorCode = -1;
3407     return ErrorCode;
3408   }
3409 
3410   ErrorCode = MimeType.GetAttrInfo (&Parameters);
3411   if (ErrorCode != B_OK)
3412   {
3413     sprintf (ErrorMessage, "Unable to retrieve list of attributes "
3414       "associated with e-mail messages in the MIME database");
3415     return ErrorCode;
3416   }
3417 
3418   for (i = 0, iClassification = -1, iProbability = -1;
3419   i < 1000 && (iClassification < 0 || iProbability < 0);
3420   i++)
3421   {
3422     ErrorCode = Parameters.FindString ("attr:name", i, &StringPntr);
3423     if (ErrorCode != B_OK)
3424       break; /* Reached the end of the attributes. */
3425     if (strcmp (StringPntr, g_AttributeNameClassification) == 0)
3426       iClassification = i;
3427     else if (strcmp (StringPntr, g_AttributeNameSpamRatio) == 0)
3428       iProbability = i;
3429   }
3430 
3431   /* Add extra default settings for those programs which previously didn't
3432   update the MIME database with all the attributes that exist (so our new
3433   additions don't show up at the wrong index). */
3434 
3435   i--; /* Set i to index of last valid attribute. */
3436 
3437   for (j = 0; j <= i; j++)
3438   {
3439     if (Parameters.FindString ("attr:public_name", j, &StringPntr) ==
3440     B_BAD_INDEX)
3441     {
3442       if (Parameters.FindString ("attr:name", j, &StringPntr) != B_OK)
3443         StringPntr = "None!";
3444       Parameters.AddString ("attr:public_name", StringPntr);
3445     }
3446   }
3447 
3448   while (Parameters.FindInt32 ("attr:type", i, &TempInt32) == B_BAD_INDEX)
3449     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3450 
3451   while (Parameters.FindBool ("attr:viewable", i, &TempBool) == B_BAD_INDEX)
3452     Parameters.AddBool ("attr:viewable", true);
3453 
3454   while (Parameters.FindBool ("attr:editable", i, &TempBool) == B_BAD_INDEX)
3455     Parameters.AddBool ("attr:editable", false);
3456 
3457   while (Parameters.FindInt32 ("attr:width", i, &TempInt32) == B_BAD_INDEX)
3458     Parameters.AddInt32 ("attr:width", 60);
3459 
3460   while (Parameters.FindInt32 ("attr:alignment", i, &TempInt32) == B_BAD_INDEX)
3461     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3462 
3463   while (Parameters.FindBool ("attr:extra", i, &TempBool) == B_BAD_INDEX)
3464     Parameters.AddBool ("attr:extra", false);
3465 
3466   /* Add our new attributes to e-mail related things, if not already there. */
3467 
3468   if (iClassification < 0)
3469   {
3470     Parameters.AddString ("attr:name", g_AttributeNameClassification);
3471     Parameters.AddString ("attr:public_name", "Classification Group");
3472     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3473     Parameters.AddBool ("attr:viewable", true);
3474     Parameters.AddBool ("attr:editable", false);
3475     Parameters.AddInt32 ("attr:width", 45);
3476     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3477     Parameters.AddBool ("attr:extra", false);
3478   }
3479 
3480   if (iProbability < 0)
3481   {
3482     Parameters.AddString ("attr:name", g_AttributeNameSpamRatio);
3483     Parameters.AddString ("attr:public_name", "Spam/Genuine Estimate");
3484     Parameters.AddInt32 ("attr:type", B_FLOAT_TYPE);
3485     Parameters.AddBool ("attr:viewable", true);
3486     Parameters.AddBool ("attr:editable", false);
3487     Parameters.AddInt32 ("attr:width", 50);
3488     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3489     Parameters.AddBool ("attr:extra", false);
3490   }
3491 
3492   if (iClassification < 0 || iProbability < 0)
3493   {
3494     ErrorCode = MimeType.SetAttrInfo (&Parameters);
3495     if (ErrorCode != B_OK)
3496     {
3497       sprintf (ErrorMessage, "Unable to associate the classification "
3498         "attributes with e-mail messages in the MIME database");
3499       return ErrorCode;
3500     }
3501   }
3502 
3503   /* Set up the MIME type for the database file. */
3504 
3505   sprintf (ErrorMessage, "Problems with setting up MIME type (%s) for "
3506     "the database files", g_ABSDatabaseFileMIMEType); /* A generic message. */
3507 
3508   ErrorCode = MimeType.SetTo (g_ABSDatabaseFileMIMEType);
3509   if (ErrorCode != B_OK)
3510     return ErrorCode;
3511 
3512   MimeType.Delete ();
3513   ErrorCode = MimeType.Install ();
3514   if (ErrorCode != B_OK)
3515   {
3516     sprintf (ErrorMessage, "Failed to install MIME type (%s) in the system",
3517       MimeType.Type ());
3518     return ErrorCode;
3519   }
3520 
3521   MimeType.SetShortDescription ("Spam Database");
3522   MimeType.SetLongDescription ("Bayesian Statistical Database for "
3523     "Classifying Junk E-Mail");
3524   sprintf (ErrorMessage, "1.0 ('%s')", g_DatabaseRecognitionString);
3525   MimeType.SetSnifferRule (ErrorMessage);
3526   MimeType.SetPreferredApp (g_ABSAppSignature);
3527 
3528   /* Set up the names of the sound effects.  Later on the user can associate
3529   sound files with the names by using the Sounds preferences panel or the
3530   installsound command.  The MDR add-on filter will trigger these sounds. */
3531 
3532   add_system_beep_event (g_BeepGenuine);
3533   add_system_beep_event (g_BeepSpam);
3534   add_system_beep_event (g_BeepUncertain);
3535 
3536   return B_OK;
3537 }
3538 
3539 
3540 /* Load the database if it hasn't been loaded yet.  Otherwise do nothing. */
3541 
3542 status_t ABSApp::LoadDatabaseIfNeeded (char *ErrorMessage)
3543 {
3544   if (m_WordMap.empty ())
3545     return LoadSaveDatabase (true /* DoLoad */, ErrorMessage);
3546 
3547   return B_OK;
3548 }
3549 
3550 
3551 /* Either load the database of spam words (DoLoad is TRUE) from the file
3552 specified in the settings, or write (DoLoad is FALSE) the database to it.  If
3553 it doesn't exist (and its parent directories do exist) then it will be created
3554 when saving.  If it doesn't exist when loading, the in-memory database will be
3555 set to an empty one and an error will be returned with an explanation put into
3556 ErrorMessage (should be big enough for a path name and a couple of lines of
3557 text).
3558 
3559 The database file format is a UTF-8 text file (well, there could be some
3560 latin-1 characters and other junk in there - it just copies the bytes from the
3561 e-mail messages directly), with tab characters to separate fields (so that you
3562 can also load it into a spreadsheet).  The first line identifies the overall
3563 file type.  The second lists pairs of classifications plus the number of
3564 messages in each class.  Currently it is just Genuine and Spam, but for future
3565 compatability, that could be followed by more classification pairs.  The
3566 remaining lines each contain a word, the date it was last updated (actually
3567 it's the number of messages in the database when the word was added, smaller
3568 numbers mean it was updated longer ago), the genuine count and the spam count.
3569 */
3570 
3571 status_t ABSApp::LoadSaveDatabase (bool DoLoad, char *ErrorMessage)
3572 {
3573   time_t                             CurrentTime;
3574   FILE                              *DatabaseFile = NULL;
3575   BNode                              DatabaseNode;
3576   BNodeInfo                          DatabaseNodeInfo;
3577   StatisticsMap::iterator            DataIter;
3578   StatisticsMap::iterator            EndIter;
3579   status_t                           ErrorCode;
3580   int                                i;
3581   pair<StatisticsMap::iterator,bool> InsertResult;
3582   char                               LineString [10240];
3583   StatisticsRecord                   Statistics;
3584   const char                        *StringPntr;
3585   char                              *TabPntr;
3586   const char                        *WordPntr;
3587 
3588   if (DoLoad)
3589   {
3590     MakeDatabaseEmpty ();
3591     m_DatabaseHasChanged = false; /* In case of early error exit. */
3592   }
3593   else /* Saving the database, backup the old version on disk. */
3594   {
3595     ErrorCode = MakeBackup (ErrorMessage);
3596     if (ErrorCode != B_OK) /* Usually because the directory isn't there. */
3597       return ErrorCode;
3598   }
3599 
3600   DatabaseFile = fopen (m_DatabaseFileName.String (), DoLoad ? "rb" : "wb");
3601   if (DatabaseFile == NULL)
3602   {
3603     ErrorCode = errno;
3604     sprintf (ErrorMessage, "Can't open database file \"%s\" for %s",
3605       m_DatabaseFileName.String (), DoLoad ? "reading" : "writing");
3606     goto ErrorExit;
3607   }
3608 
3609   /* Process the first line, which identifies the file. */
3610 
3611   if (DoLoad)
3612   {
3613     sprintf (ErrorMessage, "Can't read first line of database file \"%s\", "
3614       "expected it to start with \"%s\"",
3615       m_DatabaseFileName.String (), g_DatabaseRecognitionString);
3616     ErrorCode = -1;
3617 
3618     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3619       goto ErrorExit;
3620     if (strncmp (LineString, g_DatabaseRecognitionString,
3621     strlen (g_DatabaseRecognitionString)) != 0)
3622       goto ErrorExit;
3623   }
3624   else /* Saving */
3625   {
3626     CurrentTime = time (NULL);
3627     if (fprintf (DatabaseFile, "%s V1 (word, age, genuine count, spam count)\t"
3628     "Written by SpamDBM $Revision: 30630 $\t"
3629     "Compiled on " __DATE__ " at " __TIME__ "\tThis file saved on %s",
3630     g_DatabaseRecognitionString, ctime (&CurrentTime)) <= 0)
3631     {
3632       ErrorCode = errno;
3633       sprintf (ErrorMessage, "Problems when writing to database file \"%s\"",
3634         m_DatabaseFileName.String ());
3635       goto ErrorExit;
3636     }
3637   }
3638 
3639   /* The second line lists the different classifications.  We just check to see
3640   that the first two are Genuine and Spam.  If there are others, they'll be
3641   ignored and lost when the database is saved. */
3642 
3643   if (DoLoad)
3644   {
3645     sprintf (ErrorMessage, "Can't read second line of database file \"%s\", "
3646       "expected it to list classifications %s and %s along with their totals",
3647       m_DatabaseFileName.String (), g_ClassifiedGenuine, g_ClassifiedSpam);
3648     ErrorCode = B_BAD_VALUE;
3649 
3650     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3651       goto ErrorExit;
3652     i = strlen (LineString);
3653     if (i > 0 && LineString[i-1] == '\n')
3654       LineString[i-1] = 0; /* Remove trailing line feed character. */
3655 
3656     /* Look for the title word at the start of the line. */
3657 
3658     TabPntr = LineString;
3659     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3660       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3661 
3662     if (strncmp (StringPntr, "Classifications", 15) != 0)
3663       goto ErrorExit;
3664 
3665     /* Look for the Genuine class and count. */
3666 
3667     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3668       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3669 
3670     if (strcmp (StringPntr, g_ClassifiedGenuine) != 0)
3671       goto ErrorExit;
3672 
3673     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3674       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3675 
3676     m_TotalGenuineMessages = atoll (StringPntr);
3677 
3678     /* Look for the Spam class and count. */
3679 
3680     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3681       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3682 
3683     if (strcmp (StringPntr, g_ClassifiedSpam) != 0)
3684       goto ErrorExit;
3685 
3686     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3687       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3688 
3689     m_TotalSpamMessages = atoll (StringPntr);
3690   }
3691   else /* Saving */
3692   {
3693     fprintf (DatabaseFile,
3694       "Classifications and total messages:\t%s\t%lu\t%s\t%lu\n",
3695       g_ClassifiedGenuine, m_TotalGenuineMessages,
3696       g_ClassifiedSpam, m_TotalSpamMessages);
3697   }
3698 
3699   /* The remainder of the file is the list of words and statistics.  Each line
3700   has a word, a tab, the time when the word was last changed in the database
3701   (sequence number of message addition, starts at 0 and goes up by one for each
3702   message added to the database), a tab then the number of messages in the
3703   first class (genuine) that had that word, then a tab, then the number of
3704   messages in the second class (spam) with that word, and so on. */
3705 
3706   if (DoLoad)
3707   {
3708     while (!feof (DatabaseFile))
3709     {
3710       if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3711       {
3712         ErrorCode = errno;
3713         if (feof (DatabaseFile))
3714           break;
3715         if (ErrorCode == B_OK)
3716           ErrorCode = -1;
3717         sprintf (ErrorMessage, "Error while reading words and statistics "
3718           "from database file \"%s\"", m_DatabaseFileName.String ());
3719         goto ErrorExit;
3720       }
3721 
3722       i = strlen (LineString);
3723       if (i > 0 && LineString[i-1] == '\n')
3724         LineString[i-1] = 0; /* Remove trailing line feed character. */
3725 
3726       /* Get the word at the start of the line, save in WordPntr. */
3727 
3728       TabPntr = LineString;
3729       for (WordPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3730         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3731 
3732       /* Get the date stamp.  Actually a sequence number, not a date. */
3733 
3734       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3735         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3736 
3737       Statistics.age = atoll (StringPntr);
3738 
3739       /* Get the Genuine count. */
3740 
3741       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3742         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3743 
3744       Statistics.genuineCount = atoll (StringPntr);
3745 
3746       /* Get the Spam count. */
3747 
3748       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3749         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3750 
3751       Statistics.spamCount = atoll (StringPntr);
3752 
3753       /* Ignore empty words, totally unused words and ones which are too long
3754       (avoids lots of length checking everywhere). */
3755 
3756       if (WordPntr[0] == 0 || strlen (WordPntr) > g_MaxWordLength ||
3757       (Statistics.genuineCount <= 0 && Statistics.spamCount <= 0))
3758         continue; /* Ignore this line of text, start on next one. */
3759 
3760       /* Add the combination to the database. */
3761 
3762       InsertResult = m_WordMap.insert (
3763         StatisticsMap::value_type (WordPntr, Statistics));
3764       if (InsertResult.second == false)
3765       {
3766         ErrorCode = B_BAD_VALUE;
3767         sprintf (ErrorMessage, "Error while inserting word \"%s\" from "
3768           "database \"%s\", perhaps it is a duplicate",
3769           WordPntr, m_DatabaseFileName.String ());
3770         goto ErrorExit;
3771       }
3772       m_WordCount++;
3773 
3774       /* And the hunt for the oldest word. */
3775 
3776       if (Statistics.age < m_OldestAge)
3777         m_OldestAge = Statistics.age;
3778     }
3779   }
3780   else /* Saving, dump all words and statistics to the file. */
3781   {
3782     EndIter = m_WordMap.end ();
3783     for (DataIter = m_WordMap.begin (); DataIter != EndIter; DataIter++)
3784     {
3785       if (fprintf (DatabaseFile, "%s\t%lu\t%lu\t%lu\n",
3786       DataIter->first.c_str (), DataIter->second.age,
3787       DataIter->second.genuineCount, DataIter->second.spamCount) <= 0)
3788       {
3789         ErrorCode = errno;
3790         sprintf (ErrorMessage, "Error while writing word \"%s\" to "
3791           "database \"%s\"",
3792           DataIter->first.c_str(), m_DatabaseFileName.String ());
3793         goto ErrorExit;
3794       }
3795     }
3796   }
3797 
3798   /* Set the file type so that the new file gets associated with this program,
3799   and picks up the right icon. */
3800 
3801   if (!DoLoad)
3802   {
3803     sprintf (ErrorMessage, "Unable to set attributes (file type) of database "
3804       "file \"%s\"", m_DatabaseFileName.String ());
3805     ErrorCode = DatabaseNode.SetTo (m_DatabaseFileName.String ());
3806     if (ErrorCode != B_OK)
3807       goto ErrorExit;
3808     DatabaseNodeInfo.SetTo (&DatabaseNode);
3809     ErrorCode = DatabaseNodeInfo.SetType (g_ABSDatabaseFileMIMEType);
3810     if (ErrorCode != B_OK)
3811       goto ErrorExit;
3812   }
3813 
3814   /* Success! */
3815   m_DatabaseHasChanged = false;
3816   ErrorCode = B_OK;
3817 
3818 ErrorExit:
3819   if (DatabaseFile != NULL)
3820     fclose (DatabaseFile);
3821   return ErrorCode;
3822 }
3823 
3824 
3825 /* Either load the settings (DoLoad is TRUE) from the configuration file or
3826 write them (DoLoad is FALSE) to it.  The configuration file is a flattened
3827 BMessage containing the various program settings.  If it doesn't exist (and its
3828 parent directories don't exist) then it will be created when saving.  If it
3829 doesn't exist when loading, the settings will be set to default values. */
3830 
3831 status_t ABSApp::LoadSaveSettings (bool DoLoad)
3832 {
3833   status_t    ErrorCode;
3834   const char *NamePntr;
3835   BMessage    Settings;
3836   BDirectory  SettingsDirectory;
3837   BFile       SettingsFile;
3838   const char *StringPntr;
3839   bool        TempBool;
3840   int32       TempInt32;
3841   char        TempString [PATH_MAX + 100];
3842 
3843   /* Preset things to default values if loading, in case of an error or it's an
3844   older version of the settings file which doesn't have every field defined. */
3845 
3846   if (DoLoad)
3847     DefaultSettings ();
3848 
3849   /* Look for our settings directory.  When saving we can try to create it. */
3850 
3851   ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3852   if (ErrorCode != B_OK)
3853   {
3854     if (DoLoad || ErrorCode != B_ENTRY_NOT_FOUND)
3855     {
3856       sprintf (TempString, "Can't find settings directory \"%s\"",
3857         m_SettingsDirectoryPath.Path ());
3858       goto ErrorExit;
3859     }
3860     ErrorCode = create_directory (m_SettingsDirectoryPath.Path (), 0755);
3861     if (ErrorCode == B_OK)
3862       ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3863     if (ErrorCode != B_OK)
3864     {
3865       sprintf (TempString, "Can't create settings directory \"%s\"",
3866         m_SettingsDirectoryPath.Path ());
3867       goto ErrorExit;
3868     }
3869   }
3870 
3871   ErrorCode = SettingsFile.SetTo (&SettingsDirectory, g_SettingsFileName,
3872     DoLoad ? B_READ_ONLY : B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
3873   if (ErrorCode != B_OK)
3874   {
3875     sprintf (TempString, "Can't open settings file \"%s\" in directory \"%s\" "
3876       "for %s", g_SettingsFileName, m_SettingsDirectoryPath.Path(),
3877       DoLoad ? "reading" : "writing");
3878     goto ErrorExit;
3879   }
3880 
3881   if (DoLoad)
3882   {
3883     ErrorCode = Settings.Unflatten (&SettingsFile);
3884     if (ErrorCode != 0 || Settings.what != g_SettingsWhatCode)
3885     {
3886       sprintf (TempString, "Corrupt data detected while reading settings "
3887         "file \"%s\" in directory \"%s\", will revert to defaults",
3888         g_SettingsFileName, m_SettingsDirectoryPath.Path());
3889       goto ErrorExit;
3890     }
3891   }
3892 
3893   /* Transfer the settings between the BMessage and our various global
3894   variables.  For loading, if the setting isn't present, leave it at the
3895   default value.  Note that loading and saving are intermingled here to make
3896   code maintenance easier (less chance of forgetting to update it if load and
3897   save were separate functions). */
3898 
3899   ErrorCode = B_OK; /* So that saving settings can record an error. */
3900 
3901   NamePntr = "DatabaseFileName";
3902   if (DoLoad)
3903   {
3904     if (Settings.FindString (NamePntr, &StringPntr) == B_OK)
3905       m_DatabaseFileName.SetTo (StringPntr);
3906   }
3907   else if (ErrorCode == B_OK)
3908     ErrorCode = Settings.AddString (NamePntr, m_DatabaseFileName);
3909 
3910   NamePntr = "ServerMode";
3911   if (DoLoad)
3912   {
3913     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3914       g_ServerMode = TempBool;
3915   }
3916   else if (ErrorCode == B_OK)
3917     ErrorCode = Settings.AddBool (NamePntr, g_ServerMode);
3918 
3919   NamePntr = "IgnorePreviousClassification";
3920   if (DoLoad)
3921   {
3922     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3923       m_IgnorePreviousClassification = TempBool;
3924   }
3925   else if (ErrorCode == B_OK)
3926     ErrorCode = Settings.AddBool (NamePntr, m_IgnorePreviousClassification);
3927 
3928   NamePntr = "PurgeAge";
3929   if (DoLoad)
3930   {
3931     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3932       m_PurgeAge = TempInt32;
3933   }
3934   else if (ErrorCode == B_OK)
3935     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgeAge);
3936 
3937   NamePntr = "PurgePopularity";
3938   if (DoLoad)
3939   {
3940     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3941       m_PurgePopularity = TempInt32;
3942   }
3943   else if (ErrorCode == B_OK)
3944     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgePopularity);
3945 
3946   NamePntr = "ScoringMode";
3947   if (DoLoad)
3948   {
3949     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3950       m_ScoringMode = (ScoringModes) TempInt32;
3951     if (m_ScoringMode < 0 || m_ScoringMode >= SM_MAX)
3952       m_ScoringMode = (ScoringModes) 0;
3953   }
3954   else if (ErrorCode == B_OK)
3955     ErrorCode = Settings.AddInt32 (NamePntr, m_ScoringMode);
3956 
3957   NamePntr = "TokenizeMode";
3958   if (DoLoad)
3959   {
3960     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3961       m_TokenizeMode = (TokenizeModes) TempInt32;
3962     if (m_TokenizeMode < 0 || m_TokenizeMode >= TM_MAX)
3963       m_TokenizeMode = (TokenizeModes) 0;
3964   }
3965   else if (ErrorCode == B_OK)
3966     ErrorCode = Settings.AddInt32 (NamePntr, m_TokenizeMode);
3967 
3968   if (ErrorCode != B_OK)
3969   {
3970     strcpy (TempString, "Unable to stuff the program settings into a "
3971       "temporary BMessage, settings not saved");
3972     goto ErrorExit;
3973   }
3974 
3975   /* Save the settings BMessage to the settings file. */
3976 
3977   if (!DoLoad)
3978   {
3979     Settings.what = g_SettingsWhatCode;
3980     ErrorCode = Settings.Flatten (&SettingsFile);
3981     if (ErrorCode != 0)
3982     {
3983       sprintf (TempString, "Problems while writing settings file \"%s\" in "
3984         "directory \"%s\"", g_SettingsFileName,
3985         m_SettingsDirectoryPath.Path ());
3986       goto ErrorExit;
3987     }
3988   }
3989 
3990   m_SettingsHaveChanged = false;
3991   return B_OK;
3992 
3993 ErrorExit: /* Error message in TempString, code in ErrorCode. */
3994   DisplayErrorMessage (TempString, ErrorCode, DoLoad ?
3995     "Loading Settings Error" : "Saving Settings Error");
3996   return ErrorCode;
3997 }
3998 
3999 
4000 void ABSApp::MessageReceived (BMessage *MessagePntr)
4001 {
4002   const char           *PropertyName;
4003   struct property_info *PropInfoPntr;
4004   int32                 SpecifierIndex;
4005   int32                 SpecifierKind;
4006   BMessage              SpecifierMessage;
4007 
4008   /* See if it is a scripting message that applies to the database or one of
4009   the other operations this program supports.  Pass on other scripting messages
4010   to the inherited parent MessageReceived function (they're usually scripting
4011   messages for the BApplication). */
4012 
4013   switch (MessagePntr->what)
4014   {
4015     case B_GET_PROPERTY:
4016     case B_SET_PROPERTY:
4017     case B_COUNT_PROPERTIES:
4018     case B_CREATE_PROPERTY:
4019     case B_DELETE_PROPERTY:
4020     case B_EXECUTE_PROPERTY:
4021       if (MessagePntr->GetCurrentSpecifier (&SpecifierIndex, &SpecifierMessage,
4022       &SpecifierKind, &PropertyName) == B_OK &&
4023       SpecifierKind == B_DIRECT_SPECIFIER)
4024       {
4025         for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
4026         {
4027           if (PropInfoPntr->name == 0)
4028             break; /* Ran out of commands. */
4029 
4030           if (PropInfoPntr->commands[0] == MessagePntr->what &&
4031           strcasecmp (PropInfoPntr->name, PropertyName) == 0)
4032           {
4033             ProcessScriptingMessage (MessagePntr, PropInfoPntr);
4034             return;
4035           }
4036         }
4037       }
4038       break;
4039   }
4040 
4041   /* Pass the unprocessed message to the inherited function, maybe it knows
4042   what to do.  This includes replies to messages we sent ourselves. */
4043 
4044   BApplication::MessageReceived (MessagePntr);
4045 }
4046 
4047 
4048 /* Rename the existing database file to a backup file name, potentially
4049 replacing an older backup.  If something goes wrong, returns an error code and
4050 puts an explanation in ErrorMessage. */
4051 
4052 status_t ABSApp::MakeBackup (char *ErrorMessage)
4053 {
4054   BEntry   Entry;
4055   status_t ErrorCode;
4056   int      i;
4057   char     LeafName [NAME_MAX];
4058   char     NewName [PATH_MAX+20];
4059   char     OldName [PATH_MAX+20];
4060 
4061   ErrorCode = Entry.SetTo (m_DatabaseFileName.String ());
4062   if (ErrorCode != B_OK)
4063   {
4064     sprintf (ErrorMessage, "While making backup, failed to make a BEntry for "
4065       "\"%s\" (maybe the directory doesn't exist?)",
4066       m_DatabaseFileName.String ());
4067     return ErrorCode;
4068   }
4069   if (!Entry.Exists ())
4070     return B_OK; /* No existing file to worry about overwriting. */
4071   Entry.GetName (LeafName);
4072 
4073   /* Find the first hole (no file) where we will stop the renaming chain. */
4074 
4075   for (i = 0; i < g_MaxBackups - 1; i++)
4076   {
4077     strcpy (OldName, m_DatabaseFileName.String ());
4078     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4079     Entry.SetTo (OldName);
4080     if (!Entry.Exists ())
4081       break;
4082   }
4083 
4084   /* Move the files down by one to fill in the hole in the name series. */
4085 
4086   for (i--; i >= 0; i--)
4087   {
4088     strcpy (OldName, m_DatabaseFileName.String ());
4089     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4090     Entry.SetTo (OldName);
4091     strcpy (NewName, LeafName);
4092     sprintf (NewName + strlen (NewName), g_BackupSuffix, i + 1);
4093     ErrorCode = Entry.Rename (NewName, true /* clobber */);
4094   }
4095 
4096   Entry.SetTo (m_DatabaseFileName.String ());
4097   strcpy (NewName, LeafName);
4098   sprintf (NewName + strlen (NewName), g_BackupSuffix, 0);
4099   ErrorCode = Entry.Rename (NewName, true /* clobber */);
4100   if (ErrorCode != B_OK)
4101     sprintf (ErrorMessage, "While making backup, failed to rename "
4102       "\"%s\" to \"%s\"", m_DatabaseFileName.String (), NewName);
4103 
4104   return ErrorCode;
4105 }
4106 
4107 
4108 void ABSApp::MakeDatabaseEmpty ()
4109 {
4110   m_WordMap.clear (); /* Sets the map to empty, deallocating any old data. */
4111   m_WordCount = 0;
4112   m_TotalGenuineMessages = 0;
4113   m_TotalSpamMessages = 0;
4114   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4115 }
4116 
4117 
4118 /* Do what the scripting command says.  A reply message will be sent back with
4119 several fields: "error" containing the numerical error code (0 for success),
4120 "CommandText" with a text representation of the command, "result" with the
4121 resulting data for a get or count command.  If it isn't understood, then rather
4122 than a B_REPLY kind of message, it will be a B_MESSAGE_NOT_UNDERSTOOD message
4123 with an "error" number and an "message" string with a description. */
4124 
4125 void ABSApp::ProcessScriptingMessage (
4126   BMessage *MessagePntr,
4127   struct property_info *PropInfoPntr)
4128 {
4129   bool        ArgumentBool = false;
4130   bool        ArgumentGotBool = false;
4131   bool        ArgumentGotInt32 = false;
4132   bool        ArgumentGotString = false;
4133   int32       ArgumentInt32 = 0;
4134   const char *ArgumentString = NULL;
4135   BString     CommandText;
4136   status_t    ErrorCode;
4137   int         i;
4138   BMessage    ReplyMessage (B_MESSAGE_NOT_UNDERSTOOD);
4139   ssize_t     StringBufferSize;
4140   BMessage    TempBMessage;
4141   BPath       TempPath;
4142   char        TempString [PATH_MAX + 1024];
4143 
4144   if (g_QuitCountdown >= 0 && !g_CommandLineMode)
4145   {
4146     g_QuitCountdown = -1;
4147     cerr << "Quit countdown aborted due to a scripting command arriving.\n";
4148   }
4149 
4150   if (g_BusyCursor != NULL)
4151     SetCursor (g_BusyCursor);
4152 
4153   ErrorCode = MessagePntr->FindData (g_DataName, B_STRING_TYPE,
4154     (const void **) &ArgumentString, &StringBufferSize);
4155   if (ErrorCode == B_OK)
4156   {
4157     if (PropInfoPntr->extra_data != PN_EVALUATE_STRING &&
4158     PropInfoPntr->extra_data != PN_SPAM_STRING &&
4159     PropInfoPntr->extra_data != PN_GENUINE_STRING &&
4160     strlen (ArgumentString) >= PATH_MAX)
4161     {
4162       sprintf (TempString, "\"data\" string of a scripting message is too "
4163         "long, for SET %s action", PropInfoPntr->name);
4164       ErrorCode = B_NAME_TOO_LONG;
4165       goto ErrorExit;
4166     }
4167     ArgumentGotString = true;
4168   }
4169   else if (MessagePntr->FindBool (g_DataName, &ArgumentBool) == B_OK)
4170     ArgumentGotBool = true;
4171   else if (MessagePntr->FindInt32 (g_DataName, &ArgumentInt32) == B_OK)
4172     ArgumentGotInt32 = true;
4173 
4174   /* Prepare a Human readable description of the scripting command. */
4175 
4176   switch (PropInfoPntr->commands[0])
4177   {
4178     case B_SET_PROPERTY:
4179       CommandText.SetTo ("Set ");
4180       break;
4181 
4182     case B_GET_PROPERTY:
4183       CommandText.SetTo ("Get ");
4184       break;
4185 
4186     case B_COUNT_PROPERTIES:
4187       CommandText.SetTo ("Count ");
4188       break;
4189 
4190     case B_CREATE_PROPERTY:
4191       CommandText.SetTo ("Create ");
4192       break;
4193 
4194     case B_DELETE_PROPERTY:
4195       CommandText.SetTo ("Delete ");
4196       break;
4197 
4198     case B_EXECUTE_PROPERTY:
4199       CommandText.SetTo ("Execute ");
4200       break;
4201 
4202     default:
4203       sprintf (TempString, "Bug: scripting command for \"%s\" has an unknown "
4204         "action code %d", PropInfoPntr->name,
4205         (int) PropInfoPntr->commands[0]);
4206       ErrorCode = -1;
4207       goto ErrorExit;
4208   }
4209   CommandText.Append (PropInfoPntr->name);
4210 
4211   /* Add on the argument value to our readable command, if there is one. */
4212 
4213   if (ArgumentGotString)
4214   {
4215     CommandText.Append (" \"");
4216     CommandText.Append (ArgumentString);
4217     CommandText.Append ("\"");
4218   }
4219   if (ArgumentGotBool)
4220     CommandText.Append (ArgumentBool ? " true" : " false");
4221   if (ArgumentGotInt32)
4222   {
4223     sprintf (TempString, " %ld", ArgumentInt32);
4224     CommandText.Append (TempString);
4225   }
4226 
4227   /* From now on the scripting command has been recognized and is in the
4228   correct format, so it always returns a B_REPLY message.  A readable version
4229   of the command is also added to make debugging easier. */
4230 
4231   ReplyMessage.what = B_REPLY;
4232   ReplyMessage.AddString ("CommandText", CommandText);
4233 
4234   /* Now actually do the command.  First prepare a default error message. */
4235 
4236   sprintf (TempString, "Operation code %d (get, set, count, etc) "
4237     "unsupported for property %s",
4238     (int) PropInfoPntr->commands[0], PropInfoPntr->name);
4239   ErrorCode = B_BAD_INDEX;
4240 
4241   switch (PropInfoPntr->extra_data)
4242   {
4243     case PN_DATABASE_FILE:
4244       switch (PropInfoPntr->commands[0])
4245       {
4246         case B_GET_PROPERTY: /* Get the database file name. */
4247           ReplyMessage.AddString (g_ResultName, m_DatabaseFileName);
4248           break;
4249 
4250         case B_SET_PROPERTY: /* Set the database file name to a new one. */
4251           if (!ArgumentGotString)
4252           {
4253             ErrorCode = B_BAD_TYPE;
4254             sprintf (TempString, "You need to specify a string for the "
4255               "SET %s command", PropInfoPntr->name);
4256             goto ErrorExit;
4257           }
4258           ErrorCode = TempPath.SetTo (ArgumentString, NULL /* leaf */,
4259             true /* normalize - verifies parent directories exist */);
4260           if (ErrorCode != B_OK)
4261           {
4262             sprintf (TempString, "New database path name of \"%s\" is invalid "
4263               "(parent directories must exist)", ArgumentString);
4264             goto ErrorExit;
4265           }
4266           if ((ErrorCode = SaveDatabaseIfNeeded (TempString)) != B_OK)
4267             goto ErrorExit;
4268           MakeDatabaseEmpty (); /* So that the new one gets loaded if used. */
4269 
4270           if (strlen (TempPath.Leaf ()) > NAME_MAX-strlen(g_BackupSuffix)-1)
4271           {
4272             /* Truncate the name so that there is enough space for the backup
4273             extension.  Approximately. */
4274             strcpy (TempString, TempPath.Leaf ());
4275             TempString [NAME_MAX - strlen (g_BackupSuffix) - 1] = 0;
4276             TempPath.GetParent (&TempPath);
4277             TempPath.Append (TempString);
4278           }
4279           m_DatabaseFileName.SetTo (TempPath.Path ());
4280           m_SettingsHaveChanged = true;
4281           break;
4282 
4283         case B_CREATE_PROPERTY: /* Make a new database file plus more. */
4284           if ((ErrorCode = CreateDatabaseFile (TempString)) != B_OK)
4285             goto ErrorExit;
4286           break;
4287 
4288         case B_DELETE_PROPERTY: /* Delete the file and its backups too. */
4289           if ((ErrorCode = DeleteDatabaseFile (TempString)) != B_OK)
4290             goto ErrorExit;
4291           break;
4292 
4293         case B_COUNT_PROPERTIES:
4294           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4295             goto ErrorExit;
4296           ReplyMessage.AddInt32 (g_ResultName, m_WordCount);
4297           break;
4298 
4299         default: /* Unknown operation code, error message already set. */
4300           goto ErrorExit;
4301       }
4302       break;
4303 
4304     case PN_SPAM:
4305     case PN_SPAM_STRING:
4306     case PN_GENUINE:
4307     case PN_GENUINE_STRING:
4308     case PN_UNCERTAIN:
4309       switch (PropInfoPntr->commands[0])
4310       {
4311         case B_COUNT_PROPERTIES: /* Get the number of spam/genuine messages. */
4312           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4313             goto ErrorExit;
4314           if (PropInfoPntr->extra_data == PN_SPAM ||
4315           PropInfoPntr->extra_data == PN_SPAM_STRING)
4316             ReplyMessage.AddInt32 (g_ResultName, m_TotalSpamMessages);
4317           else
4318             ReplyMessage.AddInt32 (g_ResultName, m_TotalGenuineMessages);
4319           break;
4320 
4321         case B_SET_PROPERTY: /* Add spam/genuine/uncertain to database. */
4322           if (!ArgumentGotString)
4323           {
4324             ErrorCode = B_BAD_TYPE;
4325             sprintf (TempString, "You need to specify a string (%s) "
4326               "for the SET %s command",
4327               (PropInfoPntr->extra_data == PN_GENUINE_STRING ||
4328               PropInfoPntr->extra_data == PN_SPAM_STRING)
4329               ? "text of the message to be added"
4330               : "pathname of the file containing the text to be added",
4331               PropInfoPntr->name);
4332             goto ErrorExit;
4333           }
4334           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4335             goto ErrorExit;
4336           if (PropInfoPntr->extra_data == PN_GENUINE ||
4337           PropInfoPntr->extra_data == PN_SPAM ||
4338           PropInfoPntr->extra_data == PN_UNCERTAIN)
4339             ErrorCode = AddFileToDatabase (
4340               (PropInfoPntr->extra_data == PN_SPAM) ? CL_SPAM :
4341               ((PropInfoPntr->extra_data == PN_GENUINE) ? CL_GENUINE :
4342               CL_UNCERTAIN),
4343               ArgumentString, TempString /* ErrorMessage */);
4344           else
4345             ErrorCode = AddStringToDatabase (
4346               (PropInfoPntr->extra_data == PN_SPAM_STRING) ?
4347               CL_SPAM : CL_GENUINE,
4348               ArgumentString, TempString /* ErrorMessage */);
4349           if (ErrorCode != B_OK)
4350             goto ErrorExit;
4351           break;
4352 
4353         default: /* Unknown operation code, error message already set. */
4354           goto ErrorExit;
4355       }
4356       break;
4357 
4358     case PN_IGNORE_PREVIOUS_CLASSIFICATION:
4359       switch (PropInfoPntr->commands[0])
4360       {
4361         case B_GET_PROPERTY:
4362           ReplyMessage.AddBool (g_ResultName, m_IgnorePreviousClassification);
4363           break;
4364 
4365         case B_SET_PROPERTY:
4366           if (!ArgumentGotBool)
4367           {
4368             ErrorCode = B_BAD_TYPE;
4369             sprintf (TempString, "You need to specify a boolean (true/yes, "
4370               "false/no) for the SET %s command", PropInfoPntr->name);
4371             goto ErrorExit;
4372           }
4373           m_IgnorePreviousClassification = ArgumentBool;
4374           m_SettingsHaveChanged = true;
4375           break;
4376 
4377         default: /* Unknown operation code, error message already set. */
4378           goto ErrorExit;
4379       }
4380       break;
4381 
4382     case PN_SERVER_MODE:
4383       switch (PropInfoPntr->commands[0])
4384       {
4385         case B_GET_PROPERTY:
4386           ReplyMessage.AddBool (g_ResultName, g_ServerMode);
4387           break;
4388 
4389         case B_SET_PROPERTY:
4390           if (!ArgumentGotBool)
4391           {
4392             ErrorCode = B_BAD_TYPE;
4393             sprintf (TempString, "You need to specify a boolean (true/yes, "
4394               "false/no) for the SET %s command", PropInfoPntr->name);
4395             goto ErrorExit;
4396           }
4397           g_ServerMode = ArgumentBool;
4398           m_SettingsHaveChanged = true;
4399           break;
4400 
4401         default: /* Unknown operation code, error message already set. */
4402           goto ErrorExit;
4403       }
4404       break;
4405 
4406     case PN_FLUSH:
4407       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4408       (ErrorCode = SaveDatabaseIfNeeded (TempString)) == B_OK)
4409         break;
4410       goto ErrorExit;
4411 
4412     case PN_PURGE_AGE:
4413       switch (PropInfoPntr->commands[0])
4414       {
4415         case B_GET_PROPERTY:
4416           ReplyMessage.AddInt32 (g_ResultName, m_PurgeAge);
4417           break;
4418 
4419         case B_SET_PROPERTY:
4420           if (!ArgumentGotInt32)
4421           {
4422             ErrorCode = B_BAD_TYPE;
4423             sprintf (TempString, "You need to specify a 32 bit integer "
4424               "for the SET %s command", PropInfoPntr->name);
4425             goto ErrorExit;
4426           }
4427           m_PurgeAge = ArgumentInt32;
4428           m_SettingsHaveChanged = true;
4429           break;
4430 
4431         default: /* Unknown operation code, error message already set. */
4432           goto ErrorExit;
4433       }
4434       break;
4435 
4436     case PN_PURGE_POPULARITY:
4437       switch (PropInfoPntr->commands[0])
4438       {
4439         case B_GET_PROPERTY:
4440           ReplyMessage.AddInt32 (g_ResultName, m_PurgePopularity);
4441           break;
4442 
4443         case B_SET_PROPERTY:
4444           if (!ArgumentGotInt32)
4445           {
4446             ErrorCode = B_BAD_TYPE;
4447             sprintf (TempString, "You need to specify a 32 bit integer "
4448               "for the SET %s command", PropInfoPntr->name);
4449             goto ErrorExit;
4450           }
4451           m_PurgePopularity = ArgumentInt32;
4452           m_SettingsHaveChanged = true;
4453           break;
4454 
4455         default: /* Unknown operation code, error message already set. */
4456           goto ErrorExit;
4457       }
4458       break;
4459 
4460     case PN_PURGE:
4461       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4462       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK &&
4463       (ErrorCode = PurgeOldWords (TempString)) == B_OK)
4464         break;
4465       goto ErrorExit;
4466 
4467     case PN_OLDEST:
4468       if (PropInfoPntr->commands[0] == B_GET_PROPERTY &&
4469       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4470       {
4471         ReplyMessage.AddInt32 (g_ResultName, m_OldestAge);
4472         break;
4473       }
4474       goto ErrorExit;
4475 
4476     case PN_EVALUATE:
4477     case PN_EVALUATE_STRING:
4478       if (PropInfoPntr->commands[0] == B_SET_PROPERTY)
4479       {
4480         if (!ArgumentGotString)
4481         {
4482           ErrorCode = B_BAD_TYPE;
4483           sprintf (TempString, "You need to specify a string for the "
4484             "SET %s command", PropInfoPntr->name);
4485           goto ErrorExit;
4486         }
4487         if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4488         {
4489           if (PropInfoPntr->extra_data == PN_EVALUATE)
4490           {
4491             if ((ErrorCode = EvaluateFile (ArgumentString, &ReplyMessage,
4492             TempString)) == B_OK)
4493               break;
4494           }
4495           else /* PN_EVALUATE_STRING */
4496           {
4497             if ((ErrorCode = EvaluateString (ArgumentString, StringBufferSize,
4498             &ReplyMessage, TempString)) == B_OK)
4499               break;
4500           }
4501         }
4502       }
4503       goto ErrorExit;
4504 
4505     case PN_RESET_TO_DEFAULTS:
4506       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY)
4507       {
4508         DefaultSettings ();
4509         break;
4510       }
4511       goto ErrorExit;
4512 
4513     case PN_INSTALL_THINGS:
4514       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4515       (ErrorCode = InstallThings (TempString)) == B_OK)
4516         break;
4517       goto ErrorExit;
4518 
4519     case PN_SCORING_MODE:
4520       switch (PropInfoPntr->commands[0])
4521       {
4522         case B_GET_PROPERTY:
4523           ReplyMessage.AddString (g_ResultName,
4524             g_ScoringModeNames[m_ScoringMode]);
4525           break;
4526 
4527         case B_SET_PROPERTY:
4528           i = SM_MAX;
4529           if (ArgumentGotString)
4530             for (i = 0; i < SM_MAX; i++)
4531             {
4532               if (strcasecmp (ArgumentString, g_ScoringModeNames [i]) == 0)
4533               {
4534                 m_ScoringMode = (ScoringModes) i;
4535                 m_SettingsHaveChanged = true;
4536                 break;
4537               }
4538             }
4539           if (i >= SM_MAX) /* Didn't find a valid scoring mode word. */
4540           {
4541             ErrorCode = B_BAD_TYPE;
4542             sprintf (TempString, "You used the unrecognized \"%s\" as "
4543               "a scoring mode for the SET %s command.  Should be one of: ",
4544               ArgumentGotString ? ArgumentString : "not specified",
4545               PropInfoPntr->name);
4546             for (i = 0; i < SM_MAX; i++)
4547             {
4548               strcat (TempString, g_ScoringModeNames [i]);
4549               if (i < SM_MAX - 1)
4550                 strcat (TempString, ", ");
4551             }
4552             goto ErrorExit;
4553           }
4554           break;
4555 
4556         default: /* Unknown operation code, error message already set. */
4557           goto ErrorExit;
4558       }
4559       break;
4560 
4561     case PN_TOKENIZE_MODE:
4562       switch (PropInfoPntr->commands[0])
4563       {
4564         case B_GET_PROPERTY:
4565           ReplyMessage.AddString (g_ResultName,
4566             g_TokenizeModeNames[m_TokenizeMode]);
4567           break;
4568 
4569         case B_SET_PROPERTY:
4570           i = TM_MAX;
4571           if (ArgumentGotString)
4572             for (i = 0; i < TM_MAX; i++)
4573             {
4574               if (strcasecmp (ArgumentString, g_TokenizeModeNames [i]) == 0)
4575               {
4576                 m_TokenizeMode = (TokenizeModes) i;
4577                 m_SettingsHaveChanged = true;
4578                 break;
4579               }
4580             }
4581           if (i >= TM_MAX) /* Didn't find a valid tokenize mode word. */
4582           {
4583             ErrorCode = B_BAD_TYPE;
4584             sprintf (TempString, "You used the unrecognized \"%s\" as "
4585               "a tokenize mode for the SET %s command.  Should be one of: ",
4586               ArgumentGotString ? ArgumentString : "not specified",
4587               PropInfoPntr->name);
4588             for (i = 0; i < TM_MAX; i++)
4589             {
4590               strcat (TempString, g_TokenizeModeNames [i]);
4591               if (i < TM_MAX - 1)
4592                 strcat (TempString, ", ");
4593             }
4594             goto ErrorExit;
4595           }
4596           break;
4597 
4598         default: /* Unknown operation code, error message already set. */
4599           goto ErrorExit;
4600       }
4601       break;
4602 
4603     default:
4604       sprintf (TempString, "Bug!  Unrecognized property identification "
4605         "number %d (should be between 0 and %d).  Fix the entry in "
4606         "the g_ScriptingPropertyList array!",
4607         (int) PropInfoPntr->extra_data, PN_MAX - 1);
4608       goto ErrorExit;
4609   }
4610 
4611   /* Success. */
4612 
4613   ReplyMessage.AddInt32 ("error", B_OK);
4614   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4615     this /* Reply's reply handler */, 500000 /* send timeout */);
4616   if (ErrorCode != B_OK)
4617     cerr << "ProcessScriptingMessage failed to send a reply message, code " <<
4618     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4619     CommandText.String () << endl;
4620   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4621   return;
4622 
4623 ErrorExit: /* Error message in TempString, return code in ErrorCode. */
4624   ReplyMessage.AddInt32 ("error", ErrorCode);
4625   ReplyMessage.AddString ("message", TempString);
4626   DisplayErrorMessage (TempString, ErrorCode);
4627   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4628     this /* Reply's reply handler */, 500000 /* send timeout */);
4629   if (ErrorCode != B_OK)
4630     cerr << "ProcessScriptingMessage failed to send an error message, code " <<
4631     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4632     CommandText.String () << endl;
4633   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4634 }
4635 
4636 
4637 /* Since quitting stops the program before the results of a script command are
4638 received, we use a time delay to do the quit and make sure there are no pending
4639 commands being processed by the auxiliary looper which is sending us commands.
4640 Also, we have a countdown which can be interrupted by an incoming scripting
4641 message in case one client tells us to quit while another one is still using us
4642 (happens when you have two or more e-mail accounts).  But if the system is
4643 shutting down, quit immediately! */
4644 
4645 void ABSApp::Pulse ()
4646 {
4647   if (g_QuitCountdown == 0)
4648   {
4649     if (g_CommanderLooperPntr == NULL ||
4650     !g_CommanderLooperPntr->IsBusy ())
4651       PostMessage (B_QUIT_REQUESTED);
4652   }
4653   else if (g_QuitCountdown > 0)
4654   {
4655     cerr << "SpamDBM quitting in " << g_QuitCountdown << ".\n";
4656     g_QuitCountdown--;
4657   }
4658 }
4659 
4660 
4661 /* A quit request message has come in.  If the quit countdown has reached zero,
4662 allow the request, otherwise reject it (and start the countdown if it hasn't
4663 been started). */
4664 
4665 bool ABSApp::QuitRequested ()
4666 {
4667   BMessage  *QuitMessage;
4668   team_info  RemoteInfo;
4669   BMessenger RemoteMessenger;
4670   team_id    RemoteTeam;
4671 
4672   /* See if the quit is from the system shutdown command (which goes through
4673   the registrar server), if so, quit immediately. */
4674 
4675   QuitMessage = CurrentMessage ();
4676   if (QuitMessage != NULL && QuitMessage->IsSourceRemote ())
4677   {
4678     RemoteMessenger = QuitMessage->ReturnAddress ();
4679     RemoteTeam = RemoteMessenger.Team ();
4680     if (get_team_info (RemoteTeam, &RemoteInfo) == B_OK &&
4681     strstr (RemoteInfo.args, "registrar") != NULL)
4682       g_QuitCountdown = 0;
4683   }
4684 
4685   if (g_QuitCountdown == 0)
4686     return BApplication::QuitRequested ();
4687 
4688   if (g_QuitCountdown < 0)
4689 //    g_QuitCountdown = 10; /* Start the countdown. */
4690     g_QuitCountdown = 5; /* Quit more quickly */
4691 
4692   return false;
4693 }
4694 
4695 
4696 /* Go through the current database and delete words which are too old (time is
4697 equivalent to the number of messages added to the database) and too unpopular
4698 (words not used by many messages).  Hopefully this will get rid of words which
4699 are just hunks of binary or other garbage.  The database has been loaded
4700 elsewhere. */
4701 
4702 status_t ABSApp::PurgeOldWords (char *ErrorMessage)
4703 {
4704   uint32                  CurrentTime;
4705   StatisticsMap::iterator CurrentIter;
4706   StatisticsMap::iterator EndIter;
4707   StatisticsMap::iterator NextIter;
4708   char                    TempString [80];
4709 
4710   strcpy (ErrorMessage, "Purge can't fail"); /* So argument gets used. */
4711   CurrentTime = m_TotalGenuineMessages + m_TotalSpamMessages - 1;
4712   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4713 
4714   EndIter = m_WordMap.end ();
4715   NextIter = m_WordMap.begin ();
4716   while (NextIter != EndIter)
4717   {
4718     CurrentIter = NextIter++;
4719 
4720     if (CurrentTime - CurrentIter->second.age >= m_PurgeAge &&
4721     CurrentIter->second.genuineCount + CurrentIter->second.spamCount <=
4722     m_PurgePopularity)
4723     {
4724       /* Delete this word, it is unpopular and old.  Sob. */
4725 
4726       m_WordMap.erase (CurrentIter);
4727       if (m_WordCount > 0)
4728         m_WordCount--;
4729 
4730       m_DatabaseHasChanged = true;
4731     }
4732     else /* This word is still in the database.  Update oldest age. */
4733     {
4734       if (CurrentIter->second.age < m_OldestAge)
4735         m_OldestAge = CurrentIter->second.age;
4736     }
4737   }
4738 
4739   /* Just a little bug check here.  Just in case. */
4740 
4741   if (m_WordCount != m_WordMap.size ())
4742   {
4743     sprintf (TempString, "Our word count of %lu doesn't match the "
4744       "size of the database, %lu", m_WordCount, m_WordMap.size ());
4745     DisplayErrorMessage (TempString, -1, "Bug!");
4746     m_WordCount = m_WordMap.size ();
4747   }
4748 
4749   return B_OK;
4750 }
4751 
4752 
4753 void ABSApp::ReadyToRun ()
4754 {
4755   DatabaseWindow *DatabaseWindowPntr;
4756   float           JunkFloat;
4757   BButton        *TempButtonPntr;
4758   BCheckBox      *TempCheckBoxPntr;
4759   font_height     TempFontHeight;
4760   BMenuBar       *TempMenuBarPntr;
4761   BMenuItem      *TempMenuItemPntr;
4762   BPopUpMenu     *TempPopUpMenuPntr;
4763   BRadioButton   *TempRadioButtonPntr;
4764   BRect           TempRect;
4765   const char     *TempString = "Testing My Things";
4766   BStringView    *TempStringViewPntr;
4767   BTextControl   *TempTextPntr;
4768   BWindow        *TempWindowPntr;
4769 
4770   /* This batch of code gets some measurements which will be used for laying
4771   out controls and other GUI elements.  Set the spacing between buttons and
4772   other controls to the width of the letter "M" in the user's desired font. */
4773 
4774  g_MarginBetweenControls = (int) be_plain_font->StringWidth ("M");
4775 
4776   /* Also find out how much space a line of text uses. */
4777 
4778   be_plain_font->GetHeight (&TempFontHeight);
4779   g_LineOfTextHeight = ceilf (
4780     TempFontHeight.ascent + TempFontHeight.descent + TempFontHeight.leading);
4781 
4782   /* Start finding out the height of various user interface gadgets, which can
4783   vary based on the current font size.  Make a temporary gadget, which is
4784   attached to our window, then resize it to its prefered size so that it
4785   accomodates the font size and other frills it needs. */
4786 
4787   TempWindowPntr = new BWindow (BRect (10, 20, 200, 200), "Temporary Window",
4788     B_DOCUMENT_WINDOW, B_NO_WORKSPACE_ACTIVATION | B_ASYNCHRONOUS_CONTROLS);
4789   if (TempWindowPntr == NULL)
4790   {
4791     DisplayErrorMessage ("Unable to create temporary window for finding "
4792       "sizes of controls.");
4793     g_QuitCountdown = 0;
4794     return;
4795   }
4796 
4797   TempRect = TempWindowPntr->Bounds ();
4798 
4799   /* Find the height of a single line of text in a BStringView. */
4800 
4801   TempStringViewPntr = new BStringView (TempRect, TempString, TempString);
4802   if (TempStringViewPntr != NULL)
4803   {
4804     TempWindowPntr->Lock ();
4805     TempWindowPntr->AddChild (TempStringViewPntr);
4806     TempStringViewPntr->GetPreferredSize (&JunkFloat, &g_StringViewHeight);
4807     TempWindowPntr->RemoveChild (TempStringViewPntr);
4808     TempWindowPntr->Unlock ();
4809     delete TempStringViewPntr;
4810   }
4811 
4812   /* Find the height of a button, which seems to be larger than a text
4813   control and can make life difficult.  Make a temporary button, which
4814   is attached to our window so that it resizes to accomodate the font size. */
4815 
4816   TempButtonPntr = new BButton (TempRect, TempString, TempString, NULL);
4817   if (TempButtonPntr != NULL)
4818   {
4819     TempWindowPntr->Lock ();
4820     TempWindowPntr->AddChild (TempButtonPntr);
4821     TempButtonPntr->GetPreferredSize (&JunkFloat, &g_ButtonHeight);
4822     TempWindowPntr->RemoveChild (TempButtonPntr);
4823     TempWindowPntr->Unlock ();
4824     delete TempButtonPntr;
4825   }
4826 
4827   /* Find the height of a text box. */
4828 
4829   TempTextPntr = new BTextControl (TempRect, TempString, NULL /* label */,
4830     TempString, NULL);
4831   if (TempTextPntr != NULL)
4832   {
4833     TempWindowPntr->Lock ();
4834     TempWindowPntr->AddChild (TempTextPntr);
4835     TempTextPntr->GetPreferredSize (&JunkFloat, &g_TextBoxHeight);
4836     TempWindowPntr->RemoveChild (TempTextPntr);
4837     TempWindowPntr->Unlock ();
4838     delete TempTextPntr;
4839   }
4840 
4841   /* Find the height of a checkbox control. */
4842 
4843   TempCheckBoxPntr = new BCheckBox (TempRect, TempString, TempString, NULL);
4844   if (TempCheckBoxPntr != NULL)
4845   {
4846     TempWindowPntr->Lock ();
4847     TempWindowPntr->AddChild (TempCheckBoxPntr);
4848     TempCheckBoxPntr->GetPreferredSize (&JunkFloat, &g_CheckBoxHeight);
4849     TempWindowPntr->RemoveChild (TempCheckBoxPntr);
4850     TempWindowPntr->Unlock ();
4851     delete TempCheckBoxPntr;
4852   }
4853 
4854   /* Find the height of a radio button control. */
4855 
4856   TempRadioButtonPntr =
4857     new BRadioButton (TempRect, TempString, TempString, NULL);
4858   if (TempRadioButtonPntr != NULL)
4859   {
4860     TempWindowPntr->Lock ();
4861     TempWindowPntr->AddChild (TempRadioButtonPntr);
4862     TempRadioButtonPntr->GetPreferredSize (&JunkFloat, &g_RadioButtonHeight);
4863     TempWindowPntr->RemoveChild (TempRadioButtonPntr);
4864     TempWindowPntr->Unlock ();
4865     delete TempRadioButtonPntr;
4866   }
4867 
4868   /* Find the height of a pop-up menu. */
4869 
4870   TempMenuBarPntr = new BMenuBar (TempRect, TempString,
4871     B_FOLLOW_LEFT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
4872     true /* resize to fit items */);
4873   TempPopUpMenuPntr = new BPopUpMenu (TempString);
4874   TempMenuItemPntr = new BMenuItem (TempString, new BMessage (12345), 'g');
4875 
4876   if (TempMenuBarPntr != NULL && TempPopUpMenuPntr != NULL &&
4877   TempMenuItemPntr != NULL)
4878   {
4879     TempPopUpMenuPntr->AddItem (TempMenuItemPntr);
4880     TempMenuBarPntr->AddItem (TempPopUpMenuPntr);
4881 
4882     TempWindowPntr->Lock ();
4883     TempWindowPntr->AddChild (TempMenuBarPntr);
4884     TempMenuBarPntr->GetPreferredSize (&JunkFloat, &g_PopUpMenuHeight);
4885     TempWindowPntr->RemoveChild (TempMenuBarPntr);
4886     TempWindowPntr->Unlock ();
4887     delete TempMenuBarPntr; // It will delete contents too.
4888   }
4889 
4890   TempWindowPntr->Lock ();
4891   TempWindowPntr->Quit ();
4892 
4893   SetPulseRate (500000);
4894 
4895   if (g_CommandLineMode)
4896     g_QuitCountdown = 0; /* Quit as soon as queued up commands done. */
4897   else /* GUI mode, make a window. */
4898   {
4899     DatabaseWindowPntr = new DatabaseWindow ();
4900     if (DatabaseWindowPntr == NULL)
4901     {
4902       DisplayErrorMessage ("Unable to create window.");
4903       g_QuitCountdown = 0;
4904     }
4905     else {
4906       DatabaseWindowPntr->Show (); /* Starts the window's message loop. */
4907     }
4908   }
4909 
4910   g_AppReadyToRunCompleted = true;
4911 }
4912 
4913 
4914 /* Given a mail component (body text, attachment, whatever), look for words in
4915 it.  If the tokenize mode specifies that it isn't one of the ones we are
4916 looking for, just skip it.  For container type components, recursively examine
4917 their contents, up to the maximum depth specified. */
4918 
4919 status_t ABSApp::RecursivelyTokenizeMailComponent (
4920   BMailComponent *ComponentPntr,
4921   const char *OptionalFileName,
4922   set<string> &WordSet,
4923   char *ErrorMessage,
4924   int RecursionLevel,
4925   int MaxRecursionLevel)
4926 {
4927   char                        AttachmentName [B_FILE_NAME_LENGTH];
4928   BMailAttachment            *AttachmentPntr;
4929   BMimeType                   ComponentMIMEType;
4930   BMailContainer             *ContainerPntr;
4931   BMallocIO                   ContentsIO;
4932   const char                 *ContentsBufferPntr;
4933   size_t                      ContentsBufferSize;
4934   status_t                    ErrorCode;
4935   bool                        ExamineComponent;
4936   const char                 *HeaderKeyPntr;
4937   const char                 *HeaderValuePntr;
4938   int                         i;
4939   int                         j;
4940   const char                 *NameExtension;
4941   int                         NumComponents;
4942   BMimeType                   TextAnyMIMEType ("text");
4943   BMimeType                   TextPlainMIMEType ("text/plain");
4944 
4945   if (ComponentPntr == NULL)
4946     return B_OK;
4947 
4948   /* Add things in the sub-headers that might be useful.  Things like the file
4949   name of attachments, the encoding type, etc. */
4950 
4951   if (m_TokenizeMode == TM_PLAIN_TEXT_HEADER ||
4952   m_TokenizeMode == TM_ANY_TEXT_HEADER ||
4953   m_TokenizeMode == TM_ALL_PARTS_HEADER ||
4954   m_TokenizeMode == TM_JUST_HEADER)
4955   {
4956     for (i = 0; i < 1000; i++)
4957     {
4958       HeaderKeyPntr = ComponentPntr->HeaderAt (i);
4959       if (HeaderKeyPntr == NULL)
4960         break;
4961       AddWordsToSet (HeaderKeyPntr, strlen (HeaderKeyPntr),
4962         'H' /* Prefix for Headers, uppercase unlike normal words. */, WordSet);
4963       for (j = 0; j < 1000; j++)
4964       {
4965         HeaderValuePntr = ComponentPntr->HeaderField (HeaderKeyPntr, j);
4966         if (HeaderValuePntr == NULL)
4967           break;
4968         AddWordsToSet (HeaderValuePntr, strlen (HeaderValuePntr),
4969           'H', WordSet);
4970       }
4971     }
4972   }
4973 
4974   /* Check the MIME type of the thing.  It's used to decide if the contents are
4975   worth examining for words. */
4976 
4977   ErrorCode = ComponentPntr->MIMEType (&ComponentMIMEType);
4978   if (ErrorCode != B_OK)
4979   {
4980     sprintf (ErrorMessage, "ABSApp::RecursivelyTokenizeMailComponent: "
4981       "Unable to get MIME type at level %d in \"%s\"",
4982       RecursionLevel, OptionalFileName);
4983     return ErrorCode;
4984   }
4985   if (ComponentMIMEType.Type() == NULL)
4986   {
4987     /* Have to make up a MIME type for things which don't have them, such as
4988     the main body text, otherwise it would get ignored. */
4989 
4990     if (NULL != dynamic_cast<BTextMailComponent *>(ComponentPntr))
4991       ComponentMIMEType.SetType ("text/plain");
4992   }
4993   if (!TextAnyMIMEType.Contains (&ComponentMIMEType) &&
4994   NULL != (AttachmentPntr = dynamic_cast<BMailAttachment *>(ComponentPntr)))
4995   {
4996     /* Sometimes spam doesn't give a text MIME type for text when they do an
4997     attachment (which is often base64 encoded).  Use the file name extension to
4998     see if it really is text. */
4999     NameExtension = NULL;
5000     if (AttachmentPntr->FileName (AttachmentName) >= 0)
5001       NameExtension = strrchr (AttachmentName, '.');
5002     if (NameExtension != NULL)
5003     {
5004       if (strcasecmp (NameExtension, ".txt") == 0)
5005         ComponentMIMEType.SetType ("text/plain");
5006       else if (strcasecmp (NameExtension, ".htm") == 0 ||
5007       strcasecmp (NameExtension, ".html") == 0)
5008         ComponentMIMEType.SetType ("text/html");
5009     }
5010   }
5011 
5012   switch (m_TokenizeMode)
5013   {
5014     case TM_PLAIN_TEXT:
5015     case TM_PLAIN_TEXT_HEADER:
5016       ExamineComponent = TextPlainMIMEType.Contains (&ComponentMIMEType);
5017       break;
5018 
5019     case TM_ANY_TEXT:
5020     case TM_ANY_TEXT_HEADER:
5021       ExamineComponent = TextAnyMIMEType.Contains (&ComponentMIMEType);
5022       break;
5023 
5024     case TM_ALL_PARTS:
5025     case TM_ALL_PARTS_HEADER:
5026       ExamineComponent = true;
5027       break;
5028 
5029     default:
5030       ExamineComponent = false;
5031       break;
5032   }
5033 
5034   if (ExamineComponent)
5035   {
5036     /* Get the contents of the component.  This will be UTF-8 text (converted
5037     from whatever encoding was used) for text attachments.  For other ones,
5038     it's just the raw data, or perhaps decoded from base64 encoding. */
5039 
5040     ContentsIO.SetBlockSize (16 * 1024);
5041     ErrorCode = ComponentPntr->GetDecodedData (&ContentsIO);
5042     if (ErrorCode == B_OK) /* Can fail for container components: no data. */
5043     {
5044       /* Look for words in the decoded data. */
5045 
5046       ContentsBufferPntr = (const char *) ContentsIO.Buffer ();
5047       ContentsBufferSize = ContentsIO.BufferLength ();
5048       if (ContentsBufferPntr != NULL /* can be empty */)
5049         AddWordsToSet (ContentsBufferPntr, ContentsBufferSize,
5050           0 /* no prefix character, this is body text */, WordSet);
5051     }
5052   }
5053 
5054   /* Examine any sub-components in the message. */
5055 
5056   if (RecursionLevel + 1 <= MaxRecursionLevel &&
5057   NULL != (ContainerPntr = dynamic_cast<BMailContainer *>(ComponentPntr)))
5058   {
5059     NumComponents = ContainerPntr->CountComponents ();
5060 
5061     for (i = 0; i < NumComponents; i++)
5062     {
5063       ComponentPntr = ContainerPntr->GetComponent (i);
5064 
5065       ErrorCode = RecursivelyTokenizeMailComponent (ComponentPntr,
5066         OptionalFileName, WordSet, ErrorMessage, RecursionLevel + 1,
5067         MaxRecursionLevel);
5068       if (ErrorCode != B_OK)
5069         break;
5070     }
5071   }
5072 
5073   return ErrorCode;
5074 }
5075 
5076 
5077 /* The user has tried to open a file or several files with this application,
5078 via Tracker's open-with menu item.  If it is a database type file, then change
5079 the database file name to it.  Otherwise, ask the user whether they want to
5080 classify it as spam or non-spam.  There will be at most around 100 files, BeOS
5081 R5.0.3's Tracker crashes if it tries to pass on more than that many using Open
5082 With... etc.  The command is sent to an intermediary thread where it is
5083 asynchronously converted into a scripting message(s) that are sent back to this
5084 BApplication.  The intermediary is needed since we can't recursively execute
5085 scripting messages while processing a message (this RefsReceived one). */
5086 
5087 void ABSApp::RefsReceived (BMessage *MessagePntr)
5088 {
5089   if (g_CommanderLooperPntr != NULL)
5090     g_CommanderLooperPntr->CommandReferences (MessagePntr);
5091 }
5092 
5093 
5094 /* A scripting command is looking for something to execute it.  See if it is
5095 targetted at our database. */
5096 
5097 BHandler * ABSApp::ResolveSpecifier (
5098   BMessage *MessagePntr,
5099   int32 Index,
5100   BMessage *SpecifierMsgPntr,
5101   int32 SpecificationKind,
5102   const char *PropertyPntr)
5103 {
5104   int i;
5105 
5106   /* See if it is one of our commands. */
5107 
5108   if (SpecificationKind == B_DIRECT_SPECIFIER)
5109   {
5110     for (i = PN_MAX - 1; i >= 0; i--)
5111     {
5112       if (strcasecmp (PropertyPntr, g_PropertyNames [i]) == 0)
5113         return this; /* Found it!  Return the Handler (which is us). */
5114     }
5115   }
5116 
5117   /* Handle an unrecognized scripting command, let the parent figure it out. */
5118 
5119   return BApplication::ResolveSpecifier (
5120     MessagePntr, Index, SpecifierMsgPntr, SpecificationKind, PropertyPntr);
5121 }
5122 
5123 
5124 /* Save the database if it hasn't been saved yet.  Otherwise do nothing. */
5125 
5126 status_t ABSApp::SaveDatabaseIfNeeded (char *ErrorMessage)
5127 {
5128   if (m_DatabaseHasChanged)
5129     return LoadSaveDatabase (false /* DoLoad */, ErrorMessage);
5130 
5131   return B_OK;
5132 }
5133 
5134 
5135 /* Presumably the file is an e-mail message (or at least the header portion of
5136 one).  Break it into parts: header, body and MIME components.  Then add the
5137 words in the portions that match the current tokenization settings to the set
5138 of words. */
5139 
5140 status_t ABSApp::TokenizeParts (
5141   BPositionIO *PositionIOPntr,
5142   const char *OptionalFileName,
5143   set<string> &WordSet,
5144   char *ErrorMessage)
5145 {
5146   status_t        ErrorCode = B_OK;
5147   BEmailMessage   WholeEMail;
5148 
5149   sprintf (ErrorMessage, "ABSApp::TokenizeParts: While getting e-mail "
5150     "headers, had problems with \"%s\"", OptionalFileName);
5151 
5152   ErrorCode = WholeEMail.SetToRFC822 (
5153     PositionIOPntr /* it does its own seeking to the start */,
5154     -1 /* length */, true /* parse_now */);
5155   if (ErrorCode < 0) goto ErrorExit;
5156 
5157   ErrorCode = RecursivelyTokenizeMailComponent (&WholeEMail,
5158     OptionalFileName, WordSet, ErrorMessage, 0 /* Initial recursion level */,
5159     (m_TokenizeMode == TM_JUST_HEADER) ? 0 : 500 /* Max recursion level */);
5160 
5161 ErrorExit:
5162   return ErrorCode;
5163 }
5164 
5165 
5166 /* Add all the words in the whole file or memory buffer to the supplied set.
5167 The file doesn't have to be an e-mail message since it isn't parsed for e-mail
5168 headers or MIME headers or anything.  It blindly adds everything that looks
5169 like a word, though it does convert quoted printable codes to the characters
5170 they represent.  See also AddWordsToSet which does something more advanced. */
5171 
5172 status_t ABSApp::TokenizeWhole (
5173   BPositionIO *PositionIOPntr,
5174   const char *OptionalFileName,
5175   set<string> &WordSet,
5176   char *ErrorMessage)
5177 {
5178   string                AccumulatedWord;
5179   uint8                 Buffer [16 * 1024];
5180   uint8                *BufferCurrentPntr = Buffer + 0;
5181   uint8                *BufferEndPntr = Buffer + 0;
5182   const char           *IOErrorString =
5183                           "TokenizeWhole: Error %ld while reading \"%s\"";
5184   size_t                Length;
5185   int                   Letter = ' ';
5186   char                  HexString [4];
5187   int                   NextLetter = ' ';
5188   int                   NextNextLetter = ' ';
5189 
5190   /* Use a buffer since reading single characters from a BFile is so slow.
5191   BufferCurrentPntr is the position of the next character to be read.  When it
5192   reaches BufferEndPntr, it is time to fill the buffer again. */
5193 
5194 #define ReadChar(CharVar) \
5195   { \
5196     if (BufferCurrentPntr < BufferEndPntr) \
5197       CharVar = *BufferCurrentPntr++; \
5198     else /* Try to fill the buffer. */ \
5199     { \
5200       ssize_t AmountRead; \
5201       AmountRead = PositionIOPntr->Read (Buffer, sizeof (Buffer)); \
5202       if (AmountRead < 0) \
5203       { \
5204         sprintf (ErrorMessage, IOErrorString, AmountRead, OptionalFileName); \
5205         return AmountRead; \
5206       } \
5207       else if (AmountRead == 0) \
5208         CharVar = EOF; \
5209       else \
5210       { \
5211         BufferEndPntr = Buffer + AmountRead; \
5212         BufferCurrentPntr = Buffer + 0; \
5213         CharVar = *BufferCurrentPntr++; \
5214       } \
5215     } \
5216   }
5217 
5218   /* Read all the words in the file and add them to our local set of words.  A
5219   set is used since we don't care how many times a word occurs. */
5220 
5221   while (true)
5222   {
5223     /* We read two letters ahead so that we can decode quoted printable
5224     characters (an equals sign followed by two hex digits or a new line).  Note
5225     that Letter can become EOF (-1) when end of file is reached. */
5226 
5227     Letter = NextLetter;
5228     NextLetter = NextNextLetter;
5229     ReadChar (NextNextLetter);
5230 
5231     /* Decode quoted printable codes first, so that the rest of the code just
5232     sees an ordinary character.  Or even nothing, if it is the hidden line
5233     break combination.  This may falsely corrupt stuff following an equals
5234     sign, but usually won't. */
5235 
5236     if (Letter == '=')
5237     {
5238       if ((NextLetter == '\r' && NextNextLetter == '\n') ||
5239       (NextLetter == '\n' && NextNextLetter == '\r'))
5240       {
5241         /* Make the "=\r\n" pair disappear.  It's not even white space. */
5242         ReadChar (NextLetter);
5243         ReadChar (NextNextLetter);
5244         continue;
5245       }
5246       if (NextLetter == '\n' || NextLetter == '\r')
5247       {
5248         /* Make the "=\n" pair disappear.  It's not even white space. */
5249         NextLetter = NextNextLetter;
5250         ReadChar (NextNextLetter);
5251         continue;
5252       }
5253       if (NextNextLetter != EOF &&
5254       isxdigit (NextLetter) && isxdigit (NextNextLetter))
5255       {
5256         /* Convert the hex code to a letter. */
5257         HexString[0] = NextLetter;
5258         HexString[1] = NextNextLetter;
5259         HexString[2] = 0;
5260         Letter = strtoul (HexString, NULL, 16 /* number system base */);
5261         ReadChar (NextLetter);
5262         ReadChar (NextNextLetter);
5263       }
5264     }
5265 
5266     /* Convert to lower case to improve word matches.  Of course this loses a
5267     bit of information, such as MONEY vs Money, an indicator of spam.  Well,
5268     apparently that isn't all that useful a distinction, so do it. */
5269 
5270     if (Letter >= 'A' && Letter < 'Z')
5271       Letter = Letter + ('a' - 'A');
5272 
5273     /* See if it is a letter we treat as white space - all control characters
5274     and all punctuation except for: apostrophe (so "it's" and possessive
5275     versions of words get stored), dash (for hyphenated words), dollar sign
5276     (for cash amounts), period (for IP addresses, we later remove trailing
5277     (periods).  Note that codes above 127 are UTF-8 characters, which we
5278     consider non-space. */
5279 
5280     if (Letter < 0 /* EOF */ || (Letter < 128 && g_SpaceCharacters[Letter]))
5281     {
5282       /* That space finished off a word.  Remove trailing periods... */
5283 
5284       while ((Length = AccumulatedWord.size()) > 0 &&
5285       AccumulatedWord [Length-1] == '.')
5286         AccumulatedWord.resize (Length - 1);
5287 
5288       /* If there's anything left in the word, add it to the set.  Also ignore
5289       words which are too big (it's probably some binary encoded data).  But
5290       leave room for supercalifragilisticexpialidoceous.  According to one web
5291       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
5292       currently in English.  Note that some uuencoded data was seen with a 60
5293       character line length. */
5294 
5295       if (Length > 0 && Length <= g_MaxWordLength)
5296         WordSet.insert (AccumulatedWord);
5297 
5298       /* Empty out the string to get ready for the next word. */
5299 
5300       AccumulatedWord.resize (0);
5301     }
5302     else /* Not a space-like character, add it to the word. */
5303       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
5304 
5305     /* Stop at end of file or error.  Don't care which.  Exit here so that last
5306     word got processed. */
5307 
5308     if (Letter == EOF)
5309       break;
5310   }
5311 
5312   return B_OK;
5313 }
5314 
5315 
5316 
5317 /******************************************************************************
5318  * Implementation of the ClassificationChoicesView class, constructor,
5319  * destructor and the rest of the member functions in mostly alphabetical
5320  * order.
5321  */
5322 
5323 ClassificationChoicesWindow::ClassificationChoicesWindow (
5324   BRect FrameRect,
5325   const char *FileName,
5326   int NumberOfFiles)
5327 : BWindow (FrameRect, "Classification Choices", B_TITLED_WINDOW,
5328     B_NOT_ZOOMABLE | B_NOT_RESIZABLE | B_ASYNCHRONOUS_CONTROLS),
5329   m_BulkModeSelectedPntr (NULL),
5330   m_ChoosenClassificationPntr (NULL)
5331 {
5332   ClassificationChoicesView *SubViewPntr;
5333 
5334   SubViewPntr = new ClassificationChoicesView (Bounds(),
5335     FileName, NumberOfFiles);
5336   AddChild (SubViewPntr);
5337   SubViewPntr->ResizeToPreferred ();
5338   ResizeTo (SubViewPntr->Frame().Width(), SubViewPntr->Frame().Height());
5339 }
5340 
5341 
5342 void ClassificationChoicesWindow::MessageReceived (BMessage *MessagePntr)
5343 {
5344   BControl *ControlPntr;
5345 
5346   if (MessagePntr->what >= MSG_CLASS_BUTTONS &&
5347   MessagePntr->what < MSG_CLASS_BUTTONS + CL_MAX)
5348   {
5349     if (m_ChoosenClassificationPntr != NULL)
5350       *m_ChoosenClassificationPntr =
5351         (ClassificationTypes) (MessagePntr->what - MSG_CLASS_BUTTONS);
5352     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5353     return;
5354   }
5355 
5356   if (MessagePntr->what == MSG_BULK_CHECKBOX)
5357   {
5358     if (m_BulkModeSelectedPntr != NULL &&
5359     MessagePntr->FindPointer ("source", (void **) &ControlPntr) == B_OK)
5360       *m_BulkModeSelectedPntr = (ControlPntr->Value() == B_CONTROL_ON);
5361     return;
5362   }
5363 
5364   if (MessagePntr->what == MSG_CANCEL_BUTTON)
5365   {
5366     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5367     return;
5368   }
5369 
5370   BWindow::MessageReceived (MessagePntr);
5371 }
5372 
5373 
5374 void ClassificationChoicesWindow::Go (
5375   bool *BulkModeSelectedPntr,
5376   ClassificationTypes *ChoosenClassificationPntr)
5377 {
5378   status_t  ErrorCode = 0;
5379   BView    *MainViewPntr;
5380   thread_id WindowThreadID;
5381 
5382   m_BulkModeSelectedPntr = BulkModeSelectedPntr;
5383   m_ChoosenClassificationPntr = ChoosenClassificationPntr;
5384   if (m_ChoosenClassificationPntr != NULL)
5385     *m_ChoosenClassificationPntr = CL_MAX;
5386 
5387   Show (); // Starts the window thread running.
5388 
5389   /* Move the window to the center of the screen it is now being displayed on
5390   (have to wait for it to be showing). */
5391 
5392   Lock ();
5393   MainViewPntr = FindView ("ClassificationChoicesView");
5394   if (MainViewPntr != NULL)
5395   {
5396     BRect   TempRect;
5397     BScreen TempScreen (this);
5398     float   X;
5399     float   Y;
5400 
5401     TempRect = TempScreen.Frame ();
5402     X = TempRect.Width() / 2;
5403     Y = TempRect.Height() / 2;
5404     TempRect = MainViewPntr->Frame();
5405     X -= TempRect.Width() / 2;
5406     Y -= TempRect.Height() / 2;
5407     MoveTo (ceilf (X), ceilf (Y));
5408   }
5409   Unlock ();
5410 
5411   /* Wait for the window to go away. */
5412 
5413   WindowThreadID = Thread ();
5414   if (WindowThreadID >= 0)
5415     // Delay until the window thread has died, presumably window deleted now.
5416     wait_for_thread (WindowThreadID, &ErrorCode);
5417 }
5418 
5419 
5420 
5421 /******************************************************************************
5422  * Implementation of the ClassificationChoicesView class, constructor,
5423  * destructor and the rest of the member functions in mostly alphabetical
5424  * order.
5425  */
5426 
5427 ClassificationChoicesView::ClassificationChoicesView (
5428   BRect FrameRect,
5429   const char *FileName,
5430   int NumberOfFiles)
5431 : BView (FrameRect, "ClassificationChoicesView",
5432     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_NAVIGABLE_JUMP),
5433   m_FileName (FileName),
5434   m_NumberOfFiles (NumberOfFiles),
5435   m_PreferredBottomY (ceilf (g_ButtonHeight * 10))
5436 {
5437 }
5438 
5439 
5440 void ClassificationChoicesView::AttachedToWindow ()
5441 {
5442   BButton            *ButtonPntr;
5443   BCheckBox          *CheckBoxPntr;
5444   ClassificationTypes Classification;
5445   float               Margin;
5446   float               RowHeight;
5447   float               RowTop;
5448   BTextView          *TextViewPntr;
5449   BRect               TempRect;
5450   char                TempString [2048];
5451   BRect               TextRect;
5452   float               X;
5453 
5454   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5455 
5456   RowHeight = g_ButtonHeight;
5457   if (g_CheckBoxHeight > RowHeight)
5458     RowHeight = g_CheckBoxHeight;
5459   RowHeight = ceilf (RowHeight * 1.1);
5460 
5461   TempRect = Bounds ();
5462   RowTop = TempRect.top;
5463 
5464   /* Show the file name text. */
5465 
5466   Margin = ceilf ((RowHeight - g_StringViewHeight) / 2);
5467   TempRect = Bounds ();
5468   TempRect.top = RowTop + Margin;
5469   TextRect = TempRect;
5470   TextRect.OffsetTo (0, 0);
5471   TextRect.InsetBy (g_MarginBetweenControls, 2);
5472   sprintf (TempString, "How do you want to classify the file named \"%s\"?",
5473     m_FileName);
5474   TextViewPntr = new BTextView (TempRect, "FileText", TextRect,
5475     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE);
5476   AddChild (TextViewPntr);
5477   TextViewPntr->SetText (TempString);
5478   TextViewPntr->MakeEditable (false);
5479   TextViewPntr->SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5480   TextViewPntr->ResizeTo (TempRect.Width (),
5481     3 + TextViewPntr->TextHeight (0, sizeof (TempString)));
5482   RowTop = TextViewPntr->Frame().bottom + Margin;
5483 
5484   /* Make the classification buttons. */
5485 
5486   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5487   TempRect = Bounds ();
5488   TempRect.top = RowTop + Margin;
5489   X = Bounds().left + g_MarginBetweenControls;
5490   for (Classification = (ClassificationTypes) 0; Classification < CL_MAX;
5491   Classification = (ClassificationTypes) ((int) Classification + 1))
5492   {
5493     TempRect = Bounds ();
5494     TempRect.top = RowTop + Margin;
5495     TempRect.left = X;
5496     sprintf (TempString, "%s Button",
5497       g_ClassificationTypeNames [Classification]);
5498     ButtonPntr = new BButton (TempRect, TempString,
5499       g_ClassificationTypeNames [Classification], new BMessage (
5500       ClassificationChoicesWindow::MSG_CLASS_BUTTONS + Classification));
5501     AddChild (ButtonPntr);
5502     ButtonPntr->ResizeToPreferred ();
5503     X = ButtonPntr->Frame().right + 3 * g_MarginBetweenControls;
5504   }
5505   RowTop += ceilf (RowHeight * 1.2);
5506 
5507   /* Make the Cancel button. */
5508 
5509   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5510   TempRect = Bounds ();
5511   TempRect.top = RowTop + Margin;
5512   TempRect.left += g_MarginBetweenControls;
5513   ButtonPntr = new BButton (TempRect, "Cancel Button",
5514     "Cancel", new BMessage (ClassificationChoicesWindow::MSG_CANCEL_BUTTON));
5515   AddChild (ButtonPntr);
5516   ButtonPntr->ResizeToPreferred ();
5517   X = ButtonPntr->Frame().right + g_MarginBetweenControls;
5518 
5519   /* Make the checkbox for bulk operations. */
5520 
5521   if (m_NumberOfFiles > 1)
5522   {
5523     Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
5524     TempRect = Bounds ();
5525     TempRect.top = RowTop + Margin;
5526     TempRect.left = X;
5527     sprintf (TempString, "Mark all %d remaining messages the same way.",
5528       m_NumberOfFiles - 1);
5529     CheckBoxPntr = new BCheckBox (TempRect, "BulkBox", TempString,
5530       new BMessage (ClassificationChoicesWindow::MSG_BULK_CHECKBOX));
5531     AddChild (CheckBoxPntr);
5532     CheckBoxPntr->ResizeToPreferred ();
5533   }
5534   RowTop += RowHeight;
5535 
5536   m_PreferredBottomY = RowTop;
5537 }
5538 
5539 
5540 void ClassificationChoicesView::GetPreferredSize (float *width, float *height)
5541 {
5542   if (width != NULL)
5543     *width = Bounds().Width();
5544   if (height != NULL)
5545     *height = m_PreferredBottomY;
5546 }
5547 
5548 
5549 
5550 /******************************************************************************
5551  * Implementation of the CommanderLooper class, constructor, destructor and the
5552  * rest of the member functions in mostly alphabetical order.
5553  */
5554 
5555 CommanderLooper::CommanderLooper ()
5556 : BLooper ("CommanderLooper", B_NORMAL_PRIORITY),
5557   m_IsBusy (false)
5558 {
5559 }
5560 
5561 
5562 CommanderLooper::~CommanderLooper ()
5563 {
5564   g_CommanderLooperPntr = NULL;
5565   delete g_CommanderMessenger;
5566   g_CommanderMessenger = NULL;
5567 }
5568 
5569 
5570 /* Process some command line arguments.  Basically just send a message to this
5571 looper itself to do the work later.  That way the caller can continue doing
5572 whatever they're doing, particularly if it's the BApplication. */
5573 
5574 void CommanderLooper::CommandArguments (int argc, char **argv)
5575 {
5576   int      i;
5577   BMessage InternalMessage;
5578 
5579   InternalMessage.what = MSG_COMMAND_ARGUMENTS;
5580   for (i = 0; i < argc; i++)
5581     InternalMessage.AddString ("arg", argv[i]);
5582 
5583   PostMessage (&InternalMessage);
5584 }
5585 
5586 
5587 /* Copy the refs out of the given message and stuff them into an internal
5588 message to ourself (so that the original message can be returned to the caller,
5589 and if it is Tracker, it can close the file handles it has open).  Optionally
5590 allow preset classification rather than asking the user (set BulkMode to TRUE
5591 and specify the class with BulkClassification). */
5592 
5593 void CommanderLooper::CommandReferences (
5594   BMessage *MessagePntr,
5595   bool BulkMode,
5596   ClassificationTypes BulkClassification)
5597 {
5598   entry_ref EntryRef;
5599   int       i;
5600   BMessage  InternalMessage;
5601 
5602   InternalMessage.what = MSG_COMMAND_FILE_REFS;
5603   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
5604     InternalMessage.AddRef ("refs", &EntryRef);
5605   InternalMessage.AddBool ("BulkMode", BulkMode);
5606   InternalMessage.AddInt32 ("BulkClassification", BulkClassification);
5607 
5608   PostMessage (&InternalMessage);
5609 }
5610 
5611 
5612 /* This function is called by other threads to see if the CommanderLooper is
5613 busy working on something. */
5614 
5615 bool CommanderLooper::IsBusy ()
5616 {
5617   if (m_IsBusy)
5618     return true;
5619 
5620   if (IsLocked () || !MessageQueue()->IsEmpty ())
5621     return true;
5622 
5623   return false;
5624 }
5625 
5626 
5627 void CommanderLooper::MessageReceived (BMessage *MessagePntr)
5628 {
5629   m_IsBusy = true;
5630 
5631   if (MessagePntr->what == MSG_COMMAND_ARGUMENTS)
5632     ProcessArgs (MessagePntr);
5633   else if (MessagePntr->what == MSG_COMMAND_FILE_REFS)
5634     ProcessRefs (MessagePntr);
5635   else
5636     BLooper::MessageReceived (MessagePntr);
5637 
5638   m_IsBusy = false;
5639 }
5640 
5641 
5642 /* Process the command line by converting it into a series of scripting
5643 messages (possibly thousands) and sent them to the BApplication synchronously
5644 (so we can print the result). */
5645 
5646 void CommanderLooper::ProcessArgs (BMessage *MessagePntr)
5647 {
5648   int32                 argc = 0;
5649   const char          **argv = NULL;
5650   int                   ArgumentIndex;
5651   uint32                CommandCode;
5652   const char           *CommandWord;
5653   status_t              ErrorCode;
5654   const char           *ErrorTitle = "ProcessArgs";
5655   char                 *EndPntr;
5656   int32                 i;
5657   BMessage              ReplyMessage;
5658   BMessage              ScriptMessage;
5659   struct property_info *PropInfoPntr;
5660   const char           *PropertyName;
5661   bool                  TempBool;
5662   float                 TempFloat;
5663   int32                 TempInt32;
5664   const char           *TempStringPntr;
5665   type_code             TypeCode;
5666   const char           *ValuePntr;
5667 
5668   /* Get the argument count and pointers to arguments out of the message and
5669   into our argc and argv. */
5670 
5671   ErrorCode = MessagePntr->GetInfo ("arg", &TypeCode, &argc);
5672   if (ErrorCode != B_OK || TypeCode != B_STRING_TYPE)
5673   {
5674     DisplayErrorMessage ("Unable to find argument strings in message",
5675       ErrorCode, ErrorTitle);
5676     goto ErrorExit;
5677   }
5678 
5679   if (argc < 2)
5680   {
5681     cerr << PrintUsage;
5682     DisplayErrorMessage ("You need to specify a command word, like GET, SET "
5683       "and so on followed by a property, like DatabaseFile, and maybe "
5684       "followed by a value of some sort", -1, ErrorTitle);
5685     goto ErrorExit;
5686   }
5687 
5688   argv = (const char **) malloc (sizeof (char *) * argc);
5689   if (argv == NULL)
5690   {
5691     DisplayErrorMessage ("Out of memory when allocating argv array",
5692       ENOMEM, ErrorTitle);
5693     goto ErrorExit;
5694   }
5695 
5696   for (i = 0; i < argc; i++)
5697   {
5698     if ((ErrorCode = MessagePntr->FindString ("arg", i, &argv[i])) != B_OK)
5699     {
5700       DisplayErrorMessage ("Unable to find argument in the BMessage",
5701         ErrorCode, ErrorTitle);
5702       goto ErrorExit;
5703     }
5704   }
5705 
5706   CommandWord = argv[1];
5707 
5708   /* Special case for the Quit command since it isn't a scripting command. */
5709 
5710   if (strcasecmp (CommandWord, "quit") == 0)
5711   {
5712     g_QuitCountdown = 10;
5713     goto ErrorExit;
5714   }
5715 
5716   /* Find the corresponding scripting command. */
5717 
5718   if (strcasecmp (CommandWord, "set") == 0)
5719     CommandCode = B_SET_PROPERTY;
5720   else if (strcasecmp (CommandWord, "get") == 0)
5721     CommandCode = B_GET_PROPERTY;
5722   else if (strcasecmp (CommandWord, "count") == 0)
5723     CommandCode = B_COUNT_PROPERTIES;
5724   else if (strcasecmp (CommandWord, "create") == 0)
5725     CommandCode = B_CREATE_PROPERTY;
5726   else if (strcasecmp (CommandWord, "delete") == 0)
5727     CommandCode = B_DELETE_PROPERTY;
5728   else
5729     CommandCode = B_EXECUTE_PROPERTY;
5730 
5731   if (CommandCode == B_EXECUTE_PROPERTY)
5732   {
5733     PropertyName = CommandWord;
5734     ArgumentIndex = 2; /* Arguments to the command start at this index. */
5735   }
5736   else
5737   {
5738     if (CommandCode == B_SET_PROPERTY)
5739     {
5740       /* SET commands require at least one argument value. */
5741       if (argc < 4)
5742       {
5743         cerr << PrintUsage;
5744         DisplayErrorMessage ("SET commands require at least one "
5745           "argument value after the property name", -1, ErrorTitle);
5746         goto ErrorExit;
5747       }
5748     }
5749     else
5750       if (argc < 3)
5751       {
5752         cerr << PrintUsage;
5753         DisplayErrorMessage ("You need to specify a property to act on",
5754           -1, ErrorTitle);
5755         goto ErrorExit;
5756       }
5757     PropertyName = argv[2];
5758     ArgumentIndex = 3;
5759   }
5760 
5761   /* See if it is one of our commands. */
5762 
5763   for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
5764   {
5765     if (PropInfoPntr->name == 0)
5766     {
5767       cerr << PrintUsage;
5768       DisplayErrorMessage ("The property specified isn't known or "
5769         "doesn't support the requested action (usually means it is an "
5770         "unknown command)", -1, ErrorTitle);
5771       goto ErrorExit; /* Unrecognized command. */
5772     }
5773 
5774     if (PropInfoPntr->commands[0] == CommandCode &&
5775     strcasecmp (PropertyName, PropInfoPntr->name) == 0)
5776       break;
5777   }
5778 
5779   /* Make the equivalent command message.  For commands with multiple
5780   arguments, repeat the message for each single argument and just change the
5781   data portion for each extra argument.  Send the command and wait for a reply,
5782   which we'll print out. */
5783 
5784   ScriptMessage.MakeEmpty ();
5785   ScriptMessage.what = CommandCode;
5786   ScriptMessage.AddSpecifier (PropertyName);
5787   while (true)
5788   {
5789     if (ArgumentIndex < argc) /* If there are arguments to be added. */
5790     {
5791       ValuePntr = argv[ArgumentIndex];
5792 
5793       /* Convert the value into the likely kind of data. */
5794 
5795       if (strcasecmp (ValuePntr, "yes") == 0 ||
5796       strcasecmp (ValuePntr, "true") == 0)
5797         ScriptMessage.AddBool (g_DataName, true);
5798       else if (strcasecmp (ValuePntr, "no") == 0 ||
5799       strcasecmp (ValuePntr, "false") == 0)
5800         ScriptMessage.AddBool (g_DataName, false);
5801       else
5802       {
5803         /* See if it is a number. */
5804         i = strtol (ValuePntr, &EndPntr, 0);
5805         if (*EndPntr == 0)
5806           ScriptMessage.AddInt32 (g_DataName, i);
5807         else /* Nope, it's just a string. */
5808           ScriptMessage.AddString (g_DataName, ValuePntr);
5809       }
5810     }
5811 
5812     ErrorCode = be_app_messenger.SendMessage (&ScriptMessage, &ReplyMessage);
5813     if (ErrorCode != B_OK)
5814     {
5815       DisplayErrorMessage ("Unable to send scripting command",
5816         ErrorCode, ErrorTitle);
5817       goto ErrorExit;
5818     }
5819 
5820     /* Print the reply to the scripting command.  Even in server mode.  To
5821     standard output. */
5822 
5823     if (ReplyMessage.FindString ("CommandText", &TempStringPntr) == B_OK)
5824     {
5825       TempInt32 = -1;
5826       if (ReplyMessage.FindInt32 ("error", &TempInt32) == B_OK &&
5827       TempInt32 == B_OK)
5828       {
5829         /* It's a successful reply to one of our scripting messages.  Print out
5830         the returned values code for command line users to see. */
5831 
5832         cout << "Result of command to " << TempStringPntr << " is:\t";
5833         if (ReplyMessage.FindString (g_ResultName, &TempStringPntr) == B_OK)
5834           cout << "\"" << TempStringPntr << "\"";
5835         else if (ReplyMessage.FindInt32 (g_ResultName, &TempInt32) == B_OK)
5836           cout << TempInt32;
5837         else if (ReplyMessage.FindFloat (g_ResultName, &TempFloat) == B_OK)
5838           cout << TempFloat;
5839         else if (ReplyMessage.FindBool (g_ResultName, &TempBool) == B_OK)
5840           cout << (TempBool ? "true" : "false");
5841         else
5842           cout << "just plain success";
5843         if (ReplyMessage.FindInt32 ("count", &TempInt32) == B_OK)
5844           cout << "\t(count " << TempInt32 << ")";
5845         for (i = 0; (i < 50) &&
5846         ReplyMessage.FindString ("words", i, &TempStringPntr) == B_OK &&
5847         ReplyMessage.FindFloat ("ratios", i, &TempFloat) == B_OK;
5848         i++)
5849         {
5850           if (i == 0)
5851             cout << "\twith top words:\t";
5852           else
5853             cout << "\t";
5854           cout << TempStringPntr << "/" << TempFloat;
5855         }
5856         cout << endl;
5857       }
5858       else /* An error reply, print out the error, even in server mode. */
5859       {
5860         cout << "Failure of command " << TempStringPntr << ", error ";
5861         cout << TempInt32 << " (" << strerror (TempInt32) << ")";
5862         if (ReplyMessage.FindString ("message", &TempStringPntr) == B_OK)
5863           cout << ", message: " << TempStringPntr;
5864         cout << "." << endl;
5865       }
5866     }
5867 
5868     /* Advance to the next argument and its scripting message. */
5869 
5870     ScriptMessage.RemoveName (g_DataName);
5871     if (++ArgumentIndex >= argc)
5872       break;
5873   }
5874 
5875 ErrorExit:
5876   free (argv);
5877 }
5878 
5879 
5880 /* Given a bunch of references to files, open the files.  If it's a database
5881 file, switch to using it as a database.  Otherwise, treat them as text files
5882 and add them to the database.  Prompt the user for the spam or genuine or
5883 uncertain (declassification) choice, with the option to bulk mark many files at
5884 once. */
5885 
5886 void CommanderLooper::ProcessRefs (BMessage *MessagePntr)
5887 {
5888   bool                         BulkMode = false;
5889   ClassificationTypes          BulkClassification = CL_GENUINE;
5890   ClassificationChoicesWindow *ChoiceWindowPntr;
5891   BEntry                       Entry;
5892   entry_ref                    EntryRef;
5893   status_t                     ErrorCode;
5894   const char                  *ErrorTitle = "CommanderLooper::ProcessRefs";
5895   int32                        NumberOfRefs = 0;
5896   BPath                        Path;
5897   int                          RefIndex;
5898   BMessage                     ReplyMessage;
5899   BMessage                     ScriptingMessage;
5900   bool                         TempBool;
5901   BFile                        TempFile;
5902   int32                        TempInt32;
5903   char                         TempString [PATH_MAX + 1024];
5904   type_code                    TypeCode;
5905 
5906   // Wait for ReadyToRun to finish initializing the globals with the sizes of
5907   // the controls, since they are needed when we show the custom alert box for
5908   // choosing the message type.
5909 
5910   TempInt32 = 0;
5911   while (!g_AppReadyToRunCompleted && TempInt32++ < 10)
5912     snooze (200000);
5913 
5914   ErrorCode = MessagePntr->GetInfo ("refs", &TypeCode, &NumberOfRefs);
5915   if (ErrorCode != B_OK || TypeCode != B_REF_TYPE || NumberOfRefs <= 0)
5916   {
5917     DisplayErrorMessage ("Unable to get refs from the message",
5918       ErrorCode, ErrorTitle);
5919     return;
5920   }
5921 
5922   if (MessagePntr->FindBool ("BulkMode", &TempBool) == B_OK)
5923     BulkMode = TempBool;
5924   if (MessagePntr->FindInt32 ("BulkClassification", &TempInt32) == B_OK &&
5925   TempInt32 >= 0 && TempInt32 < CL_MAX)
5926     BulkClassification = (ClassificationTypes) TempInt32;
5927 
5928   for (RefIndex = 0;
5929   MessagePntr->FindRef ("refs", RefIndex, &EntryRef) == B_OK;
5930   RefIndex++)
5931   {
5932     ScriptingMessage.MakeEmpty ();
5933     ScriptingMessage.what = 0; /* Haven't figured out what to do yet. */
5934 
5935     /* See if the entry is a valid file or directory or other thing. */
5936 
5937     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
5938     if (ErrorCode != B_OK ||
5939     ((ErrorCode = /* assignment */ B_ENTRY_NOT_FOUND) != 0 /* this pacifies
5940     mwcc -nwhitehorn */ && !Entry.Exists ()) ||
5941     ((ErrorCode = Entry.GetPath (&Path)) != B_OK))
5942     {
5943       DisplayErrorMessage ("Bad entry reference encountered, will skip it",
5944         ErrorCode, ErrorTitle);
5945       BulkMode = false;
5946       continue; /* Bad file reference, try the next one. */
5947     }
5948 
5949     /* If it's a file, check if it is a spam database file.  Go by the magic
5950     text at the start of the file, in case someone has edited the file with a
5951     spreadsheet or other tool and lost the MIME type. */
5952 
5953     if (Entry.IsFile ())
5954     {
5955       ErrorCode = TempFile.SetTo (&Entry, B_READ_ONLY);
5956       if (ErrorCode != B_OK)
5957       {
5958         sprintf (TempString, "Unable to open file \"%s\" for reading, will "
5959           "skip it", Path.Path ());
5960         DisplayErrorMessage (TempString, ErrorCode, ErrorTitle);
5961         BulkMode = false;
5962         continue;
5963       }
5964       if (TempFile.Read (TempString, strlen (g_DatabaseRecognitionString)) ==
5965       (int) strlen (g_DatabaseRecognitionString) && strncmp (TempString,
5966       g_DatabaseRecognitionString, strlen (g_DatabaseRecognitionString)) == 0)
5967       {
5968         ScriptingMessage.what = B_SET_PROPERTY;
5969         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
5970         ScriptingMessage.AddString (g_DataName, Path.Path ());
5971       }
5972       TempFile.Unset ();
5973     }
5974 
5975     /* Not a database file.  Could be a directory or a file.  Submit it as
5976     something to be marked spam or genuine. */
5977 
5978     if (ScriptingMessage.what == 0)
5979     {
5980       if (!Entry.IsFile ())
5981       {
5982         sprintf (TempString, "\"%s\" is not a file, can't do anything with it",
5983           Path.Path ());
5984         DisplayErrorMessage (TempString, -1, ErrorTitle);
5985         BulkMode = false;
5986         continue;
5987       }
5988 
5989       if (!BulkMode) /* Have to ask the user. */
5990       {
5991         ChoiceWindowPntr = new ClassificationChoicesWindow (
5992           BRect (40, 40, 40 + 50 * g_MarginBetweenControls,
5993           40 + g_ButtonHeight * 5), Path.Path (), NumberOfRefs - RefIndex);
5994         ChoiceWindowPntr->Go (&BulkMode, &BulkClassification);
5995         if (BulkClassification == CL_MAX)
5996           break; /* Cancel was picked. */
5997       }
5998 
5999       /* Format the command for classifying the file. */
6000 
6001       ScriptingMessage.what = B_SET_PROPERTY;
6002 
6003       if (BulkClassification == CL_GENUINE)
6004         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_GENUINE]);
6005       else if (BulkClassification == CL_SPAM)
6006         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_SPAM]);
6007       else if (BulkClassification == CL_UNCERTAIN)
6008         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_UNCERTAIN]);
6009       else /* Broken code */
6010         break;
6011       ScriptingMessage.AddString (g_DataName, Path.Path ());
6012     }
6013 
6014     /* Tell the BApplication to do the work, and wait for it to finish.  The
6015     BApplication will display any error messages for us. */
6016 
6017     ErrorCode =
6018       be_app_messenger.SendMessage (&ScriptingMessage, &ReplyMessage);
6019     if (ErrorCode != B_OK)
6020     {
6021       DisplayErrorMessage ("Unable to send scripting command",
6022         ErrorCode, ErrorTitle);
6023       return;
6024     }
6025 
6026     /* If there was an error, allow the user to stop by switching off bulk
6027     mode.  The message will already have been displayed in an alert box, if
6028     server mode is off. */
6029 
6030     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
6031     TempInt32 != B_OK)
6032       BulkMode = false;
6033   }
6034 }
6035 
6036 
6037 
6038 /******************************************************************************
6039  * Implementation of the ControlsView class, constructor, destructor and the
6040  * rest of the member functions in mostly alphabetical order.
6041  */
6042 
6043 ControlsView::ControlsView (BRect NewBounds)
6044 : BView (NewBounds, "ControlsView", B_FOLLOW_TOP | B_FOLLOW_LEFT_RIGHT,
6045     B_WILL_DRAW | B_PULSE_NEEDED | B_NAVIGABLE_JUMP | B_FRAME_EVENTS),
6046   m_AboutButtonPntr (NULL),
6047   m_AddExampleButtonPntr (NULL),
6048   m_BrowseButtonPntr (NULL),
6049   m_BrowseFilePanelPntr (NULL),
6050   m_CreateDatabaseButtonPntr (NULL),
6051   m_DatabaseFileNameTextboxPntr (NULL),
6052   m_DatabaseLoadDone (false),
6053   m_EstimateSpamButtonPntr (NULL),
6054   m_EstimateSpamFilePanelPntr (NULL),
6055   m_GenuineCountTextboxPntr (NULL),
6056   m_IgnorePreviousClassCheckboxPntr (NULL),
6057   m_InstallThingsButtonPntr (NULL),
6058   m_PurgeAgeTextboxPntr (NULL),
6059   m_PurgeButtonPntr (NULL),
6060   m_PurgePopularityTextboxPntr (NULL),
6061   m_ResetToDefaultsButtonPntr (NULL),
6062   m_ScoringModeMenuBarPntr (NULL),
6063   m_ScoringModePopUpMenuPntr (NULL),
6064   m_ServerModeCheckboxPntr (NULL),
6065   m_SpamCountTextboxPntr (NULL),
6066   m_TimeOfLastPoll (0),
6067   m_TokenizeModeMenuBarPntr (NULL),
6068   m_TokenizeModePopUpMenuPntr (NULL),
6069   m_WordCountTextboxPntr (NULL)
6070 {
6071 }
6072 
6073 
6074 ControlsView::~ControlsView ()
6075 {
6076   if (m_BrowseFilePanelPntr != NULL)
6077   {
6078     delete m_BrowseFilePanelPntr;
6079     m_BrowseFilePanelPntr = NULL;
6080   }
6081 
6082   if (m_EstimateSpamFilePanelPntr != NULL)
6083   {
6084     delete m_EstimateSpamFilePanelPntr;
6085     m_EstimateSpamFilePanelPntr = NULL;
6086   }
6087 }
6088 
6089 
6090 void ControlsView::AttachedToWindow ()
6091 {
6092   float         BigPurgeButtonTop;
6093   BMessage      CommandMessage;
6094   const char   *EightDigitsString = " 12345678 ";
6095   float         Height;
6096   float         Margin;
6097   float         RowHeight;
6098   float         RowTop;
6099   ScoringModes  ScoringMode;
6100   char         *StringPntr;
6101   BMenuItem    *TempMenuItemPntr;
6102   BRect         TempRect;
6103   char          TempString [PATH_MAX];
6104   TokenizeModes TokenizeMode;
6105   float         Width;
6106   float         X;
6107 
6108   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
6109 
6110   TempRect = Bounds ();
6111   X = TempRect.right;
6112   RowTop = TempRect.top;
6113   RowHeight = g_ButtonHeight;
6114   if (g_TextBoxHeight > RowHeight)
6115     RowHeight = g_TextBoxHeight;
6116   RowHeight = ceilf (RowHeight * 1.1);
6117 
6118   /* Make the Create button at the far right of the first row of controls,
6119   which are all database file related. */
6120 
6121   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6122   TempRect = Bounds ();
6123   TempRect.top = RowTop + Margin;
6124   TempRect.bottom = TempRect.top + g_ButtonHeight;
6125 
6126   CommandMessage.MakeEmpty ();
6127   CommandMessage.what = B_CREATE_PROPERTY;
6128   CommandMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6129   m_CreateDatabaseButtonPntr = new BButton (TempRect, "Create Button",
6130     "Create", new BMessage (CommandMessage), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6131   if (m_CreateDatabaseButtonPntr == NULL) goto ErrorExit;
6132   AddChild (m_CreateDatabaseButtonPntr);
6133   m_CreateDatabaseButtonPntr->SetTarget (be_app);
6134   m_CreateDatabaseButtonPntr->ResizeToPreferred ();
6135   m_CreateDatabaseButtonPntr->GetPreferredSize (&Width, &Height);
6136   m_CreateDatabaseButtonPntr->MoveTo (X - Width, TempRect.top);
6137   X -= Width + g_MarginBetweenControls;
6138 
6139   /* Make the Browse button, middle of the first row. */
6140 
6141   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6142   TempRect = Bounds ();
6143   TempRect.top = RowTop + Margin;
6144   TempRect.bottom = TempRect.top + g_ButtonHeight;
6145 
6146   m_BrowseButtonPntr = new BButton (TempRect, "Browse Button",
6147     "Browse…", new BMessage (MSG_BROWSE_BUTTON), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6148   if (m_BrowseButtonPntr == NULL) goto ErrorExit;
6149   AddChild (m_BrowseButtonPntr);
6150   m_BrowseButtonPntr->SetTarget (this);
6151   m_BrowseButtonPntr->ResizeToPreferred ();
6152   m_BrowseButtonPntr->GetPreferredSize (&Width, &Height);
6153   m_BrowseButtonPntr->MoveTo (X - Width, TempRect.top);
6154   X -= Width + g_MarginBetweenControls;
6155 
6156   /* Fill the rest of the space on the first row with the file name box. */
6157 
6158   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6159   TempRect = Bounds ();
6160   TempRect.top = RowTop + Margin;
6161   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6162   TempRect.right = X;
6163 
6164   StringPntr = "Word Database:";
6165   strcpy (m_DatabaseFileNameCachedValue, "Unknown...");
6166   m_DatabaseFileNameTextboxPntr = new BTextControl (TempRect,
6167     "File Name",
6168     StringPntr /* label */,
6169     m_DatabaseFileNameCachedValue /* text */,
6170     new BMessage (MSG_DATABASE_NAME),
6171     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6172     B_WILL_DRAW | B_NAVIGABLE | B_NAVIGABLE_JUMP);
6173   AddChild (m_DatabaseFileNameTextboxPntr);
6174   m_DatabaseFileNameTextboxPntr->SetTarget (this);
6175   m_DatabaseFileNameTextboxPntr->SetDivider (
6176     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6177 
6178   /* Second row contains the purge age, and a long line explaining it.  There
6179   is space to the right where the top half of the big purge button will go. */
6180 
6181   RowTop += RowHeight /* previous row's RowHeight */;
6182   BigPurgeButtonTop = RowTop;
6183   TempRect = Bounds ();
6184   X = TempRect.left;
6185   RowHeight = g_TextBoxHeight;
6186   RowHeight = ceilf (RowHeight * 1.1);
6187 
6188   StringPntr = "Number of occurrences needed to store a word:";
6189   m_PurgeAgeCachedValue = 12345678;
6190 
6191   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6192   TempRect.top = RowTop + Margin;
6193   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6194   TempRect.left = X;
6195   TempRect.right = TempRect.left +
6196     be_plain_font->StringWidth (StringPntr) +
6197     be_plain_font->StringWidth (EightDigitsString) +
6198     3 * g_MarginBetweenControls;
6199 
6200   sprintf (TempString, "%d", (int) m_PurgeAgeCachedValue);
6201   m_PurgeAgeTextboxPntr = new BTextControl (TempRect,
6202     "Purge Age",
6203     StringPntr /* label */,
6204     TempString /* text */,
6205     new BMessage (MSG_PURGE_AGE),
6206     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6207     B_WILL_DRAW | B_NAVIGABLE);
6208   AddChild (m_PurgeAgeTextboxPntr);
6209   m_PurgeAgeTextboxPntr->SetTarget (this);
6210   m_PurgeAgeTextboxPntr->SetDivider (
6211     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6212 
6213   /* Third row contains the purge popularity and bottom half of the purge
6214   button. */
6215 
6216   RowTop += RowHeight /* previous row's RowHeight */;
6217   TempRect = Bounds ();
6218   X = TempRect.left;
6219   RowHeight = g_TextBoxHeight;
6220   RowHeight = ceilf (RowHeight * 1.1);
6221 
6222   StringPntr = "Number of messages to store words from:";
6223   m_PurgePopularityCachedValue = 87654321;
6224   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6225   TempRect.top = RowTop + Margin;
6226   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6227   TempRect.left = X;
6228   TempRect.right = TempRect.left +
6229     be_plain_font->StringWidth (StringPntr) +
6230     be_plain_font->StringWidth (EightDigitsString) +
6231     3 * g_MarginBetweenControls;
6232   X = TempRect.right + g_MarginBetweenControls;
6233 
6234   sprintf (TempString, "%d", (int) m_PurgePopularityCachedValue);
6235   m_PurgePopularityTextboxPntr = new BTextControl (TempRect,
6236     "Purge Popularity",
6237     StringPntr /* label */,
6238     TempString /* text */,
6239     new BMessage (MSG_PURGE_POPULARITY),
6240     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6241     B_WILL_DRAW | B_NAVIGABLE);
6242   AddChild (m_PurgePopularityTextboxPntr);
6243   m_PurgePopularityTextboxPntr->SetTarget (this);
6244   m_PurgePopularityTextboxPntr->SetDivider (
6245     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6246 
6247   /* Make the purge button, which will take up space in the 2nd and 3rd rows,
6248   on the right side.  Twice as tall as a regular button too. */
6249 
6250   StringPntr = "Remove Old Words";
6251   Margin = ceilf ((((RowTop + RowHeight) - BigPurgeButtonTop) -
6252     2 * g_TextBoxHeight) / 2);
6253   TempRect.top = BigPurgeButtonTop + Margin;
6254   TempRect.bottom = TempRect.top + 2 * g_TextBoxHeight;
6255   TempRect.left = X;
6256   TempRect.right = X + ceilf (2 * be_plain_font->StringWidth (StringPntr));
6257 
6258   CommandMessage.MakeEmpty ();
6259   CommandMessage.what = B_EXECUTE_PROPERTY;
6260   CommandMessage.AddSpecifier (g_PropertyNames[PN_PURGE]);
6261   m_PurgeButtonPntr = new BButton (TempRect, "Purge Button",
6262     StringPntr, new BMessage (CommandMessage), B_FOLLOW_LEFT | B_FOLLOW_TOP);
6263   if (m_PurgeButtonPntr == NULL) goto ErrorExit;
6264   m_PurgeButtonPntr->ResizeToPreferred();
6265   AddChild (m_PurgeButtonPntr);
6266   m_PurgeButtonPntr->SetTarget (be_app);
6267 
6268   /* The fourth row contains the ignore previous classification checkbox. */
6269 
6270   RowTop += RowHeight /* previous row's RowHeight */;
6271   TempRect = Bounds ();
6272   X = TempRect.left;
6273   RowHeight = g_CheckBoxHeight;
6274   RowHeight = ceilf (RowHeight * 1.1);
6275 
6276   StringPntr = "Allow Retraining on a Message";
6277   m_IgnorePreviousClassCachedValue = false;
6278 
6279   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6280   TempRect.top = RowTop + Margin;
6281   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6282   TempRect.left = X;
6283   m_IgnorePreviousClassCheckboxPntr = new BCheckBox (TempRect,
6284     "Ignore Check",
6285     StringPntr,
6286     new BMessage (MSG_IGNORE_CLASSIFICATION),
6287     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6288   if (m_IgnorePreviousClassCheckboxPntr == NULL) goto ErrorExit;
6289   AddChild (m_IgnorePreviousClassCheckboxPntr);
6290   m_IgnorePreviousClassCheckboxPntr->SetTarget (this);
6291   m_IgnorePreviousClassCheckboxPntr->ResizeToPreferred ();
6292   m_IgnorePreviousClassCheckboxPntr->GetPreferredSize (&Width, &Height);
6293   X += Width + g_MarginBetweenControls;
6294 
6295   /* The fifth row contains the server mode checkbox. */
6296 
6297   RowTop += RowHeight /* previous row's RowHeight */;
6298   TempRect = Bounds ();
6299   RowHeight = g_CheckBoxHeight;
6300   RowHeight = ceilf (RowHeight * 1.1);
6301 
6302   StringPntr = "Print errors to Terminal";
6303   m_ServerModeCachedValue = false;
6304 
6305   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6306   TempRect.top = RowTop + Margin;
6307   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6308   m_ServerModeCheckboxPntr = new BCheckBox (TempRect,
6309     "ServerMode Check",
6310     StringPntr,
6311     new BMessage (MSG_SERVER_MODE),
6312     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6313   if (m_ServerModeCheckboxPntr == NULL) goto ErrorExit;
6314   AddChild (m_ServerModeCheckboxPntr);
6315   m_ServerModeCheckboxPntr->SetTarget (this);
6316   m_ServerModeCheckboxPntr->ResizeToPreferred ();
6317   m_ServerModeCheckboxPntr->GetPreferredSize (&Width, &Height);
6318 
6319   /* This row just contains a huge pop-up menu which shows the tokenize mode
6320   and an explanation of what each mode does. */
6321 
6322   RowTop += RowHeight /* previous row's RowHeight */;
6323   TempRect = Bounds ();
6324   RowHeight = g_PopUpMenuHeight;
6325   RowHeight = ceilf (RowHeight * 1.1);
6326 
6327   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6328   TempRect.top = RowTop + Margin;
6329   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6330 
6331   m_TokenizeModeCachedValue = TM_MAX; /* Illegal value will force redraw. */
6332   m_TokenizeModeMenuBarPntr = new BMenuBar (TempRect, "TokenizeModeMenuBar",
6333     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6334     false /* resize to fit items */);
6335   if (m_TokenizeModeMenuBarPntr == NULL) goto ErrorExit;
6336   m_TokenizeModePopUpMenuPntr = new BPopUpMenu ("TokenizeModePopUpMenu");
6337   if (m_TokenizeModePopUpMenuPntr == NULL) goto ErrorExit;
6338 
6339   for (TokenizeMode = (TokenizeModes) 0;
6340   TokenizeMode < TM_MAX;
6341   TokenizeMode = (TokenizeModes) ((int) TokenizeMode + 1))
6342   {
6343     /* Each different tokenize mode gets its own menu item.  Selecting the item
6344     will send a canned command to the application to switch to the appropriate
6345     tokenize mode.  An optional explanation of each mode is added to the mode
6346     name string. */
6347 
6348     CommandMessage.MakeEmpty ();
6349     CommandMessage.what = B_SET_PROPERTY;
6350     CommandMessage.AddSpecifier (g_PropertyNames[PN_TOKENIZE_MODE]);
6351     CommandMessage.AddString (g_DataName, g_TokenizeModeNames[TokenizeMode]);
6352     strcpy (TempString, g_TokenizeModeNames[TokenizeMode]);
6353     switch (TokenizeMode)
6354     {
6355       case TM_WHOLE:
6356         strcat (TempString, " - Scan everything");
6357         break;
6358 
6359       case TM_PLAIN_TEXT:
6360         strcat (TempString, " - Scan e-mail body text except rich text");
6361         break;
6362 
6363       case TM_PLAIN_TEXT_HEADER:
6364         strcat (TempString, " - Scan entire e-mail text except rich text");
6365         break;
6366 
6367       case TM_ANY_TEXT:
6368         strcat (TempString, " - Scan e-mail body text and text attachments");
6369         break;
6370 
6371       case TM_ANY_TEXT_HEADER:
6372         strcat (TempString, " - Scan entire e-mail text and text attachments (Recommended)");
6373         break;
6374 
6375       case TM_ALL_PARTS:
6376         strcat (TempString, " - Scan e-mail body and all attachments");
6377         break;
6378 
6379       case TM_ALL_PARTS_HEADER:
6380         strcat (TempString, " - Scan all parts of the e-mail");
6381         break;
6382 
6383       case TM_JUST_HEADER:
6384         strcat (TempString, " - Scan just the header (mail routing information)");
6385         break;
6386 
6387       default:
6388         break;
6389     }
6390     TempMenuItemPntr =
6391       new BMenuItem (TempString, new BMessage (CommandMessage));
6392     if (TempMenuItemPntr == NULL) goto ErrorExit;
6393     TempMenuItemPntr->SetTarget (be_app);
6394     m_TokenizeModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6395   }
6396   m_TokenizeModeMenuBarPntr->AddItem (m_TokenizeModePopUpMenuPntr);
6397   AddChild (m_TokenizeModeMenuBarPntr);
6398 
6399   /* This row just contains a huge pop-up menu which shows the scoring mode
6400   and an explanation of what each mode does. */
6401 
6402   RowTop += RowHeight /* previous row's RowHeight */;
6403   TempRect = Bounds ();
6404   RowHeight = g_PopUpMenuHeight;
6405   RowHeight = ceilf (RowHeight * 1.1);
6406 
6407   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6408   TempRect.top = RowTop + Margin;
6409   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6410 
6411   m_ScoringModeCachedValue = SM_MAX; /* Illegal value will force redraw. */
6412   m_ScoringModeMenuBarPntr = new BMenuBar (TempRect, "ScoringModeMenuBar",
6413     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6414     false /* resize to fit items */);
6415   if (m_ScoringModeMenuBarPntr == NULL) goto ErrorExit;
6416   m_ScoringModePopUpMenuPntr = new BPopUpMenu ("ScoringModePopUpMenu");
6417   if (m_ScoringModePopUpMenuPntr == NULL) goto ErrorExit;
6418 
6419   for (ScoringMode = (ScoringModes) 0;
6420   ScoringMode < SM_MAX;
6421   ScoringMode = (ScoringModes) ((int) ScoringMode + 1))
6422   {
6423     /* Each different scoring mode gets its own menu item.  Selecting the item
6424     will send a canned command to the application to switch to the appropriate
6425     scoring mode.  An optional explanation of each mode is added to the mode
6426     name string. */
6427 
6428     CommandMessage.MakeEmpty ();
6429     CommandMessage.what = B_SET_PROPERTY;
6430     CommandMessage.AddSpecifier (g_PropertyNames[PN_SCORING_MODE]);
6431     CommandMessage.AddString (g_DataName, g_ScoringModeNames[ScoringMode]);
6432 /*
6433     strcpy (TempString, g_ScoringModeNames[ScoringMode]);
6434     switch (ScoringMode)
6435     {
6436       case SM_ROBINSON:
6437         strcat (TempString, " - Learning Method 1: Naive Bayesian");
6438         break;
6439 
6440       case SM_CHISQUARED:
6441         strcat (TempString, " - Learning Method 2: Chi-Squared");
6442         break;
6443 
6444       default:
6445         break;
6446     }
6447 */
6448     switch (ScoringMode)
6449     {
6450       case SM_ROBINSON:
6451         strcpy (TempString, "Learning Method 1: Naive Bayesian");
6452         break;
6453 
6454       case SM_CHISQUARED:
6455         strcpy (TempString, "Learning Method 2: Chi-Squared");
6456         break;
6457 
6458       default:
6459         break;
6460     }
6461     TempMenuItemPntr =
6462       new BMenuItem (TempString, new BMessage (CommandMessage));
6463     if (TempMenuItemPntr == NULL) goto ErrorExit;
6464     TempMenuItemPntr->SetTarget (be_app);
6465     m_ScoringModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6466   }
6467   m_ScoringModeMenuBarPntr->AddItem (m_ScoringModePopUpMenuPntr);
6468   AddChild (m_ScoringModeMenuBarPntr);
6469 
6470   /* The next row has the install MIME types button and the reset to defaults
6471   button, one on the left and the other on the right. */
6472 
6473   RowTop += RowHeight /* previous row's RowHeight */;
6474   TempRect = Bounds ();
6475   RowHeight = g_ButtonHeight;
6476   RowHeight = ceilf (RowHeight * 1.1);
6477 
6478   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6479   TempRect.top = RowTop + Margin;
6480   TempRect.bottom = TempRect.top + g_ButtonHeight;
6481 
6482   CommandMessage.MakeEmpty ();
6483   CommandMessage.what = B_EXECUTE_PROPERTY;
6484   CommandMessage.AddSpecifier (g_PropertyNames[PN_INSTALL_THINGS]);
6485   m_InstallThingsButtonPntr = new BButton (TempRect, "Install Button",
6486     "Install Spam Types",
6487     new BMessage (CommandMessage),
6488     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6489   if (m_InstallThingsButtonPntr == NULL) goto ErrorExit;
6490   AddChild (m_InstallThingsButtonPntr);
6491   m_InstallThingsButtonPntr->SetTarget (be_app);
6492   m_InstallThingsButtonPntr->ResizeToPreferred ();
6493 
6494   /* The Reset to Defaults button.  On the right side of the row. */
6495 
6496   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6497   TempRect = Bounds ();
6498   TempRect.top = RowTop + Margin;
6499   TempRect.bottom = TempRect.top + g_ButtonHeight;
6500 
6501   CommandMessage.MakeEmpty ();
6502   CommandMessage.what = B_EXECUTE_PROPERTY;
6503   CommandMessage.AddSpecifier (g_PropertyNames[PN_RESET_TO_DEFAULTS]);
6504   m_ResetToDefaultsButtonPntr = new BButton (TempRect, "Reset Button",
6505     "Default Settings", new BMessage (CommandMessage),
6506     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6507   if (m_ResetToDefaultsButtonPntr == NULL) goto ErrorExit;
6508   AddChild (m_ResetToDefaultsButtonPntr);
6509   m_ResetToDefaultsButtonPntr->SetTarget (be_app);
6510   m_ResetToDefaultsButtonPntr->ResizeToPreferred ();
6511   m_ResetToDefaultsButtonPntr->GetPreferredSize (&Width, &Height);
6512   m_ResetToDefaultsButtonPntr->MoveTo (TempRect.right - Width, TempRect.top);
6513 
6514   /* The next row contains the Estimate, Add Examples and About buttons. */
6515 
6516   RowTop += RowHeight /* previous row's RowHeight */;
6517   TempRect = Bounds ();
6518   X = TempRect.left;
6519   RowHeight = g_ButtonHeight;
6520   RowHeight = ceilf (RowHeight * 1.1);
6521 
6522   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6523   TempRect.top = RowTop + Margin;
6524   TempRect.bottom = TempRect.top + g_ButtonHeight;
6525   TempRect.left = X;
6526 
6527   m_EstimateSpamButtonPntr = new BButton (TempRect, "Estimate Button",
6528     "Scan a Message",
6529     new BMessage (MSG_ESTIMATE_BUTTON),
6530     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6531   if (m_EstimateSpamButtonPntr == NULL) goto ErrorExit;
6532   AddChild (m_EstimateSpamButtonPntr);
6533   m_EstimateSpamButtonPntr->SetTarget (this);
6534   m_EstimateSpamButtonPntr->ResizeToPreferred ();
6535   X = m_EstimateSpamButtonPntr->Frame().right + g_MarginBetweenControls;
6536 
6537   /* The Add Example button in the middle.  Does the same as the browse button,
6538   but don't tell anyone that! */
6539 
6540   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6541   TempRect.top = RowTop + Margin;
6542   TempRect.bottom = TempRect.top + g_ButtonHeight;
6543   TempRect.left = X;
6544 
6545   m_AddExampleButtonPntr = new BButton (TempRect, "Example Button",
6546     "Train Spam Filter on a Message",
6547     new BMessage (MSG_BROWSE_BUTTON),
6548     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6549     B_WILL_DRAW | B_NAVIGABLE | B_FULL_UPDATE_ON_RESIZE);
6550   if (m_AddExampleButtonPntr == NULL) goto ErrorExit;
6551   AddChild (m_AddExampleButtonPntr);
6552   m_AddExampleButtonPntr->SetTarget (this);
6553   m_AddExampleButtonPntr->ResizeToPreferred ();
6554   X = m_AddExampleButtonPntr->Frame().right + g_MarginBetweenControls;
6555 
6556   /* Add the About button on the right. */
6557 
6558   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6559   TempRect = Bounds ();
6560   TempRect.top = RowTop + Margin;
6561   TempRect.bottom = TempRect.top + g_ButtonHeight;
6562   TempRect.left = X;
6563 
6564   m_AboutButtonPntr = new BButton (TempRect, "About Button",
6565     "About…",
6566     new BMessage (B_ABOUT_REQUESTED),
6567     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6568   if (m_AboutButtonPntr == NULL) goto ErrorExit;
6569   AddChild (m_AboutButtonPntr);
6570   m_AboutButtonPntr->SetTarget (be_app);
6571 
6572   /* This row displays various counters.  Starting with the genuine messages
6573   count on the left. */
6574 
6575   RowTop += RowHeight /* previous row's RowHeight */;
6576   TempRect = Bounds ();
6577   RowHeight = g_TextBoxHeight;
6578   RowHeight = ceilf (RowHeight * 1.1);
6579 
6580   StringPntr = "Genuine Messages:";
6581   m_GenuineCountCachedValue = 87654321;
6582   sprintf (TempString, "%d", (int) m_GenuineCountCachedValue);
6583 
6584   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6585   TempRect = Bounds ();
6586   TempRect.top = RowTop + Margin;
6587   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6588   TempRect.right = TempRect.left +
6589     be_plain_font->StringWidth (StringPntr) +
6590     be_plain_font->StringWidth (TempString) +
6591     3 * g_MarginBetweenControls;
6592 
6593   m_GenuineCountTextboxPntr = new BTextControl (TempRect,
6594     "Genuine Count",
6595     StringPntr /* label */,
6596     TempString /* text */,
6597     NULL /* no message */,
6598     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6599     B_WILL_DRAW /* not B_NAVIGABLE */);
6600   AddChild (m_GenuineCountTextboxPntr);
6601   m_GenuineCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6602   m_GenuineCountTextboxPntr->SetDivider (
6603     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6604   m_GenuineCountTextboxPntr->SetEnabled (false); /* For display only. */
6605 
6606   /* The word count in the center. */
6607 
6608   StringPntr = "Word Count:";
6609   m_WordCountCachedValue = 87654321;
6610   sprintf (TempString, "%d", (int) m_WordCountCachedValue);
6611 
6612   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6613   TempRect = Bounds ();
6614   TempRect.top = RowTop + Margin;
6615   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6616   Width = be_plain_font->StringWidth (StringPntr) +
6617     be_plain_font->StringWidth (TempString) +
6618     3 * g_MarginBetweenControls;
6619   TempRect.left = ceilf ((TempRect.right - TempRect.left) / 2 - Width / 2);
6620   TempRect.right = TempRect.left + Width;
6621 
6622   m_WordCountTextboxPntr = new BTextControl (TempRect,
6623     "Word Count",
6624     StringPntr /* label */,
6625     TempString /* text */,
6626     NULL /* no message */,
6627     B_FOLLOW_H_CENTER | B_FOLLOW_TOP,
6628     B_WILL_DRAW /* not B_NAVIGABLE */);
6629   AddChild (m_WordCountTextboxPntr);
6630   m_WordCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6631   m_WordCountTextboxPntr->SetDivider (
6632     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6633   m_WordCountTextboxPntr->SetEnabled (false); /* For display only. */
6634 
6635   /* The spam count on the far right. */
6636 
6637   StringPntr = "Spam Messages:";
6638   m_SpamCountCachedValue = 87654321;
6639   sprintf (TempString, "%d", (int) m_SpamCountCachedValue);
6640 
6641   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6642   TempRect = Bounds ();
6643   TempRect.top = RowTop + Margin;
6644   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6645   TempRect.left = TempRect.right -
6646     be_plain_font->StringWidth (StringPntr) -
6647     be_plain_font->StringWidth (TempString) -
6648     3 * g_MarginBetweenControls;
6649 
6650   m_SpamCountTextboxPntr = new BTextControl (TempRect,
6651     "Spam Count",
6652     StringPntr /* label */,
6653     TempString /* text */,
6654     NULL /* no message */,
6655     B_FOLLOW_RIGHT | B_FOLLOW_TOP,
6656     B_WILL_DRAW /* not B_NAVIGABLE */);
6657   AddChild (m_SpamCountTextboxPntr);
6658   m_SpamCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6659   m_SpamCountTextboxPntr->SetDivider (
6660     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6661   m_SpamCountTextboxPntr->SetEnabled (false); /* For display only. */
6662 
6663   /* Change the size of our view so it only takes up the space needed by the
6664   buttons. */
6665 
6666   RowTop += RowHeight /* previous row's RowHeight */;
6667   ResizeTo (Bounds().Width(), RowTop - Bounds().top + 1);
6668 
6669   return; /* Successful. */
6670 
6671 ErrorExit:
6672   DisplayErrorMessage ("Unable to initialise the controls view.");
6673 }
6674 
6675 
6676 void ControlsView::BrowseForDatabaseFile ()
6677 {
6678   if (m_BrowseFilePanelPntr == NULL)
6679   {
6680     BEntry      DirectoryEntry;
6681     entry_ref   DirectoryEntryRef;
6682     BMessage    GetDatabasePathCommand;
6683     BMessage    GetDatabasePathResult;
6684     const char *StringPntr = NULL;
6685 
6686     /* Create a new file panel.  First set up the entry ref stuff so that the
6687     file panel can open to show the initial directory (the one where the
6688     database file currently is).  Note that we have to create it after the
6689     window and view are up and running, otherwise the BMessenger won't point to
6690     a valid looper/handler.  First find out the current database file name to
6691     use as a starting point. */
6692 
6693     GetDatabasePathCommand.what = B_GET_PROPERTY;
6694     GetDatabasePathCommand.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6695     be_app_messenger.SendMessage (&GetDatabasePathCommand,
6696       &GetDatabasePathResult, 5000000 /* delivery timeout */,
6697       5000000 /* reply timeout */);
6698     if (GetDatabasePathResult.FindString (g_ResultName, &StringPntr) != B_OK ||
6699     DirectoryEntry.SetTo (StringPntr) != B_OK ||
6700     DirectoryEntry.GetParent (&DirectoryEntry) != B_OK)
6701       DirectoryEntry.SetTo ("."); /* Default directory if we can't find it. */
6702     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6703     {
6704       DisplayErrorMessage (
6705         "Unable to set up the file requestor starting directory.  Sorry.");
6706       return;
6707     }
6708 
6709     m_BrowseFilePanelPntr = new BFilePanel (
6710       B_OPEN_PANEL /* mode */,
6711       &be_app_messenger /* target for event messages */,
6712       &DirectoryEntryRef /* starting directory */,
6713       B_FILE_NODE,
6714       true /* true for multiple selections */,
6715       NULL /* canned message */,
6716       NULL /* ref filter */,
6717       false /* true for modal */,
6718       true /* true to hide when done */);
6719   }
6720 
6721   if (m_BrowseFilePanelPntr != NULL)
6722     m_BrowseFilePanelPntr->Show (); /* Answer returned later in RefsReceived. */
6723 }
6724 
6725 
6726 void ControlsView::BrowseForFileToEstimate ()
6727 {
6728   if (m_EstimateSpamFilePanelPntr == NULL)
6729   {
6730     BEntry      DirectoryEntry;
6731     entry_ref   DirectoryEntryRef;
6732     status_t    ErrorCode;
6733     BMessenger  MessengerToSelf (this);
6734     BPath       PathToMailDirectory;
6735 
6736     /* Create a new file panel.  First set up the entry ref stuff so that the
6737     file panel can open to show the initial directory (the user's mail
6738     directory).  Note that we have to create the panel after the window and
6739     view are up and running, otherwise the BMessenger won't point to a valid
6740     looper/handler. */
6741 
6742     ErrorCode = find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6743     if (ErrorCode == B_OK)
6744     {
6745       PathToMailDirectory.Append ("mail");
6746       ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(),
6747         true /* traverse symbolic links*/);
6748       if (ErrorCode != B_OK || !DirectoryEntry.Exists ())
6749       {
6750         /* If no mail directory, try home directory. */
6751         find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6752         ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6753       }
6754     }
6755     if (ErrorCode != B_OK)
6756       PathToMailDirectory.SetTo (".");
6757 
6758     DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6759     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6760     {
6761       DisplayErrorMessage (
6762         "Unable to set up the file requestor starting directory.  Sorry.");
6763       return;
6764     }
6765 
6766     m_EstimateSpamFilePanelPntr = new BFilePanel (
6767       B_OPEN_PANEL /* mode */,
6768       &MessengerToSelf /* target for event messages */,
6769       &DirectoryEntryRef /* starting directory */,
6770       B_FILE_NODE,
6771       true /* true for multiple selections */,
6772       new BMessage (MSG_ESTIMATE_FILE_REFS) /* canned message */,
6773       NULL /* ref filter */,
6774       false /* true for modal */,
6775       true /* true to hide when done */);
6776   }
6777 
6778   if (m_EstimateSpamFilePanelPntr != NULL)
6779     m_EstimateSpamFilePanelPntr->Show (); /* Answer sent via a message. */
6780 }
6781 
6782 
6783 /* The display has been resized.  Have to manually adjust the popup menu bar to
6784 show the new size (the sub-items need to be resized too).  Then make it redraw.
6785 Well, actually just resetting the mark on the current item will resize it
6786 properly. */
6787 
6788 void ControlsView::FrameResized (float, float)
6789 {
6790   m_ScoringModeCachedValue = SM_MAX; /* Force it to reset the mark. */
6791   m_TokenizeModeCachedValue = TM_MAX; /* Force it to reset the mark. */
6792 }
6793 
6794 
6795 void ControlsView::MessageReceived (BMessage *MessagePntr)
6796 {
6797   BMessage CommandMessage;
6798   bool     TempBool;
6799   uint32   TempUint32;
6800 
6801   switch (MessagePntr->what)
6802   {
6803     case MSG_BROWSE_BUTTON:
6804       BrowseForDatabaseFile ();
6805       break;
6806 
6807     case MSG_DATABASE_NAME:
6808       if (strcmp (m_DatabaseFileNameCachedValue,
6809       m_DatabaseFileNameTextboxPntr->Text ()) != 0)
6810         SubmitCommandString (PN_DATABASE_FILE, B_SET_PROPERTY,
6811         m_DatabaseFileNameTextboxPntr->Text ());
6812       break;
6813 
6814     case MSG_ESTIMATE_BUTTON:
6815       BrowseForFileToEstimate ();
6816       break;
6817 
6818     case MSG_ESTIMATE_FILE_REFS:
6819       EstimateRefFilesAndDisplay (MessagePntr);
6820       break;
6821 
6822     case MSG_IGNORE_CLASSIFICATION:
6823       TempBool = (m_IgnorePreviousClassCheckboxPntr->Value() == B_CONTROL_ON);
6824       if (m_IgnorePreviousClassCachedValue != TempBool)
6825         SubmitCommandBool (PN_IGNORE_PREVIOUS_CLASSIFICATION,
6826         B_SET_PROPERTY, TempBool);
6827       break;
6828 
6829     case MSG_PURGE_AGE:
6830       TempUint32 = strtoul (m_PurgeAgeTextboxPntr->Text (), NULL, 10);
6831       if (m_PurgeAgeCachedValue != TempUint32)
6832         SubmitCommandInt32 (PN_PURGE_AGE, B_SET_PROPERTY, TempUint32);
6833       break;
6834 
6835     case MSG_PURGE_POPULARITY:
6836       TempUint32 = strtoul (m_PurgePopularityTextboxPntr->Text (), NULL, 10);
6837       if (m_PurgePopularityCachedValue != TempUint32)
6838         SubmitCommandInt32 (PN_PURGE_POPULARITY, B_SET_PROPERTY, TempUint32);
6839       break;
6840 
6841     case MSG_SERVER_MODE:
6842       TempBool = (m_ServerModeCheckboxPntr->Value() == B_CONTROL_ON);
6843       if (m_ServerModeCachedValue != TempBool)
6844         SubmitCommandBool (PN_SERVER_MODE, B_SET_PROPERTY, TempBool);
6845       break;
6846 
6847     default:
6848       BView::MessageReceived (MessagePntr);
6849   }
6850 }
6851 
6852 
6853 /* Check the server for changes in the state of the database, and if there are
6854 any changes, update the displayed values.  Since this is a read only
6855 examination of the server, we go directly to the application rather than
6856 sending it messages.  Also, when sending messages, we can't find out what it is
6857 doing while it is busy with a batch of spam additions (all the spam add
6858 commands will be in the queue ahead of our requests for info).  Instead, we
6859 lock the BApplication (so it isn't changing things while we're looking) and
6860 retrieve our values. */
6861 
6862 void ControlsView::PollServerForChanges ()
6863 {
6864   ABSApp     *MyAppPntr;
6865   BMenuItem  *TempMenuItemPntr;
6866   char        TempString [PATH_MAX];
6867   BWindow    *WindowPntr;
6868 
6869   /* We need a pointer to our window, for changing the title etc. */
6870 
6871   WindowPntr = Window ();
6872   if (WindowPntr == NULL)
6873     return; /* No window, no point in updating the display! */
6874 
6875   /* Check the server mode flag.  If the mode is off, then the window has to be
6876   minimized.  Similarly, if it gets turned on, maximize the window.  Note that
6877   the user can maximize the window manually, even while still in server mode.
6878   */
6879 
6880   if (g_ServerMode != m_ServerModeCachedValue &&
6881   m_ServerModeCheckboxPntr != NULL)
6882   {
6883     m_ServerModeCachedValue = g_ServerMode;
6884     m_ServerModeCheckboxPntr->SetValue (
6885       m_ServerModeCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6886     WindowPntr->Minimize (m_ServerModeCachedValue);
6887   }
6888 
6889   if (WindowPntr->IsMinimized ())
6890     return; /* Window isn't visible, don't waste time updating it. */
6891 
6892   /* So that people don't stare at a blank screen, request a database load if
6893   nothing is there.  But only do it once, so the user doesn't get a lot of
6894   invalid database messages if one doesn't exist yet.  In server mode, we never
6895   get this far so it is only loaded when the user wants to see something. */
6896 
6897   if (!m_DatabaseLoadDone)
6898   {
6899     m_DatabaseLoadDone = true;
6900     /* Counting the number of words will load the database. */
6901     SubmitCommandString (PN_DATABASE_FILE, B_COUNT_PROPERTIES, "");
6902   }
6903 
6904   /* Check various read only values, which can be read from the BApplication
6905   without having to lock it.  This is useful for displaying the number of words
6906   as it is changing.  First up is the purge age setting. */
6907 
6908   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
6909   if (MyAppPntr == NULL)
6910     return; /* Doesn't exist or is the wrong class.  Not likely! */
6911 
6912   if (MyAppPntr->m_PurgeAge != m_PurgeAgeCachedValue &&
6913   m_PurgeAgeTextboxPntr != NULL)
6914   {
6915     m_PurgeAgeCachedValue = MyAppPntr->m_PurgeAge;
6916     sprintf (TempString, "%lu", m_PurgeAgeCachedValue);
6917     m_PurgeAgeTextboxPntr->SetText (TempString);
6918   }
6919 
6920   /* Check the purge popularity. */
6921 
6922   if (MyAppPntr->m_PurgePopularity != m_PurgePopularityCachedValue &&
6923   m_PurgePopularityTextboxPntr != NULL)
6924   {
6925     m_PurgePopularityCachedValue = MyAppPntr->m_PurgePopularity;
6926     sprintf (TempString, "%lu", m_PurgePopularityCachedValue);
6927     m_PurgePopularityTextboxPntr->SetText (TempString);
6928   }
6929 
6930   /* Check the Ignore Previous Classification flag. */
6931 
6932   if (MyAppPntr->m_IgnorePreviousClassification !=
6933   m_IgnorePreviousClassCachedValue &&
6934   m_IgnorePreviousClassCheckboxPntr != NULL)
6935   {
6936     m_IgnorePreviousClassCachedValue =
6937       MyAppPntr->m_IgnorePreviousClassification;
6938     m_IgnorePreviousClassCheckboxPntr->SetValue (
6939       m_IgnorePreviousClassCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6940   }
6941 
6942   /* Update the genuine count. */
6943 
6944   if (MyAppPntr->m_TotalGenuineMessages != m_GenuineCountCachedValue &&
6945   m_GenuineCountTextboxPntr != NULL)
6946   {
6947     m_GenuineCountCachedValue = MyAppPntr->m_TotalGenuineMessages;
6948     sprintf (TempString, "%lu", m_GenuineCountCachedValue);
6949     m_GenuineCountTextboxPntr->SetText (TempString);
6950   }
6951 
6952   /* Update the spam count. */
6953 
6954   if (MyAppPntr->m_TotalSpamMessages != m_SpamCountCachedValue &&
6955   m_SpamCountTextboxPntr != NULL)
6956   {
6957     m_SpamCountCachedValue = MyAppPntr->m_TotalSpamMessages;
6958     sprintf (TempString, "%lu", m_SpamCountCachedValue);
6959     m_SpamCountTextboxPntr->SetText (TempString);
6960   }
6961 
6962   /* Update the word count. */
6963 
6964   if (MyAppPntr->m_WordCount != m_WordCountCachedValue &&
6965   m_WordCountTextboxPntr != NULL)
6966   {
6967     m_WordCountCachedValue = MyAppPntr->m_WordCount;
6968     sprintf (TempString, "%lu", m_WordCountCachedValue);
6969     m_WordCountTextboxPntr->SetText (TempString);
6970   }
6971 
6972   /* Update the tokenize mode pop-up menu. */
6973 
6974   if (MyAppPntr->m_TokenizeMode != m_TokenizeModeCachedValue &&
6975   m_TokenizeModePopUpMenuPntr != NULL)
6976   {
6977     m_TokenizeModeCachedValue = MyAppPntr->m_TokenizeMode;
6978     TempMenuItemPntr =
6979       m_TokenizeModePopUpMenuPntr->ItemAt ((int) m_TokenizeModeCachedValue);
6980     if (TempMenuItemPntr != NULL)
6981       TempMenuItemPntr->SetMarked (true);
6982   }
6983 
6984   /* Update the scoring mode pop-up menu. */
6985 
6986   if (MyAppPntr->m_ScoringMode != m_ScoringModeCachedValue &&
6987   m_ScoringModePopUpMenuPntr != NULL)
6988   {
6989     m_ScoringModeCachedValue = MyAppPntr->m_ScoringMode;
6990     TempMenuItemPntr =
6991       m_ScoringModePopUpMenuPntr->ItemAt ((int) m_ScoringModeCachedValue);
6992     if (TempMenuItemPntr != NULL)
6993       TempMenuItemPntr->SetMarked (true);
6994   }
6995 
6996   /* Lock the application.  This will stop it from processing any further
6997   messages until we are done.  Or if it is busy, the lock will fail. */
6998 
6999   if (MyAppPntr->LockWithTimeout (100000) != B_OK)
7000     return; /* It's probably busy doing something. */
7001 
7002   /* See if the database file name has changed. */
7003 
7004   if (strcmp (MyAppPntr->m_DatabaseFileName.String (),
7005   m_DatabaseFileNameCachedValue) != 0 &&
7006   m_DatabaseFileNameTextboxPntr != NULL)
7007   {
7008     strcpy (m_DatabaseFileNameCachedValue,
7009       MyAppPntr->m_DatabaseFileName.String ());
7010     m_DatabaseFileNameTextboxPntr->SetText (m_DatabaseFileNameCachedValue);
7011     WindowPntr->SetTitle (m_DatabaseFileNameCachedValue);
7012   }
7013 
7014   /* Done.  Let the BApplication continue processing messages. */
7015 
7016   MyAppPntr->Unlock ();
7017 }
7018 
7019 
7020 void ControlsView::Pulse ()
7021 {
7022   if (system_time () > m_TimeOfLastPoll + 200000)
7023   {
7024     PollServerForChanges ();
7025     m_TimeOfLastPoll = system_time ();
7026   }
7027 }
7028 
7029 
7030 
7031 /******************************************************************************
7032  * Implementation of the DatabaseWindow class, constructor, destructor and the
7033  * rest of the member functions in mostly alphabetical order.
7034  */
7035 
7036 DatabaseWindow::DatabaseWindow ()
7037 : BWindow (BRect (30, 30, 620, 400),
7038     "Haiku Spam Filter Server",
7039     B_DOCUMENT_WINDOW, B_ASYNCHRONOUS_CONTROLS)
7040 {
7041   BRect TempRect;
7042 
7043   /* Add the controls view. */
7044 
7045   m_ControlsViewPntr = new ControlsView (Bounds ());
7046   if (m_ControlsViewPntr == NULL)
7047     goto ErrorExit;
7048   AddChild (m_ControlsViewPntr);
7049 
7050   /* Add the word view in the remaining space under the controls view. */
7051 
7052 
7053   TempRect = Bounds ();
7054   TempRect.top = m_ControlsViewPntr->Frame().bottom + 1;
7055   m_WordsViewPntr = new WordsView (TempRect);
7056   if (m_WordsViewPntr == NULL)
7057     goto ErrorExit;
7058   AddChild (m_WordsViewPntr);
7059 
7060  /* Minimize the window if we are starting up in server mode.  This is done
7061 	before the window is open so it doesn't flash onto the screen, and possibly
7062 	steal a keystroke or two.  The ControlsView will further update the minimize
7063 	mode when it detects changes in the server mode. */
7064   Minimize (g_ServerMode);
7065 
7066   return;
7067 
7068 ErrorExit:
7069   DisplayErrorMessage ("Unable to initialise the window contents.");
7070 }
7071 
7072 
7073 void DatabaseWindow::MessageReceived (BMessage *MessagePntr)
7074 {
7075   if (MessagePntr->what == B_MOUSE_WHEEL_CHANGED)
7076   {
7077     /* Pass the mouse wheel stuff down to the words view, since that's the only
7078     one which does scrolling so we don't need to worry about whether it has
7079     focus or not. */
7080 
7081     if (m_WordsViewPntr != NULL)
7082       m_WordsViewPntr->MessageReceived (MessagePntr);
7083   }
7084   else
7085     BWindow::MessageReceived (MessagePntr);
7086 }
7087 
7088 
7089 bool DatabaseWindow::QuitRequested ()
7090 {
7091   be_app->PostMessage (B_QUIT_REQUESTED);
7092   return true;
7093 }
7094 
7095 
7096 
7097 /******************************************************************************
7098  * Implementation of the word display view.
7099  */
7100 
7101 WordsView::WordsView (BRect NewBounds)
7102 : BView (NewBounds, "WordsView", B_FOLLOW_ALL_SIDES,
7103     B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE | B_NAVIGABLE | B_PULSE_NEEDED),
7104   m_ArrowLineDownPntr (NULL),
7105   m_ArrowLineUpPntr (NULL),
7106   m_ArrowPageDownPntr (NULL),
7107   m_ArrowPageUpPntr (NULL),
7108   m_LastTimeAKeyWasPressed (0)
7109 {
7110   font_height TempFontHeight;
7111 
7112   GetFont (&m_TextFont); /* Modify the default font to be our own. */
7113   m_TextFont.SetSize (ceilf (m_TextFont.Size() * 1.1));
7114   m_TextFont.GetHeight (&TempFontHeight);
7115   SetFont (&m_TextFont);
7116 
7117   m_LineHeight = ceilf (TempFontHeight.ascent +
7118     TempFontHeight.descent + TempFontHeight.leading);
7119   m_AscentHeight = ceilf (TempFontHeight.ascent);
7120   m_TextHeight = ceilf (TempFontHeight.ascent +
7121     TempFontHeight.descent);
7122 
7123   m_FocusedColour.red = 255;
7124   m_FocusedColour.green = 255;
7125   m_FocusedColour.blue = 255;
7126   m_FocusedColour.alpha = 255;
7127 
7128   m_UnfocusedColour.red = 245;
7129   m_UnfocusedColour.green = 245;
7130   m_UnfocusedColour.blue = 255;
7131   m_UnfocusedColour.alpha = 255;
7132 
7133   m_BackgroundColour = m_UnfocusedColour;
7134   SetViewColor (m_BackgroundColour);
7135   SetLowColor (m_BackgroundColour);
7136   SetHighColor (0, 0, 0);
7137 
7138   strcpy (m_FirstDisplayedWord, "a");
7139 }
7140 
7141 
7142 void WordsView::AttachedToWindow ()
7143 {
7144   BPolygon        DownLinePolygon (g_DownLinePoints,
7145                     sizeof (g_DownLinePoints) /
7146                     sizeof (g_DownLinePoints[0]));
7147 
7148   BPolygon        DownPagePolygon (g_DownPagePoints,
7149                     sizeof (g_DownPagePoints) /
7150                     sizeof (g_DownPagePoints[0]));
7151 
7152   BPolygon        UpLinePolygon (g_UpLinePoints,
7153                     sizeof (g_UpLinePoints) /
7154                     sizeof (g_UpLinePoints[0]));
7155 
7156   BPolygon        UpPagePolygon (g_UpPagePoints,
7157                     sizeof (g_UpPagePoints) /
7158                     sizeof (g_UpPagePoints[0]));
7159 
7160   BPicture        TempOffPicture;
7161   BPicture        TempOnPicture;
7162   BRect           TempRect;
7163 
7164   /* Make the buttons and associated polygon images for the forward and
7165   backwards a word or a page of words buttons.  They're the width of the scroll
7166   bar area on the right, but twice as tall as usual, since there is no scroll
7167   bar and that will make it easier to use them.  First the up a line button. */
7168 
7169   SetHighColor (0, 0, 0);
7170   BeginPicture (&TempOffPicture);
7171   FillPolygon (&UpLinePolygon);
7172   SetHighColor (180, 180, 180);
7173   StrokePolygon (&UpLinePolygon);
7174   EndPicture ();
7175 
7176   SetHighColor (128, 128, 128);
7177   BeginPicture (&TempOnPicture);
7178   FillPolygon (&UpLinePolygon);
7179   EndPicture ();
7180 
7181   TempRect = Bounds ();
7182   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7183   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7184   m_ArrowLineUpPntr = new BPictureButton (TempRect, "Up Line",
7185     &TempOffPicture, &TempOnPicture,
7186     new BMessage (MSG_LINE_UP), B_ONE_STATE_BUTTON,
7187     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7188   if (m_ArrowLineUpPntr == NULL) goto ErrorExit;
7189   AddChild (m_ArrowLineUpPntr);
7190   m_ArrowLineUpPntr->SetTarget (this);
7191 
7192   /* Up a page button. */
7193 
7194   SetHighColor (0, 0, 0);
7195   BeginPicture (&TempOffPicture);
7196   FillPolygon (&UpPagePolygon);
7197   SetHighColor (180, 180, 180);
7198   StrokePolygon (&UpPagePolygon);
7199   EndPicture ();
7200 
7201   SetHighColor (128, 128, 128);
7202   BeginPicture (&TempOnPicture);
7203   FillPolygon (&UpPagePolygon);
7204   EndPicture ();
7205 
7206   TempRect = Bounds ();
7207   TempRect.top += 2 * B_H_SCROLL_BAR_HEIGHT + 1;
7208   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7209   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7210   m_ArrowPageUpPntr = new BPictureButton (TempRect, "Up Page",
7211     &TempOffPicture, &TempOnPicture,
7212     new BMessage (MSG_PAGE_UP), B_ONE_STATE_BUTTON,
7213     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7214   if (m_ArrowPageUpPntr == NULL) goto ErrorExit;
7215   AddChild (m_ArrowPageUpPntr);
7216   m_ArrowPageUpPntr->SetTarget (this);
7217 
7218   /* Down a page button. */
7219 
7220   SetHighColor (0, 0, 0);
7221   BeginPicture (&TempOffPicture);
7222   FillPolygon (&DownPagePolygon);
7223   SetHighColor (180, 180, 180);
7224   StrokePolygon (&DownPagePolygon);
7225   EndPicture ();
7226 
7227   SetHighColor (128, 128, 128);
7228   BeginPicture (&TempOnPicture);
7229   FillPolygon (&DownPagePolygon);
7230   EndPicture ();
7231 
7232   TempRect = Bounds ();
7233   TempRect.bottom -= 3 * B_H_SCROLL_BAR_HEIGHT + 1;
7234   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7235   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7236   m_ArrowPageDownPntr = new BPictureButton (TempRect, "Down Page",
7237     &TempOffPicture, &TempOnPicture,
7238     new BMessage (MSG_PAGE_DOWN), B_ONE_STATE_BUTTON,
7239     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7240   if (m_ArrowPageDownPntr == NULL) goto ErrorExit;
7241   AddChild (m_ArrowPageDownPntr);
7242   m_ArrowPageDownPntr->SetTarget (this);
7243 
7244   /* Down a line button. */
7245 
7246   SetHighColor (0, 0, 0);
7247   BeginPicture (&TempOffPicture);
7248   FillPolygon (&DownLinePolygon);
7249   SetHighColor (180, 180, 180);
7250   StrokePolygon (&DownLinePolygon);
7251   EndPicture ();
7252 
7253   SetHighColor (128, 128, 128);
7254   BeginPicture (&TempOnPicture);
7255   FillPolygon (&DownLinePolygon);
7256   EndPicture ();
7257 
7258   TempRect = Bounds ();
7259   TempRect.bottom -= B_H_SCROLL_BAR_HEIGHT;
7260   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7261   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7262   m_ArrowLineDownPntr = new BPictureButton (TempRect, "Down Line",
7263     &TempOffPicture, &TempOnPicture,
7264     new BMessage (MSG_LINE_DOWN), B_ONE_STATE_BUTTON,
7265     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7266   if (m_ArrowLineDownPntr == NULL) goto ErrorExit;
7267   AddChild (m_ArrowLineDownPntr);
7268   m_ArrowLineDownPntr->SetTarget (this);
7269 
7270   return;
7271 
7272 ErrorExit:
7273   DisplayErrorMessage ("Problems while making view displaying the words.");
7274 }
7275 
7276 
7277 /* Draw the words starting with the one at or after m_FirstDisplayedWord.  This
7278 requires looking at the database in the BApplication, which may or may not be
7279 available (if it isn't, don't draw, a redraw will usually be requested by the
7280 Pulse member function when it keeps on noticing that the stuff on the display
7281 doesn't match the database). */
7282 
7283 void WordsView::Draw (BRect UpdateRect)
7284 {
7285   float                   AgeDifference;
7286   float                   AgeProportion;
7287   float                   CenterX;
7288   float                   ColumnLeftCenterX;
7289   float                   ColumnMiddleCenterX;
7290   float                   ColumnRightCenterX;
7291   float                   CompensatedRatio;
7292   StatisticsMap::iterator DataIter;
7293   StatisticsMap::iterator EndIter;
7294   rgb_color               FillColour;
7295   float                   GenuineProportion;
7296   uint32                  GenuineSpamSum;
7297   float                   HeightPixels;
7298   float                   HeightProportion;
7299   float                   LeftBounds;
7300   ABSApp                 *MyAppPntr;
7301   uint32                  NewestAge;
7302   uint32                  OldestAge;
7303   float                   OneFifthTotalGenuine;
7304   float                   OneFifthTotalSpam;
7305   double                  RawProbabilityRatio;
7306   float                   RightBounds;
7307   float                   SpamProportion;
7308   StatisticsPointer       StatisticsPntr;
7309   BRect                   TempRect;
7310   char                    TempString [PATH_MAX];
7311   float                   TotalGenuineMessages = 1.0; /* Avoid divide by 0. */
7312   float                   TotalSpamMessages = 1.0;
7313   float                   Width;
7314   float                   Y;
7315 
7316   /* Lock the application.  This will stop it from processing any further
7317   messages until we are done.  Or if it is busy, the lock will fail. */
7318 
7319   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7320   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (100000) != B_OK)
7321     return; /* It's probably busy doing something. */
7322 
7323   /* Set up various loop invariant variables. */
7324 
7325   if (MyAppPntr->m_TotalGenuineMessages > 0)
7326     TotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7327   OneFifthTotalGenuine = TotalGenuineMessages / 5;
7328 
7329   if (MyAppPntr->m_TotalSpamMessages > 0)
7330     TotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7331   OneFifthTotalSpam = TotalSpamMessages / 5;
7332 
7333   EndIter = MyAppPntr->m_WordMap.end ();
7334 
7335   OldestAge = MyAppPntr->m_OldestAge;
7336   NewestAge = /* actually newest age plus one */
7337     MyAppPntr->m_TotalGenuineMessages + MyAppPntr->m_TotalSpamMessages;
7338 
7339   if (NewestAge == 0)
7340     goto NormalExit; /* No words to display, or something is badly wrong. */
7341 
7342   NewestAge--; /* The newest message has age NewestAge. */
7343   AgeDifference = NewestAge - OldestAge; /* Can be zero if just one message. */
7344 
7345   LeftBounds = Bounds().left;
7346   RightBounds = Bounds().right - B_V_SCROLL_BAR_WIDTH;
7347   Width = RightBounds - LeftBounds;
7348   FillColour.alpha = 255;
7349 
7350   CenterX = ceilf (LeftBounds + Width * 0.5);
7351   ColumnLeftCenterX = ceilf (LeftBounds + Width * 0.05);
7352   ColumnMiddleCenterX = CenterX;
7353   ColumnRightCenterX = ceilf (LeftBounds + Width * 0.95);
7354 
7355   for (DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord),
7356   Y = Bounds().top;
7357   DataIter != EndIter && Y < UpdateRect.bottom;
7358   DataIter++, Y += m_LineHeight)
7359   {
7360     if (Y + m_LineHeight < UpdateRect.top)
7361       continue; /* Not in the visible area yet, don't actually draw. */
7362 
7363     /* Draw the colour bar behind the word.  It reflects the spamness or
7364     genuineness of that particular word, plus the importance of the word and
7365     the age of the word.
7366 
7367     First calculate the compensated spam ratio (described elsewhere).  It is
7368     close to 0.0 for genuine words and close to 1.0 for pure spam.  It is drawn
7369     as a blue bar to the left of center if it is less than 0.5, and a red bar
7370     on the right of center if it is greater than 0.5.  At exactly 0.5 nothing
7371     is drawn; the word is worthless as an indicator.
7372 
7373     The height of the bar corresponds to the number of messages the word was
7374     found in.  Make the height proportional to the total of spam and genuine
7375     messages for the word divided by the sum of the most extreme spam and
7376     genuine counts in the database.
7377 
7378     The staturation of the colour corresponds to the age of the word, with old
7379     words being almost white rather than solid blue or red. */
7380 
7381     StatisticsPntr = &DataIter->second;
7382 
7383     SpamProportion = StatisticsPntr->spamCount / TotalSpamMessages;
7384     GenuineProportion = StatisticsPntr->genuineCount / TotalGenuineMessages;
7385     if (SpamProportion + GenuineProportion > 0.0f)
7386       RawProbabilityRatio =
7387       SpamProportion / (SpamProportion + GenuineProportion);
7388     else
7389       RawProbabilityRatio = g_RobinsonX;
7390 
7391     /* The compensated ratio leans towards 0.5 (RobinsonX) more for fewer
7392     data points, with a weight of 0.45 (RobinsonS). */
7393 
7394     GenuineSpamSum =
7395       StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
7396     CompensatedRatio =
7397       (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
7398       (g_RobinsonS + GenuineSpamSum);
7399 
7400     /* Used to use the height based on the most frequent word, but some words,
7401     like "From", show up in all messages which made most other words just
7402     appear as a thin line.  I did a histogram plot of the sizes in my test
7403     database, and figured that you get better coverage of 90% of the messages
7404     if you use 1/5 of the total number as the count which gives you 100%
7405     height.  The other 10% get a full height bar, but most people wouldn't care
7406     that they're super frequently used. */
7407 
7408     HeightProportion = 0.5f * (StatisticsPntr->genuineCount /
7409       OneFifthTotalGenuine + StatisticsPntr->spamCount / OneFifthTotalSpam);
7410 
7411     if (HeightProportion > 1.0f)
7412       HeightProportion = 1.0f;
7413     HeightPixels = ceilf (HeightProportion * m_TextHeight);
7414 
7415     if (AgeDifference <= 0.0f)
7416       AgeProportion = 1.0; /* New is 1.0, old is 0.0 */
7417     else
7418       AgeProportion = (StatisticsPntr->age - OldestAge) / AgeDifference;
7419 
7420     TempRect.top = ceilf (Y + m_TextHeight / 2 - HeightPixels / 2);
7421     TempRect.bottom = TempRect.top + HeightPixels;
7422 
7423     if (CompensatedRatio < 0.5f)
7424     {
7425       TempRect.left = ceilf (
7426         CenterX - 1.6f * (0.5f - CompensatedRatio) * (CenterX - LeftBounds));
7427       TempRect.right = CenterX;
7428       FillColour.red = 230 - (int) (AgeProportion * 230.0f);
7429       FillColour.green = FillColour.red;
7430       FillColour.blue = 255;
7431     }
7432     else /* Ratio >= 0.5, red spam block. */
7433     {
7434       TempRect.left = CenterX;
7435       TempRect.right = ceilf (
7436         CenterX + 1.6f * (CompensatedRatio - 0.5f) * (RightBounds - CenterX));
7437       FillColour.blue = 230 - (int) (AgeProportion * 230.0f);
7438       FillColour.green = FillColour.blue;
7439       FillColour.red = 255;
7440     }
7441     SetHighColor (FillColour);
7442     SetDrawingMode (B_OP_COPY);
7443     FillRect (TempRect);
7444 
7445     /* Print the text centered in columns of various widths.  The number of
7446     genuine messages in the left 10% of the width, the word in the middle 80%,
7447     and the number of spam messages using the word in the right 10%. */
7448 
7449     SetHighColor (0, 0, 0);
7450     SetDrawingMode (B_OP_OVER); /* So that antialiased text mixes better. */
7451 
7452     sprintf (TempString, "%lu", StatisticsPntr->genuineCount);
7453     Width = m_TextFont.StringWidth (TempString);
7454     MovePenTo (ceilf (ColumnLeftCenterX - Width / 2), Y + m_AscentHeight);
7455     DrawString (TempString);
7456 
7457     strcpy (TempString, DataIter->first.c_str ());
7458     Width = m_TextFont.StringWidth (TempString);
7459     MovePenTo (ceilf (ColumnMiddleCenterX - Width / 2), Y + m_AscentHeight);
7460     DrawString (TempString);
7461 
7462     sprintf (TempString, "%lu", StatisticsPntr->spamCount);
7463     Width = m_TextFont.StringWidth (TempString);
7464     MovePenTo (ceilf (ColumnRightCenterX - Width / 2), Y + m_AscentHeight);
7465     DrawString (TempString);
7466   }
7467 
7468   /* Draw the first word (the one which the user types in to select the first
7469   displayed word) on the right, in the scroll bar margin, rotated 90 degrees to
7470   fit between the page up and page down buttons. */
7471 
7472   Width = m_TextFont.StringWidth (m_FirstDisplayedWord);
7473   if (Width > 0)
7474   {
7475     TempRect = Bounds ();
7476     TempRect.top += 4 * B_H_SCROLL_BAR_HEIGHT + 1;
7477     TempRect.bottom -= 5 * B_H_SCROLL_BAR_HEIGHT + 1;
7478 
7479     MovePenTo (TempRect.right - m_TextHeight + m_AscentHeight - 1,
7480       ceilf ((TempRect.bottom + TempRect.top) / 2 + Width / 2));
7481     m_TextFont.SetRotation (90);
7482     SetFont (&m_TextFont, B_FONT_ROTATION);
7483     DrawString (m_FirstDisplayedWord);
7484     m_TextFont.SetRotation (0);
7485     SetFont (&m_TextFont, B_FONT_ROTATION);
7486   }
7487 
7488 NormalExit:
7489 
7490   /* Successfully finished drawing.  Update the cached values to match what we
7491   have drawn. */
7492   m_CachedTotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7493   m_CachedTotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7494   m_CachedWordCount = MyAppPntr->m_WordCount;
7495 
7496   /* Done.  Let the BApplication continue processing messages. */
7497   MyAppPntr->Unlock ();
7498 }
7499 
7500 
7501 /* When the user presses keys, they select the first word to be displayed in
7502 the view (it's the word at or lexicographically after the word typed in).  The
7503 keys are appended to the starting word, until the user stops typing for a
7504 while, then the next key will be the first letter of a new starting word. */
7505 
7506 void WordsView::KeyDown (const char *BufferPntr, int32 NumBytes)
7507 {
7508   int32          CharLength;
7509   bigtime_t      CurrentTime;
7510   char           TempString [40];
7511 
7512   CurrentTime = system_time ();
7513 
7514   if (NumBytes < (int32) sizeof (TempString))
7515   {
7516     memcpy (TempString, BufferPntr, NumBytes);
7517     TempString [NumBytes] = 0;
7518     CharLength = strlen (TempString); /* So NUL bytes don't get through. */
7519 
7520     /* Check for arrow keys, which move the view up and down. */
7521 
7522     if (CharLength == 1 &&
7523     (TempString[0] == B_UP_ARROW ||
7524     TempString[0] == B_DOWN_ARROW ||
7525     TempString[0] == B_PAGE_UP ||
7526     TempString[0] == B_PAGE_DOWN))
7527     {
7528       MoveTextUpOrDown ((TempString[0] == B_UP_ARROW) ? MSG_LINE_UP :
7529         ((TempString[0] == B_DOWN_ARROW) ? MSG_LINE_DOWN :
7530         ((TempString[0] == B_PAGE_UP) ? MSG_PAGE_UP : MSG_PAGE_DOWN)));
7531     }
7532     else if (CharLength > 1 ||
7533     (CharLength == 1 && 32 <= (uint8) TempString[0]))
7534     {
7535       /* Have a non-control character, or some sort of multibyte char.  Add it
7536       to the word and mark things for redisplay starting at the resulting word.
7537       */
7538 
7539       if (CurrentTime - m_LastTimeAKeyWasPressed >= 1000000 /* microseconds */)
7540         strcpy (m_FirstDisplayedWord, TempString); /* Starting a new word. */
7541       else if (strlen (m_FirstDisplayedWord) + CharLength <= g_MaxWordLength)
7542         strcat (m_FirstDisplayedWord, TempString); /* Append to existing. */
7543 
7544       Invalidate ();
7545     }
7546   }
7547 
7548   m_LastTimeAKeyWasPressed = CurrentTime;
7549   BView::KeyDown (BufferPntr, NumBytes);
7550 }
7551 
7552 
7553 /* Change the background colour to show that we have the focus.  When we have
7554 it, keystrokes will select the word to be displayed at the top of the list. */
7555 
7556 void WordsView::MakeFocus (bool Focused)
7557 {
7558   if (Focused)
7559     m_BackgroundColour = m_FocusedColour;
7560   else
7561     m_BackgroundColour = m_UnfocusedColour;
7562   SetViewColor (m_BackgroundColour);
7563   SetLowColor (m_BackgroundColour);
7564 
7565   /* Also need to set the background colour for the scroll buttons, since they
7566   can't be made transparent. */
7567 
7568   if (m_ArrowLineDownPntr != NULL)
7569   {
7570     m_ArrowLineDownPntr->SetViewColor (m_BackgroundColour);
7571     m_ArrowLineDownPntr->Invalidate ();
7572   }
7573 
7574   if (m_ArrowLineUpPntr != NULL)
7575   {
7576     m_ArrowLineUpPntr->SetViewColor (m_BackgroundColour);
7577     m_ArrowLineUpPntr->Invalidate ();
7578   }
7579 
7580   if (m_ArrowPageDownPntr != NULL)
7581   {
7582     m_ArrowPageDownPntr->SetViewColor (m_BackgroundColour);
7583     m_ArrowPageDownPntr->Invalidate ();
7584   }
7585 
7586   if (m_ArrowPageUpPntr != NULL)
7587   {
7588     m_ArrowPageUpPntr->SetViewColor (m_BackgroundColour);
7589     m_ArrowPageUpPntr->Invalidate ();
7590   }
7591 
7592   Invalidate ();
7593 
7594   BView::MakeFocus (Focused);
7595 }
7596 
7597 
7598 void WordsView::MessageReceived (BMessage *MessagePntr)
7599 {
7600   int32     CountFound;
7601   float     DeltaY; /* Usually -1.0, 0.0 or +1.0. */
7602   type_code TypeFound;
7603 
7604   switch (MessagePntr->what)
7605   {
7606     case B_MOUSE_WHEEL_CHANGED:
7607       if (MessagePntr->FindFloat ("be:wheel_delta_y", &DeltaY) != 0) break;
7608       if (DeltaY < 0)
7609         MoveTextUpOrDown (MSG_LINE_UP);
7610       else if (DeltaY > 0)
7611         MoveTextUpOrDown (MSG_LINE_DOWN);
7612       break;
7613 
7614     case MSG_LINE_DOWN:
7615     case MSG_LINE_UP:
7616     case MSG_PAGE_DOWN:
7617     case MSG_PAGE_UP:
7618       MoveTextUpOrDown (MessagePntr->what);
7619       break;
7620 
7621     case B_SIMPLE_DATA: /* Something has been dropped in our view. */
7622       if (MessagePntr->GetInfo ("refs", &TypeFound, &CountFound) == B_OK &&
7623       CountFound > 0 && TypeFound == B_REF_TYPE)
7624       {
7625         RefsDroppedHere (MessagePntr);
7626         break;
7627       }
7628       /* Else fall through to the default case, in case it is something else
7629       dropped that the system knows about. */
7630 
7631     default:
7632       BView::MessageReceived (MessagePntr);
7633   }
7634 }
7635 
7636 
7637 /* If the user clicks on our view, take over the focus. */
7638 
7639 void WordsView::MouseDown (BPoint)
7640 {
7641   if (!IsFocus ())
7642     MakeFocus (true);
7643 }
7644 
7645 
7646 void WordsView::MoveTextUpOrDown (uint32 MovementType)
7647 {
7648   StatisticsMap::iterator  DataIter;
7649   int                      i;
7650   ABSApp                  *MyAppPntr;
7651   int                      PageSize;
7652 
7653   /* Lock the application.  This will stop it from processing any further
7654   messages until we are done (we need to look at the word list directly).  Or
7655   if it is busy, the lock will fail. */
7656 
7657   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7658   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (2000000) != B_OK)
7659     return; /* It's probably busy doing something. */
7660 
7661   PageSize = (int) (Bounds().Height() / m_LineHeight - 1);
7662   if (PageSize < 1)
7663     PageSize = 1;
7664 
7665   DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord);
7666 
7667   switch (MovementType)
7668   {
7669     case MSG_LINE_UP:
7670       if (DataIter != MyAppPntr->m_WordMap.begin ())
7671         DataIter--;
7672       break;
7673 
7674     case MSG_LINE_DOWN:
7675       if (DataIter != MyAppPntr->m_WordMap.end ())
7676         DataIter++;
7677       break;
7678 
7679     case MSG_PAGE_UP:
7680       for (i = 0; i < PageSize; i++)
7681       {
7682         if (DataIter == MyAppPntr->m_WordMap.begin ())
7683           break;
7684         DataIter--;
7685       }
7686       break;
7687 
7688     case MSG_PAGE_DOWN:
7689       for (i = 0; i < PageSize; i++)
7690       {
7691         if (DataIter == MyAppPntr->m_WordMap.end ())
7692           break;
7693         DataIter++;
7694       }
7695       break;
7696   }
7697 
7698   if (DataIter != MyAppPntr->m_WordMap.end ())
7699     strcpy (m_FirstDisplayedWord, DataIter->first.c_str ());
7700 
7701   Invalidate ();
7702 
7703   MyAppPntr->Unlock ();
7704 }
7705 
7706 
7707 /* This function periodically polls the BApplication to see if anything has
7708 changed.  If the word list is different or the display has changed in some
7709 other way, it will then try to refresh the display, repeating the attempt until
7710 it gets successfully drawn. */
7711 
7712 void WordsView::Pulse ()
7713 {
7714   ABSApp *MyAppPntr;
7715 
7716   /* Probe the BApplication to see if it has changed. */
7717 
7718   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7719   if (MyAppPntr == NULL)
7720     return; /* Something is wrong, give up. */
7721 
7722   if (MyAppPntr->m_TotalGenuineMessages != m_CachedTotalGenuineMessages ||
7723   MyAppPntr->m_TotalSpamMessages != m_CachedTotalSpamMessages ||
7724   MyAppPntr->m_WordCount != m_CachedWordCount)
7725     Invalidate ();
7726 }
7727 
7728 
7729 /* The user has dragged and dropped some file references on the words view.  If
7730 it is in the left third, add the file(s) as examples of genuine messages, right
7731 third for spam messages and if it is in the middle third then evaluate the
7732 file(s) for spaminess. */
7733 
7734 void WordsView::RefsDroppedHere (BMessage *MessagePntr)
7735 {
7736   float  Left;
7737   bool   SpamExample = true; /* TRUE if example is of spam, FALSE genuine. */
7738   float  Third;
7739   BPoint WhereDropped;
7740 
7741   /* Find out which third of the view it was dropped into. */
7742 
7743   if (MessagePntr->FindPoint ("_drop_point_", &WhereDropped) != B_OK)
7744     return;  /* Need to know where it was dropped. */
7745   ConvertFromScreen (&WhereDropped);
7746   Third = Bounds().Width() / 3;
7747   Left = Bounds().left;
7748   if (WhereDropped.x < Left + Third)
7749     SpamExample = false;
7750   else if (WhereDropped.x < Left + 2 * Third)
7751   {
7752     /* In the middle third, evaluate all files for spaminess. */
7753     EstimateRefFilesAndDisplay (MessagePntr);
7754     return;
7755   }
7756 
7757   if (g_CommanderLooperPntr != NULL)
7758     g_CommanderLooperPntr->CommandReferences (
7759     MessagePntr, true /* BulkMode */, SpamExample ? CL_SPAM : CL_GENUINE);
7760 }
7761 
7762 
7763 
7764 /******************************************************************************
7765  * Finally, the main program which drives it all.
7766  */
7767 
7768 int main (int argc, char**)
7769 {
7770   g_CommandLineMode = (argc > 1);
7771   if (!g_CommandLineMode)
7772     cout << PrintUsage; /* In case no arguments specified. */
7773 
7774   g_CommanderLooperPntr = new CommanderLooper;
7775   if (g_CommanderLooperPntr != NULL)
7776   {
7777     g_CommanderMessenger = new BMessenger (NULL, g_CommanderLooperPntr);
7778     g_CommanderLooperPntr->Run ();
7779   }
7780 
7781   ABSApp MyApp;
7782 
7783   if (MyApp.InitCheck () == 0)
7784   {
7785     MyApp.LoadSaveSettings (true /* DoLoad */);
7786     MyApp.Run ();
7787   }
7788 
7789   if (g_CommanderLooperPntr != NULL)
7790   {
7791     g_CommanderLooperPntr->PostMessage (B_QUIT_REQUESTED);
7792     snooze (100000); /* Let the CommanderLooper thread run so it quits. */
7793   }
7794 
7795   cerr << "SpamDBM shutting down..." << endl;
7796   return 0; /* And implicitly destroys MyApp, which writes out the database. */
7797 }
7798