xref: /haiku/src/bin/mail_utils/spamdbm.cpp (revision 5ac9b506412b11afb993bb52d161efe7666958a5)
1 /******************************************************************************
2  * $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $
3  *
4  * This is a BeOS program for classifying e-mail messages as spam (unwanted
5  * junk mail) or as genuine mail using a Bayesian statistical approach.  There
6  * is also a Mail Daemon Replacement add-on to filter mail using the
7  * classification statistics collected earlier.
8  *
9  * See also http://www.paulgraham.com/spam.html for a good writeup and
10  * http://www.tuxedo.org/~esr/bogofilter/ for another implementation.
11  * And more recently, Gary Robinson's write up of his improved algorithm
12  * at http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
13  * which gives a better spread in spam ratios and slightly fewer
14  * misclassifications.
15  *
16  * Note that this uses the AGMS vacation coding style, not the OpenTracker one.
17  * That means no tabs, indents are two spaces, m_ is the prefix for member
18  * variables, g_ is the prefix for global names, C style comments, constants
19  * are in all capital letters and most other things are mixed case, it's word
20  * wrapped to fit in 79 characters per line to make proofreading on paper
21  * easier, and functions are listed in reverse dependency order so that forward
22  * declarations (function prototypes with no code) aren't needed.
23  *
24  * The Original Design:
25  * There is a spam database (just a file listing words and number of times they
26  * were used in spam and non-spam messages) that a BeMailDaemon input filter
27  * will use when scanning email.  It will mark the mail with the spam
28  * probability (an attribute, optionally a mail header field) and optionally do
29  * something if the probability exceeds a user defined level (delete message,
30  * change subject, file in a different folder).  Or should that be a different
31  * filter?  Outside the mail system, the probability can be used in queries to
32  * find spam.
33  *
34  * A second user application will be used to update the database.  Besides
35  * showing you the current list of words, you can drag and drop files to mark
36  * them as spam or non-spam (a balanced binary tree is used internally to make
37  * word storage fast).  It will add a second attribute to the files to show how
38  * they have been classified by the user (and won't update the database if you
39  * accidentally try to classify a file again).  Besides drag and drop, there
40  * will be a command line interface and a message passing interface.  BeMail
41  * (or other programs) will then communicate via messages to tell it when the
42  * user marks a message as spam or not (via having separate delete spam /
43  * delete genuine mail buttons and a menu item or two).
44  *
45  * Plus lots of details, like the rename swap method to update the database
46  * file (so programs with the old file open aren't affected).  A nice tab text
47  * format so you can open the database in a spreadsheet.  Startup and shutdown
48  * control of the updater from BeMail.  Automatic creation of the indices
49  * needed by the filter.  MIME types for the database file.  Icons for the app.
50  * System settings to enable tracker to display the new attributes when viewing
51  * e-mail (and maybe news articles if someone ever gets around to an NNTP as
52  * files reader).  Documentation.  Recursive directory traversal for the
53  * command line or directory drag and drop.  Options for the updater to warn or
54  * ignore non-email files.  Etc.
55  *
56  * The Actual Implementation:
57  * The spam database updates and the test for spam have been combined into one
58  * program which runs as a server.  That way there won't be as long a delay
59  * when the e-mail system wants to check for spam, because the database is
60  * already loaded by the server and in memory.  The MDR mail filter add-on
61  * simply sends scripting commands to the server (and starts it up if it isn't
62  * already running).  The filter takes care of marking the messages when it
63  * gets the rating back from the server, and then the rest of the mail system
64  * rule chain can delete the message or otherwise manipulate it.
65  *
66  * Revision History (now manually updated due to SVN's philosophy)
67  * $Log: spamdbm.cpp,v $
68  * ------------------------------------------------------------------------
69  * r15195 | agmsmith | 2005-11-27 21:07:55 -0500 (Sun, 27 Nov 2005) | 4 lines
70  * Just a few minutes after checking in, I mentioned it to Japanese expert Koki
71  * and he suggested also including the Japanese comma.  So before I forget to
72  * do it...
73  *
74  * ------------------------------------------------------------------------
75  * r15194 | agmsmith | 2005-11-27 20:37:13 -0500 (Sun, 27 Nov 2005) | 5 lines
76  * Truncate overly long URLs to the maximum word length.  Convert Japanese
77  * periods to spaces so that more "words" are found.  Fix UTF-8 comparison
78  * problems with tolower() incorrectly converting characters with the high bit
79  * set.
80  *
81  * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
82  * Added better tokenization so that HTML is parsed and things like tags
83  * between letters of a word no longer hide that word.  After testing, the
84  * result seems to be a tighter spread of ratings when done in full text plus
85  * header mode.
86  *
87  * Revision 1.10  2005/11/24 02:08:39  agmsmith
88  * Fixed up prefix codes, Z for things that are inside other things.
89  *
90  * Revision 1.9  2005/11/21 03:28:03  agmsmith
91  * Added a function for extracting URLs.
92  *
93  * Revision 1.8  2005/11/09 03:36:18  agmsmith
94  * Removed noframes detection (doesn't show up in e-mails).  Now use
95  * just H for headers and Z for HTML tag junk.
96  *
97  * Revision 1.7  2005/10/24 00:00:08  agmsmith
98  * Adding HTML tag removal, which also affected the search function so it
99  * could search for single part things like  .
100  *
101  * Revision 1.6  2005/10/17 01:55:08  agmsmith
102  * Remove HTML comments and a few other similar things.
103  *
104  * Revision 1.5  2005/10/16 18:35:36  agmsmith
105  * Under construction - looking into HTML not being in UTF-8.
106  *
107  * Revision 1.4  2005/10/11 01:51:21  agmsmith
108  * Starting on the tokenising passes.  Still need to test asian truncation.
109  *
110  * Revision 1.3  2005/10/06 11:54:07  agmsmith
111  * Not much.
112  *
113  * Revision 1.2  2005/09/12 01:49:37  agmsmith
114  * Enable case folding for the whole file tokenizer.
115  *
116  * r13961 | agmsmith | 2005-08-13 22:25:28 -0400 (Sat, 13 Aug 2005) | 2 lines
117  * Source code changes so that mboxtobemail now compiles and is in the build
118  * system.
119  *
120  * r13959 | agmsmith | 2005-08-13 22:05:27 -0400 (Sat, 13 Aug 2005) | 2 lines
121  * Rename the directory before doing anything else, otherwise svn dies badly.
122  *
123  * r13952 | agmsmith | 2005-08-13 15:31:42 -0400 (Sat, 13 Aug 2005) | 3 lines
124  * Added the resources and file type associations, changed the application
125  * signature and otherwise made the spam detection system work properly again.
126  *
127  * r13951 | agmsmith | 2005-08-13 11:40:01 -0400 (Sat, 13 Aug 2005) | 2 lines
128  * Had to do the file rename as a separate operation due to SVN limitations.
129  *
130  * r13950 | agmsmith | 2005-08-13 11:38:44 -0400 (Sat, 13 Aug 2005) | 3 lines
131  * Oops, "spamdb" is already used for a Unix package.  And spamdatabase is
132  * already reserved by a domain name squatter.  Use "spamdbm" instead.
133  *
134  * r13949 | agmsmith | 2005-08-13 11:17:52 -0400 (Sat, 13 Aug 2005) | 3 lines
135  * Renamed spamfilter to be the more meaningful spamdb (spam database) and
136  * moved it into its own source directory in preparation for adding resources.
137  *
138  * r13628 | agmsmith | 2005-07-10 20:11:29 -0400 (Sun, 10 Jul 2005) | 3 lines
139  * Updated keyword expansion to use SVN keywords.  Also seeing if svn is
140  * working well enough for me to update files from BeOS R5.
141  *
142  * r11909 | axeld | 2005-03-18 19:09:19 -0500 (Fri, 18 Mar 2005) | 2 lines
143  * Moved bin/ directory out of apps/.
144  *
145  * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
146  * Move trunk into respective module.
147  *
148  * r10362 | nwhitehorn | 2004-12-06 20:14:05 -0500 (Mon, 06 Dec 2004) | 2 lines
149  * Fixed the spam filter so it works correctly now.
150  *
151  * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
152  * Added AGMS's excellent spam detection software.  Still some weirdness with
153  * the configuration interface from E-mail prefs.
154  *
155  * Revision 1.2  2004/12/07 01:14:05  nwhitehorn
156  * Fixed the spam filter so it works correctly now.
157  *
158  * Revision 1.87  2004/09/20 15:57:26  nwhitehorn
159  * Mostly updated the tree to Be/Haiku style identifier naming conventions.  I
160  * have a few more things to work out, mostly in mail_util.h, and then I'm
161  * proceeding to jamify the build system.  Then we go into Haiku CVS.
162  *
163  * Revision 1.86  2003/07/26 16:47:46  agmsmith
164  * Bug - wasn't allowing double classification if the user had turned on
165  * the option to ignore the previous classification.
166  *
167  * Revision 1.85  2003/07/08 14:52:57  agmsmith
168  * Fix bug with classification choices dialog box coming up with weird
169  * sizes due to RefsReceived message coming in before ReadyToRun had
170  * finished setting up the default sizes of the controls.
171  *
172  * Revision 1.84  2003/07/04 19:59:29  agmsmith
173  * Now with a GUI option to let you declassify messages (set them back
174  * to uncertain, rather than spam or genuine).  Required a BAlert
175  * replacement since BAlerts can't do four buttons.
176  *
177  * Revision 1.83  2003/07/03 20:40:36  agmsmith
178  * Added Uncertain option for declassifying messages.
179  *
180  * Revision 1.82  2003/06/16 14:57:13  agmsmith
181  * Detect spam which uses mislabeled text attachments, going by the file name
182  * extension.
183  *
184  * Revision 1.81  2003/04/08 20:27:04  agmsmith
185  * AGMSBayesianSpamServer now shuts down immediately and returns true if
186  * it is asked to quit by the registrar.
187  *
188  * Revision 1.80  2003/04/07 19:20:27  agmsmith
189  * Ooops, int64 doesn't exist, use long long instead.
190  *
191  * Revision 1.79  2003/04/07 19:05:22  agmsmith
192  * Now with Allen Brunson's atoll for PPC (you need the %Ld, but that
193  * becomes %lld on other systems).
194  *
195  * Revision 1.78  2003/04/04 22:43:53  agmsmith
196  * Fixed up atoll PPC processor hack so it would actually work, was just
197  * returning zero which meant that it wouldn't load in the database file
198  * (read the size as zero).
199  *
200  * Revision 1.77  2003/01/22 03:19:48  agmsmith
201  * Don't convert words to lower case, the case is important for spam.
202  * Particularly sentences which start with exciting words, which you
203  * normally won't use at the start of a sentence (and thus capitalize).
204  *
205  * Revision 1.76  2002/12/18 02:29:22  agmsmith
206  * Add space for the Uncertain display in Tracker.
207  *
208  * Revision 1.75  2002/12/18 01:54:37  agmsmith
209  * Added uncertain sound effect.
210  *
211  * Revision 1.74  2002/12/13 23:53:12  agmsmith
212  * Minimize the window before opening it so that it doesn't flash on the
213  * screen in server mode.  Also load the database when the window is
214  * displayed so that the user can see the words.
215  *
216  * Revision 1.73  2002/12/13 20:55:57  agmsmith
217  * Documentation.
218  *
219  * Revision 1.72  2002/12/13 20:26:11  agmsmith
220  * Fixed bug with adding messages in strings to database (was limited to
221  * messages at most 1K long).  Also changed default server mode to true
222  * since that's what people use most.
223  *
224  * Revision 1.71  2002/12/11 22:37:30  agmsmith
225  * Added commands to train on spam and genuine e-mail messages passed
226  * in string arguments rather then via external files.
227  *
228  * Revision 1.70  2002/12/10 22:12:41  agmsmith
229  * Adding a message to the database now uses a BPositionIO rather than a
230  * file and file name (for future string rather than file additions).  Also
231  * now re-evaluate a file after reclassifying it so that the user can see
232  * the new ratio.  Also remove the [Spam 99.9%] subject prefix when doing
233  * a re-evaluation or classification (the number would be wrong).
234  *
235  * Revision 1.69  2002/12/10 01:46:04  agmsmith
236  * Added the Chi-Squared scoring method.
237  *
238  * Revision 1.68  2002/11/29 22:08:25  agmsmith
239  * Change default purge age to 2000 so that hitting the purge button
240  * doesn't erase stuff from the new sample database.
241  *
242  * Revision 1.67  2002/11/25 20:39:39  agmsmith
243  * Don't need to massage the MIME type since the mail library now does
244  * the lower case conversion and converts TEXT to text/plain too.
245  *
246  * Revision 1.66  2002/11/20 22:57:12  nwhitehorn
247  * PPC Compatibility Fixes
248  *
249  * Revision 1.65  2002/11/10 18:43:55  agmsmith
250  * Added a time delay to some quitting operations so that scripting commands
251  * from a second client (like a second e-mail account) will make the program
252  * abort the quit operation.
253  *
254  * Revision 1.64  2002/11/05 18:05:16  agmsmith
255  * Looked at Nathan's PPC changes (thanks!), modified style a bit.
256  *
257  * Revision 1.63  2002/11/04 03:30:22  nwhitehorn
258  * Now works (or compiles at least) on PowerPC.  I'll get around to testing it
259  * later.
260  *
261  * Revision 1.62  2002/11/04 01:03:33  agmsmith
262  * Fixed warnings so it compiles under the bemaildaemon system.
263  *
264  * Revision 1.61  2002/11/03 23:00:37  agmsmith
265  * Added to the bemaildaemon project on SourceForge.  Hmmmm, seems to switch to
266  * a new version if I commit and specify a message, but doesn't accept the
267  * message and puts up the text editor.  Must be a bug where cvs eats the first
268  * option after "commit".
269  *
270  * Revision 1.60.1.1  2002/10/22 14:29:27  agmsmith
271  * Needed to recompile with the original Libmail.so from Beta/1 since
272  * the current library uses a different constructor, and thus wouldn't
273  * run when used with the old library.
274  *
275  * Revision 1.60  2002/10/21 16:41:27  agmsmith
276  * Return a special error code when no words are found in a message,
277  * so that messages without text/plain parts can be recognized as
278  * spam by the mail filter.
279  *
280  * Revision 1.59  2002/10/20 21:29:47  agmsmith
281  * Watch out for MIME types of "text", treat as text/plain.
282  *
283  * Revision 1.58  2002/10/20 18:29:07  agmsmith
284  * *** empty log message ***
285  *
286  * Revision 1.57  2002/10/20 18:25:02  agmsmith
287  * Fix case sensitivity in MIME type tests, and fix text/any test.
288  *
289  * Revision 1.56  2002/10/19 17:00:10  agmsmith
290  * Added the pop-up menu for the tokenize modes.
291  *
292  * Revision 1.55  2002/10/19 14:54:06  agmsmith
293  * Fudge MIME type of body text components so that they get
294  * treated as text.
295  *
296  * Revision 1.54  2002/10/19 00:56:37  agmsmith
297  * The parsing of e-mail messages seems to be working now, just need
298  * to add some user interface stuff for the tokenizing mode.
299  *
300  * Revision 1.53  2002/10/18 23:37:56  agmsmith
301  * More mail kit usage, can now decode headers, but more to do.
302  *
303  * Revision 1.52  2002/10/16 23:52:33  agmsmith
304  * Getting ready to add more tokenizing modes, exploring Mail Kit to break
305  * apart messages into components (and decode BASE64 and other encodings).
306  *
307  * Revision 1.51  2002/10/11 20:05:31  agmsmith
308  * Added installation of sound effect names, which the filter will use.
309  *
310  * Revision 1.50  2002/10/02 16:50:02  agmsmith
311  * Forgot to add credits to the algorithm inventors.
312  *
313  * Revision 1.49  2002/10/01 00:39:29  agmsmith
314  * Added drag and drop to evaluate files or to add them to the list.
315  *
316  * Revision 1.48  2002/09/30 19:44:17  agmsmith
317  * Switched to Gary Robinson's method, removed max spam/genuine word.
318  *
319  * Revision 1.47  2002/09/23 17:08:55  agmsmith
320  * Add an attribute with the spam ratio to files which have been evaluated.
321  *
322  * Revision 1.46  2002/09/23 02:50:32  agmsmith
323  * Fiddling with display width of e-mail attributes.
324  *
325  * Revision 1.45  2002/09/23 01:13:56  agmsmith
326  * Oops, bug in string evaluation scripting.
327  *
328  * Revision 1.44  2002/09/22 21:00:55  agmsmith
329  * Added EvaluateString so that the BeMail add-on can pass the info without
330  * having to create a temporary file.
331  *
332  * Revision 1.43  2002/09/20 19:56:02  agmsmith
333  * Added about box and button for estimating the spam ratio of a file.
334  *
335  * Revision 1.42  2002/09/20 01:22:26  agmsmith
336  * More testing, decide that an extreme ratio bias point of 0.5 is good.
337  *
338  * Revision 1.41  2002/09/19 21:17:12  agmsmith
339  * Changed a few names and proofread the program.
340  *
341  * Revision 1.40  2002/09/19 14:27:17  agmsmith
342  * Rearranged execution of commands, moving them to a separate looper
343  * rather than the BApplication, so that thousands of files could be
344  * processed without worrying about the message queue filling up.
345  *
346  * Revision 1.39  2002/09/18 18:47:16  agmsmith
347  * Stop flickering when the view is partially obscured, update cached
348  * values in all situations except when app is busy.
349  *
350  * Revision 1.38  2002/09/18 18:08:11  agmsmith
351  * Add a function for evaluating the spam ratio of a message.
352  *
353  * Revision 1.37  2002/09/16 01:30:16  agmsmith
354  * Added Get Oldest command.
355  *
356  * Revision 1.36  2002/09/16 00:47:52  agmsmith
357  * Change the display to counter-weigh the spam ratio by the number of
358  * messages.
359  *
360  * Revision 1.35  2002/09/15 20:49:35  agmsmith
361  * Scrolling improved, buttons, keys and mouse wheel added.
362  *
363  * Revision 1.34  2002/09/15 03:46:10  agmsmith
364  * Up and down buttons under construction.
365  *
366  * Revision 1.33  2002/09/15 02:09:21  agmsmith
367  * Took out scroll bar.
368  *
369  * Revision 1.32  2002/09/15 02:05:30  agmsmith
370  * Trying to add a scroll bar, but it isn't very useful.
371  *
372  * Revision 1.31  2002/09/14 23:06:28  agmsmith
373  * Now has live updates of the list of words.
374  *
375  * Revision 1.30  2002/09/14 19:53:11  agmsmith
376  * Now with a better display of the words.
377  *
378  * Revision 1.29  2002/09/13 21:33:54  agmsmith
379  * Now draws the words in the word display view, but still primitive.
380  *
381  * Revision 1.28  2002/09/13 19:28:02  agmsmith
382  * Added display of most genuine and most spamiest, fixed up cursor.
383  *
384  * Revision 1.27  2002/09/13 03:08:42  agmsmith
385  * Show current word and message counts, and a busy cursor.
386  *
387  * Revision 1.26  2002/09/13 00:00:08  agmsmith
388  * Fixed up some deadlock problems, now using asynchronous message replies.
389  *
390  * Revision 1.25  2002/09/12 17:56:58  agmsmith
391  * Keep track of words which are spamiest and genuinest.
392  *
393  * Revision 1.24  2002/09/12 01:57:10  agmsmith
394  * Added server mode.
395  *
396  * Revision 1.23  2002/09/11 23:30:45  agmsmith
397  * Added Purge button and ignore classification checkbox.
398  *
399  * Revision 1.22  2002/09/11 21:23:13  agmsmith
400  * Added bulk update choice, purge button, moved to a BView container
401  * for all the controls (so background colour could be set, and Pulse
402  * works normally for it too).
403  *
404  * Revision 1.21  2002/09/10 22:52:49  agmsmith
405  * You can now change the database name in the GUI.
406  *
407  * Revision 1.20  2002/09/09 14:20:42  agmsmith
408  * Now can have multiple backups, and implemented refs received.
409  *
410  * Revision 1.19  2002/09/07 19:14:56  agmsmith
411  * Added standard GUI measurement code.
412  *
413  * Revision 1.18  2002/09/06 21:03:03  agmsmith
414  * Rearranging code to avoid forward references when adding a window class.
415  *
416  * Revision 1.17  2002/09/06 02:54:00  agmsmith
417  * Added the ability to purge old words from the database.
418  *
419  * Revision 1.16  2002/09/05 00:46:03  agmsmith
420  * Now adds spam to the database!
421  *
422  * Revision 1.15  2002/09/04 20:32:15  agmsmith
423  * Read ahead a couple of letters to decode quoted-printable better.
424  *
425  * Revision 1.14  2002/09/04 03:10:03  agmsmith
426  * Can now tokenize (break into words) a text file.
427  *
428  * Revision 1.13  2002/09/03 21:50:54  agmsmith
429  * Count database command, set up MIME type for the database file.
430  *
431  * Revision 1.12  2002/09/03 19:55:54  agmsmith
432  * Added loading and saving the database.
433  *
434  * Revision 1.11  2002/09/02 03:35:33  agmsmith
435  * Create indices and set up attribute associations with the e-mail MIME type.
436  *
437  * Revision 1.10  2002/09/01 15:52:49  agmsmith
438  * Can now delete the database.
439  *
440  * Revision 1.9  2002/08/31 21:55:32  agmsmith
441  * Yet more scripting.
442  *
443  * Revision 1.8  2002/08/31 21:41:37  agmsmith
444  * Under construction, with example code to decode a B_REPLY.
445  *
446  * Revision 1.7  2002/08/30 19:29:06  agmsmith
447  * Combined loading and saving settings into one function.
448  *
449  * Revision 1.6  2002/08/30 02:01:10  agmsmith
450  * Working on loading and saving settings.
451  *
452  * Revision 1.5  2002/08/29 23:17:42  agmsmith
453  * More scripting.
454  *
455  * Revision 1.4  2002/08/28 00:40:52  agmsmith
456  * Scripting now seems to work, at least the messages flow properly.
457  *
458  * Revision 1.3  2002/08/25 21:51:44  agmsmith
459  * Getting the about text formatting right.
460  *
461  * Revision 1.2  2002/08/25 21:28:20  agmsmith
462  * Trying out the BeOS scripting system as a way of implementing the program.
463  *
464  * Revision 1.1  2002/08/24 02:27:51  agmsmith
465  * Initial revision
466  */
467 
468 /* Standard C Library. */
469 
470 #include <errno.h>
471 #include <stdio.h>
472 #include <stdlib.h>
473 #include <strings.h>
474 
475 /* Standard C++ library. */
476 
477 #include <iostream>
478 
479 /* STL (Standard Template Library) headers. */
480 
481 #include <map>
482 #include <queue>
483 #include <set>
484 #include <string>
485 #include <vector>
486 
487 using namespace std;
488 
489 /* BeOS (Be Operating System) headers. */
490 
491 #include <Alert.h>
492 #include <Application.h>
493 #include <Beep.h>
494 #include <Button.h>
495 #include <CheckBox.h>
496 #include <Cursor.h>
497 #include <Directory.h>
498 #include <Entry.h>
499 #include <File.h>
500 #include <FilePanel.h>
501 #include <FindDirectory.h>
502 #include <fs_index.h>
503 #include <fs_info.h>
504 #include <MenuBar.h>
505 #include <MenuItem.h>
506 #include <Message.h>
507 #include <MessageQueue.h>
508 #include <MessageRunner.h>
509 #include <Mime.h>
510 #include <NodeInfo.h>
511 #include <Path.h>
512 #include <Picture.h>
513 #include <PictureButton.h>
514 #include <Point.h>
515 #include <Polygon.h>
516 #include <PopUpMenu.h>
517 #include <PropertyInfo.h>
518 #include <RadioButton.h>
519 #include <Resources.h>
520 #include <Screen.h>
521 #include <ScrollBar.h>
522 #include <String.h>
523 #include <StringView.h>
524 #include <TextControl.h>
525 #include <View.h>
526 
527 /* Included from the Mail Daemon Replacement project (MDR) include/public
528 directory, available from http://sourceforge.net/projects/bemaildaemon/ */
529 
530 #include <MailMessage.h>
531 #include <MailAttachment.h>
532 
533 
534 /******************************************************************************
535  * Global variables, and not-so-variable things too.  Grouped by functionality.
536  */
537 
538 static float g_MarginBetweenControls; /* Space of a letter "M" between them. */
539 static float g_LineOfTextHeight;      /* Height of text the current font. */
540 static float g_StringViewHeight;      /* Height of a string view text box. */
541 static float g_ButtonHeight;          /* How many pixels tall buttons are. */
542 static float g_CheckBoxHeight;        /* Same for check boxes. */
543 static float g_RadioButtonHeight;     /* Also for radio buttons. */
544 static float g_PopUpMenuHeight;       /* Again for pop-up menus. */
545 static float g_TextBoxHeight;         /* Ditto for editable text controls. */
546 
547 static const char *g_ABSAppSignature =
548   "application/x-vnd.agmsmith.spamdbm";
549 
550 static const char *g_ABSDatabaseFileMIMEType =
551   "text/x-vnd.agmsmith.spam_probability_database";
552 
553 static const char *g_DefaultDatabaseFileName =
554   "SpamDBM Database";
555 
556 static const char *g_DatabaseRecognitionString =
557   "Spam Database File";
558 
559 static const char *g_AttributeNameClassification = "MAIL:classification";
560 static const char *g_AttributeNameSpamRatio = "MAIL:ratio_spam";
561 static const char *g_BeepGenuine = "SpamFilter-Genuine";
562 static const char *g_BeepSpam = "SpamFilter-Spam";
563 static const char *g_BeepUncertain = "SpamFilter-Uncertain";
564 static const char *g_ClassifiedSpam = "Spam";
565 static const char *g_ClassifiedGenuine = "Genuine";
566 static const char *g_DataName = "data";
567 static const char *g_ResultName = "result";
568 
569 static const char *g_SettingsDirectoryName = "Mail";
570 static const char *g_SettingsFileName = "SpamDBM Settings";
571 static const uint32 g_SettingsWhatCode = 'SDBM';
572 static const char *g_BackupSuffix = ".backup %d";
573 static const int g_MaxBackups = 10; /* Numbered from 0 to g_MaxBackups - 1. */
574 static const size_t g_MaxWordLength = 50; /* Words longer than this aren't. */
575 static const int g_MaxInterestingWords = 150; /* Top N words are examined. */
576 static const double g_RobinsonS = 0.45; /* Default weight for no data. */
577 static const double g_RobinsonX = 0.5; /* Halfway point for no data. */
578 
579 static bool g_CommandLineMode;
580   /* TRUE if the program was started from the command line (and thus should
581   exit after processing the command), FALSE if it is running with a graphical
582   user interface. */
583 
584 static bool g_ServerMode;
585   /* When TRUE the program runs in server mode - error messages don't result in
586   pop-up dialog boxes, but you can still see them in stderr.  Also the window
587   is minimized, if it exists. */
588 
589 static int g_QuitCountdown = -1;
590   /* Set to the number of pulse timing events (about one every half second) to
591   count down before the program quits.  Negative means stop counting.  Zero
592   means quit at the next pulse event.  This is used to keep the program alive
593   for a short while after someone requests that it quit, in case more scripting
594   commands come in, which will stop the countdown.  Needed to handle the case
595   where there are multiple e-mail accounts all requesting spam identification,
596   and one finishes first and tells the server to quit.  It also checks to see
597   that there is no more work to do before trying to quit. */
598 
599 static volatile bool g_AppReadyToRunCompleted = false;
600   /* The BApplication starts processing messages before ReadyToRun finishes,
601   which can lead to initialisation problems (button heights not determined).
602   So wait for this to turn TRUE in code that might run early, like
603   RefsReceived. */
604 
605 static class CommanderLooper *g_CommanderLooperPntr = NULL;
606 static BMessenger *g_CommanderMessenger = NULL;
607   /* Some globals for use with the looper which processes external commands
608   (arguments received, file references received), needed for avoiding deadlocks
609   which would happen if the BApplication sent a scripting message to itself. */
610 
611 static BCursor *g_BusyCursor = NULL;
612   /* The busy cursor, will be loaded from the resource file during application
613   startup. */
614 
615 typedef enum PropertyNumbersEnum
616 {
617   PN_DATABASE_FILE = 0,
618   PN_SPAM,
619   PN_SPAM_STRING,
620   PN_GENUINE,
621   PN_GENUINE_STRING,
622   PN_UNCERTAIN,
623   PN_IGNORE_PREVIOUS_CLASSIFICATION,
624   PN_SERVER_MODE,
625   PN_FLUSH,
626   PN_PURGE_AGE,
627   PN_PURGE_POPULARITY,
628   PN_PURGE,
629   PN_OLDEST,
630   PN_EVALUATE,
631   PN_EVALUATE_STRING,
632   PN_RESET_TO_DEFAULTS,
633   PN_INSTALL_THINGS,
634   PN_TOKENIZE_MODE,
635   PN_SCORING_MODE,
636   PN_MAX
637 } PropertyNumbers;
638 
639 static const char * g_PropertyNames [PN_MAX] =
640 {
641   "DatabaseFile",
642   "Spam",
643   "SpamString",
644   "Genuine",
645   "GenuineString",
646   "Uncertain",
647   "IgnorePreviousClassification",
648   "ServerMode",
649   "Flush",
650   "PurgeAge",
651   "PurgePopularity",
652   "Purge",
653   "Oldest",
654   "Evaluate",
655   "EvaluateString",
656   "ResetToDefaults",
657   "InstallThings",
658   "TokenizeMode",
659   "ScoringMode"
660 };
661 
662 /* This array lists the scripting commands we can handle, in a format that the
663 scripting system can understand too. */
664 
665 static struct property_info g_ScriptingPropertyList [] =
666 {
667   /* *name; commands[10]; specifiers[10]; *usage; extra_data; ... */
668   {g_PropertyNames[PN_DATABASE_FILE], {B_GET_PROPERTY, 0},
669     {B_DIRECT_SPECIFIER, 0}, "Get the pathname of the current database file.  "
670     "The default name is something like B_USER_SETTINGS_DIRECTORY / "
671     "Mail / SpamDBM Database", PN_DATABASE_FILE,
672     {}, {}, {}},
673   {g_PropertyNames[PN_DATABASE_FILE], {B_SET_PROPERTY, 0},
674     {B_DIRECT_SPECIFIER, 0}, "Change the pathname of the database file to "
675     "use.  It will automatically be converted to an absolute path name, "
676     "so make sure the parent directories exist before setting it.  If it "
677     "doesn't exist, you'll have to use the create command next.",
678     PN_DATABASE_FILE, {}, {}, {}},
679   {g_PropertyNames[PN_DATABASE_FILE], {B_CREATE_PROPERTY, 0},
680     {B_DIRECT_SPECIFIER, 0}, "Creates a new empty database, will replace "
681     "the existing database file too.", PN_DATABASE_FILE, {}, {}, {}},
682   {g_PropertyNames[PN_DATABASE_FILE], {B_DELETE_PROPERTY, 0},
683     {B_DIRECT_SPECIFIER, 0}, "Deletes the database file and all backup copies "
684     "of that file too.  Really only of use for uninstallers.",
685     PN_DATABASE_FILE, {}, {}, {}},
686   {g_PropertyNames[PN_DATABASE_FILE], {B_COUNT_PROPERTIES, 0},
687     {B_DIRECT_SPECIFIER, 0}, "Returns the number of words in the database.",
688     PN_DATABASE_FILE, {}, {}, {}},
689   {g_PropertyNames[PN_SPAM], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
690     "Adds the spam in the given file (specify full pathname to be safe) to "
691     "the database.  The words in the files will be added to the list of words "
692     "in the database that identify spam messages.  The files processed will "
693     "also have the attribute MAIL:classification added with a value of "
694     "\"Spam\" or \"Genuine\" as specified.  They also have their spam ratio "
695     "attribute updated, as if you had also used the Evaluate command on "
696     "them.  If they already have the MAIL:classification "
697     "attribute and it matches the new classification then they won't get "
698     "processed (and if it is different, they will get removed from the "
699     "statistics for the old class and added to the statistics for the new "
700     "one).  You can turn off that behaviour with the "
701     "IgnorePreviousClassification property.  The command line version lets "
702     "you specify more than one pathname.", PN_SPAM, {}, {}, {}},
703   {g_PropertyNames[PN_SPAM], {B_COUNT_PROPERTIES, 0}, {B_DIRECT_SPECIFIER, 0},
704     "Returns the number of spam messages in the database.", PN_SPAM,
705     {}, {}, {}},
706   {g_PropertyNames[PN_SPAM_STRING], {B_SET_PROPERTY, 0},
707     {B_DIRECT_SPECIFIER, 0}, "Adds the spam in the given string (assumed to "
708     "be the text of a whole e-mail message, not just a file name) to the "
709     "database.", PN_SPAM_STRING, {}, {}, {}},
710   {g_PropertyNames[PN_GENUINE], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
711     "Similar to adding spam except that the message file is added to the "
712     "genuine statistics.", PN_GENUINE, {}, {}, {}},
713   {g_PropertyNames[PN_GENUINE], {B_COUNT_PROPERTIES, 0},
714     {B_DIRECT_SPECIFIER, 0}, "Returns the number of genuine messages in the "
715     "database.", PN_GENUINE, {}, {}, {}},
716   {g_PropertyNames[PN_GENUINE_STRING], {B_SET_PROPERTY, 0},
717     {B_DIRECT_SPECIFIER, 0}, "Adds the genuine message in the given string "
718     "(assumed to be the text of a whole e-mail message, not just a file name) "
719     "to the database.", PN_GENUINE_STRING, {}, {}, {}},
720   {g_PropertyNames[PN_UNCERTAIN], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
721     "Similar to adding spam except that the message file is removed from the "
722     "database, undoing the previous classification.  Obviously, it needs to "
723     "have been classified previously (using the file attributes) so it can "
724     "tell if it is removing spam or genuine words.", PN_UNCERTAIN, {}, {}, {}},
725   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_SET_PROPERTY, 0},
726     {B_DIRECT_SPECIFIER, 0}, "If set to true then the previous classification "
727     "(which was saved as an attribute of the e-mail message file) will be "
728     "ignored, so that you can add the message to the database again.  If set "
729     "to false (the normal case), the attribute will be examined, and if the "
730     "message has already been classified as what you claim it is, nothing "
731     "will be done.  If it was misclassified, then the message will be removed "
732     "from the statistics for the old class and added to the stats for the "
733     "new classification you have requested.",
734     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
735   {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_GET_PROPERTY, 0},
736     {B_DIRECT_SPECIFIER, 0}, "Find out the current setting of the flag for "
737     "ignoring the previously recorded classification.",
738     PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
739   {g_PropertyNames[PN_SERVER_MODE], {B_SET_PROPERTY, 0},
740     {B_DIRECT_SPECIFIER, 0}, "If set to true then error messages get printed "
741     "to the standard error stream rather than showing up in an alert box.  "
742     "It also starts up with the window minimized.", PN_SERVER_MODE,
743     {}, {}, {}},
744   {g_PropertyNames[PN_SERVER_MODE], {B_GET_PROPERTY, 0},
745     {B_DIRECT_SPECIFIER, 0}, "Find out the setting of the server mode flag.",
746     PN_SERVER_MODE, {}, {}, {}},
747   {g_PropertyNames[PN_FLUSH], {B_EXECUTE_PROPERTY, 0},
748     {B_DIRECT_SPECIFIER, 0}, "Writes out the database file to disk, if it has "
749     "been updated in memory but hasn't been saved to disk.  It will "
750     "automatically get written when the program exits, so this command is "
751     "mostly useful for server mode.", PN_FLUSH, {}, {}, {}},
752   {g_PropertyNames[PN_PURGE_AGE], {B_SET_PROPERTY, 0},
753     {B_DIRECT_SPECIFIER, 0}, "Sets the old age limit.  Words which haven't "
754       "been updated since this many message additions to the database may be "
755       "deleted when you do a purge.  A good value is 1000, meaning that if a "
756       "word hasn't appeared in the last 1000 spam/genuine messages, it will "
757       "be forgotten.  Zero will purge all words, 1 will purge words not in "
758       "the last message added to the database, 2 will purge words not in the "
759       "last two messages added, and so on.  This is mostly useful for "
760       "removing those one time words which are often hunks of binary garbage, "
761       "not real words.  This acts in combination with the popularity limit; "
762       "both conditions have to be valid before the word gets deleted.",
763       PN_PURGE_AGE, {}, {}, {}},
764   {g_PropertyNames[PN_PURGE_AGE], {B_GET_PROPERTY, 0},
765     {B_DIRECT_SPECIFIER, 0}, "Gets the old age limit.", PN_PURGE_AGE,
766     {}, {}, {}},
767   {g_PropertyNames[PN_PURGE_POPULARITY], {B_SET_PROPERTY, 0},
768     {B_DIRECT_SPECIFIER, 0}, "Sets the popularity limit.  Words which aren't "
769     "this popular may be deleted when you do a purge.  A good value is 5, "
770     "which means that the word is safe from purging if it has been seen in 6 "
771     "or more e-mail messages.  If it's only in 5 or less, then it may get "
772     "purged.  The extreme is zero, where only words that haven't been seen "
773     "in any message are deleted (usually means no words).  This acts in "
774     "combination with the old age limit; both conditions have to be valid "
775     "before the word gets deleted.", PN_PURGE_POPULARITY, {}, {}, {}},
776   {g_PropertyNames[PN_PURGE_POPULARITY], {B_GET_PROPERTY, 0},
777     {B_DIRECT_SPECIFIER, 0}, "Gets the purge popularity limit.",
778     PN_PURGE_POPULARITY, {}, {}, {}},
779   {g_PropertyNames[PN_PURGE], {B_EXECUTE_PROPERTY, 0},
780     {B_DIRECT_SPECIFIER, 0}, "Purges the old obsolete words from the "
781     "database, if they are old enough according to the age limit and also "
782     "unpopular enough according to the popularity limit.", PN_PURGE,
783     {}, {}, {}},
784   {g_PropertyNames[PN_OLDEST], {B_GET_PROPERTY, 0},
785     {B_DIRECT_SPECIFIER, 0}, "Gets the age of the oldest message in the "
786     "database.  It's relative to the beginning of time, so you need to do "
787     "(total messages - age - 1) to see how many messages ago it was added.",
788     PN_OLDEST, {}, {}, {}},
789   {g_PropertyNames[PN_EVALUATE], {B_SET_PROPERTY, 0},
790     {B_DIRECT_SPECIFIER, 0}, "Evaluates a given file (by path name) to see "
791     "if it is spam or not.  Returns the ratio of spam probability vs genuine "
792     "probability, 0.0 meaning completely genuine, 1.0 for completely spam.  "
793     "Normally you should safely be able to consider it as spam if it is over "
794     "0.56 for the Robinson scoring method.  For the ChiSquared method, the "
795     "numbers are near 0 for genuine, near 1 for spam, and anywhere in the "
796     "middle means it can't decide.  The program attaches a MAIL:ratio_spam "
797     "attribute with the ratio as its "
798     "float32 value to the file.  Also returns the top few interesting words "
799     "in \"words\" and the associated per-word probability ratios in "
800     "\"ratios\".", PN_EVALUATE, {}, {}, {}},
801   {g_PropertyNames[PN_EVALUATE_STRING], {B_SET_PROPERTY, 0},
802     {B_DIRECT_SPECIFIER, 0}, "Like Evaluate, but rather than a file name, "
803     "the string argument contains the entire text of the message to be "
804     "evaluated.", PN_EVALUATE_STRING, {}, {}, {}},
805   {g_PropertyNames[PN_RESET_TO_DEFAULTS], {B_EXECUTE_PROPERTY, 0},
806     {B_DIRECT_SPECIFIER, 0}, "Resets all the configuration options to the "
807     "default values, including the database name.", PN_RESET_TO_DEFAULTS,
808     {}, {}, {}},
809   {g_PropertyNames[PN_INSTALL_THINGS], {B_EXECUTE_PROPERTY, 0},
810     {B_DIRECT_SPECIFIER, 0}, "Creates indices for the MAIL:classification and "
811     "MAIL:ratio_spam attributes on all volumes which support BeOS queries, "
812     "identifies them to the system as e-mail related attributes (modifies "
813     "the text/x-email MIME type), and sets up the new MIME type "
814     "(text/x-vnd.agmsmith.spam_probability_database) for the database file.  "
815     "Also registers names for the sound effects used by the separate filter "
816     "program (use the installsound BeOS program or the Sounds preferences "
817     "program to associate sound files with the names).", PN_INSTALL_THINGS,
818     {}, {}, {}},
819   {g_PropertyNames[PN_TOKENIZE_MODE], {B_SET_PROPERTY, 0},
820     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for breaking up the "
821     "message into words.  Use \"Whole\" for the whole file (also use it for "
822     "non-email files).  The file isn't broken into parts; the whole thing is "
823     "converted into words, headers and attachments are just more raw data.  "
824     "Well, not quite raw data since it converts quoted-printable codes "
825     "(equals sign followed by hex digits or end of line) to the equivalent "
826     "single characters.  \"PlainText\" breaks the file into MIME components "
827     "and only looks at the ones which are of MIME type text/plain.  "
828     "\"AnyText\" will look for words in all text/* things, including "
829     "text/html attachments.  \"AllParts\" will decode all message components "
830     "and look for words in them, including binary attachments.  "
831     "\"JustHeader\" will only look for words in the message header.  "
832     "\"AllPartsAndHeader\", \"PlainTextAndHeader\" and \"AnyTextAndHeader\" "
833     "will also include the words from the message headers.", PN_TOKENIZE_MODE,
834     {}, {}, {}},
835   {g_PropertyNames[PN_TOKENIZE_MODE], {B_GET_PROPERTY, 0},
836     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for breaking up the "
837     "message into words.", PN_TOKENIZE_MODE, {}, {}, {}},
838   {g_PropertyNames[PN_SCORING_MODE], {B_SET_PROPERTY, 0},
839     {B_DIRECT_SPECIFIER, 0}, "Sets the method used for combining the "
840     "probabilities of individual words into an overall score.  "
841     "\"Robinson\" mode will use Gary Robinson's nth root of the product "
842     "method.  It gives a nice range of values between 0 and 1 so you can "
843     "see shades of spaminess.  The cutoff point between spam and genuine "
844     "varies depending on your database of words (0.56 was one point in "
845     "some experiments).  \"ChiSquared\" mode will use chi-squared "
846     "statistics to evaluate the difference in probabilities that the lists "
847     "of word ratios are random.  The result is very close to 0 for genuine "
848     "and very close to 1 for spam, and near the middle if it is uncertain.",
849     PN_SCORING_MODE, {}, {}, {}},
850   {g_PropertyNames[PN_SCORING_MODE], {B_GET_PROPERTY, 0},
851     {B_DIRECT_SPECIFIER, 0}, "Gets the method used for combining the "
852     "individual word ratios into an overall score.", PN_SCORING_MODE,
853     {}, {}, {}},
854 
855   { 0 }
856 };
857 
858 
859 /* The various scoring modes as text and enums.  See PN_SCORING_MODE. */
860 
861 typedef enum ScoringModeEnum
862 {
863   SM_ROBINSON = 0,
864   SM_CHISQUARED,
865   SM_MAX
866 } ScoringModes;
867 
868 static const char * g_ScoringModeNames [SM_MAX] =
869 {
870   "Robinson",
871   "ChiSquared"
872 };
873 
874 
875 /* The various tokenizing modes as text and enums.  See PN_TOKENIZE_MODE. */
876 
877 typedef enum TokenizeModeEnum
878 {
879   TM_WHOLE = 0,
880   TM_PLAIN_TEXT,
881   TM_PLAIN_TEXT_HEADER,
882   TM_ANY_TEXT,
883   TM_ANY_TEXT_HEADER,
884   TM_ALL_PARTS,
885   TM_ALL_PARTS_HEADER,
886   TM_JUST_HEADER,
887   TM_MAX
888 } TokenizeModes;
889 
890 static const char * g_TokenizeModeNames [TM_MAX] =
891 {
892   "All",
893   "Plain text",
894   "Plain text and header",
895   "Any text",
896   "Any text and header",
897   "All parts",
898   "All parts and header",
899   "Just header"
900 };
901 
902 
903 /* Possible message classifications. */
904 
905 typedef enum ClassificationTypesEnum
906 {
907   CL_GENUINE = 0,
908   CL_SPAM,
909   CL_UNCERTAIN,
910   CL_MAX
911 } ClassificationTypes;
912 
913 static const char * g_ClassificationTypeNames [CL_MAX] =
914 {
915   g_ClassifiedGenuine,
916   g_ClassifiedSpam,
917   "Uncertain"
918 };
919 
920 
921 /* Some polygon graphics for the scroll arrows. */
922 
923 static BPoint g_UpLinePoints [] =
924 {
925   BPoint (8, 2 * (1)),
926   BPoint (14, 2 * (6)),
927   BPoint (10, 2 * (6)),
928   BPoint (10, 2 * (13)),
929   BPoint (6, 2 * (13)),
930   BPoint (6, 2 * (6)),
931   BPoint (2, 2 * (6))
932 };
933 
934 static BPoint g_DownLinePoints [] =
935 {
936   BPoint (8, 2 * (14-1)),
937   BPoint (14, 2 * (14-6)),
938   BPoint (10, 2 * (14-6)),
939   BPoint (10, 2 * (14-13)),
940   BPoint (6, 2 * (14-13)),
941   BPoint (6, 2 * (14-6)),
942   BPoint (2, 2 * (14-6))
943 };
944 
945 static BPoint g_UpPagePoints [] =
946 {
947   BPoint (8, 2 * (1)),
948   BPoint (13, 2 * (6)),
949   BPoint (10, 2 * (6)),
950   BPoint (14, 2 * (10)),
951   BPoint (10, 2 * (10)),
952   BPoint (10, 2 * (13)),
953   BPoint (6, 2 * (13)),
954   BPoint (6, 2 * (10)),
955   BPoint (2, 2 * (10)),
956   BPoint (6, 2 * (6)),
957   BPoint (3, 2 * (6))
958 };
959 
960 static BPoint g_DownPagePoints [] =
961 {
962   BPoint (8, 2 * (14-1)),
963   BPoint (13, 2 * (14-6)),
964   BPoint (10, 2 * (14-6)),
965   BPoint (14, 2 * (14-10)),
966   BPoint (10, 2 * (14-10)),
967   BPoint (10, 2 * (14-13)),
968   BPoint (6, 2 * (14-13)),
969   BPoint (6, 2 * (14-10)),
970   BPoint (2, 2 * (14-10)),
971   BPoint (6, 2 * (14-6)),
972   BPoint (3, 2 * (14-6))
973 };
974 
975 
976 /* An array of flags to identify characters which are considered to be spaces.
977 If character code X has g_SpaceCharacters[X] set to true then it is a
978 space-like character.  Character codes 128 and above are always non-space since
979 they are UTF-8 characters.  Initialised in the ABSApp constructor. */
980 
981 static bool g_SpaceCharacters [128];
982 
983 
984 
985 /******************************************************************************
986  * Each word in the spam database gets one of these structures.  The database
987  * has a string (the word) as the key and this structure as the value
988  * (statistics for that word).
989  */
990 
991 typedef struct StatisticsStruct
992 {
993   uint32 age;
994     /* Sequence number for the time when this word was last updated in the
995     database, so that we can remove old words (haven't been seen in recent
996     spam).  It's zero for the first file ever added (spam or genuine) to the
997     database, 1 for all words added or updated by the second file, etc.  If a
998     later file updates an existing word, it gets the age of the later file. */
999 
1000   uint32 genuineCount;
1001     /* Number of genuine messages that have this word. */
1002 
1003   uint32 spamCount;
1004     /* A count of the number of spam e-mail messages which contain the word. */
1005 
1006 } StatisticsRecord, *StatisticsPointer;
1007 
1008 typedef map<string, StatisticsRecord> StatisticsMap;
1009   /* Define this type which will be used for our main data storage facility, so
1010   we can more conveniently specify things that are derived from it, like
1011   iterators. */
1012 
1013 
1014 
1015 /******************************************************************************
1016  * An alert box asking how the user wants to mark messages.  There are buttons
1017  * for each classification category, and a checkbox to mark all remaining N
1018  * messages the same way.  And a cancel button.  To use it, first create the
1019  * ClassificationChoicesWindow, specifying the input arguments.  Then call the
1020  * Go method which will show the window, stuff the user's answer into your
1021  * output arguments (class set to CL_MAX if the user cancels), and destroy the
1022  * window.  Implemented because BAlert only allows 3 buttons, max!
1023  */
1024 
1025 class ClassificationChoicesWindow : public BWindow
1026 {
1027 public:
1028   /* Constructor and destructor. */
1029   ClassificationChoicesWindow (BRect FrameRect,
1030     const char *FileName, int NumberOfFiles);
1031 
1032   /* BeOS virtual functions. */
1033   virtual void MessageReceived (BMessage *MessagePntr);
1034 
1035   /* Our methods. */
1036   void Go (bool *BulkModeSelectedPntr,
1037     ClassificationTypes *ChoosenClassificationPntr);
1038 
1039   /* Various message codes for various buttons etc. */
1040   static const uint32 MSG_CLASS_BUTTONS = 'ClB0';
1041   static const uint32 MSG_CANCEL_BUTTON = 'Cncl';
1042   static const uint32 MSG_BULK_CHECKBOX = 'BlkK';
1043 
1044 private:
1045   /* Member variables. */
1046   bool *m_BulkModeSelectedPntr;
1047   ClassificationTypes *m_ChoosenClassificationPntr;
1048 };
1049 
1050 class ClassificationChoicesView : public BView
1051 {
1052 public:
1053   /* Constructor and destructor. */
1054   ClassificationChoicesView (BRect FrameRect,
1055     const char *FileName, int NumberOfFiles);
1056 
1057   /* BeOS virtual functions. */
1058   virtual void AttachedToWindow ();
1059   virtual void GetPreferredSize (float *width, float *height);
1060 
1061 private:
1062   /* Member variables. */
1063   const char *m_FileName;
1064   int         m_NumberOfFiles;
1065   float       m_PreferredBottomY;
1066 };
1067 
1068 
1069 
1070 /******************************************************************************
1071  * Due to deadlock problems with the BApplication posting scripting messages to
1072  * itself, we need to add a second Looper.  Its job is to just to convert
1073  * command line arguments and arguments from the Tracker (refs received) into a
1074  * series of scripting commands sent to the main BApplication.  It also prints
1075  * out the replies received (to stdout for command line replies).  An instance
1076  * of this class will be created and run by the main() function, and shut down
1077  * by it too.
1078  */
1079 
1080 class CommanderLooper : public BLooper
1081 {
1082 public:
1083   CommanderLooper ();
1084   ~CommanderLooper ();
1085   virtual void MessageReceived (BMessage *MessagePntr);
1086 
1087   void CommandArguments (int argc, char **argv);
1088   void CommandReferences (BMessage *MessagePntr,
1089     bool BulkMode = false,
1090     ClassificationTypes BulkClassification = CL_GENUINE);
1091   bool IsBusy ();
1092 
1093 private:
1094   void ProcessArgs (BMessage *MessagePntr);
1095   void ProcessRefs (BMessage *MessagePntr);
1096 
1097   static const uint32 MSG_COMMAND_ARGUMENTS = 'CArg';
1098   static const uint32 MSG_COMMAND_FILE_REFS = 'CRef';
1099 
1100   bool m_IsBusy;
1101 };
1102 
1103 
1104 
1105 /******************************************************************************
1106  * This view contains the various buttons and other controls for setting
1107  * configuration options and displaying the state of the database (but not the
1108  * actual list of words).  It will appear in the top half of the
1109  * DatabaseWindow.
1110  */
1111 
1112 class ControlsView : public BView
1113 {
1114 public:
1115   /* Constructor and destructor. */
1116   ControlsView (BRect NewBounds);
1117   ~ControlsView ();
1118 
1119   /* BeOS virtual functions. */
1120   virtual void AttachedToWindow ();
1121   virtual void FrameResized (float Width, float Height);
1122   virtual void MessageReceived (BMessage *MessagePntr);
1123   virtual void Pulse ();
1124 
1125 private:
1126   /* Various message codes for various buttons etc. */
1127   static const uint32 MSG_BROWSE_BUTTON = 'Brws';
1128   static const uint32 MSG_DATABASE_NAME = 'DbNm';
1129   static const uint32 MSG_ESTIMATE_BUTTON = 'Estm';
1130   static const uint32 MSG_ESTIMATE_FILE_REFS = 'ERef';
1131   static const uint32 MSG_IGNORE_CLASSIFICATION = 'IPCl';
1132   static const uint32 MSG_PURGE_AGE = 'PuAg';
1133   static const uint32 MSG_PURGE_BUTTON = 'Purg';
1134   static const uint32 MSG_PURGE_POPULARITY = 'PuPo';
1135   static const uint32 MSG_SERVER_MODE = 'SrvM';
1136 
1137   /* Our member functions. */
1138   void BrowseForDatabaseFile ();
1139   void BrowseForFileToEstimate ();
1140   void PollServerForChanges ();
1141 
1142   /* Member variables. */
1143   BButton        *m_AboutButtonPntr;
1144   BButton        *m_AddExampleButtonPntr;
1145   BButton        *m_BrowseButtonPntr;
1146   BFilePanel     *m_BrowseFilePanelPntr;
1147   BButton        *m_CreateDatabaseButtonPntr;
1148   char            m_DatabaseFileNameCachedValue [PATH_MAX];
1149   BTextControl   *m_DatabaseFileNameTextboxPntr;
1150   bool            m_DatabaseLoadDone;
1151   BButton        *m_EstimateSpamButtonPntr;
1152   BFilePanel     *m_EstimateSpamFilePanelPntr;
1153   uint32          m_GenuineCountCachedValue;
1154   BTextControl   *m_GenuineCountTextboxPntr;
1155   bool            m_IgnorePreviousClassCachedValue;
1156   BCheckBox      *m_IgnorePreviousClassCheckboxPntr;
1157   BButton        *m_InstallThingsButtonPntr;
1158   uint32          m_PurgeAgeCachedValue;
1159   BTextControl   *m_PurgeAgeTextboxPntr;
1160   BButton        *m_PurgeButtonPntr;
1161   uint32          m_PurgePopularityCachedValue;
1162   BTextControl   *m_PurgePopularityTextboxPntr;
1163   BButton        *m_ResetToDefaultsButtonPntr;
1164   ScoringModes    m_ScoringModeCachedValue;
1165   BMenuBar       *m_ScoringModeMenuBarPntr;
1166   BPopUpMenu     *m_ScoringModePopUpMenuPntr;
1167   bool            m_ServerModeCachedValue;
1168   BCheckBox      *m_ServerModeCheckboxPntr;
1169   uint32          m_SpamCountCachedValue;
1170   BTextControl   *m_SpamCountTextboxPntr;
1171   bigtime_t       m_TimeOfLastPoll;
1172   TokenizeModes   m_TokenizeModeCachedValue;
1173   BMenuBar       *m_TokenizeModeMenuBarPntr;
1174   BPopUpMenu     *m_TokenizeModePopUpMenuPntr;
1175   uint32          m_WordCountCachedValue;
1176   BTextControl   *m_WordCountTextboxPntr;
1177 };
1178 
1179 
1180 /* Various message codes for various buttons etc. */
1181 static const uint32 MSG_LINE_DOWN = 'LnDn';
1182 static const uint32 MSG_LINE_UP = 'LnUp';
1183 static const uint32 MSG_PAGE_DOWN = 'PgDn';
1184 static const uint32 MSG_PAGE_UP = 'PgUp';
1185 
1186 /******************************************************************************
1187  * This view contains the list of words.  It displays as many as can fit in the
1188  * view rectangle, starting at a specified word (so it can simulate scrolling).
1189  * Usually it will appear in the bottom half of the DatabaseWindow.
1190  */
1191 
1192 class WordsView : public BView
1193 {
1194 public:
1195   /* Constructor and destructor. */
1196   WordsView (BRect NewBounds);
1197 
1198   /* BeOS virtual functions. */
1199   virtual void AttachedToWindow ();
1200   virtual void Draw (BRect UpdateRect);
1201   virtual void KeyDown (const char *BufferPntr, int32 NumBytes);
1202   virtual void MakeFocus (bool Focused);
1203   virtual void MessageReceived (BMessage *MessagePntr);
1204   virtual void MouseDown (BPoint point);
1205   virtual void Pulse ();
1206 
1207 private:
1208   /* Our member functions. */
1209   void MoveTextUpOrDown (uint32 MovementType);
1210   void RefsDroppedHere (BMessage *MessagePntr);
1211 
1212   /* Member variables. */
1213   BPictureButton *m_ArrowLineDownPntr;
1214   BPictureButton *m_ArrowLineUpPntr;
1215   BPictureButton *m_ArrowPageDownPntr;
1216   BPictureButton *m_ArrowPageUpPntr;
1217     /* Various buttons for controlling scrolling, since we can't use a scroll
1218     bar.  To make them less obvious, their background view colour needs to be
1219     changed whenever the main view's colour changes. */
1220 
1221   float m_AscentHeight;
1222     /* The ascent height for the font used to draw words.  Height from the top
1223     of the highest letter to the base line (which is near the middle bottom of
1224     the letters, the line where you would align your writing of the text by
1225     hand, all letters have part above, some also have descenders below this
1226     line). */
1227 
1228   rgb_color m_BackgroundColour;
1229     /* The current background colour.  Changes when the focus changes. */
1230 
1231   uint32 m_CachedTotalGenuineMessages;
1232   uint32 m_CachedTotalSpamMessages;
1233   uint32 m_CachedWordCount;
1234     /* These are cached copies of the similar values in the BApplication.  They
1235     reflect what's currently displayed.  If they are different than the values
1236     from the BApplication then the polling loop will try to redraw the display.
1237     They get set to the values actually used during drawing when drawing is
1238     successful. */
1239 
1240   char m_FirstDisplayedWord [g_MaxWordLength + 1];
1241     /* The scrolling display starts at this word.  Since we can't use index
1242     numbers (word[12345] for example), we use the word itself.  The scroll
1243     buttons set this to the next or previous word in the database.  Typing by
1244     the user when the view has the focus will also change this starting word.
1245     */
1246 
1247   rgb_color m_FocusedColour;
1248     /* The colour to use for focused mode (typing by the user is received by
1249     our view). */
1250 
1251   bigtime_t m_LastTimeAKeyWasPressed;
1252     /* Records the time when a key was last pressed.  Used for determining when
1253     the user has stopped typing a batch of letters. */
1254 
1255   float m_LineHeight;
1256     /* Height of a line of text in the font used for the word display.
1257     Includes the height of the letters plus a bit of extra space for between
1258     the lines (called leading). */
1259 
1260   BFont m_TextFont;
1261     /* The font used to draw the text in the window. */
1262 
1263   float m_TextHeight;
1264     /* Maximum total height of the letters in the text, includes the part above
1265     the baseline and the part below.  Doesn't include the sliver of space
1266     between lines. */
1267 
1268   rgb_color m_UnfocusedColour;
1269     /* The colour to use for unfocused mode, when user typing isn't active. */
1270 };
1271 
1272 
1273 
1274 /******************************************************************************
1275  * The BWindow class for this program.  It displays the database in real time,
1276  * and has various buttons and gadgets in the top half for changing settings
1277  * (live changes, no OK button, and they reflect changes done by other programs
1278  * using the server too).  The bottom half is a scrolling view listing all the
1279  * words in the database.  A simple graphic blotch behind each word shows
1280  * whether the word is strongly or weakly related to spam or genuine messages.
1281  * Most operations go through the scripting message system, but it also peeks
1282  * at the BApplication data for examining simple things and when redrawing the
1283  * list of words.
1284  */
1285 
1286 class DatabaseWindow : public BWindow
1287 {
1288 public:
1289   /* Constructor and destructor. */
1290   DatabaseWindow ();
1291 
1292   /* BeOS virtual functions. */
1293   virtual void MessageReceived (BMessage *MessagePntr);
1294   virtual bool QuitRequested ();
1295 
1296 private:
1297   /* Member variables. */
1298   ControlsView *m_ControlsViewPntr;
1299   WordsView    *m_WordsViewPntr;
1300 };
1301 
1302 
1303 
1304 /******************************************************************************
1305  * ABSApp is the BApplication class for this program.  This handles messages
1306  * from the outside world (requests to load a database, or to add files to the
1307  * collection).  It responds to command line arguments (if you start up the
1308  * program a second time, the system will just send the arguments to the
1309  * existing running program).  It responds to scripting messages.  And it
1310  * responds to messages from the window.  Its thread does the main work of
1311  * updating the database and reading / writing files.
1312  */
1313 
1314 class ABSApp : public BApplication
1315 {
1316 public:
1317   /* Constructor and destructor. */
1318   ABSApp ();
1319   ~ABSApp ();
1320 
1321   /* BeOS virtual functions. */
1322   virtual void AboutRequested ();
1323   virtual void ArgvReceived (int32 argc, char **argv);
1324   virtual status_t GetSupportedSuites (BMessage *MessagePntr);
1325   virtual void MessageReceived (BMessage *MessagePntr);
1326   virtual void Pulse ();
1327   virtual bool QuitRequested ();
1328   virtual void ReadyToRun ();
1329   virtual void RefsReceived (BMessage *MessagePntr);
1330   virtual BHandler *ResolveSpecifier (BMessage *MessagePntr, int32 Index,
1331     BMessage *SpecifierMsgPntr, int32 SpecificationKind, const char *Property);
1332 
1333 private:
1334   /* Our member functions. */
1335   status_t AddFileToDatabase (ClassificationTypes IsSpamOrWhat,
1336     const char *FileName, char *ErrorMessage);
1337   status_t AddPositionIOToDatabase (ClassificationTypes IsSpamOrWhat,
1338     BPositionIO *MessageIOPntr, const char *OptionalFileName,
1339     char *ErrorMessage);
1340   status_t AddStringToDatabase (ClassificationTypes IsSpamOrWhat,
1341     const char *String, char *ErrorMessage);
1342   void AddWordsToSet (const char *InputString, size_t NumberOfBytes,
1343     char PrefixCharacter, set<string> &WordSet);
1344   status_t CreateDatabaseFile (char *ErrorMessage);
1345   void DefaultSettings ();
1346   status_t DeleteDatabaseFile (char *ErrorMessage);
1347   status_t EvaluateFile (const char *PathName, BMessage *ReplyMessagePntr,
1348     char *ErrorMessage);
1349   status_t EvaluatePositionIO (BPositionIO *PositionIOPntr,
1350     const char *OptionalFileName, BMessage *ReplyMessagePntr,
1351     char *ErrorMessage);
1352   status_t EvaluateString (const char *BufferPntr, ssize_t BufferSize,
1353     BMessage *ReplyMessagePntr, char *ErrorMessage);
1354   status_t GetWordsFromPositionIO (BPositionIO *PositionIOPntr,
1355     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1356   status_t InstallThings (char *ErrorMessage);
1357   status_t LoadDatabaseIfNeeded (char *ErrorMessage);
1358   status_t LoadSaveDatabase (bool DoLoad, char *ErrorMessage);
1359 public:
1360   status_t LoadSaveSettings (bool DoLoad);
1361 private:
1362   status_t MakeBackup (char *ErrorMessage);
1363   void MakeDatabaseEmpty ();
1364   void ProcessScriptingMessage (BMessage *MessagePntr,
1365     struct property_info *PropInfoPntr);
1366   status_t PurgeOldWords (char *ErrorMessage);
1367   status_t RecursivelyTokenizeMailComponent (
1368     BMailComponent *ComponentPntr, const char *OptionalFileName,
1369     set<string> &WordSet, char *ErrorMessage,
1370     int RecursionLevel, int MaxRecursionLevel);
1371   status_t SaveDatabaseIfNeeded (char *ErrorMessage);
1372   status_t TokenizeParts (BPositionIO *PositionIOPntr,
1373     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1374   status_t TokenizeWhole (BPositionIO *PositionIOPntr,
1375     const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1376 
1377 public:
1378   /* Member variables.  Many are read by the window thread to see if it needs
1379   updating, and to draw the words.  However, the other threads will lock the
1380   BApplication or using scripting commands if they want to make changes. */
1381 
1382   bool m_DatabaseHasChanged;
1383     /* Set to TRUE when the in-memory database (stored in m_WordMap) has
1384     changed and is different from the on-disk database file.  When the
1385     application exits, the database will be written out if it has changed. */
1386 
1387   BString m_DatabaseFileName;
1388     /* The absolute path name to use for the database file on disk. */
1389 
1390   bool m_IgnorePreviousClassification;
1391     /* If TRUE then the previous classification of a message (stored in an
1392     attribute on the message file) will be ignored, and the message will be
1393     added to the requested spam/genuine list.  If this is FALSE then the spam
1394     won't be added to the list if it has already been classified as specified,
1395     but if it was mis-classified, it will be removed from the old list and
1396     added to the new list. */
1397 
1398   uint32 m_OldestAge;
1399     /* The age of the oldest word.  This will be the smallest age number in the
1400     database.  Mostly useful for scaling graphics representing age in the word
1401     display.  If the oldest word is no longer the oldest, this variable won't
1402     get immediately updated since it would take a lot of effort to find the
1403     next older age.  Since it's only used for display, we'll let it be slightly
1404     incorrect.  The next database load or purge will fix it. */
1405 
1406   uint32 m_PurgeAge;
1407     /* When purging old words, they have to be at least this old to be eligible
1408     for deletion.  Age is measured as the number of e-mails added to the
1409     database since the word was last updated in the database.  Zero means all
1410     words are old. */
1411 
1412   uint32 m_PurgePopularity;
1413     /* When purging old words, they have to be less than or equal to this
1414     popularity limit to be eligible for deletion.  Popularity is measured as
1415     the number of messages (spam and genuine) which have the word.  Zero means
1416     no words. */
1417 
1418   ScoringModes m_ScoringMode;
1419     /* Controls how to combine the word probabilities into an overall score.
1420     See the PN_SCORING_MODE comments for details. */
1421 
1422   BPath m_SettingsDirectoryPath;
1423     /* The constructor initialises this to the settings directory path.  It
1424     never changes after that. */
1425 
1426   bool m_SettingsHaveChanged;
1427     /* Set to TRUE when the settings are changed (different than the ones which
1428     were loaded).  When the application exits, the settings will be written out
1429     if they have changed. */
1430 
1431   double m_SmallestUseableDouble;
1432     /* When multiplying fractional numbers together, avoid using numbers
1433     smaller than this because the double exponent range is close to being
1434     exhausted.  The IEEE STANDARD 754 floating-point arithmetic (used on the
1435     Intel i8087 and later math processors) has 64 bit numbers with 53 bits of
1436     mantissa, giving it an underflow starting at 0.5**1022 = 2.2e-308 where it
1437     rounds off to the nearest multiple of 0.5**1074 = 4.9e-324. */
1438 
1439   TokenizeModes m_TokenizeMode;
1440     /* Controls how to convert the raw message text into words.  See the
1441     PN_TOKENIZE_MODE comments for details. */
1442 
1443   uint32 m_TotalGenuineMessages;
1444     /* Number of genuine messages which are in the database. */
1445 
1446   uint32 m_TotalSpamMessages;
1447     /* Number of spam messages which are in the database. */
1448 
1449   uint32 m_WordCount;
1450     /* The number of words currently in the database.  Stored separately as a
1451     member variable to avoid having to call m_WordMap.size() all the time,
1452     which other threads can't do while the database is being updated (but they
1453     can look at the word count variable). */
1454 
1455   StatisticsMap m_WordMap;
1456     /* The in-memory data structure holding the set of words and their
1457     associated statistics.  When the database isn't in use, it is an empty
1458     collection.  You should lock the BApplication if you are using the word
1459     collection (reading or writing) from another thread. */
1460 };
1461 
1462 
1463 
1464 /******************************************************************************
1465  * Global utility function to display an error message and return.  The message
1466  * part describes the error, and if ErrorNumber is non-zero, gets the string
1467  * ", error code $X (standard description)." appended to it.  If the message
1468  * is NULL then it gets defaulted to "Something went wrong".  The title part
1469  * doesn't get displayed (no title bar in the dialog box, but you can see it in
1470  * the debugger as the window thread name), and defaults to "Error Message" if
1471  * you didn't specify one.  If running in command line mode, the error gets
1472  * printed to stderr rather than showing up in a dialog box.
1473  */
1474 
1475 static void
1476 DisplayErrorMessage (
1477   const char *MessageString = NULL,
1478   int ErrorNumber = 0,
1479   const char *TitleString = NULL)
1480 {
1481   BAlert *AlertPntr;
1482   char ErrorBuffer [PATH_MAX + 1500];
1483 
1484   if (TitleString == NULL)
1485     TitleString = "SpamDBM Error Message";
1486 
1487   if (MessageString == NULL)
1488   {
1489     if (ErrorNumber == 0)
1490       MessageString = "No error, no message, why bother?";
1491     else
1492       MessageString = "Something went wrong";
1493   }
1494 
1495   if (ErrorNumber != 0)
1496   {
1497     sprintf (ErrorBuffer, "%s, error code $%X/%d (%s) has occured.",
1498       MessageString, ErrorNumber, ErrorNumber, strerror (ErrorNumber));
1499     MessageString = ErrorBuffer;
1500   }
1501 
1502   if (g_CommandLineMode || g_ServerMode)
1503     cerr << TitleString << ": " << MessageString << endl;
1504   else
1505   {
1506     AlertPntr = new BAlert (TitleString, MessageString,
1507       "Acknowledge", NULL, NULL, B_WIDTH_AS_USUAL, B_STOP_ALERT);
1508     if (AlertPntr != NULL) {
1509       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1510       AlertPntr->Go ();
1511     }
1512   }
1513 }
1514 
1515 
1516 
1517 /******************************************************************************
1518  * Word wrap a long line of text into shorter 79 column lines and print the
1519  * result on the given output stream.
1520  */
1521 
1522 static void
1523 WrapTextToStream (ostream& OutputStream, const char *TextPntr)
1524 {
1525   const int LineLength = 79;
1526   char     *StringPntr;
1527   char      TempString [LineLength+1];
1528 
1529   TempString[LineLength] = 0; /* Only needs to be done once. */
1530 
1531   while (*TextPntr != 0)
1532   {
1533     while (isspace (*TextPntr))
1534       TextPntr++; /* Skip leading spaces. */
1535     if (*TextPntr == 0)
1536       break; /* It was all spaces, don't print any more. */
1537 
1538     strncpy (TempString, TextPntr, LineLength);
1539 
1540     /* Advance StringPntr to the end of the temp string, partly to see how long
1541     it is (rather than doing strlen). */
1542 
1543     StringPntr = TempString;
1544     while (*StringPntr != 0)
1545       StringPntr++;
1546 
1547     if (StringPntr - TempString < LineLength)
1548     {
1549       /* This line fits completely. */
1550       OutputStream << TempString << endl;
1551       TextPntr += StringPntr - TempString;
1552       continue;
1553     }
1554 
1555     /* Advance StringPntr to the last space in the temp string. */
1556 
1557     while (StringPntr > TempString)
1558     {
1559       if (isspace (*StringPntr))
1560         break; /* Found the trailing space. */
1561       else /* Go backwards, looking for the trailing space. */
1562         StringPntr--;
1563     }
1564 
1565     /* Remove more trailing spaces at the end of the line, in case there were
1566     several spaces in a row. */
1567 
1568     while (StringPntr > TempString && isspace (StringPntr[-1]))
1569       StringPntr--;
1570 
1571     /* Print the line of text and advance the text pointer too. */
1572 
1573     if (StringPntr == TempString)
1574     {
1575       /* This line has no spaces, don't wrap it, just split off a chunk. */
1576       OutputStream << TempString << endl;
1577       TextPntr += strlen (TempString);
1578       continue;
1579     }
1580 
1581     *StringPntr = 0; /* Cut off after the first trailing space. */
1582     OutputStream << TempString << endl;
1583     TextPntr += StringPntr - TempString;
1584   }
1585 }
1586 
1587 
1588 
1589 /******************************************************************************
1590  * Print the usage info to the stream.  Includes a list of all commands.
1591  */
1592 ostream& PrintUsage (ostream& OutputStream);
1593 
1594 ostream& PrintUsage (ostream& OutputStream)
1595 {
1596   struct property_info *PropInfoPntr;
1597 
1598   OutputStream << "\nSpamDBM - A Spam Database Manager\n";
1599   OutputStream << "Copyright © 2002 by Alexander G. M. Smith.  ";
1600   OutputStream << "Released to the public domain.\n\n";
1601   WrapTextToStream (OutputStream, "Compiled on " __DATE__ " at " __TIME__
1602 ".  $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $  $HeadURL: http://svn.haiku-os.org/haiku/haiku/trunk/src/bin/mail_utils/spamdbm.cpp $");
1603   OutputStream << "\n"
1604 "This is a program for classifying e-mail messages as spam (junk mail which\n"
1605 "you don't want to read) and regular genuine messages.  It can learn what's\n"
1606 "spam and what's genuine.  You just give it a bunch of spam messages and a\n"
1607 "bunch of non-spam ones.  It uses them to make a list of the words from the\n"
1608 "messages with the probability that each word is from a spam message or from\n"
1609 "a genuine message.  Later on, it can use those probabilities to classify\n"
1610 "new messages as spam or not spam.  If the classifier stops working well\n"
1611 "(because the spammers have changed their writing style and vocabulary, or\n"
1612 "your regular correspondants are writing like spammers), you can use this\n"
1613 "program to update the list of words to identify the new messages\n"
1614 "correctly.\n"
1615 "\n"
1616 "The original idea was from Paul Graham's algorithm, which has an excellent\n"
1617 "writeup at: http://www.paulgraham.com/spam.html\n"
1618 "\n"
1619 "Gary Robinson came up with the improved algorithm, which you can read about at:\n"
1620 "http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n"
1621 "\n"
1622 "Then he, Tim Peters and the SpamBayes mailing list developed the Chi-Squared\n"
1623 "test, see http://mail.python.org/pipermail/spambayes/2002-October/001036.html\n"
1624 "for one of the earlier messages leading from the central limit theorem to\n"
1625 "the current chi-squared scoring method.\n"
1626 "\n"
1627 "Thanks go to Isaac Yonemoto for providing a better icon, which we can\n"
1628 "unfortunately no longer use, since the Hormel company wants people to\n"
1629 "avoid associating their meat product with junk e-mail.\n"
1630 "\n"
1631 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes\n"
1632 "uses to extract words from messages.  In particular, HTML is now handled.\n"
1633 "\n"
1634 "Usage: Specify the operation as the first argument followed by more\n"
1635 "information as appropriate.  The program's configuration will affect the\n"
1636 "actual operation (things like the name of the database file to use, or\n"
1637 "whether it should allow non-email messages to be added).  In command line\n"
1638 "mode it will do the operation and exit.  In GUI/server mode a command line\n"
1639 "invocation will just send the command to the running server.  You can also\n"
1640 "use BeOS scripting (see the \"Hey\" command which you can get from\n"
1641 "http://www.bebits.com/app/2042 ) to control the Spam server.  And finally,\n"
1642 "there's also a GUI interface which shows up if you start it without any\n"
1643 "command line arguments.\n"
1644 "\n"
1645 "Commands:\n"
1646 "\n"
1647 "Quit\n"
1648 "Stop the program.  Useful if it's running as a server.\n"
1649 "\n";
1650 
1651   /* Go through all our scripting commands and add a description of each one to
1652   the usage text. */
1653 
1654   for (PropInfoPntr = g_ScriptingPropertyList + 0;
1655   PropInfoPntr->name != 0;
1656   PropInfoPntr++)
1657   {
1658     switch (PropInfoPntr->commands[0])
1659     {
1660       case B_GET_PROPERTY:
1661         OutputStream << "Get " << PropInfoPntr->name << endl;
1662         break;
1663 
1664       case B_SET_PROPERTY:
1665         OutputStream << "Set " << PropInfoPntr->name << " NewValue" << endl;
1666         break;
1667 
1668       case B_COUNT_PROPERTIES:
1669         OutputStream << "Count " << PropInfoPntr->name << endl;
1670         break;
1671 
1672       case B_CREATE_PROPERTY:
1673         OutputStream << "Create " << PropInfoPntr->name << endl;
1674         break;
1675 
1676       case B_DELETE_PROPERTY:
1677         OutputStream << "Delete " << PropInfoPntr->name << endl;
1678         break;
1679 
1680       case B_EXECUTE_PROPERTY:
1681         OutputStream << PropInfoPntr->name << endl;
1682         break;
1683 
1684       default:
1685         OutputStream << "Buggy Command: " << PropInfoPntr->name << endl;
1686         break;
1687     }
1688     WrapTextToStream (OutputStream, (char *)PropInfoPntr->usage);
1689     OutputStream << endl;
1690   }
1691 
1692   return OutputStream;
1693 }
1694 
1695 
1696 
1697 /******************************************************************************
1698  * A utility function to send a command to the application, will return after a
1699  * short delay if the application is busy (doesn't wait for it to be executed).
1700  * The reply from the application is also thrown away.  It used to be an
1701  * overloaded function, but the system couldn't distinguish between bool and
1702  * int, so now it has slightly different names depending on the arguments.
1703  */
1704 
1705 static void
1706 SubmitCommand (BMessage& CommandMessage)
1707 {
1708   status_t ErrorCode;
1709 
1710   ErrorCode = be_app_messenger.SendMessage (&CommandMessage,
1711     be_app_messenger /* reply messenger, throw away the reply */,
1712     1000000 /* delivery timeout */);
1713 
1714   if (ErrorCode != B_OK)
1715     cerr << "SubmitCommand failed to send a command, code " <<
1716     ErrorCode << " (" << strerror (ErrorCode) << ")." << endl;
1717 }
1718 
1719 
1720 static void
1721 SubmitCommandString (
1722   PropertyNumbers Property,
1723   uint32 CommandCode,
1724   const char *StringArgument = NULL)
1725 {
1726   BMessage CommandMessage (CommandCode);
1727 
1728   if (Property < 0 || Property >= PN_MAX)
1729   {
1730     DisplayErrorMessage ("SubmitCommandString bug.");
1731     return;
1732   }
1733   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1734   if (StringArgument != NULL)
1735     CommandMessage.AddString (g_DataName, StringArgument);
1736   SubmitCommand (CommandMessage);
1737 }
1738 
1739 
1740 static void
1741 SubmitCommandInt32 (
1742   PropertyNumbers Property,
1743   uint32 CommandCode,
1744   int32 Int32Argument)
1745 {
1746   BMessage CommandMessage (CommandCode);
1747 
1748   if (Property < 0 || Property >= PN_MAX)
1749   {
1750     DisplayErrorMessage ("SubmitCommandInt32 bug.");
1751     return;
1752   }
1753   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1754   CommandMessage.AddInt32 (g_DataName, Int32Argument);
1755   SubmitCommand (CommandMessage);
1756 }
1757 
1758 
1759 static void
1760 SubmitCommandBool (
1761   PropertyNumbers Property,
1762   uint32 CommandCode,
1763   bool BoolArgument)
1764 {
1765   BMessage CommandMessage (CommandCode);
1766 
1767   if (Property < 0 || Property >= PN_MAX)
1768   {
1769     DisplayErrorMessage ("SubmitCommandBool bug.");
1770     return;
1771   }
1772   CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1773   CommandMessage.AddBool (g_DataName, BoolArgument);
1774   SubmitCommand (CommandMessage);
1775 }
1776 
1777 
1778 
1779 /******************************************************************************
1780  * A utility function which will estimate the spaminess of file(s), not
1781  * callable from the application thread since it sends a scripting command to
1782  * the application and waits for results.  For each file there will be an entry
1783  * reference in the message.  For each of those, run it through the spam
1784  * estimator and display a box with the results.  This function is used both by
1785  * the file requestor and by dragging and dropping into the middle of the words
1786  * view.
1787  */
1788 
1789 static void
1790 EstimateRefFilesAndDisplay (BMessage *MessagePntr)
1791 {
1792   BAlert     *AlertPntr;
1793   BEntry      Entry;
1794   entry_ref   EntryRef;
1795   status_t    ErrorCode;
1796   int         i, j;
1797   BPath       Path;
1798   BMessage    ReplyMessage;
1799   BMessage    ScriptingMessage;
1800   const char *StringPntr;
1801   float       TempFloat;
1802   int32       TempInt32;
1803   char        TempString [PATH_MAX + 1024 +
1804                 g_MaxInterestingWords * (g_MaxWordLength + 16)];
1805 
1806   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
1807   {
1808     /* See if the entry is a valid file or directory or other thing. */
1809 
1810     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
1811     if (ErrorCode != B_OK || !Entry.Exists () || Entry.GetPath (&Path) != B_OK)
1812       continue;
1813 
1814     /* Evaluate the spaminess of the file. */
1815 
1816     ScriptingMessage.MakeEmpty ();
1817     ScriptingMessage.what = B_SET_PROPERTY;
1818     ScriptingMessage.AddSpecifier (g_PropertyNames[PN_EVALUATE]);
1819     ScriptingMessage.AddString (g_DataName, Path.Path ());
1820 
1821     if (be_app_messenger.SendMessage (&ScriptingMessage,&ReplyMessage) != B_OK)
1822       break; /* App has died or something is wrong. */
1823 
1824     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
1825     TempInt32 != B_OK)
1826       break; /* Error messages will be displayed elsewhere. */
1827 
1828     ReplyMessage.FindFloat (g_ResultName, &TempFloat);
1829     sprintf (TempString, "%f spam ratio for \"%s\".\nThe top words are:",
1830       (double) TempFloat, Path.Path ());
1831 
1832     for (j = 0; j < 20 /* Don't print too many! */; j++)
1833     {
1834       if (ReplyMessage.FindString ("words", j, &StringPntr) != B_OK ||
1835       ReplyMessage.FindFloat ("ratios", j, &TempFloat) != B_OK)
1836         break;
1837 
1838       sprintf (TempString + strlen (TempString), "\n%s / %f",
1839         StringPntr, TempFloat);
1840     }
1841     if (j >= 20 && j < g_MaxInterestingWords)
1842       sprintf (TempString + strlen (TempString), "\nAnd up to %d more words.",
1843         g_MaxInterestingWords - j);
1844 
1845     AlertPntr = new BAlert ("Estimate", TempString, "OK");
1846     if (AlertPntr != NULL) {
1847       AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1848       AlertPntr->Go ();
1849     }
1850   }
1851 }
1852 
1853 
1854 
1855 /******************************************************************************
1856  * A utility function from the http://sourceforge.net/projects/spambayes
1857  * SpamBayes project.  Return prob(chisq >= x2, with v degrees of freedom).  It
1858  * computes the probability that the chi-squared value (a kind of normalized
1859  * error measurement), with v degrees of freedom, would be larger than a given
1860  * number (x2; chi is the Greek letter X thus x2).  So you can tell if the
1861  * error is really unusual (the returned probability is near zero meaning that
1862  * your measured error number is kind of large - actual chi-squared is rarely
1863  * above that number merely due to random effects), or if it happens often
1864  * (usually if the probability is over 5% then it's within 3 standard
1865  * deviations - meaning that chi-squared goes over your number fairly often due
1866  * merely to random effects).  v must be even for this calculation to work.
1867  */
1868 
1869 static double ChiSquaredProbability (double x2, int v)
1870 {
1871   int    halfV = v / 2;
1872   int    i;
1873   double m;
1874   double sum;
1875   double term;
1876 
1877   if (v & 1)
1878     return -1.0; /* Out of range return value as a hint v is odd. */
1879 
1880   /* If x2 is very large, exp(-m) will underflow to 0. */
1881   m = x2 / 2.0;
1882   sum = term = exp (-m);
1883   for (i = 1; i < halfV; i++)
1884   {
1885     term *= m / i;
1886     sum += term;
1887   }
1888 
1889   /* With small x2 and large v, accumulated roundoff error, plus error in the
1890   platform exp(), can cause this to spill a few ULP above 1.0.  For example,
1891   ChiSquaredProbability(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
1892   point.  Returning a value even a teensy bit over 1.0 is no good. */
1893 
1894   if (sum > 1.0)
1895     return 1.0;
1896   return sum;
1897 }
1898 
1899 
1900 
1901 /******************************************************************************
1902  * A utility function to remove the "[Spam 99.9%] " from in front of the
1903  * MAIL:subject attribute of a file.
1904  */
1905 
1906 static status_t RemoveSpamPrefixFromSubjectAttribute (BNode *BNodePntr)
1907 {
1908   status_t    ErrorCode;
1909   const char *MailSubjectName = "MAIL:subject";
1910   char       *StringPntr;
1911   char        SubjectString [2000];
1912 
1913   ErrorCode = BNodePntr->ReadAttr (MailSubjectName,
1914     B_STRING_TYPE, 0 /* offset */, SubjectString,
1915     sizeof (SubjectString) - 1);
1916   if (ErrorCode <= 0)
1917     return 0; /* The attribute isn't there so we don't care. */
1918   if (ErrorCode >= (int) sizeof (SubjectString) - 1)
1919     return 0; /* Can't handle subjects which are too long. */
1920 
1921   SubjectString [ErrorCode] = 0;
1922   ErrorCode = 0; /* So do-nothing exit returns zero. */
1923   if (strncmp (SubjectString, "[Spam ", 6) == 0)
1924   {
1925     for (StringPntr = SubjectString;
1926     *StringPntr != 0 && *StringPntr != ']'; StringPntr++)
1927       ; /* No body in this for loop. */
1928     if (StringPntr[0] == ']' && StringPntr[1] == ' ')
1929     {
1930       ErrorCode = BNodePntr->RemoveAttr (MailSubjectName);
1931       ErrorCode = BNodePntr->WriteAttr (MailSubjectName,
1932         B_STRING_TYPE, 0 /* offset */,
1933         StringPntr + 2, strlen (StringPntr + 2) + 1);
1934       if (ErrorCode > 0)
1935         ErrorCode = 0;
1936     }
1937   }
1938 
1939   return ErrorCode;
1940 }
1941 
1942 
1943 
1944 /******************************************************************************
1945  * The tokenizing functions.  To make tokenization of the text easier to
1946  * understand, it is broken up into several passes.  Each pass goes over the
1947  * text (can include NUL bytes) and extracts all the words it can recognise
1948  * (can be none).  The extracted words are added to the WordSet, with the
1949  * PrefixCharacter prepended (zero if none) so we can distinguish between words
1950  * found in headers and in the text body.  It also modifies the input text
1951  * buffer in-place to change the text that the next pass will see (blanking out
1952  * words that it wants to delete, but not inserting much new text since the
1953  * buffer can't be enlarged).  They all return the number of bytes remaining in
1954  * InputString after it has been modified to be input for the next pass.
1955  * Returns zero if it has exhausted the possibility of getting more words, or
1956  * if something goes wrong.
1957  */
1958 
1959 static size_t TokenizerPassLowerCase (
1960   char *BufferPntr,
1961   size_t NumberOfBytes)
1962 {
1963   char *EndOfStringPntr;
1964 
1965   EndOfStringPntr = BufferPntr + NumberOfBytes;
1966 
1967   while (BufferPntr < EndOfStringPntr)
1968   {
1969     /* Do our own lower case conversion; tolower () has problems with UTF-8
1970     characters that have the high bit set. */
1971 
1972     if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
1973       *BufferPntr = *BufferPntr + ('a' - 'A');
1974     BufferPntr++;
1975   }
1976   return NumberOfBytes;
1977 }
1978 
1979 
1980 /* A utility function for some commonly repeated code.  If this was Modula-2,
1981 we could use a nested procedure.  But it's not.  Adds the given word to the set
1982 of words, checking for maximum word length and prepending the prefix to the
1983 word, which gets modified by this function to reflect the word actually added
1984 to the set. */
1985 
1986 static void
1987 AddWordAndPrefixToSet (
1988   string &Word,
1989   const char *PrefixString,
1990   set<string> &WordSet)
1991 {
1992   if (Word.empty ())
1993     return;
1994 
1995   if (Word.size () > g_MaxWordLength)
1996     Word.resize (g_MaxWordLength);
1997   Word.insert (0, PrefixString);
1998   WordSet.insert (Word);
1999 }
2000 
2001 
2002 /* Hunt through the text for various URLs and extract the components as
2003 separate words.  Doesn't affect the text in the buffer.  Looks for
2004 protocol://user:password@computer:port/path?query=key#anchor strings.  Also
2005 www.blah strings are detected and broken down.  Doesn't do HREF="" strings
2006 where the string has a relative path (no host computer name).  Assumes the
2007 input buffer is already in lower case. */
2008 
2009 static size_t TokenizerPassExtractURLs (
2010   char *BufferPntr,
2011   size_t NumberOfBytes,
2012   char PrefixCharacter,
2013   set<string> &WordSet)
2014 {
2015   char   *AtSignStringPntr;
2016   char   *HostStringPntr;
2017   char   *InputStringEndPntr;
2018   char   *InputStringPntr;
2019   char   *OptionsStringPntr;
2020   char   *PathStringPntr;
2021   char    PrefixString [2];
2022   char   *ProtocolStringPntr;
2023   string  Word;
2024 
2025   InputStringPntr = BufferPntr;
2026   InputStringEndPntr = BufferPntr + NumberOfBytes;
2027   PrefixString [0] = PrefixCharacter;
2028   PrefixString [1] = 0;
2029 
2030   while (InputStringPntr < InputStringEndPntr - 4)
2031   {
2032     HostStringPntr = NULL;
2033     if (memcmp (InputStringPntr, "www.", 4) == 0)
2034       HostStringPntr = InputStringPntr;
2035     else if (memcmp (InputStringPntr, "://", 3) == 0)
2036     {
2037       /* Find the protocol name, and add it as a word such as "ftp:" "http:" */
2038       ProtocolStringPntr = InputStringPntr;
2039       while (ProtocolStringPntr > BufferPntr &&
2040       isalpha (ProtocolStringPntr[-1]))
2041         ProtocolStringPntr--;
2042       Word.assign (ProtocolStringPntr,
2043         (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
2044       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2045       HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
2046     }
2047     if (HostStringPntr == NULL)
2048     {
2049       InputStringPntr++;
2050       continue;
2051     }
2052 
2053     /* Got a host name string starting at HostStringPntr.  It's everything
2054     until the next slash or space, like "user:password@computer:port". */
2055 
2056     InputStringPntr = HostStringPntr;
2057     AtSignStringPntr = NULL;
2058     while (InputStringPntr < InputStringEndPntr &&
2059     (*InputStringPntr != '/' && !isspace (*InputStringPntr)))
2060     {
2061       if (*InputStringPntr == '@')
2062         AtSignStringPntr = InputStringPntr;
2063       InputStringPntr++;
2064     }
2065     if (AtSignStringPntr != NULL)
2066     {
2067       /* Add a word with the user and password, unseparated. */
2068       Word.assign (HostStringPntr,
2069         AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
2070       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2071       HostStringPntr = AtSignStringPntr + 1;
2072     }
2073 
2074     /* Add a word with the computer and port, unseparated. */
2075 
2076     Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
2077     AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2078 
2079     /* Now get the path name, not including the extra junk after ?  and #
2080     separators (they're stored as separate options).  Stops at white space or a
2081     double quote mark. */
2082 
2083     PathStringPntr = InputStringPntr;
2084     OptionsStringPntr = NULL;
2085     while (InputStringPntr < InputStringEndPntr &&
2086     (*InputStringPntr != '"' && !isspace (*InputStringPntr)))
2087     {
2088       if (OptionsStringPntr == NULL &&
2089       (*InputStringPntr == '?' || *InputStringPntr == '#'))
2090         OptionsStringPntr = InputStringPntr;
2091       InputStringPntr++;
2092     }
2093 
2094     if (OptionsStringPntr == NULL)
2095     {
2096       /* No options, all path. */
2097       Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
2098       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2099     }
2100     else
2101     {
2102       /* Insert the path before the options. */
2103       Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
2104       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2105 
2106       /* Insert all the options as a word. */
2107       Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
2108       AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2109     }
2110   }
2111   return NumberOfBytes;
2112 }
2113 
2114 
2115 /* Replace long Asian words (likely to actually be sentences) with the first
2116 character in the word. */
2117 
2118 static size_t TokenizerPassTruncateLongAsianWords (
2119   char *BufferPntr,
2120   size_t NumberOfBytes)
2121 {
2122   char *EndOfStringPntr;
2123   char *InputStringPntr;
2124   int   Letter;
2125   char *OutputStringPntr;
2126   char *StartOfInputLongUnicodeWord;
2127   char *StartOfOutputLongUnicodeWord;
2128 
2129   InputStringPntr = BufferPntr;
2130   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2131   OutputStringPntr = InputStringPntr;
2132   StartOfInputLongUnicodeWord = NULL; /* Non-NULL flags it as started. */
2133   StartOfOutputLongUnicodeWord = NULL;
2134 
2135   /* Copy the text from the input to the output (same buffer), but when we find
2136   a sequence of UTF-8 characters that is too long then truncate it down to one
2137   character and reset the output pointer to be after that character, thus
2138   deleting the word.  Replacing the deleted characters after it with spaces
2139   won't work since we need to preserve the lack of space to handle those sneaky
2140   HTML artificial word breakers.  So that Thelongword<blah>ing becomes
2141   "T<blah>ing" rather than "T <blah>ing", so the next step joins them up into
2142   "Ting" rather than "T" and "ing".  The first code in a UTF-8 character is
2143   11xxxxxx and subsequent ones are 10xxxxxx. */
2144 
2145   while (InputStringPntr < EndOfStringPntr)
2146   {
2147     Letter = (unsigned char) *InputStringPntr;
2148     if (Letter < 128) // Got a regular ASCII letter?
2149     {
2150       if (StartOfInputLongUnicodeWord != NULL)
2151       {
2152         if (InputStringPntr - StartOfInputLongUnicodeWord >
2153         (int) g_MaxWordLength * 2)
2154         {
2155           /* Need to truncate the long word (100 bytes or about 50 characters)
2156           back down to the first UTF-8 character, so find out where the first
2157           character ends (skip past the 10xxxxxx bytes), and rewind the output
2158           pointer to be just after that (ignoring the rest of the long word in
2159           effect). */
2160 
2161           OutputStringPntr = StartOfOutputLongUnicodeWord + 1;
2162           while (OutputStringPntr < InputStringPntr)
2163           {
2164             Letter = (unsigned char) *OutputStringPntr;
2165             if (Letter < 128 || Letter >= 192)
2166               break;
2167             ++OutputStringPntr; // Still a UTF-8 middle of the character code.
2168           }
2169         }
2170         StartOfInputLongUnicodeWord = NULL;
2171       }
2172     }
2173     else if (Letter >= 192 && StartOfInputLongUnicodeWord == NULL)
2174     {
2175       /* Got the start of a UTF-8 character.  Remember the spot so we can see
2176       if this is a too long UTF-8 word, which is often a whole sentence in
2177       asian languages, since they sort of use a single character per word. */
2178 
2179       StartOfInputLongUnicodeWord = InputStringPntr;
2180       StartOfOutputLongUnicodeWord = OutputStringPntr;
2181     }
2182     *OutputStringPntr++ = *InputStringPntr++;
2183   }
2184   return OutputStringPntr - BufferPntr;
2185 }
2186 
2187 
2188 /* Find all the words in the string and add them to our local set of words.
2189 The characters considered white space are defined by g_SpaceCharacters.  This
2190 function is also used as a subroutine by other tokenizer functions when they
2191 have a bunch of presumably plain text they want broken into words and added. */
2192 
2193 static size_t TokenizerPassGetPlainWords (
2194   char *BufferPntr,
2195   size_t NumberOfBytes,
2196   char PrefixCharacter,
2197   set<string> &WordSet)
2198 {
2199   string  AccumulatedWord;
2200   char   *EndOfStringPntr;
2201   size_t  Length;
2202   int     Letter;
2203 
2204   if (NumberOfBytes <= 0)
2205     return 0; /* Nothing to process. */
2206 
2207   if (PrefixCharacter != 0)
2208     AccumulatedWord = PrefixCharacter;
2209   EndOfStringPntr = BufferPntr + NumberOfBytes;
2210   while (true)
2211   {
2212     if (BufferPntr >= EndOfStringPntr)
2213       Letter = EOF; // Usually a negative number.
2214     else
2215       Letter = (unsigned char) *BufferPntr++;
2216 
2217     /* See if it is a letter we treat as white space.  Some word separators
2218     like dashes and periods aren't considered as space.  Note that codes above
2219     127 are UTF-8 characters, which we consider non-space. */
2220 
2221     if (Letter < 0 /* EOF is -1 */ ||
2222     (Letter < 128 && g_SpaceCharacters[Letter]))
2223     {
2224       /* That space finished off a word.  Remove trailing periods... */
2225 
2226       while ((Length = AccumulatedWord.size()) > 0 &&
2227       AccumulatedWord [Length-1] == '.')
2228         AccumulatedWord.resize (Length - 1);
2229 
2230       /* If there's anything left in the word, add it to the set.  Also ignore
2231       words which are too big (it's probably some binary encoded data).  But
2232       leave room for supercalifragilisticexpialidoceous.  According to one web
2233       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
2234       currently in English.  Note that some uuencoded data was seen with a 60
2235       character line length. */
2236 
2237       if (PrefixCharacter != 0)
2238         Length--; // Don't count prefix when judging size or emptiness.
2239       if (Length > 0 && Length <= g_MaxWordLength)
2240         WordSet.insert (AccumulatedWord);
2241 
2242       /* Empty out the string to get ready for the next word.  Not quite empty,
2243       start it off with the prefix character if any. */
2244 
2245       if (PrefixCharacter != 0)
2246         AccumulatedWord = PrefixCharacter;
2247       else
2248         AccumulatedWord.resize (0);
2249     }
2250     else /* Not a space-like character, add it to the word. */
2251       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
2252 
2253     if (Letter < 0)
2254       break; /* End of data.  Exit here so that last word got processed. */
2255   }
2256   return NumberOfBytes;
2257 }
2258 
2259 
2260 /* Delete Things from the text.  The Thing is marked by a start string and an
2261 end string, such as "<!--" and "--> for HTML comment things.  All the text
2262 between the markers will be added to the word list before it gets deleted from
2263 the buffer.  The markers must be prepared in lower case and the buffer is
2264 assumed to have already been converted to lower case.  You can specify an empty
2265 string for the end marker if you're just matching a string constant like
2266 "&nbsp;", which you would put in the starting marker.  This is a utility
2267 function used by other tokenizer functions. */
2268 
2269 static size_t TokenizerUtilRemoveStartEndThing (
2270   char *BufferPntr,
2271   size_t NumberOfBytes,
2272   char PrefixCharacter,
2273   set<string> &WordSet,
2274   const char *ThingStartCode,
2275   const char *ThingEndCode,
2276   bool ReplaceWithSpace)
2277 {
2278   char *EndOfStringPntr;
2279   bool  FoundAndDeletedThing;
2280   char *InputStringPntr;
2281   char *OutputStringPntr;
2282   int   ThingEndLength;
2283   char *ThingEndPntr;
2284   int   ThingStartLength;
2285 
2286   InputStringPntr = BufferPntr;
2287   EndOfStringPntr = InputStringPntr + NumberOfBytes;
2288   OutputStringPntr = InputStringPntr;
2289   ThingStartLength = strlen (ThingStartCode);
2290   ThingEndLength = strlen (ThingEndCode);
2291 
2292   if (ThingStartLength <= 0)
2293     return NumberOfBytes; /* Need some things to look for first! */
2294 
2295   while (InputStringPntr < EndOfStringPntr)
2296   {
2297     /* Search for the starting marker. */
2298 
2299     FoundAndDeletedThing = false;
2300     if (EndOfStringPntr - InputStringPntr >=
2301     ThingStartLength + ThingEndLength /* space remains for start + end */ &&
2302     *InputStringPntr == *ThingStartCode &&
2303     memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
2304     {
2305       /* Found the start marker.  Look for the terminating string.  If it is an
2306       empty string, then we've found it right now! */
2307 
2308       ThingEndPntr = InputStringPntr + ThingStartLength;
2309       while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
2310       {
2311         if (ThingEndLength == 0 ||
2312         (*ThingEndPntr == *ThingEndCode &&
2313         memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
2314         {
2315           /* Got the end of the Thing.  First dump the text inbetween the start
2316           and end markers into the words list. */
2317 
2318           TokenizerPassGetPlainWords (InputStringPntr + ThingStartLength,
2319             ThingEndPntr - (InputStringPntr + ThingStartLength),
2320             PrefixCharacter, WordSet);
2321 
2322           /* Delete by not updating the output pointer while moving the input
2323           pointer to just after the ending tag. */
2324 
2325           InputStringPntr = ThingEndPntr + ThingEndLength;
2326           if (ReplaceWithSpace)
2327             *OutputStringPntr++ = ' ';
2328           FoundAndDeletedThing = true;
2329           break;
2330         }
2331         ThingEndPntr++;
2332       } /* End while ThingEndPntr */
2333     }
2334     if (!FoundAndDeletedThing)
2335       *OutputStringPntr++ = *InputStringPntr++;
2336   } /* End while InputStringPntr */
2337 
2338   return OutputStringPntr - BufferPntr;
2339 }
2340 
2341 
2342 static size_t TokenizerPassRemoveHTMLComments (
2343   char *BufferPntr,
2344   size_t NumberOfBytes,
2345   char PrefixCharacter,
2346   set<string> &WordSet)
2347 {
2348   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2349     PrefixCharacter, WordSet, "<!--", "-->", false);
2350 }
2351 
2352 
2353 static size_t TokenizerPassRemoveHTMLStyle (
2354   char *BufferPntr,
2355   size_t NumberOfBytes,
2356   char PrefixCharacter,
2357   set<string> &WordSet)
2358 {
2359   return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2360     PrefixCharacter, WordSet,
2361     "<style", "/style>", false /* replace with space if true */);
2362 }
2363 
2364 
2365 /* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
2366 start of the next sentence is recognised at least as the start of a very long
2367 word.  The Japanese comma also does the same job. */
2368 
2369 static size_t TokenizerPassJapanesePeriodsToSpaces (
2370   char *BufferPntr,
2371   size_t NumberOfBytes,
2372   char PrefixCharacter,
2373   set<string> &WordSet)
2374 {
2375   size_t BytesRemaining = NumberOfBytes;
2376 
2377   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2378     BytesRemaining, PrefixCharacter, WordSet, "。" /* period */, "", true);
2379   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2380     BytesRemaining, PrefixCharacter, WordSet, "、" /* comma */, "", true);
2381   return BytesRemaining;
2382 }
2383 
2384 
2385 /* Delete HTML tags from the text.  The contents of the tag are added as words
2386 before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
2387 stage while other HTML things get replaced by nothing. */
2388 
2389 static size_t TokenizerPassRemoveHTMLTags (
2390   char *BufferPntr,
2391   size_t NumberOfBytes,
2392   char PrefixCharacter,
2393   set<string> &WordSet)
2394 {
2395   size_t BytesRemaining = NumberOfBytes;
2396 
2397   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2398     BytesRemaining, PrefixCharacter, WordSet, "&nbsp;", "", true);
2399   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2400     BytesRemaining, PrefixCharacter, WordSet, "<p", ">", true);
2401   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2402     BytesRemaining, PrefixCharacter, WordSet, "<br", ">", true);
2403   BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2404     BytesRemaining, PrefixCharacter, WordSet, "<", ">", false);
2405   return BytesRemaining;
2406 }
2407 
2408 
2409 
2410 /******************************************************************************
2411  * Implementation of the ABSApp class, constructor, destructor and the rest of
2412  * the member functions in mostly alphabetical order.
2413  */
2414 
2415 ABSApp::ABSApp ()
2416 : BApplication (g_ABSAppSignature),
2417   m_DatabaseHasChanged (false),
2418   m_SettingsHaveChanged (false)
2419 {
2420   status_t    ErrorCode;
2421   int         HalvingCount;
2422   int         i;
2423   const void *ResourceData;
2424   size_t      ResourceSize;
2425   BResources *ResourcesPntr;
2426 
2427   MakeDatabaseEmpty ();
2428 
2429   /* Set up the pathname which identifies our settings directory.  Note that
2430   the actual settings are loaded later on (or set to defaults) by the main()
2431   function, before this BApplication starts running.  So we don't bother
2432   initialising the other setting related variables here. */
2433 
2434   ErrorCode =
2435     find_directory (B_USER_SETTINGS_DIRECTORY, &m_SettingsDirectoryPath);
2436   if (ErrorCode == B_OK)
2437     ErrorCode = m_SettingsDirectoryPath.Append (g_SettingsDirectoryName);
2438   if (ErrorCode != B_OK)
2439     m_SettingsDirectoryPath.SetTo (".");
2440 
2441   /* Set up the table which identifies which characters are spaces and which
2442   are not.  Spaces are all control characters and all punctuation except for:
2443   apostrophe (so "it's" and possessive versions of words get stored), dash (for
2444   hyphenated words), dollar sign (for cash amounts), period (for IP addresses,
2445   we later remove trailing periods). */
2446 
2447   memset (g_SpaceCharacters, 1, sizeof (g_SpaceCharacters));
2448   g_SpaceCharacters['\''] = false;
2449   g_SpaceCharacters['-'] = false;
2450   g_SpaceCharacters['$'] = false;
2451   g_SpaceCharacters['.'] = false;
2452   for (i = '0'; i <= '9'; i++)
2453     g_SpaceCharacters[i] = false;
2454   for (i = 'A'; i <= 'Z'; i++)
2455     g_SpaceCharacters[i] = false;
2456   for (i = 'a'; i <= 'z'; i++)
2457     g_SpaceCharacters[i] = false;
2458 
2459   /* Initialise the busy cursor from data in the application's resources. */
2460 
2461   if ((ResourcesPntr = AppResources ()) != NULL && (ResourceData =
2462   ResourcesPntr->LoadResource ('CURS', "Busy Cursor", &ResourceSize)) != NULL
2463   && ResourceSize >= 68 /* Size of a raw 2x16x16x8+4 cursor is 68 bytes */)
2464     g_BusyCursor = new BCursor (ResourceData);
2465 
2466   /* Find out the smallest usable double by seeing how small we can make it. */
2467 
2468   m_SmallestUseableDouble = 1.0;
2469   HalvingCount = 0;
2470   while (HalvingCount < 10000 && m_SmallestUseableDouble > 0.0)
2471   {
2472     HalvingCount++;
2473     m_SmallestUseableDouble /= 2;
2474   }
2475 
2476   /* Recreate the number.  But don't make quite as small, we want to allow some
2477   precision bits and a bit of extra margin for intermediate results in future
2478   calculations. */
2479 
2480   HalvingCount -= 50 + sizeof (double) * 8;
2481 
2482   m_SmallestUseableDouble = 1.0;
2483   while (HalvingCount > 0)
2484   {
2485     HalvingCount--;
2486     m_SmallestUseableDouble /= 2;
2487   }
2488 }
2489 
2490 
2491 ABSApp::~ABSApp ()
2492 {
2493   status_t ErrorCode;
2494   char     ErrorMessage [PATH_MAX + 1024];
2495 
2496   if (m_SettingsHaveChanged)
2497     LoadSaveSettings (false /* DoLoad */);
2498   if ((ErrorCode = SaveDatabaseIfNeeded (ErrorMessage)) != B_OK)
2499     DisplayErrorMessage (ErrorMessage, ErrorCode, "Exiting Error");
2500   delete g_BusyCursor;
2501   g_BusyCursor = NULL;
2502 }
2503 
2504 
2505 /* Display a box showing information about this program. */
2506 
2507 void
2508 ABSApp::AboutRequested ()
2509 {
2510   BAlert *AboutAlertPntr;
2511 
2512   AboutAlertPntr = new BAlert ("About",
2513 "SpamDBM - Spam Database Manager\n\n"
2514 
2515 "This is a BeOS program for classifying e-mail messages as spam (unwanted \
2516 junk mail) or as genuine mail using a Bayesian statistical approach.  There \
2517 is also a Mail Daemon Replacement add-on to filter mail using the \
2518 classification statistics collected earlier.\n\n"
2519 
2520 "Written by Alexander G. M. Smith, fall 2002.\n\n"
2521 
2522 "The original idea was from Paul Graham's algorithm, which has an excellent \
2523 writeup at: http://www.paulgraham.com/spam.html\n\n"
2524 
2525 "Gary Robinson came up with the improved algorithm, which you can read about \
2526 at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n\n"
2527 
2528 "Mr. Robinson, Tim Peters and the SpamBayes mailing list people then \
2529 developed the even better chi-squared scoring method.\n\n"
2530 
2531 "Icon courtesy of Isaac Yonemoto, though it is no longer used since Hormel \
2532 doesn't want their meat product associated with junk e-mail.\n\n"
2533 
2534 "Tokenising code updated in 2005 to use some of the tricks that SpamBayes \
2535 uses to extract words from messages.  In particular, HTML is now handled.\n\n"
2536 
2537 "Released to the public domain, with no warranty.\n"
2538 "$Revision: 30630 $\n"
2539 "Compiled on " __DATE__ " at " __TIME__ ".", "Done");
2540   if (AboutAlertPntr != NULL)
2541   {
2542     AboutAlertPntr->SetFlags(AboutAlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
2543     AboutAlertPntr->Go ();
2544   }
2545 }
2546 
2547 
2548 /* Add the text in the given file to the database as an example of a spam or
2549 genuine message, or removes it from the database if you claim it is
2550 CL_UNCERTAIN.  Also resets the spam ratio attribute to show the effect of the
2551 database change. */
2552 
2553 status_t ABSApp::AddFileToDatabase (
2554   ClassificationTypes IsSpamOrWhat,
2555   const char *FileName,
2556   char *ErrorMessage)
2557 {
2558   status_t ErrorCode;
2559   BFile    MessageFile;
2560   BMessage TempBMessage;
2561 
2562   ErrorCode = MessageFile.SetTo (FileName, B_READ_ONLY);
2563   if (ErrorCode != B_OK)
2564   {
2565     sprintf (ErrorMessage, "Unable to open file \"%s\" for reading", FileName);
2566     return ErrorCode;
2567   }
2568 
2569   ErrorCode = AddPositionIOToDatabase (IsSpamOrWhat,
2570     &MessageFile, FileName, ErrorMessage);
2571   MessageFile.Unset ();
2572   if (ErrorCode != B_OK)
2573     return ErrorCode;
2574 
2575   /* Re-evaluate the file so that the user sees the new ratio attribute. */
2576   return EvaluateFile (FileName, &TempBMessage, ErrorMessage);
2577 }
2578 
2579 
2580 /* Add the given text to the database.  The unique words found in MessageIOPntr
2581 will be added to the database (incrementing the count for the number of
2582 messages using each word, either the spam or genuine count depending on
2583 IsSpamOrWhat).  It will remove the message (decrement the word counts) if you
2584 specify CL_UNCERTAIN as the new classification.  And if it switches from spam
2585 to genuine or vice versa, it will do both - decrement the counts for the old
2586 class and increment the counts for the new one.  An attribute will be added to
2587 MessageIOPntr (if it is a file) to record that it has been marked as Spam or
2588 Genuine (so that it doesn't get added to the database a second time).  If it is
2589 being removed from the database, the classification attribute gets removed too.
2590 If things go wrong, a non-zero error code will be returned and an explanation
2591 written to ErrorMessage (assumed to be at least PATH_MAX + 1024 bytes long).
2592 OptionalFileName is just used in the error message to identify the file to the
2593 user. */
2594 
2595 status_t ABSApp::AddPositionIOToDatabase (
2596   ClassificationTypes IsSpamOrWhat,
2597   BPositionIO *MessageIOPntr,
2598   const char *OptionalFileName,
2599   char *ErrorMessage)
2600 {
2601   BNode                             *BNodePntr;
2602   char                               ClassificationString [NAME_MAX];
2603   StatisticsMap::iterator            DataIter;
2604   status_t                           ErrorCode = 0;
2605   pair<StatisticsMap::iterator,bool> InsertResult;
2606   uint32                             NewAge;
2607   StatisticsRecord                   NewStatistics;
2608   ClassificationTypes                PreviousClassification;
2609   StatisticsPointer                  StatisticsPntr;
2610   set<string>::iterator              WordEndIter;
2611   set<string>::iterator              WordIter;
2612   set<string>                        WordSet;
2613 
2614   NewAge = m_TotalGenuineMessages + m_TotalSpamMessages;
2615   if (NewAge >= 0xFFFFFFF0UL)
2616   {
2617     sprintf (ErrorMessage, "The database is full!  There are %lu messages in "
2618       "it and we can't add any more without overflowing the maximum integer "
2619       "representation in 32 bits", NewAge);
2620     return B_NO_MEMORY;
2621   }
2622 
2623   /* Check that this file hasn't already been added to the database. */
2624 
2625   PreviousClassification = CL_UNCERTAIN;
2626   BNodePntr = dynamic_cast<BNode *> (MessageIOPntr);
2627   if (BNodePntr != NULL) /* If this thing might have attributes. */
2628   {
2629     ErrorCode = BNodePntr->ReadAttr (g_AttributeNameClassification,
2630       B_STRING_TYPE, 0 /* offset */, ClassificationString,
2631       sizeof (ClassificationString) - 1);
2632     if (ErrorCode <= 0) /* Positive values for the number of bytes read */
2633       strcpy (ClassificationString, "none");
2634     else /* Just in case it needs a NUL at the end. */
2635       ClassificationString [ErrorCode] = 0;
2636 
2637     if (strcasecmp (ClassificationString, g_ClassifiedSpam) == 0)
2638       PreviousClassification = CL_SPAM;
2639     else if (strcasecmp (ClassificationString, g_ClassifiedGenuine) == 0)
2640       PreviousClassification = CL_GENUINE;
2641   }
2642 
2643   if (!m_IgnorePreviousClassification &&
2644   PreviousClassification != CL_UNCERTAIN)
2645   {
2646     if (IsSpamOrWhat == PreviousClassification)
2647     {
2648       sprintf (ErrorMessage, "Ignoring file \"%s\" since it seems to have "
2649         "already been classified as %s.", OptionalFileName,
2650         g_ClassificationTypeNames [IsSpamOrWhat]);
2651     }
2652     else
2653     {
2654       sprintf (ErrorMessage, "Changing existing classification of file \"%s\" "
2655         "from %s to %s.", OptionalFileName,
2656         g_ClassificationTypeNames [PreviousClassification],
2657         g_ClassificationTypeNames [IsSpamOrWhat]);
2658     }
2659     DisplayErrorMessage (ErrorMessage, 0, "Note");
2660   }
2661 
2662   if (!m_IgnorePreviousClassification &&
2663   IsSpamOrWhat == PreviousClassification)
2664     /* Nothing to do if it is already classified correctly and the user doesn't
2665     want double classification. */
2666     return B_OK;
2667 
2668   /* Get the list of unique words in the file. */
2669 
2670   ErrorCode = GetWordsFromPositionIO (MessageIOPntr, OptionalFileName,
2671     WordSet, ErrorMessage);
2672   if (ErrorCode != B_OK)
2673     return ErrorCode;
2674 
2675   /* Update the count of the number of messages processed, with corrections if
2676   reclassifying a message. */
2677 
2678   m_DatabaseHasChanged = true;
2679 
2680   if (!m_IgnorePreviousClassification &&
2681   PreviousClassification == CL_SPAM && m_TotalSpamMessages > 0)
2682     m_TotalSpamMessages--;
2683 
2684   if (IsSpamOrWhat == CL_SPAM)
2685     m_TotalSpamMessages++;
2686 
2687   if (!m_IgnorePreviousClassification &&
2688   PreviousClassification == CL_GENUINE && m_TotalGenuineMessages > 0)
2689       m_TotalGenuineMessages--;
2690 
2691   if (IsSpamOrWhat == CL_GENUINE)
2692     m_TotalGenuineMessages++;
2693 
2694   /* Mark the file's attributes with the new classification.  Don't care if it
2695   fails. */
2696 
2697   if (BNodePntr != NULL) /* If this thing might have attributes. */
2698   {
2699     ErrorCode = BNodePntr->RemoveAttr (g_AttributeNameClassification);
2700     if (IsSpamOrWhat != CL_UNCERTAIN)
2701     {
2702       strcpy (ClassificationString, g_ClassificationTypeNames [IsSpamOrWhat]);
2703       ErrorCode = BNodePntr->WriteAttr (g_AttributeNameClassification,
2704         B_STRING_TYPE, 0 /* offset */,
2705         ClassificationString, strlen (ClassificationString) + 1);
2706     }
2707   }
2708 
2709   /* Add the words to the database by incrementing or decrementing the counts
2710   for each word as appropriate. */
2711 
2712   WordEndIter = WordSet.end ();
2713   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
2714   {
2715     if ((DataIter = m_WordMap.find (*WordIter)) == m_WordMap.end ())
2716     {
2717       /* No record in the database for the word. */
2718 
2719       if (IsSpamOrWhat == CL_UNCERTAIN)
2720         continue; /* Not adding words, don't have to subtract from nothing. */
2721 
2722       /* Create a new one record in the database for the new word. */
2723 
2724       memset (&NewStatistics, 0, sizeof (NewStatistics));
2725       InsertResult = m_WordMap.insert (
2726         StatisticsMap::value_type (*WordIter, NewStatistics));
2727       if (!InsertResult.second)
2728       {
2729         sprintf (ErrorMessage, "Failed to insert new database entry for "
2730           "word \"%s\", while processing file \"%s\"",
2731           WordIter->c_str (), OptionalFileName);
2732         return B_NO_MEMORY;
2733       }
2734       DataIter = InsertResult.first;
2735       m_WordCount++;
2736     }
2737 
2738     /* Got the database record for the word, update the statistics. */
2739 
2740     StatisticsPntr = &DataIter->second;
2741 
2742     StatisticsPntr->age = NewAge;
2743 
2744     /* Can't update m_OldestAge here, since it would take a lot of effort to
2745     find the next older age.  Since it's only used for display, we'll let it be
2746     slightly incorrect.  The next database load or purge will fix it. */
2747 
2748     if (IsSpamOrWhat == CL_SPAM)
2749       StatisticsPntr->spamCount++;
2750 
2751     if (IsSpamOrWhat == CL_GENUINE)
2752       StatisticsPntr->genuineCount++;
2753 
2754     if (!m_IgnorePreviousClassification &&
2755     PreviousClassification == CL_SPAM && StatisticsPntr->spamCount > 0)
2756       StatisticsPntr->spamCount--;
2757 
2758     if (!m_IgnorePreviousClassification &&
2759     PreviousClassification == CL_GENUINE && StatisticsPntr->genuineCount > 0)
2760       StatisticsPntr->genuineCount--;
2761   }
2762 
2763   return B_OK;
2764 }
2765 
2766 
2767 /* Add the text in the string to the database as an example of a spam or
2768 genuine message. */
2769 
2770 status_t ABSApp::AddStringToDatabase (
2771   ClassificationTypes IsSpamOrWhat,
2772   const char *String,
2773   char *ErrorMessage)
2774 {
2775   BMemoryIO MemoryIO (String, strlen (String));
2776 
2777   return AddPositionIOToDatabase (IsSpamOrWhat, &MemoryIO,
2778    "Memory Buffer" /* OptionalFileName */, ErrorMessage);
2779 }
2780 
2781 
2782 /* Given a bunch of text, find the words within it (doing special tricks to
2783 extract words from HTML), and add them to the set.  Allow NULs in the text.  If
2784 the PrefixCharacter isn't zero then it is prepended to all words found (so you
2785 can distinguish words as being from a header or from the body text).  See also
2786 TokenizeWhole which does something similar. */
2787 
2788 void
2789 ABSApp::AddWordsToSet (
2790   const char *InputString,
2791   size_t NumberOfBytes,
2792   char PrefixCharacter,
2793   set<string> &WordSet)
2794 {
2795   char   *BufferPntr;
2796   size_t  CurrentSize;
2797   int     PassNumber;
2798 
2799   /* Copy the input buffer.  The code will be modifying it in-place as HTML
2800   fragments and other junk are deleted. */
2801 
2802   BufferPntr = new char [NumberOfBytes];
2803   if (BufferPntr == NULL)
2804     return;
2805   memcpy (BufferPntr, InputString, NumberOfBytes);
2806 
2807   /* Do the tokenization.  Each pass does something to the text in the buffer,
2808   and may add words to the word set. */
2809 
2810   CurrentSize = NumberOfBytes;
2811   for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
2812   {
2813     switch (PassNumber)
2814     {
2815       case 1: /* Lowercase first, rest of them assume lower case inputs. */
2816         CurrentSize = TokenizerPassLowerCase (BufferPntr, CurrentSize);
2817         break;
2818       case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
2819         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2820       case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
2821         BufferPntr, CurrentSize); break;
2822       case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
2823         BufferPntr, CurrentSize, 'Z', WordSet); break;
2824       case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
2825         BufferPntr, CurrentSize, 'Z', WordSet); break;
2826       case 6: CurrentSize = TokenizerPassExtractURLs (
2827         BufferPntr, CurrentSize, 'Z', WordSet); break;
2828       case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
2829         BufferPntr, CurrentSize, 'Z', WordSet); break;
2830       case 8: CurrentSize = TokenizerPassGetPlainWords (
2831         BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2832       default: break;
2833     }
2834   }
2835 
2836   delete [] BufferPntr;
2837 }
2838 
2839 
2840 /* The user has provided a command line.  This could actually be from a
2841 separate attempt to invoke the program (this application's resource/attributes
2842 have the launch flags set to "single launch", so the shell doesn't start the
2843 program but instead sends the arguments to the already running instance).  In
2844 either case, the command is sent to an intermediary thread where it is
2845 asynchronously converted into a scripting message(s) that are sent back to this
2846 BApplication.  The intermediary is needed since we can't recursively execute
2847 scripting messages while processing a message (this ArgsReceived one). */
2848 
2849 void
2850 ABSApp::ArgvReceived (int32 argc, char **argv)
2851 {
2852   if (g_CommanderLooperPntr != NULL)
2853     g_CommanderLooperPntr->CommandArguments (argc, argv);
2854 }
2855 
2856 
2857 /* Create a new empty database.  Note that we have to write out the new file
2858 immediately, otherwise other operations will see the empty database and then
2859 try to load the file, and complain that it doesn't exist.  Now they will see
2860 the empty database and redundantly load the empty file. */
2861 
2862 status_t ABSApp::CreateDatabaseFile (char *ErrorMessage)
2863 {
2864   MakeDatabaseEmpty ();
2865   m_DatabaseHasChanged = true;
2866   return SaveDatabaseIfNeeded (ErrorMessage); /* Make it now. */
2867 }
2868 
2869 
2870 /* Set the settings to the defaults.  Needed in case there isn't a settings
2871 file or it is obsolete. */
2872 
2873 void
2874 ABSApp::DefaultSettings ()
2875 {
2876   status_t ErrorCode;
2877   BPath    DatabasePath (m_SettingsDirectoryPath);
2878   char     TempString [PATH_MAX];
2879 
2880   /* The default database file is in the settings directory. */
2881 
2882   ErrorCode = DatabasePath.Append (g_DefaultDatabaseFileName);
2883   if (ErrorCode != B_OK)
2884     strcpy (TempString, g_DefaultDatabaseFileName); /* Unlikely to happen. */
2885   else
2886     strcpy (TempString, DatabasePath.Path ());
2887   m_DatabaseFileName.SetTo (TempString);
2888 
2889   // Users need to be allowed to undo their mistakes...
2890   m_IgnorePreviousClassification = true;
2891   g_ServerMode = true;
2892   m_PurgeAge = 2000;
2893   m_PurgePopularity = 2;
2894   m_ScoringMode = SM_CHISQUARED;
2895   m_TokenizeMode = TM_ANY_TEXT_HEADER;
2896 
2897   m_SettingsHaveChanged = true;
2898 }
2899 
2900 
2901 /* Deletes the database file, and the backup file, and clears the database but
2902 marks it as not changed so that it doesn't get written out when the program
2903 exits. */
2904 
2905 status_t ABSApp::DeleteDatabaseFile (char *ErrorMessage)
2906 {
2907   BEntry   FileEntry;
2908   status_t ErrorCode;
2909   int      i;
2910   char     TempString [PATH_MAX+20];
2911 
2912   /* Clear the in-memory database. */
2913 
2914   MakeDatabaseEmpty ();
2915   m_DatabaseHasChanged = false;
2916 
2917   /* Delete the backup files first.  Don't care if it fails. */
2918 
2919   for (i = 0; i < g_MaxBackups; i++)
2920   {
2921     strcpy (TempString, m_DatabaseFileName.String ());
2922     sprintf (TempString + strlen (TempString), g_BackupSuffix, i);
2923     ErrorCode = FileEntry.SetTo (TempString);
2924     if (ErrorCode == B_OK)
2925       FileEntry.Remove ();
2926   }
2927 
2928   /* Delete the main database file. */
2929 
2930   strcpy (TempString, m_DatabaseFileName.String ());
2931   ErrorCode = FileEntry.SetTo (TempString);
2932   if (ErrorCode != B_OK)
2933   {
2934     sprintf (ErrorMessage, "While deleting, failed to make BEntry for "
2935       "\"%s\" (does the directory exist?)", TempString);
2936     return ErrorCode;
2937   }
2938 
2939   ErrorCode = FileEntry.Remove ();
2940   if (ErrorCode != B_OK)
2941     sprintf (ErrorMessage, "While deleting, failed to remove file "
2942       "\"%s\"", TempString);
2943 
2944   return ErrorCode;
2945 }
2946 
2947 
2948 /* Evaluate the given file as being a spam message, and tag it with the
2949 resulting spam probability ratio.  If it also has an e-mail subject attribute,
2950 remove the [Spam 99.9%] prefix since the number usually changes. */
2951 
2952 status_t ABSApp::EvaluateFile (
2953   const char *PathName,
2954   BMessage *ReplyMessagePntr,
2955   char *ErrorMessage)
2956 {
2957   status_t ErrorCode;
2958   float    TempFloat;
2959   BFile    TextFile;
2960 
2961   /* Open the specified file. */
2962 
2963   ErrorCode = TextFile.SetTo (PathName, B_READ_ONLY);
2964   if (ErrorCode != B_OK)
2965   {
2966     sprintf (ErrorMessage, "Problems opening file \"%s\" for evaluating",
2967       PathName);
2968     return ErrorCode;
2969   }
2970 
2971   ErrorCode =
2972     EvaluatePositionIO (&TextFile, PathName, ReplyMessagePntr, ErrorMessage);
2973 
2974   if (ErrorCode == B_OK &&
2975   ReplyMessagePntr->FindFloat (g_ResultName, &TempFloat) == B_OK)
2976   {
2977     TextFile.WriteAttr (g_AttributeNameSpamRatio, B_FLOAT_TYPE,
2978       0 /* offset */, &TempFloat, sizeof (TempFloat));
2979     /* Don't know the spam cutoff ratio, that's in the e-mail filter, so just
2980     blindly remove the prefix, which would have the wrong percentage. */
2981     RemoveSpamPrefixFromSubjectAttribute (&TextFile);
2982   }
2983 
2984   return ErrorCode;
2985 }
2986 
2987 
2988 /* Evaluate a given file or memory buffer (a BPositionIO handles both cases)
2989 for spaminess.  The output is added to the ReplyMessagePntr message, with the
2990 probability ratio stored in "result" (0.0 means genuine and 1.0 means spam).
2991 It also adds the most significant words (used in the ratio calculation) to the
2992 array "words" and the associated per-word probability ratios in "ratios".  If
2993 it fails, an error code is returned and an error message written to the
2994 ErrorMessage string (which is at least MAX_PATH + 1024 bytes long).
2995 OptionalFileName is only used in the error message.
2996 
2997 The math used for combining the individual word probabilities in my method is
2998 based on Gary Robinson's method (formerly it was a variation of Paul Graham's
2999 method) or the Chi-Squared method.  It's input is the database of words that
3000 has a count of the number of spam and number of genuine messages each word
3001 appears in (doesn't matter if it appears more than once in a message, it still
3002 counts as 1).
3003 
3004 The spam word count is divided the by the total number of spam e-mail messages
3005 in the database to get the probability of spam and probability of genuineness
3006 is similarly computed for a particular word.  The spam probability is divided
3007 by the sum of the spam and genuine probabilities to get the Raw Spam Ratio for
3008 the word.  It's nearer to 0.0 for genuine and nearer to 1.0 for spam, and can
3009 be exactly zero or one too.
3010 
3011 To avoid multiplying later results by zero, and to compensate for a lack of
3012 data points, the Raw Spam Ratio is adjusted towards the 0.5 halfway point.  The
3013 0.5 is combined with the raw spam ratio, with a weight of 0.45 (determined to
3014 be a good value by the "spambayes" mailing list tests) messages applied to the
3015 half way point and a weight of the number of spam + genuine messages applied to
3016 the raw spam ratio.  This gives you the compensated spam ratio for the word.
3017 
3018 The top N (150 was good in the spambayes tests) extreme words are selected by
3019 the distance of each word's compensated spam ratio from 0.5.  Then the ratios
3020 of the words are combined.
3021 
3022 The Gary Robinson combining (scoring) method gets one value from the Nth root
3023 of the product of all the word ratios.  The other is the Nth root of the
3024 product of (1 - ratio) for all the words.  The final result is the first value
3025 divided by the sum of the two values.  The Nth root helps spread the resulting
3026 range of values more evenly between 0.0 and 1.0, otherwise the values all clump
3027 together at 0 or 1.  Also you can think of the Nth root as a kind of average
3028 for products; it's like a generic word probability which when multiplied by
3029 itself N times gives you the same result as the N separate actual word
3030 probabilities multiplied together.
3031 
3032 The Chi-Squared combining (scoring) method assumes that the spam word
3033 probabilities are uniformly distributed and computes an error measurement
3034 (called chi squared - see http://bmj.com/collections/statsbk/8.shtml for a good
3035 tutorial) and then sees how likely that error value would be observed in
3036 practice.  If it's rare to observe, then the words are likely not just randomly
3037 occuring and it's spammy.  The same is done for genuine words.  The two
3038 resulting unlikelynesses are compared to see which is more unlikely, if neither
3039 is, then the method says it can't decide.  The SpamBayes notes (see the
3040 classifier.py file in CVS in http://sourceforge.net/projects/spambayes) say:
3041 
3042 "Across vectors of length n, containing random uniformly-distributed
3043 probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution with 2*n
3044 degrees of freedom.  This has been proven (in some appropriate sense) to be the
3045 most sensitive possible test for rejecting the hypothesis that a vector of
3046 probabilities is uniformly distributed.  Gary Robinson's original scheme was
3047 monotonic *with* this test, but skipped the details.  Turns out that getting
3048 closer to the theoretical roots gives a much sharper classification, with a
3049 very small (in # of msgs), but also very broad (in range of scores), "middle
3050 ground", where most of the mistakes live.  In particular, this scheme seems
3051 immune to all forms of "cancellation disease": if there are many strong ham
3052 *and* spam clues, this reliably scores close to 0.5.  Most other schemes are
3053 extremely certain then -- and often wrong."
3054 
3055 I did a test with 448 example genuine messages including personal mail (some
3056 with HTML attachments) and mailing lists, and 267 spam messages for 27471 words
3057 total.  Test messages were more recent messages in the same groups.  Out of 100
3058 test genuine messages, with Gary Robinson (0.56 cutoff limit), 1 (1%) was
3059 falsely identified as spam and 8 of 73 (11%) spam messages were incorrectly
3060 classified as genuine.  With my variation of Paul Graham's scheme (0.90 cutoff)
3061 I got 6 of 100 (6%) genuine messages incorrectly marked as spam and 2 of 73
3062 (3%) spam messages were incorrectly classified as genuine.  Pretty close, but
3063 Robinson's values are more evenly spread out so you can tell just how spammy it
3064 is by looking at the number. */
3065 
3066 struct WordAndRatioStruct
3067 {
3068   double        probabilityRatio; /* Actually the compensated ratio. */
3069   const string *wordPntr;
3070 
3071   bool operator() ( /* Our less-than comparison function for sorting. */
3072     const WordAndRatioStruct &ItemA,
3073     const WordAndRatioStruct &ItemB) const
3074   {
3075     return
3076       (fabs (ItemA.probabilityRatio - 0.5) <
3077       fabs (ItemB.probabilityRatio - 0.5));
3078   };
3079 };
3080 
3081 status_t ABSApp::EvaluatePositionIO (
3082   BPositionIO *PositionIOPntr,
3083   const char *OptionalFileName,
3084   BMessage *ReplyMessagePntr,
3085   char *ErrorMessage)
3086 {
3087   StatisticsMap::iterator            DataEndIter;
3088   StatisticsMap::iterator            DataIter;
3089   status_t                           ErrorCode;
3090   double                             GenuineProbability;
3091   uint32                             GenuineSpamSum;
3092   int                                i;
3093   priority_queue<
3094     WordAndRatioStruct /* Data type stored in the queue */,
3095     vector<WordAndRatioStruct> /* Underlying container */,
3096     WordAndRatioStruct /* Function for comparing elements */>
3097                                      PriorityQueue;
3098   double                             ProductGenuine;
3099   double                             ProductLogGenuine;
3100   double                             ProductLogSpam;
3101   double                             ProductSpam;
3102   double                             RawProbabilityRatio;
3103   float                              ResultRatio;
3104   double                             SpamProbability;
3105   StatisticsPointer                  StatisticsPntr;
3106   double                             TempDouble;
3107   double                             TotalGenuine;
3108   double                             TotalSpam;
3109   WordAndRatioStruct                 WordAndRatio;
3110   set<string>::iterator              WordEndIter;
3111   set<string>::iterator              WordIter;
3112   const WordAndRatioStruct          *WordRatioPntr;
3113   set<string>                        WordSet;
3114 
3115   /* Get the list of unique words in the file / memory buffer. */
3116 
3117   ErrorCode = GetWordsFromPositionIO (PositionIOPntr, OptionalFileName,
3118     WordSet, ErrorMessage);
3119   if (ErrorCode != B_OK)
3120     return ErrorCode;
3121 
3122   /* Prepare a few variables.  Mostly these are stored double values of some of
3123   the numbers involved (to avoid the overhead of multiple conversions from
3124   integer to double), with extra precautions to avoid divide by zero. */
3125 
3126   if (m_TotalGenuineMessages <= 0)
3127     TotalGenuine = 1.0;
3128   else
3129     TotalGenuine = m_TotalGenuineMessages;
3130 
3131   if (m_TotalSpamMessages <= 0)
3132     TotalSpam = 1.0;
3133   else
3134     TotalSpam = m_TotalSpamMessages;
3135 
3136   /* Look up the words in the database and calculate their compensated spam
3137   ratio.  The results are stored in a priority queue so that we can later find
3138   the top g_MaxInterestingWords for doing the actual determination. */
3139 
3140   WordEndIter = WordSet.end ();
3141   DataEndIter = m_WordMap.end ();
3142   for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
3143   {
3144     WordAndRatio.wordPntr = &(*WordIter);
3145 
3146     if ((DataIter = m_WordMap.find (*WordIter)) != DataEndIter)
3147     {
3148       StatisticsPntr = &DataIter->second;
3149 
3150       /* Calculate the probability the word is spam and the probability it is
3151       genuine.  Then the raw probability ratio. */
3152 
3153       SpamProbability = StatisticsPntr->spamCount / TotalSpam;
3154       GenuineProbability = StatisticsPntr->genuineCount / TotalGenuine;
3155 
3156       if (SpamProbability + GenuineProbability > 0)
3157         RawProbabilityRatio =
3158         SpamProbability / (SpamProbability + GenuineProbability);
3159       else /* Word with zero statistics, perhaps due to reclassification. */
3160         RawProbabilityRatio = 0.5;
3161 
3162       /* The compensated ratio leans towards 0.5 (g_RobinsonX) more for fewer
3163       data points, with a weight of 0.45 (g_RobinsonS). */
3164 
3165       GenuineSpamSum =
3166         StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
3167 
3168       WordAndRatio.probabilityRatio =
3169         (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
3170         (g_RobinsonS + GenuineSpamSum);
3171     }
3172     else /* Unknown word. With N=0, compensated ratio equation is RobinsonX. */
3173       WordAndRatio.probabilityRatio = g_RobinsonX;
3174 
3175      PriorityQueue.push (WordAndRatio);
3176   }
3177 
3178   /* Compute the combined probability (multiply them together) of the top few
3179   words.  To avoid numeric underflow (doubles can only get as small as 1E-300),
3180   logarithms are also used.  But avoid the logarithms (sum of logs of numbers
3181   is the same as the product of numbers) as much as possible due to reduced
3182   accuracy and slowness. */
3183 
3184   ProductGenuine = 1.0;
3185   ProductLogGenuine = 0.0;
3186   ProductSpam = 1.0;
3187   ProductLogSpam = 0.0;
3188   for (i = 0;
3189   i < g_MaxInterestingWords && !PriorityQueue.empty();
3190   i++, PriorityQueue.pop())
3191   {
3192     WordRatioPntr = &PriorityQueue.top();
3193     ProductSpam *= WordRatioPntr->probabilityRatio;
3194     ProductGenuine *= 1.0 - WordRatioPntr->probabilityRatio;
3195 
3196     /* Check for the numbers getting dangerously small, close to underflowing.
3197     If they are, move the value into the logarithm storage part. */
3198 
3199     if (ProductSpam < m_SmallestUseableDouble)
3200     {
3201       ProductLogSpam += log (ProductSpam);
3202       ProductSpam = 1.0;
3203     }
3204 
3205     if (ProductGenuine < m_SmallestUseableDouble)
3206     {
3207       ProductLogGenuine += log (ProductGenuine);
3208       ProductGenuine = 1.0;
3209     }
3210 
3211     ReplyMessagePntr->AddString ("words", WordRatioPntr->wordPntr->c_str ());
3212     ReplyMessagePntr->AddFloat ("ratios", WordRatioPntr->probabilityRatio);
3213   }
3214 
3215   /* Get the resulting log of the complete products. */
3216 
3217   if (i > 0)
3218   {
3219     ProductLogSpam += log (ProductSpam);
3220     ProductLogGenuine += log (ProductGenuine);
3221   }
3222 
3223   if (m_ScoringMode == SM_ROBINSON)
3224   {
3225     /* Apply Gary Robinson's scoring method where we take the Nth root of the
3226     products.  This is easiest in logarithm form. */
3227 
3228     if (i > 0)
3229     {
3230       ProductSpam = exp (ProductLogSpam / i);
3231       ProductGenuine = exp (ProductLogGenuine / i);
3232       ResultRatio = ProductSpam / (ProductGenuine + ProductSpam);
3233     }
3234     else /* Somehow got no words! */
3235       ResultRatio = g_RobinsonX;
3236   }
3237   else if (m_ScoringMode == SM_CHISQUARED)
3238   {
3239     /* From the SpamBayes notes: "We compute two chi-squared statistics, one
3240     for ham and one for spam.  The sum-of-the-logs business is more sensitive
3241     to probs near 0 than to probs near 1, so the spam measure uses 1-p (so that
3242     high-spamprob words have greatest effect), and the ham measure uses p
3243     directly (so that lo-spamprob words have greatest effect)."  That means we
3244     just reversed the meaning of the previously calculated spam and genuine
3245     products!  Oh well. */
3246 
3247     TempDouble = ProductLogSpam;
3248     ProductLogSpam = ProductLogGenuine;
3249     ProductLogGenuine = TempDouble;
3250 
3251     if (i > 0)
3252     {
3253       ProductSpam =
3254         1.0 - ChiSquaredProbability (-2.0 * ProductLogSpam, 2 * i);
3255       ProductGenuine =
3256         1.0 - ChiSquaredProbability (-2.0 * ProductLogGenuine, 2 * i);
3257 
3258       /* The SpamBayes notes say: "How to combine these into a single spam
3259       score?  We originally used (S-H)/(S+H) scaled into [0., 1.], which equals
3260       S/(S+H).  A systematic problem is that we could end up being near-certain
3261       a thing was (for example) spam, even if S was small, provided that H was
3262       much smaller.  Rob Hooft stared at these problems and invented the
3263       measure we use now, the simpler S-H, scaled into [0., 1.]." */
3264 
3265       ResultRatio = (ProductSpam - ProductGenuine + 1.0) / 2.0;
3266     }
3267     else /* No words to analyse. */
3268       ResultRatio = 0.5;
3269   }
3270   else /* Unknown scoring mode. */
3271   {
3272     strcpy (ErrorMessage, "Unknown scoring mode specified in settings");
3273     return B_BAD_VALUE;
3274   }
3275 
3276   ReplyMessagePntr->AddFloat (g_ResultName, ResultRatio);
3277   return B_OK;
3278 }
3279 
3280 
3281 /* Just evaluate the given string as being spam text. */
3282 
3283 status_t ABSApp::EvaluateString (
3284   const char *BufferPntr,
3285   ssize_t BufferSize,
3286   BMessage *ReplyMessagePntr,
3287   char *ErrorMessage)
3288 {
3289   BMemoryIO MemoryIO (BufferPntr, BufferSize);
3290 
3291   return EvaluatePositionIO (&MemoryIO, "Memory Buffer",
3292     ReplyMessagePntr, ErrorMessage);
3293 }
3294 
3295 
3296 /* Tell other programs about the scripting commands we support.  Try this
3297 command: "hey application/x-vnd.agmsmith.spamdbm getsuites" to
3298 see it in action (this program has to be already running for it to work). */
3299 
3300 status_t ABSApp::GetSupportedSuites (BMessage *MessagePntr)
3301 {
3302   BPropertyInfo TempPropInfo (g_ScriptingPropertyList);
3303 
3304   MessagePntr->AddString ("suites", "suite/x-vnd.agmsmith.spamdbm");
3305   MessagePntr->AddFlat ("messages", &TempPropInfo);
3306   return BApplication::GetSupportedSuites (MessagePntr);
3307 }
3308 
3309 
3310 /* Add all the words in the given file or memory buffer to the supplied set.
3311 The file name is only there for error messages, it assumes you have already
3312 opened the PositionIO to the right file.  If things go wrong, a non-zero error
3313 code will be returned and an explanation written to ErrorMessage (assumed to be
3314 at least PATH_MAX + 1024 bytes long). */
3315 
3316 status_t ABSApp::GetWordsFromPositionIO (
3317   BPositionIO *PositionIOPntr,
3318   const char *OptionalFileName,
3319   set<string> &WordSet,
3320   char *ErrorMessage)
3321 {
3322   status_t ErrorCode;
3323 
3324   if (m_TokenizeMode == TM_WHOLE)
3325     ErrorCode = TokenizeWhole (PositionIOPntr, OptionalFileName,
3326       WordSet, ErrorMessage);
3327   else
3328     ErrorCode = TokenizeParts (PositionIOPntr, OptionalFileName,
3329       WordSet, ErrorMessage);
3330 
3331   if (ErrorCode == B_OK && WordSet.empty ())
3332   {
3333     /* ENOMSG usually means no message found in queue, but I'm using it to show
3334     no words, a good indicator of spam which is pure HTML. */
3335 
3336     sprintf (ErrorMessage, "No words were found in \"%s\"", OptionalFileName);
3337     ErrorCode = ENOMSG;
3338   }
3339 
3340   return ErrorCode;
3341 }
3342 
3343 
3344 /* Set up indices for attributes MAIL:classification (string) and
3345 MAIL:ratio_spam (float) on all mounted disk volumes that support queries.  Also
3346 tell the system to make those attributes visible to the user (so they can see
3347 them in Tracker) and associate them with e-mail messages.  Also set up the
3348 database file MIME type (provide a description and associate it with this
3349 program so that it picks up the right icon).  And register the names for our
3350 sound effects. */
3351 
3352 status_t ABSApp::InstallThings (char *ErrorMessage)
3353 {
3354   int32       Cookie;
3355   dev_t       DeviceID;
3356   status_t    ErrorCode = B_OK;
3357   fs_info     FSInfo;
3358   int32       i;
3359   int32       iClassification;
3360   int32       iProbability;
3361   int32       j;
3362   index_info  IndexInfo;
3363   BMimeType   MimeType;
3364   BMessage    Parameters;
3365   const char *StringPntr;
3366   bool        TempBool;
3367   int32       TempInt32;
3368 
3369   /* Iterate through all mounted devices and try to make the indices on each
3370   one.  Don't bother if the index exists or the device doesn't support indices
3371   (actually queries). */
3372 
3373   Cookie = 0;
3374   while ((DeviceID = next_dev (&Cookie)) >= 0)
3375   {
3376     if (!fs_stat_dev (DeviceID, &FSInfo) && (FSInfo.flags & B_FS_HAS_QUERY))
3377     {
3378       if (fs_stat_index (DeviceID, g_AttributeNameClassification, &IndexInfo)
3379       && errno == B_ENTRY_NOT_FOUND)
3380       {
3381         if (fs_create_index (DeviceID, g_AttributeNameClassification,
3382         B_STRING_TYPE, 0 /* flags */))
3383         {
3384           ErrorCode = errno;
3385           sprintf (ErrorMessage, "Unable to make string index %s on "
3386             "volume #%d, volume name \"%s\", file system type \"%s\", "
3387             "on device \"%s\"", g_AttributeNameClassification,
3388             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3389             FSInfo.device_name);
3390         }
3391       }
3392 
3393       if (fs_stat_index (DeviceID, g_AttributeNameSpamRatio,
3394       &IndexInfo) && errno == B_ENTRY_NOT_FOUND)
3395       {
3396         if (fs_create_index (DeviceID, g_AttributeNameSpamRatio,
3397         B_FLOAT_TYPE, 0 /* flags */))
3398         {
3399           ErrorCode = errno;
3400           sprintf (ErrorMessage, "Unable to make float index %s on "
3401             "volume #%d, volume name \"%s\", file system type \"%s\", "
3402             "on device \"%s\"", g_AttributeNameSpamRatio,
3403             (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3404             FSInfo.device_name);
3405         }
3406       }
3407     }
3408   }
3409   if (ErrorCode != B_OK)
3410     return ErrorCode;
3411 
3412   /* Set up the MIME types for the classification attributes, associate them
3413   with e-mail and make them visible to the user (but not editable).  First need
3414   to get the existing MIME settings, then add ours to them (otherwise the
3415   existing ones get wiped out). */
3416 
3417   ErrorCode = MimeType.SetTo ("text/x-email");
3418   if (ErrorCode != B_OK || !MimeType.IsInstalled ())
3419   {
3420     sprintf (ErrorMessage, "No e-mail MIME type (%s) in the system, can't "
3421       "update it to add our special attributes, and without e-mail this "
3422       "program is useless!", MimeType.Type ());
3423     if (ErrorCode == B_OK)
3424       ErrorCode = -1;
3425     return ErrorCode;
3426   }
3427 
3428   ErrorCode = MimeType.GetAttrInfo (&Parameters);
3429   if (ErrorCode != B_OK)
3430   {
3431     sprintf (ErrorMessage, "Unable to retrieve list of attributes "
3432       "associated with e-mail messages in the MIME database");
3433     return ErrorCode;
3434   }
3435 
3436   for (i = 0, iClassification = -1, iProbability = -1;
3437   i < 1000 && (iClassification < 0 || iProbability < 0);
3438   i++)
3439   {
3440     ErrorCode = Parameters.FindString ("attr:name", i, &StringPntr);
3441     if (ErrorCode != B_OK)
3442       break; /* Reached the end of the attributes. */
3443     if (strcmp (StringPntr, g_AttributeNameClassification) == 0)
3444       iClassification = i;
3445     else if (strcmp (StringPntr, g_AttributeNameSpamRatio) == 0)
3446       iProbability = i;
3447   }
3448 
3449   /* Add extra default settings for those programs which previously didn't
3450   update the MIME database with all the attributes that exist (so our new
3451   additions don't show up at the wrong index). */
3452 
3453   i--; /* Set i to index of last valid attribute. */
3454 
3455   for (j = 0; j <= i; j++)
3456   {
3457     if (Parameters.FindString ("attr:public_name", j, &StringPntr) ==
3458     B_BAD_INDEX)
3459     {
3460       if (Parameters.FindString ("attr:name", j, &StringPntr) != B_OK)
3461         StringPntr = "None!";
3462       Parameters.AddString ("attr:public_name", StringPntr);
3463     }
3464   }
3465 
3466   while (Parameters.FindInt32 ("attr:type", i, &TempInt32) == B_BAD_INDEX)
3467     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3468 
3469   while (Parameters.FindBool ("attr:viewable", i, &TempBool) == B_BAD_INDEX)
3470     Parameters.AddBool ("attr:viewable", true);
3471 
3472   while (Parameters.FindBool ("attr:editable", i, &TempBool) == B_BAD_INDEX)
3473     Parameters.AddBool ("attr:editable", false);
3474 
3475   while (Parameters.FindInt32 ("attr:width", i, &TempInt32) == B_BAD_INDEX)
3476     Parameters.AddInt32 ("attr:width", 60);
3477 
3478   while (Parameters.FindInt32 ("attr:alignment", i, &TempInt32) == B_BAD_INDEX)
3479     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3480 
3481   while (Parameters.FindBool ("attr:extra", i, &TempBool) == B_BAD_INDEX)
3482     Parameters.AddBool ("attr:extra", false);
3483 
3484   /* Add our new attributes to e-mail related things, if not already there. */
3485 
3486   if (iClassification < 0)
3487   {
3488     Parameters.AddString ("attr:name", g_AttributeNameClassification);
3489     Parameters.AddString ("attr:public_name", "Classification Group");
3490     Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3491     Parameters.AddBool ("attr:viewable", true);
3492     Parameters.AddBool ("attr:editable", false);
3493     Parameters.AddInt32 ("attr:width", 45);
3494     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3495     Parameters.AddBool ("attr:extra", false);
3496   }
3497 
3498   if (iProbability < 0)
3499   {
3500     Parameters.AddString ("attr:name", g_AttributeNameSpamRatio);
3501     Parameters.AddString ("attr:public_name", "Spam/Genuine Estimate");
3502     Parameters.AddInt32 ("attr:type", B_FLOAT_TYPE);
3503     Parameters.AddBool ("attr:viewable", true);
3504     Parameters.AddBool ("attr:editable", false);
3505     Parameters.AddInt32 ("attr:width", 50);
3506     Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3507     Parameters.AddBool ("attr:extra", false);
3508   }
3509 
3510   if (iClassification < 0 || iProbability < 0)
3511   {
3512     ErrorCode = MimeType.SetAttrInfo (&Parameters);
3513     if (ErrorCode != B_OK)
3514     {
3515       sprintf (ErrorMessage, "Unable to associate the classification "
3516         "attributes with e-mail messages in the MIME database");
3517       return ErrorCode;
3518     }
3519   }
3520 
3521   /* Set up the MIME type for the database file. */
3522 
3523   sprintf (ErrorMessage, "Problems with setting up MIME type (%s) for "
3524     "the database files", g_ABSDatabaseFileMIMEType); /* A generic message. */
3525 
3526   ErrorCode = MimeType.SetTo (g_ABSDatabaseFileMIMEType);
3527   if (ErrorCode != B_OK)
3528     return ErrorCode;
3529 
3530   MimeType.Delete ();
3531   ErrorCode = MimeType.Install ();
3532   if (ErrorCode != B_OK)
3533   {
3534     sprintf (ErrorMessage, "Failed to install MIME type (%s) in the system",
3535       MimeType.Type ());
3536     return ErrorCode;
3537   }
3538 
3539   MimeType.SetShortDescription ("Spam Database");
3540   MimeType.SetLongDescription ("Bayesian Statistical Database for "
3541     "Classifying Junk E-Mail");
3542   sprintf (ErrorMessage, "1.0 ('%s')", g_DatabaseRecognitionString);
3543   MimeType.SetSnifferRule (ErrorMessage);
3544   MimeType.SetPreferredApp (g_ABSAppSignature);
3545 
3546   /* Set up the names of the sound effects.  Later on the user can associate
3547   sound files with the names by using the Sounds preferences panel or the
3548   installsound command.  The MDR add-on filter will trigger these sounds. */
3549 
3550   add_system_beep_event (g_BeepGenuine);
3551   add_system_beep_event (g_BeepSpam);
3552   add_system_beep_event (g_BeepUncertain);
3553 
3554   return B_OK;
3555 }
3556 
3557 
3558 /* Load the database if it hasn't been loaded yet.  Otherwise do nothing. */
3559 
3560 status_t ABSApp::LoadDatabaseIfNeeded (char *ErrorMessage)
3561 {
3562   if (m_WordMap.empty ())
3563     return LoadSaveDatabase (true /* DoLoad */, ErrorMessage);
3564 
3565   return B_OK;
3566 }
3567 
3568 
3569 /* Either load the database of spam words (DoLoad is TRUE) from the file
3570 specified in the settings, or write (DoLoad is FALSE) the database to it.  If
3571 it doesn't exist (and its parent directories do exist) then it will be created
3572 when saving.  If it doesn't exist when loading, the in-memory database will be
3573 set to an empty one and an error will be returned with an explanation put into
3574 ErrorMessage (should be big enough for a path name and a couple of lines of
3575 text).
3576 
3577 The database file format is a UTF-8 text file (well, there could be some
3578 latin-1 characters and other junk in there - it just copies the bytes from the
3579 e-mail messages directly), with tab characters to separate fields (so that you
3580 can also load it into a spreadsheet).  The first line identifies the overall
3581 file type.  The second lists pairs of classifications plus the number of
3582 messages in each class.  Currently it is just Genuine and Spam, but for future
3583 compatability, that could be followed by more classification pairs.  The
3584 remaining lines each contain a word, the date it was last updated (actually
3585 it's the number of messages in the database when the word was added, smaller
3586 numbers mean it was updated longer ago), the genuine count and the spam count.
3587 */
3588 
3589 status_t ABSApp::LoadSaveDatabase (bool DoLoad, char *ErrorMessage)
3590 {
3591   time_t                             CurrentTime;
3592   FILE                              *DatabaseFile = NULL;
3593   BNode                              DatabaseNode;
3594   BNodeInfo                          DatabaseNodeInfo;
3595   StatisticsMap::iterator            DataIter;
3596   StatisticsMap::iterator            EndIter;
3597   status_t                           ErrorCode;
3598   int                                i;
3599   pair<StatisticsMap::iterator,bool> InsertResult;
3600   char                               LineString [10240];
3601   StatisticsRecord                   Statistics;
3602   const char                        *StringPntr;
3603   char                              *TabPntr;
3604   const char                        *WordPntr;
3605 
3606   if (DoLoad)
3607   {
3608     MakeDatabaseEmpty ();
3609     m_DatabaseHasChanged = false; /* In case of early error exit. */
3610   }
3611   else /* Saving the database, backup the old version on disk. */
3612   {
3613     ErrorCode = MakeBackup (ErrorMessage);
3614     if (ErrorCode != B_OK) /* Usually because the directory isn't there. */
3615       return ErrorCode;
3616   }
3617 
3618   DatabaseFile = fopen (m_DatabaseFileName.String (), DoLoad ? "rb" : "wb");
3619   if (DatabaseFile == NULL)
3620   {
3621     ErrorCode = errno;
3622     sprintf (ErrorMessage, "Can't open database file \"%s\" for %s",
3623       m_DatabaseFileName.String (), DoLoad ? "reading" : "writing");
3624     goto ErrorExit;
3625   }
3626 
3627   /* Process the first line, which identifies the file. */
3628 
3629   if (DoLoad)
3630   {
3631     sprintf (ErrorMessage, "Can't read first line of database file \"%s\", "
3632       "expected it to start with \"%s\"",
3633       m_DatabaseFileName.String (), g_DatabaseRecognitionString);
3634     ErrorCode = -1;
3635 
3636     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3637       goto ErrorExit;
3638     if (strncmp (LineString, g_DatabaseRecognitionString,
3639     strlen (g_DatabaseRecognitionString)) != 0)
3640       goto ErrorExit;
3641   }
3642   else /* Saving */
3643   {
3644     CurrentTime = time (NULL);
3645     if (fprintf (DatabaseFile, "%s V1 (word, age, genuine count, spam count)\t"
3646     "Written by SpamDBM $Revision: 30630 $\t"
3647     "Compiled on " __DATE__ " at " __TIME__ "\tThis file saved on %s",
3648     g_DatabaseRecognitionString, ctime (&CurrentTime)) <= 0)
3649     {
3650       ErrorCode = errno;
3651       sprintf (ErrorMessage, "Problems when writing to database file \"%s\"",
3652         m_DatabaseFileName.String ());
3653       goto ErrorExit;
3654     }
3655   }
3656 
3657   /* The second line lists the different classifications.  We just check to see
3658   that the first two are Genuine and Spam.  If there are others, they'll be
3659   ignored and lost when the database is saved. */
3660 
3661   if (DoLoad)
3662   {
3663     sprintf (ErrorMessage, "Can't read second line of database file \"%s\", "
3664       "expected it to list classifications %s and %s along with their totals",
3665       m_DatabaseFileName.String (), g_ClassifiedGenuine, g_ClassifiedSpam);
3666     ErrorCode = B_BAD_VALUE;
3667 
3668     if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3669       goto ErrorExit;
3670     i = strlen (LineString);
3671     if (i > 0 && LineString[i-1] == '\n')
3672       LineString[i-1] = 0; /* Remove trailing line feed character. */
3673 
3674     /* Look for the title word at the start of the line. */
3675 
3676     TabPntr = LineString;
3677     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3678       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3679 
3680     if (strncmp (StringPntr, "Classifications", 15) != 0)
3681       goto ErrorExit;
3682 
3683     /* Look for the Genuine class and count. */
3684 
3685     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3686       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3687 
3688     if (strcmp (StringPntr, g_ClassifiedGenuine) != 0)
3689       goto ErrorExit;
3690 
3691     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3692       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3693 
3694     m_TotalGenuineMessages = atoll (StringPntr);
3695 
3696     /* Look for the Spam class and count. */
3697 
3698     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3699       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3700 
3701     if (strcmp (StringPntr, g_ClassifiedSpam) != 0)
3702       goto ErrorExit;
3703 
3704     for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3705       ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3706 
3707     m_TotalSpamMessages = atoll (StringPntr);
3708   }
3709   else /* Saving */
3710   {
3711     fprintf (DatabaseFile,
3712       "Classifications and total messages:\t%s\t%lu\t%s\t%lu\n",
3713       g_ClassifiedGenuine, m_TotalGenuineMessages,
3714       g_ClassifiedSpam, m_TotalSpamMessages);
3715   }
3716 
3717   /* The remainder of the file is the list of words and statistics.  Each line
3718   has a word, a tab, the time when the word was last changed in the database
3719   (sequence number of message addition, starts at 0 and goes up by one for each
3720   message added to the database), a tab then the number of messages in the
3721   first class (genuine) that had that word, then a tab, then the number of
3722   messages in the second class (spam) with that word, and so on. */
3723 
3724   if (DoLoad)
3725   {
3726     while (!feof (DatabaseFile))
3727     {
3728       if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3729       {
3730         ErrorCode = errno;
3731         if (feof (DatabaseFile))
3732           break;
3733         if (ErrorCode == B_OK)
3734           ErrorCode = -1;
3735         sprintf (ErrorMessage, "Error while reading words and statistics "
3736           "from database file \"%s\"", m_DatabaseFileName.String ());
3737         goto ErrorExit;
3738       }
3739 
3740       i = strlen (LineString);
3741       if (i > 0 && LineString[i-1] == '\n')
3742         LineString[i-1] = 0; /* Remove trailing line feed character. */
3743 
3744       /* Get the word at the start of the line, save in WordPntr. */
3745 
3746       TabPntr = LineString;
3747       for (WordPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3748         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3749 
3750       /* Get the date stamp.  Actually a sequence number, not a date. */
3751 
3752       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3753         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3754 
3755       Statistics.age = atoll (StringPntr);
3756 
3757       /* Get the Genuine count. */
3758 
3759       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3760         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3761 
3762       Statistics.genuineCount = atoll (StringPntr);
3763 
3764       /* Get the Spam count. */
3765 
3766       for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3767         ; if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3768 
3769       Statistics.spamCount = atoll (StringPntr);
3770 
3771       /* Ignore empty words, totally unused words and ones which are too long
3772       (avoids lots of length checking everywhere). */
3773 
3774       if (WordPntr[0] == 0 || strlen (WordPntr) > g_MaxWordLength ||
3775       (Statistics.genuineCount <= 0 && Statistics.spamCount <= 0))
3776         continue; /* Ignore this line of text, start on next one. */
3777 
3778       /* Add the combination to the database. */
3779 
3780       InsertResult = m_WordMap.insert (
3781         StatisticsMap::value_type (WordPntr, Statistics));
3782       if (InsertResult.second == false)
3783       {
3784         ErrorCode = B_BAD_VALUE;
3785         sprintf (ErrorMessage, "Error while inserting word \"%s\" from "
3786           "database \"%s\", perhaps it is a duplicate",
3787           WordPntr, m_DatabaseFileName.String ());
3788         goto ErrorExit;
3789       }
3790       m_WordCount++;
3791 
3792       /* And the hunt for the oldest word. */
3793 
3794       if (Statistics.age < m_OldestAge)
3795         m_OldestAge = Statistics.age;
3796     }
3797   }
3798   else /* Saving, dump all words and statistics to the file. */
3799   {
3800     EndIter = m_WordMap.end ();
3801     for (DataIter = m_WordMap.begin (); DataIter != EndIter; DataIter++)
3802     {
3803       if (fprintf (DatabaseFile, "%s\t%lu\t%lu\t%lu\n",
3804       DataIter->first.c_str (), DataIter->second.age,
3805       DataIter->second.genuineCount, DataIter->second.spamCount) <= 0)
3806       {
3807         ErrorCode = errno;
3808         sprintf (ErrorMessage, "Error while writing word \"%s\" to "
3809           "database \"%s\"",
3810           DataIter->first.c_str(), m_DatabaseFileName.String ());
3811         goto ErrorExit;
3812       }
3813     }
3814   }
3815 
3816   /* Set the file type so that the new file gets associated with this program,
3817   and picks up the right icon. */
3818 
3819   if (!DoLoad)
3820   {
3821     sprintf (ErrorMessage, "Unable to set attributes (file type) of database "
3822       "file \"%s\"", m_DatabaseFileName.String ());
3823     ErrorCode = DatabaseNode.SetTo (m_DatabaseFileName.String ());
3824     if (ErrorCode != B_OK)
3825       goto ErrorExit;
3826     DatabaseNodeInfo.SetTo (&DatabaseNode);
3827     ErrorCode = DatabaseNodeInfo.SetType (g_ABSDatabaseFileMIMEType);
3828     if (ErrorCode != B_OK)
3829       goto ErrorExit;
3830   }
3831 
3832   /* Success! */
3833   m_DatabaseHasChanged = false;
3834   ErrorCode = B_OK;
3835 
3836 ErrorExit:
3837   if (DatabaseFile != NULL)
3838     fclose (DatabaseFile);
3839   return ErrorCode;
3840 }
3841 
3842 
3843 /* Either load the settings (DoLoad is TRUE) from the configuration file or
3844 write them (DoLoad is FALSE) to it.  The configuration file is a flattened
3845 BMessage containing the various program settings.  If it doesn't exist (and its
3846 parent directories don't exist) then it will be created when saving.  If it
3847 doesn't exist when loading, the settings will be set to default values. */
3848 
3849 status_t ABSApp::LoadSaveSettings (bool DoLoad)
3850 {
3851   status_t    ErrorCode;
3852   const char *NamePntr;
3853   BMessage    Settings;
3854   BDirectory  SettingsDirectory;
3855   BFile       SettingsFile;
3856   const char *StringPntr;
3857   bool        TempBool;
3858   int32       TempInt32;
3859   char        TempString [PATH_MAX + 100];
3860 
3861   /* Preset things to default values if loading, in case of an error or it's an
3862   older version of the settings file which doesn't have every field defined. */
3863 
3864   if (DoLoad)
3865     DefaultSettings ();
3866 
3867   /* Look for our settings directory.  When saving we can try to create it. */
3868 
3869   ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3870   if (ErrorCode != B_OK)
3871   {
3872     if (DoLoad || ErrorCode != B_ENTRY_NOT_FOUND)
3873     {
3874       sprintf (TempString, "Can't find settings directory \"%s\"",
3875         m_SettingsDirectoryPath.Path ());
3876       goto ErrorExit;
3877     }
3878     ErrorCode = create_directory (m_SettingsDirectoryPath.Path (), 0755);
3879     if (ErrorCode == B_OK)
3880       ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3881     if (ErrorCode != B_OK)
3882     {
3883       sprintf (TempString, "Can't create settings directory \"%s\"",
3884         m_SettingsDirectoryPath.Path ());
3885       goto ErrorExit;
3886     }
3887   }
3888 
3889   ErrorCode = SettingsFile.SetTo (&SettingsDirectory, g_SettingsFileName,
3890     DoLoad ? B_READ_ONLY : B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
3891   if (ErrorCode != B_OK)
3892   {
3893     sprintf (TempString, "Can't open settings file \"%s\" in directory \"%s\" "
3894       "for %s", g_SettingsFileName, m_SettingsDirectoryPath.Path(),
3895       DoLoad ? "reading" : "writing");
3896     goto ErrorExit;
3897   }
3898 
3899   if (DoLoad)
3900   {
3901     ErrorCode = Settings.Unflatten (&SettingsFile);
3902     if (ErrorCode != 0 || Settings.what != g_SettingsWhatCode)
3903     {
3904       sprintf (TempString, "Corrupt data detected while reading settings "
3905         "file \"%s\" in directory \"%s\", will revert to defaults",
3906         g_SettingsFileName, m_SettingsDirectoryPath.Path());
3907       goto ErrorExit;
3908     }
3909   }
3910 
3911   /* Transfer the settings between the BMessage and our various global
3912   variables.  For loading, if the setting isn't present, leave it at the
3913   default value.  Note that loading and saving are intermingled here to make
3914   code maintenance easier (less chance of forgetting to update it if load and
3915   save were separate functions). */
3916 
3917   ErrorCode = B_OK; /* So that saving settings can record an error. */
3918 
3919   NamePntr = "DatabaseFileName";
3920   if (DoLoad)
3921   {
3922     if (Settings.FindString (NamePntr, &StringPntr) == B_OK)
3923       m_DatabaseFileName.SetTo (StringPntr);
3924   }
3925   else if (ErrorCode == B_OK)
3926     ErrorCode = Settings.AddString (NamePntr, m_DatabaseFileName);
3927 
3928   NamePntr = "ServerMode";
3929   if (DoLoad)
3930   {
3931     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3932       g_ServerMode = TempBool;
3933   }
3934   else if (ErrorCode == B_OK)
3935     ErrorCode = Settings.AddBool (NamePntr, g_ServerMode);
3936 
3937   NamePntr = "IgnorePreviousClassification";
3938   if (DoLoad)
3939   {
3940     if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3941       m_IgnorePreviousClassification = TempBool;
3942   }
3943   else if (ErrorCode == B_OK)
3944     ErrorCode = Settings.AddBool (NamePntr, m_IgnorePreviousClassification);
3945 
3946   NamePntr = "PurgeAge";
3947   if (DoLoad)
3948   {
3949     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3950       m_PurgeAge = TempInt32;
3951   }
3952   else if (ErrorCode == B_OK)
3953     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgeAge);
3954 
3955   NamePntr = "PurgePopularity";
3956   if (DoLoad)
3957   {
3958     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3959       m_PurgePopularity = TempInt32;
3960   }
3961   else if (ErrorCode == B_OK)
3962     ErrorCode = Settings.AddInt32 (NamePntr, m_PurgePopularity);
3963 
3964   NamePntr = "ScoringMode";
3965   if (DoLoad)
3966   {
3967     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3968       m_ScoringMode = (ScoringModes) TempInt32;
3969     if (m_ScoringMode < 0 || m_ScoringMode >= SM_MAX)
3970       m_ScoringMode = (ScoringModes) 0;
3971   }
3972   else if (ErrorCode == B_OK)
3973     ErrorCode = Settings.AddInt32 (NamePntr, m_ScoringMode);
3974 
3975   NamePntr = "TokenizeMode";
3976   if (DoLoad)
3977   {
3978     if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3979       m_TokenizeMode = (TokenizeModes) TempInt32;
3980     if (m_TokenizeMode < 0 || m_TokenizeMode >= TM_MAX)
3981       m_TokenizeMode = (TokenizeModes) 0;
3982   }
3983   else if (ErrorCode == B_OK)
3984     ErrorCode = Settings.AddInt32 (NamePntr, m_TokenizeMode);
3985 
3986   if (ErrorCode != B_OK)
3987   {
3988     strcpy (TempString, "Unable to stuff the program settings into a "
3989       "temporary BMessage, settings not saved");
3990     goto ErrorExit;
3991   }
3992 
3993   /* Save the settings BMessage to the settings file. */
3994 
3995   if (!DoLoad)
3996   {
3997     Settings.what = g_SettingsWhatCode;
3998     ErrorCode = Settings.Flatten (&SettingsFile);
3999     if (ErrorCode != 0)
4000     {
4001       sprintf (TempString, "Problems while writing settings file \"%s\" in "
4002         "directory \"%s\"", g_SettingsFileName,
4003         m_SettingsDirectoryPath.Path ());
4004       goto ErrorExit;
4005     }
4006   }
4007 
4008   m_SettingsHaveChanged = false;
4009   return B_OK;
4010 
4011 ErrorExit: /* Error message in TempString, code in ErrorCode. */
4012   DisplayErrorMessage (TempString, ErrorCode, DoLoad ?
4013     "Loading Settings Error" : "Saving Settings Error");
4014   return ErrorCode;
4015 }
4016 
4017 
4018 void
4019 ABSApp::MessageReceived (BMessage *MessagePntr)
4020 {
4021   const char           *PropertyName;
4022   struct property_info *PropInfoPntr;
4023   int32                 SpecifierIndex;
4024   int32                 SpecifierKind;
4025   BMessage              SpecifierMessage;
4026 
4027   /* See if it is a scripting message that applies to the database or one of
4028   the other operations this program supports.  Pass on other scripting messages
4029   to the inherited parent MessageReceived function (they're usually scripting
4030   messages for the BApplication). */
4031 
4032   switch (MessagePntr->what)
4033   {
4034     case B_GET_PROPERTY:
4035     case B_SET_PROPERTY:
4036     case B_COUNT_PROPERTIES:
4037     case B_CREATE_PROPERTY:
4038     case B_DELETE_PROPERTY:
4039     case B_EXECUTE_PROPERTY:
4040       if (MessagePntr->GetCurrentSpecifier (&SpecifierIndex, &SpecifierMessage,
4041       &SpecifierKind, &PropertyName) == B_OK &&
4042       SpecifierKind == B_DIRECT_SPECIFIER)
4043       {
4044         for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
4045         {
4046           if (PropInfoPntr->name == 0)
4047             break; /* Ran out of commands. */
4048 
4049           if (PropInfoPntr->commands[0] == MessagePntr->what &&
4050           strcasecmp (PropInfoPntr->name, PropertyName) == 0)
4051           {
4052             ProcessScriptingMessage (MessagePntr, PropInfoPntr);
4053             return;
4054           }
4055         }
4056       }
4057       break;
4058   }
4059 
4060   /* Pass the unprocessed message to the inherited function, maybe it knows
4061   what to do.  This includes replies to messages we sent ourselves. */
4062 
4063   BApplication::MessageReceived (MessagePntr);
4064 }
4065 
4066 
4067 /* Rename the existing database file to a backup file name, potentially
4068 replacing an older backup.  If something goes wrong, returns an error code and
4069 puts an explanation in ErrorMessage. */
4070 
4071 status_t ABSApp::MakeBackup (char *ErrorMessage)
4072 {
4073   BEntry   Entry;
4074   status_t ErrorCode;
4075   int      i;
4076   char     LeafName [NAME_MAX];
4077   char     NewName [PATH_MAX+20];
4078   char     OldName [PATH_MAX+20];
4079 
4080   ErrorCode = Entry.SetTo (m_DatabaseFileName.String ());
4081   if (ErrorCode != B_OK)
4082   {
4083     sprintf (ErrorMessage, "While making backup, failed to make a BEntry for "
4084       "\"%s\" (maybe the directory doesn't exist?)",
4085       m_DatabaseFileName.String ());
4086     return ErrorCode;
4087   }
4088   if (!Entry.Exists ())
4089     return B_OK; /* No existing file to worry about overwriting. */
4090   Entry.GetName (LeafName);
4091 
4092   /* Find the first hole (no file) where we will stop the renaming chain. */
4093 
4094   for (i = 0; i < g_MaxBackups - 1; i++)
4095   {
4096     strcpy (OldName, m_DatabaseFileName.String ());
4097     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4098     Entry.SetTo (OldName);
4099     if (!Entry.Exists ())
4100       break;
4101   }
4102 
4103   /* Move the files down by one to fill in the hole in the name series. */
4104 
4105   for (i--; i >= 0; i--)
4106   {
4107     strcpy (OldName, m_DatabaseFileName.String ());
4108     sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4109     Entry.SetTo (OldName);
4110     strcpy (NewName, LeafName);
4111     sprintf (NewName + strlen (NewName), g_BackupSuffix, i + 1);
4112     ErrorCode = Entry.Rename (NewName, true /* clobber */);
4113   }
4114 
4115   Entry.SetTo (m_DatabaseFileName.String ());
4116   strcpy (NewName, LeafName);
4117   sprintf (NewName + strlen (NewName), g_BackupSuffix, 0);
4118   ErrorCode = Entry.Rename (NewName, true /* clobber */);
4119   if (ErrorCode != B_OK)
4120     sprintf (ErrorMessage, "While making backup, failed to rename "
4121       "\"%s\" to \"%s\"", m_DatabaseFileName.String (), NewName);
4122 
4123   return ErrorCode;
4124 }
4125 
4126 
4127 void
4128 ABSApp::MakeDatabaseEmpty ()
4129 {
4130   m_WordMap.clear (); /* Sets the map to empty, deallocating any old data. */
4131   m_WordCount = 0;
4132   m_TotalGenuineMessages = 0;
4133   m_TotalSpamMessages = 0;
4134   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4135 }
4136 
4137 
4138 /* Do what the scripting command says.  A reply message will be sent back with
4139 several fields: "error" containing the numerical error code (0 for success),
4140 "CommandText" with a text representation of the command, "result" with the
4141 resulting data for a get or count command.  If it isn't understood, then rather
4142 than a B_REPLY kind of message, it will be a B_MESSAGE_NOT_UNDERSTOOD message
4143 with an "error" number and an "message" string with a description. */
4144 
4145 void
4146 ABSApp::ProcessScriptingMessage (
4147   BMessage *MessagePntr,
4148   struct property_info *PropInfoPntr)
4149 {
4150   bool        ArgumentBool = false;
4151   bool        ArgumentGotBool = false;
4152   bool        ArgumentGotInt32 = false;
4153   bool        ArgumentGotString = false;
4154   int32       ArgumentInt32 = 0;
4155   const char *ArgumentString = NULL;
4156   BString     CommandText;
4157   status_t    ErrorCode;
4158   int         i;
4159   BMessage    ReplyMessage (B_MESSAGE_NOT_UNDERSTOOD);
4160   ssize_t     StringBufferSize;
4161   BMessage    TempBMessage;
4162   BPath       TempPath;
4163   char        TempString [PATH_MAX + 1024];
4164 
4165   if (g_QuitCountdown >= 0 && !g_CommandLineMode)
4166   {
4167     g_QuitCountdown = -1;
4168     cerr << "Quit countdown aborted due to a scripting command arriving.\n";
4169   }
4170 
4171   if (g_BusyCursor != NULL)
4172     SetCursor (g_BusyCursor);
4173 
4174   ErrorCode = MessagePntr->FindData (g_DataName, B_STRING_TYPE,
4175     (const void **) &ArgumentString, &StringBufferSize);
4176   if (ErrorCode == B_OK)
4177   {
4178     if (PropInfoPntr->extra_data != PN_EVALUATE_STRING &&
4179     PropInfoPntr->extra_data != PN_SPAM_STRING &&
4180     PropInfoPntr->extra_data != PN_GENUINE_STRING &&
4181     strlen (ArgumentString) >= PATH_MAX)
4182     {
4183       sprintf (TempString, "\"data\" string of a scripting message is too "
4184         "long, for SET %s action", PropInfoPntr->name);
4185       ErrorCode = B_NAME_TOO_LONG;
4186       goto ErrorExit;
4187     }
4188     ArgumentGotString = true;
4189   }
4190   else if (MessagePntr->FindBool (g_DataName, &ArgumentBool) == B_OK)
4191     ArgumentGotBool = true;
4192   else if (MessagePntr->FindInt32 (g_DataName, &ArgumentInt32) == B_OK)
4193     ArgumentGotInt32 = true;
4194 
4195   /* Prepare a Human readable description of the scripting command. */
4196 
4197   switch (PropInfoPntr->commands[0])
4198   {
4199     case B_SET_PROPERTY:
4200       CommandText.SetTo ("Set ");
4201       break;
4202 
4203     case B_GET_PROPERTY:
4204       CommandText.SetTo ("Get ");
4205       break;
4206 
4207     case B_COUNT_PROPERTIES:
4208       CommandText.SetTo ("Count ");
4209       break;
4210 
4211     case B_CREATE_PROPERTY:
4212       CommandText.SetTo ("Create ");
4213       break;
4214 
4215     case B_DELETE_PROPERTY:
4216       CommandText.SetTo ("Delete ");
4217       break;
4218 
4219     case B_EXECUTE_PROPERTY:
4220       CommandText.SetTo ("Execute ");
4221       break;
4222 
4223     default:
4224       sprintf (TempString, "Bug: scripting command for \"%s\" has an unknown "
4225         "action code %d", PropInfoPntr->name,
4226         (int) PropInfoPntr->commands[0]);
4227       ErrorCode = -1;
4228       goto ErrorExit;
4229   }
4230   CommandText.Append (PropInfoPntr->name);
4231 
4232   /* Add on the argument value to our readable command, if there is one. */
4233 
4234   if (ArgumentGotString)
4235   {
4236     CommandText.Append (" \"");
4237     CommandText.Append (ArgumentString);
4238     CommandText.Append ("\"");
4239   }
4240   if (ArgumentGotBool)
4241     CommandText.Append (ArgumentBool ? " true" : " false");
4242   if (ArgumentGotInt32)
4243   {
4244     sprintf (TempString, " %ld", ArgumentInt32);
4245     CommandText.Append (TempString);
4246   }
4247 
4248   /* From now on the scripting command has been recognized and is in the
4249   correct format, so it always returns a B_REPLY message.  A readable version
4250   of the command is also added to make debugging easier. */
4251 
4252   ReplyMessage.what = B_REPLY;
4253   ReplyMessage.AddString ("CommandText", CommandText);
4254 
4255   /* Now actually do the command.  First prepare a default error message. */
4256 
4257   sprintf (TempString, "Operation code %d (get, set, count, etc) "
4258     "unsupported for property %s",
4259     (int) PropInfoPntr->commands[0], PropInfoPntr->name);
4260   ErrorCode = B_BAD_INDEX;
4261 
4262   switch (PropInfoPntr->extra_data)
4263   {
4264     case PN_DATABASE_FILE:
4265       switch (PropInfoPntr->commands[0])
4266       {
4267         case B_GET_PROPERTY: /* Get the database file name. */
4268           ReplyMessage.AddString (g_ResultName, m_DatabaseFileName);
4269           break;
4270 
4271         case B_SET_PROPERTY: /* Set the database file name to a new one. */
4272           if (!ArgumentGotString)
4273           {
4274             ErrorCode = B_BAD_TYPE;
4275             sprintf (TempString, "You need to specify a string for the "
4276               "SET %s command", PropInfoPntr->name);
4277             goto ErrorExit;
4278           }
4279           ErrorCode = TempPath.SetTo (ArgumentString, NULL /* leaf */,
4280             true /* normalize - verifies parent directories exist */);
4281           if (ErrorCode != B_OK)
4282           {
4283             sprintf (TempString, "New database path name of \"%s\" is invalid "
4284               "(parent directories must exist)", ArgumentString);
4285             goto ErrorExit;
4286           }
4287           if ((ErrorCode = SaveDatabaseIfNeeded (TempString)) != B_OK)
4288             goto ErrorExit;
4289           MakeDatabaseEmpty (); /* So that the new one gets loaded if used. */
4290 
4291           if (strlen (TempPath.Leaf ()) > NAME_MAX-strlen(g_BackupSuffix)-1)
4292           {
4293             /* Truncate the name so that there is enough space for the backup
4294             extension.  Approximately. */
4295             strcpy (TempString, TempPath.Leaf ());
4296             TempString [NAME_MAX - strlen (g_BackupSuffix) - 1] = 0;
4297             TempPath.GetParent (&TempPath);
4298             TempPath.Append (TempString);
4299           }
4300           m_DatabaseFileName.SetTo (TempPath.Path ());
4301           m_SettingsHaveChanged = true;
4302           break;
4303 
4304         case B_CREATE_PROPERTY: /* Make a new database file plus more. */
4305           if ((ErrorCode = CreateDatabaseFile (TempString)) != B_OK)
4306             goto ErrorExit;
4307           break;
4308 
4309         case B_DELETE_PROPERTY: /* Delete the file and its backups too. */
4310           if ((ErrorCode = DeleteDatabaseFile (TempString)) != B_OK)
4311             goto ErrorExit;
4312           break;
4313 
4314         case B_COUNT_PROPERTIES:
4315           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4316             goto ErrorExit;
4317           ReplyMessage.AddInt32 (g_ResultName, m_WordCount);
4318           break;
4319 
4320         default: /* Unknown operation code, error message already set. */
4321           goto ErrorExit;
4322       }
4323       break;
4324 
4325     case PN_SPAM:
4326     case PN_SPAM_STRING:
4327     case PN_GENUINE:
4328     case PN_GENUINE_STRING:
4329     case PN_UNCERTAIN:
4330       switch (PropInfoPntr->commands[0])
4331       {
4332         case B_COUNT_PROPERTIES: /* Get the number of spam/genuine messages. */
4333           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4334             goto ErrorExit;
4335           if (PropInfoPntr->extra_data == PN_SPAM ||
4336           PropInfoPntr->extra_data == PN_SPAM_STRING)
4337             ReplyMessage.AddInt32 (g_ResultName, m_TotalSpamMessages);
4338           else
4339             ReplyMessage.AddInt32 (g_ResultName, m_TotalGenuineMessages);
4340           break;
4341 
4342         case B_SET_PROPERTY: /* Add spam/genuine/uncertain to database. */
4343           if (!ArgumentGotString)
4344           {
4345             ErrorCode = B_BAD_TYPE;
4346             sprintf (TempString, "You need to specify a string (%s) "
4347               "for the SET %s command",
4348               (PropInfoPntr->extra_data == PN_GENUINE_STRING ||
4349               PropInfoPntr->extra_data == PN_SPAM_STRING)
4350               ? "text of the message to be added"
4351               : "pathname of the file containing the text to be added",
4352               PropInfoPntr->name);
4353             goto ErrorExit;
4354           }
4355           if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4356             goto ErrorExit;
4357           if (PropInfoPntr->extra_data == PN_GENUINE ||
4358           PropInfoPntr->extra_data == PN_SPAM ||
4359           PropInfoPntr->extra_data == PN_UNCERTAIN)
4360             ErrorCode = AddFileToDatabase (
4361               (PropInfoPntr->extra_data == PN_SPAM) ? CL_SPAM :
4362               ((PropInfoPntr->extra_data == PN_GENUINE) ? CL_GENUINE :
4363               CL_UNCERTAIN),
4364               ArgumentString, TempString /* ErrorMessage */);
4365           else
4366             ErrorCode = AddStringToDatabase (
4367               (PropInfoPntr->extra_data == PN_SPAM_STRING) ?
4368               CL_SPAM : CL_GENUINE,
4369               ArgumentString, TempString /* ErrorMessage */);
4370           if (ErrorCode != B_OK)
4371             goto ErrorExit;
4372           break;
4373 
4374         default: /* Unknown operation code, error message already set. */
4375           goto ErrorExit;
4376       }
4377       break;
4378 
4379     case PN_IGNORE_PREVIOUS_CLASSIFICATION:
4380       switch (PropInfoPntr->commands[0])
4381       {
4382         case B_GET_PROPERTY:
4383           ReplyMessage.AddBool (g_ResultName, m_IgnorePreviousClassification);
4384           break;
4385 
4386         case B_SET_PROPERTY:
4387           if (!ArgumentGotBool)
4388           {
4389             ErrorCode = B_BAD_TYPE;
4390             sprintf (TempString, "You need to specify a boolean (true/yes, "
4391               "false/no) for the SET %s command", PropInfoPntr->name);
4392             goto ErrorExit;
4393           }
4394           m_IgnorePreviousClassification = ArgumentBool;
4395           m_SettingsHaveChanged = true;
4396           break;
4397 
4398         default: /* Unknown operation code, error message already set. */
4399           goto ErrorExit;
4400       }
4401       break;
4402 
4403     case PN_SERVER_MODE:
4404       switch (PropInfoPntr->commands[0])
4405       {
4406         case B_GET_PROPERTY:
4407           ReplyMessage.AddBool (g_ResultName, g_ServerMode);
4408           break;
4409 
4410         case B_SET_PROPERTY:
4411           if (!ArgumentGotBool)
4412           {
4413             ErrorCode = B_BAD_TYPE;
4414             sprintf (TempString, "You need to specify a boolean (true/yes, "
4415               "false/no) for the SET %s command", PropInfoPntr->name);
4416             goto ErrorExit;
4417           }
4418           g_ServerMode = ArgumentBool;
4419           m_SettingsHaveChanged = true;
4420           break;
4421 
4422         default: /* Unknown operation code, error message already set. */
4423           goto ErrorExit;
4424       }
4425       break;
4426 
4427     case PN_FLUSH:
4428       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4429       (ErrorCode = SaveDatabaseIfNeeded (TempString)) == B_OK)
4430         break;
4431       goto ErrorExit;
4432 
4433     case PN_PURGE_AGE:
4434       switch (PropInfoPntr->commands[0])
4435       {
4436         case B_GET_PROPERTY:
4437           ReplyMessage.AddInt32 (g_ResultName, m_PurgeAge);
4438           break;
4439 
4440         case B_SET_PROPERTY:
4441           if (!ArgumentGotInt32)
4442           {
4443             ErrorCode = B_BAD_TYPE;
4444             sprintf (TempString, "You need to specify a 32 bit integer "
4445               "for the SET %s command", PropInfoPntr->name);
4446             goto ErrorExit;
4447           }
4448           m_PurgeAge = ArgumentInt32;
4449           m_SettingsHaveChanged = true;
4450           break;
4451 
4452         default: /* Unknown operation code, error message already set. */
4453           goto ErrorExit;
4454       }
4455       break;
4456 
4457     case PN_PURGE_POPULARITY:
4458       switch (PropInfoPntr->commands[0])
4459       {
4460         case B_GET_PROPERTY:
4461           ReplyMessage.AddInt32 (g_ResultName, m_PurgePopularity);
4462           break;
4463 
4464         case B_SET_PROPERTY:
4465           if (!ArgumentGotInt32)
4466           {
4467             ErrorCode = B_BAD_TYPE;
4468             sprintf (TempString, "You need to specify a 32 bit integer "
4469               "for the SET %s command", PropInfoPntr->name);
4470             goto ErrorExit;
4471           }
4472           m_PurgePopularity = ArgumentInt32;
4473           m_SettingsHaveChanged = true;
4474           break;
4475 
4476         default: /* Unknown operation code, error message already set. */
4477           goto ErrorExit;
4478       }
4479       break;
4480 
4481     case PN_PURGE:
4482       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4483       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK &&
4484       (ErrorCode = PurgeOldWords (TempString)) == B_OK)
4485         break;
4486       goto ErrorExit;
4487 
4488     case PN_OLDEST:
4489       if (PropInfoPntr->commands[0] == B_GET_PROPERTY &&
4490       (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4491       {
4492         ReplyMessage.AddInt32 (g_ResultName, m_OldestAge);
4493         break;
4494       }
4495       goto ErrorExit;
4496 
4497     case PN_EVALUATE:
4498     case PN_EVALUATE_STRING:
4499       if (PropInfoPntr->commands[0] == B_SET_PROPERTY)
4500       {
4501         if (!ArgumentGotString)
4502         {
4503           ErrorCode = B_BAD_TYPE;
4504           sprintf (TempString, "You need to specify a string for the "
4505             "SET %s command", PropInfoPntr->name);
4506           goto ErrorExit;
4507         }
4508         if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4509         {
4510           if (PropInfoPntr->extra_data == PN_EVALUATE)
4511           {
4512             if ((ErrorCode = EvaluateFile (ArgumentString, &ReplyMessage,
4513             TempString)) == B_OK)
4514               break;
4515           }
4516           else /* PN_EVALUATE_STRING */
4517           {
4518             if ((ErrorCode = EvaluateString (ArgumentString, StringBufferSize,
4519             &ReplyMessage, TempString)) == B_OK)
4520               break;
4521           }
4522         }
4523       }
4524       goto ErrorExit;
4525 
4526     case PN_RESET_TO_DEFAULTS:
4527       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY)
4528       {
4529         DefaultSettings ();
4530         break;
4531       }
4532       goto ErrorExit;
4533 
4534     case PN_INSTALL_THINGS:
4535       if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4536       (ErrorCode = InstallThings (TempString)) == B_OK)
4537         break;
4538       goto ErrorExit;
4539 
4540     case PN_SCORING_MODE:
4541       switch (PropInfoPntr->commands[0])
4542       {
4543         case B_GET_PROPERTY:
4544           ReplyMessage.AddString (g_ResultName,
4545             g_ScoringModeNames[m_ScoringMode]);
4546           break;
4547 
4548         case B_SET_PROPERTY:
4549           i = SM_MAX;
4550           if (ArgumentGotString)
4551             for (i = 0; i < SM_MAX; i++)
4552             {
4553               if (strcasecmp (ArgumentString, g_ScoringModeNames [i]) == 0)
4554               {
4555                 m_ScoringMode = (ScoringModes) i;
4556                 m_SettingsHaveChanged = true;
4557                 break;
4558               }
4559             }
4560           if (i >= SM_MAX) /* Didn't find a valid scoring mode word. */
4561           {
4562             ErrorCode = B_BAD_TYPE;
4563             sprintf (TempString, "You used the unrecognized \"%s\" as "
4564               "a scoring mode for the SET %s command.  Should be one of: ",
4565               ArgumentGotString ? ArgumentString : "not specified",
4566               PropInfoPntr->name);
4567             for (i = 0; i < SM_MAX; i++)
4568             {
4569               strcat (TempString, g_ScoringModeNames [i]);
4570               if (i < SM_MAX - 1)
4571                 strcat (TempString, ", ");
4572             }
4573             goto ErrorExit;
4574           }
4575           break;
4576 
4577         default: /* Unknown operation code, error message already set. */
4578           goto ErrorExit;
4579       }
4580       break;
4581 
4582     case PN_TOKENIZE_MODE:
4583       switch (PropInfoPntr->commands[0])
4584       {
4585         case B_GET_PROPERTY:
4586           ReplyMessage.AddString (g_ResultName,
4587             g_TokenizeModeNames[m_TokenizeMode]);
4588           break;
4589 
4590         case B_SET_PROPERTY:
4591           i = TM_MAX;
4592           if (ArgumentGotString)
4593             for (i = 0; i < TM_MAX; i++)
4594             {
4595               if (strcasecmp (ArgumentString, g_TokenizeModeNames [i]) == 0)
4596               {
4597                 m_TokenizeMode = (TokenizeModes) i;
4598                 m_SettingsHaveChanged = true;
4599                 break;
4600               }
4601             }
4602           if (i >= TM_MAX) /* Didn't find a valid tokenize mode word. */
4603           {
4604             ErrorCode = B_BAD_TYPE;
4605             sprintf (TempString, "You used the unrecognized \"%s\" as "
4606               "a tokenize mode for the SET %s command.  Should be one of: ",
4607               ArgumentGotString ? ArgumentString : "not specified",
4608               PropInfoPntr->name);
4609             for (i = 0; i < TM_MAX; i++)
4610             {
4611               strcat (TempString, g_TokenizeModeNames [i]);
4612               if (i < TM_MAX - 1)
4613                 strcat (TempString, ", ");
4614             }
4615             goto ErrorExit;
4616           }
4617           break;
4618 
4619         default: /* Unknown operation code, error message already set. */
4620           goto ErrorExit;
4621       }
4622       break;
4623 
4624     default:
4625       sprintf (TempString, "Bug!  Unrecognized property identification "
4626         "number %d (should be between 0 and %d).  Fix the entry in "
4627         "the g_ScriptingPropertyList array!",
4628         (int) PropInfoPntr->extra_data, PN_MAX - 1);
4629       goto ErrorExit;
4630   }
4631 
4632   /* Success. */
4633 
4634   ReplyMessage.AddInt32 ("error", B_OK);
4635   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4636     this /* Reply's reply handler */, 500000 /* send timeout */);
4637   if (ErrorCode != B_OK)
4638     cerr << "ProcessScriptingMessage failed to send a reply message, code " <<
4639     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4640     CommandText.String () << endl;
4641   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4642   return;
4643 
4644 ErrorExit: /* Error message in TempString, return code in ErrorCode. */
4645   ReplyMessage.AddInt32 ("error", ErrorCode);
4646   ReplyMessage.AddString ("message", TempString);
4647   DisplayErrorMessage (TempString, ErrorCode);
4648   ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4649     this /* Reply's reply handler */, 500000 /* send timeout */);
4650   if (ErrorCode != B_OK)
4651     cerr << "ProcessScriptingMessage failed to send an error message, code " <<
4652     ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4653     CommandText.String () << endl;
4654   SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4655 }
4656 
4657 
4658 /* Since quitting stops the program before the results of a script command are
4659 received, we use a time delay to do the quit and make sure there are no pending
4660 commands being processed by the auxiliary looper which is sending us commands.
4661 Also, we have a countdown which can be interrupted by an incoming scripting
4662 message in case one client tells us to quit while another one is still using us
4663 (happens when you have two or more e-mail accounts).  But if the system is
4664 shutting down, quit immediately! */
4665 
4666 void
4667 ABSApp::Pulse ()
4668 {
4669   if (g_QuitCountdown == 0)
4670   {
4671     if (g_CommanderLooperPntr == NULL ||
4672     !g_CommanderLooperPntr->IsBusy ())
4673       PostMessage (B_QUIT_REQUESTED);
4674   }
4675   else if (g_QuitCountdown > 0)
4676   {
4677     cerr << "SpamDBM quitting in " << g_QuitCountdown << ".\n";
4678     g_QuitCountdown--;
4679   }
4680 }
4681 
4682 
4683 /* A quit request message has come in.  If the quit countdown has reached zero,
4684 allow the request, otherwise reject it (and start the countdown if it hasn't
4685 been started). */
4686 
4687 bool
4688 ABSApp::QuitRequested ()
4689 {
4690   BMessage  *QuitMessage;
4691   team_info  RemoteInfo;
4692   BMessenger RemoteMessenger;
4693   team_id    RemoteTeam;
4694 
4695   /* See if the quit is from the system shutdown command (which goes through
4696   the registrar server), if so, quit immediately. */
4697 
4698   QuitMessage = CurrentMessage ();
4699   if (QuitMessage != NULL && QuitMessage->IsSourceRemote ())
4700   {
4701     RemoteMessenger = QuitMessage->ReturnAddress ();
4702     RemoteTeam = RemoteMessenger.Team ();
4703     if (get_team_info (RemoteTeam, &RemoteInfo) == B_OK &&
4704     strstr (RemoteInfo.args, "registrar") != NULL)
4705       g_QuitCountdown = 0;
4706   }
4707 
4708   if (g_QuitCountdown == 0)
4709     return BApplication::QuitRequested ();
4710 
4711   if (g_QuitCountdown < 0)
4712 //    g_QuitCountdown = 10; /* Start the countdown. */
4713     g_QuitCountdown = 5; /* Quit more quickly */
4714 
4715   return false;
4716 }
4717 
4718 
4719 /* Go through the current database and delete words which are too old (time is
4720 equivalent to the number of messages added to the database) and too unpopular
4721 (words not used by many messages).  Hopefully this will get rid of words which
4722 are just hunks of binary or other garbage.  The database has been loaded
4723 elsewhere. */
4724 
4725 status_t
4726 ABSApp::PurgeOldWords (char *ErrorMessage)
4727 {
4728   uint32                  CurrentTime;
4729   StatisticsMap::iterator CurrentIter;
4730   StatisticsMap::iterator EndIter;
4731   StatisticsMap::iterator NextIter;
4732   char                    TempString [80];
4733 
4734   strcpy (ErrorMessage, "Purge can't fail"); /* So argument gets used. */
4735   CurrentTime = m_TotalGenuineMessages + m_TotalSpamMessages - 1;
4736   m_OldestAge = (uint32) -1 /* makes largest number possible */;
4737 
4738   EndIter = m_WordMap.end ();
4739   NextIter = m_WordMap.begin ();
4740   while (NextIter != EndIter) {
4741     CurrentIter = NextIter++;
4742 
4743     if (CurrentTime - CurrentIter->second.age >= m_PurgeAge &&
4744     CurrentIter->second.genuineCount + CurrentIter->second.spamCount <=
4745     m_PurgePopularity) {
4746       /* Delete this word, it is unpopular and old.  Sob. */
4747 
4748       m_WordMap.erase (CurrentIter);
4749       if (m_WordCount > 0)
4750         m_WordCount--;
4751 
4752       m_DatabaseHasChanged = true;
4753     }
4754     else /* This word is still in the database.  Update oldest age. */
4755     {
4756       if (CurrentIter->second.age < m_OldestAge)
4757         m_OldestAge = CurrentIter->second.age;
4758     }
4759   }
4760 
4761   /* Just a little bug check here.  Just in case. */
4762 
4763   if (m_WordCount != m_WordMap.size ()) {
4764     sprintf (TempString, "Our word count of %lu doesn't match the "
4765       "size of the database, %lu", m_WordCount, m_WordMap.size());
4766     DisplayErrorMessage (TempString, -1, "Bug!");
4767     m_WordCount = m_WordMap.size ();
4768   }
4769 
4770   return B_OK;
4771 }
4772 
4773 
4774 void
4775 ABSApp::ReadyToRun ()
4776 {
4777   DatabaseWindow *DatabaseWindowPntr;
4778   float           JunkFloat;
4779   BButton        *TempButtonPntr;
4780   BCheckBox      *TempCheckBoxPntr;
4781   font_height     TempFontHeight;
4782   BMenuBar       *TempMenuBarPntr;
4783   BMenuItem      *TempMenuItemPntr;
4784   BPopUpMenu     *TempPopUpMenuPntr;
4785   BRadioButton   *TempRadioButtonPntr;
4786   BRect           TempRect;
4787   const char     *TempString = "Testing My Things";
4788   BStringView    *TempStringViewPntr;
4789   BTextControl   *TempTextPntr;
4790   BWindow        *TempWindowPntr;
4791 
4792   /* This batch of code gets some measurements which will be used for laying
4793   out controls and other GUI elements.  Set the spacing between buttons and
4794   other controls to the width of the letter "M" in the user's desired font. */
4795 
4796  g_MarginBetweenControls = (int) be_plain_font->StringWidth ("M");
4797 
4798   /* Also find out how much space a line of text uses. */
4799 
4800   be_plain_font->GetHeight (&TempFontHeight);
4801   g_LineOfTextHeight = ceilf (
4802     TempFontHeight.ascent + TempFontHeight.descent + TempFontHeight.leading);
4803 
4804   /* Start finding out the height of various user interface gadgets, which can
4805   vary based on the current font size.  Make a temporary gadget, which is
4806   attached to our window, then resize it to its prefered size so that it
4807   accomodates the font size and other frills it needs. */
4808 
4809   TempWindowPntr = new (std::nothrow) BWindow (BRect (10, 20, 200, 200),
4810 	"Temporary Window", B_DOCUMENT_WINDOW,
4811 	B_NO_WORKSPACE_ACTIVATION | B_ASYNCHRONOUS_CONTROLS);
4812   if (TempWindowPntr == NULL) {
4813     DisplayErrorMessage ("Unable to create temporary window for finding "
4814       "sizes of controls.");
4815     g_QuitCountdown = 0;
4816     return;
4817   }
4818 
4819   TempRect = TempWindowPntr->Bounds ();
4820 
4821   /* Find the height of a single line of text in a BStringView. */
4822 
4823   TempStringViewPntr = new (std::nothrow) BStringView (TempRect, TempString, TempString);
4824   if (TempStringViewPntr != NULL) {
4825     TempWindowPntr->Lock();
4826     TempWindowPntr->AddChild (TempStringViewPntr);
4827     TempStringViewPntr->GetPreferredSize (&JunkFloat, &g_StringViewHeight);
4828     TempWindowPntr->RemoveChild (TempStringViewPntr);
4829     TempWindowPntr->Unlock();
4830     delete TempStringViewPntr;
4831   }
4832 
4833   /* Find the height of a button, which seems to be larger than a text
4834   control and can make life difficult.  Make a temporary button, which
4835   is attached to our window so that it resizes to accomodate the font size. */
4836 
4837   TempButtonPntr = new (std::nothrow) BButton (TempRect, TempString, TempString, NULL);
4838   if (TempButtonPntr != NULL) {
4839     TempWindowPntr->Lock();
4840     TempWindowPntr->AddChild (TempButtonPntr);
4841     TempButtonPntr->GetPreferredSize (&JunkFloat, &g_ButtonHeight);
4842     TempWindowPntr->RemoveChild (TempButtonPntr);
4843     TempWindowPntr->Unlock();
4844     delete TempButtonPntr;
4845   }
4846 
4847   /* Find the height of a text box. */
4848 
4849   TempTextPntr = new (std::nothrow) BTextControl (TempRect, TempString, NULL /* label */,
4850     TempString, NULL);
4851   if (TempTextPntr != NULL) {
4852     TempWindowPntr->Lock ();
4853     TempWindowPntr->AddChild (TempTextPntr);
4854     TempTextPntr->GetPreferredSize (&JunkFloat, &g_TextBoxHeight);
4855     TempWindowPntr->RemoveChild (TempTextPntr);
4856     TempWindowPntr->Unlock ();
4857     delete TempTextPntr;
4858   }
4859 
4860   /* Find the height of a checkbox control. */
4861 
4862   TempCheckBoxPntr = new (std::nothrow) BCheckBox (TempRect, TempString, TempString, NULL);
4863   if (TempCheckBoxPntr != NULL) {
4864     TempWindowPntr->Lock ();
4865     TempWindowPntr->AddChild (TempCheckBoxPntr);
4866     TempCheckBoxPntr->GetPreferredSize (&JunkFloat, &g_CheckBoxHeight);
4867     TempWindowPntr->RemoveChild (TempCheckBoxPntr);
4868     TempWindowPntr->Unlock ();
4869     delete TempCheckBoxPntr;
4870   }
4871 
4872   /* Find the height of a radio button control. */
4873 
4874   TempRadioButtonPntr =
4875     new (std::nothrow) BRadioButton (TempRect, TempString, TempString, NULL);
4876   if (TempRadioButtonPntr != NULL) {
4877     TempWindowPntr->Lock ();
4878     TempWindowPntr->AddChild (TempRadioButtonPntr);
4879     TempRadioButtonPntr->GetPreferredSize (&JunkFloat, &g_RadioButtonHeight);
4880     TempWindowPntr->RemoveChild (TempRadioButtonPntr);
4881     TempWindowPntr->Unlock ();
4882     delete TempRadioButtonPntr;
4883   }
4884 
4885   /* Find the height of a pop-up menu. */
4886 
4887   TempMenuBarPntr = new (std::nothrow) BMenuBar (TempRect, TempString,
4888     B_FOLLOW_LEFT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
4889     true /* resize to fit items */);
4890   TempPopUpMenuPntr = new (std::nothrow) BPopUpMenu (TempString);
4891   TempMenuItemPntr = new (std::nothrow) BMenuItem (TempString, new BMessage (12345), 'g');
4892 
4893   if (TempMenuBarPntr != NULL && TempPopUpMenuPntr != NULL &&
4894   TempMenuItemPntr != NULL)
4895   {
4896     TempPopUpMenuPntr->AddItem (TempMenuItemPntr);
4897     TempMenuBarPntr->AddItem (TempPopUpMenuPntr);
4898 
4899     TempWindowPntr->Lock ();
4900     TempWindowPntr->AddChild (TempMenuBarPntr);
4901     TempMenuBarPntr->GetPreferredSize (&JunkFloat, &g_PopUpMenuHeight);
4902     TempWindowPntr->RemoveChild (TempMenuBarPntr);
4903     TempWindowPntr->Unlock ();
4904     delete TempMenuBarPntr; // It will delete contents too.
4905   }
4906 
4907   TempWindowPntr->Lock ();
4908   TempWindowPntr->Quit ();
4909 
4910   SetPulseRate (500000);
4911 
4912   if (g_CommandLineMode)
4913     g_QuitCountdown = 0; /* Quit as soon as queued up commands done. */
4914   else /* GUI mode, make a window. */
4915   {
4916     DatabaseWindowPntr = new (std::nothrow) DatabaseWindow ();
4917     if (DatabaseWindowPntr == NULL) {
4918       DisplayErrorMessage ("Unable to create window.");
4919       g_QuitCountdown = 0;
4920     } else {
4921       DatabaseWindowPntr->Show (); /* Starts the window's message loop. */
4922     }
4923   }
4924 
4925   g_AppReadyToRunCompleted = true;
4926 }
4927 
4928 
4929 /* Given a mail component (body text, attachment, whatever), look for words in
4930 it.  If the tokenize mode specifies that it isn't one of the ones we are
4931 looking for, just skip it.  For container type components, recursively examine
4932 their contents, up to the maximum depth specified. */
4933 
4934 status_t
4935 ABSApp::RecursivelyTokenizeMailComponent (
4936   BMailComponent *ComponentPntr,
4937   const char *OptionalFileName,
4938   set<string> &WordSet,
4939   char *ErrorMessage,
4940   int RecursionLevel,
4941   int MaxRecursionLevel)
4942 {
4943   char                        AttachmentName [B_FILE_NAME_LENGTH];
4944   BMailAttachment            *AttachmentPntr;
4945   BMimeType                   ComponentMIMEType;
4946   BMailContainer             *ContainerPntr;
4947   BMallocIO                   ContentsIO;
4948   const char                 *ContentsBufferPntr;
4949   size_t                      ContentsBufferSize;
4950   status_t                    ErrorCode;
4951   bool                        ExamineComponent;
4952   const char                 *HeaderKeyPntr;
4953   const char                 *HeaderValuePntr;
4954   int                         i;
4955   int                         j;
4956   const char                 *NameExtension;
4957   int                         NumComponents;
4958   BMimeType                   TextAnyMIMEType ("text");
4959   BMimeType                   TextPlainMIMEType ("text/plain");
4960 
4961   if (ComponentPntr == NULL)
4962     return B_OK;
4963 
4964   /* Add things in the sub-headers that might be useful.  Things like the file
4965   name of attachments, the encoding type, etc. */
4966 
4967   if (m_TokenizeMode == TM_PLAIN_TEXT_HEADER ||
4968   m_TokenizeMode == TM_ANY_TEXT_HEADER ||
4969   m_TokenizeMode == TM_ALL_PARTS_HEADER ||
4970   m_TokenizeMode == TM_JUST_HEADER)
4971   {
4972     for (i = 0; i < 1000; i++)
4973     {
4974       HeaderKeyPntr = ComponentPntr->HeaderAt (i);
4975       if (HeaderKeyPntr == NULL)
4976         break;
4977       AddWordsToSet (HeaderKeyPntr, strlen (HeaderKeyPntr),
4978         'H' /* Prefix for Headers, uppercase unlike normal words. */, WordSet);
4979       for (j = 0; j < 1000; j++)
4980       {
4981         HeaderValuePntr = ComponentPntr->HeaderField (HeaderKeyPntr, j);
4982         if (HeaderValuePntr == NULL)
4983           break;
4984         AddWordsToSet (HeaderValuePntr, strlen (HeaderValuePntr),
4985           'H', WordSet);
4986       }
4987     }
4988   }
4989 
4990   /* Check the MIME type of the thing.  It's used to decide if the contents are
4991   worth examining for words. */
4992 
4993   ErrorCode = ComponentPntr->MIMEType (&ComponentMIMEType);
4994   if (ErrorCode != B_OK)
4995   {
4996     sprintf (ErrorMessage, "ABSApp::RecursivelyTokenizeMailComponent: "
4997       "Unable to get MIME type at level %d in \"%s\"",
4998       RecursionLevel, OptionalFileName);
4999     return ErrorCode;
5000   }
5001   if (ComponentMIMEType.Type() == NULL)
5002   {
5003     /* Have to make up a MIME type for things which don't have them, such as
5004     the main body text, otherwise it would get ignored. */
5005 
5006     if (NULL != dynamic_cast<BTextMailComponent *>(ComponentPntr))
5007       ComponentMIMEType.SetType ("text/plain");
5008   }
5009   if (!TextAnyMIMEType.Contains (&ComponentMIMEType) &&
5010   NULL != (AttachmentPntr = dynamic_cast<BMailAttachment *>(ComponentPntr)))
5011   {
5012     /* Sometimes spam doesn't give a text MIME type for text when they do an
5013     attachment (which is often base64 encoded).  Use the file name extension to
5014     see if it really is text. */
5015     NameExtension = NULL;
5016     if (AttachmentPntr->FileName (AttachmentName) >= 0)
5017       NameExtension = strrchr (AttachmentName, '.');
5018     if (NameExtension != NULL)
5019     {
5020       if (strcasecmp (NameExtension, ".txt") == 0)
5021         ComponentMIMEType.SetType ("text/plain");
5022       else if (strcasecmp (NameExtension, ".htm") == 0 ||
5023       strcasecmp (NameExtension, ".html") == 0)
5024         ComponentMIMEType.SetType ("text/html");
5025     }
5026   }
5027 
5028   switch (m_TokenizeMode)
5029   {
5030     case TM_PLAIN_TEXT:
5031     case TM_PLAIN_TEXT_HEADER:
5032       ExamineComponent = TextPlainMIMEType.Contains (&ComponentMIMEType);
5033       break;
5034 
5035     case TM_ANY_TEXT:
5036     case TM_ANY_TEXT_HEADER:
5037       ExamineComponent = TextAnyMIMEType.Contains (&ComponentMIMEType);
5038       break;
5039 
5040     case TM_ALL_PARTS:
5041     case TM_ALL_PARTS_HEADER:
5042       ExamineComponent = true;
5043       break;
5044 
5045     default:
5046       ExamineComponent = false;
5047       break;
5048   }
5049 
5050   if (ExamineComponent)
5051   {
5052     /* Get the contents of the component.  This will be UTF-8 text (converted
5053     from whatever encoding was used) for text attachments.  For other ones,
5054     it's just the raw data, or perhaps decoded from base64 encoding. */
5055 
5056     ContentsIO.SetBlockSize (16 * 1024);
5057     ErrorCode = ComponentPntr->GetDecodedData (&ContentsIO);
5058     if (ErrorCode == B_OK) /* Can fail for container components: no data. */
5059     {
5060       /* Look for words in the decoded data. */
5061 
5062       ContentsBufferPntr = (const char *) ContentsIO.Buffer ();
5063       ContentsBufferSize = ContentsIO.BufferLength ();
5064       if (ContentsBufferPntr != NULL /* can be empty */)
5065         AddWordsToSet (ContentsBufferPntr, ContentsBufferSize,
5066           0 /* no prefix character, this is body text */, WordSet);
5067     }
5068   }
5069 
5070   /* Examine any sub-components in the message. */
5071 
5072   if (RecursionLevel + 1 <= MaxRecursionLevel &&
5073   NULL != (ContainerPntr = dynamic_cast<BMailContainer *>(ComponentPntr)))
5074   {
5075     NumComponents = ContainerPntr->CountComponents ();
5076 
5077     for (i = 0; i < NumComponents; i++)
5078     {
5079       ComponentPntr = ContainerPntr->GetComponent (i);
5080 
5081       ErrorCode = RecursivelyTokenizeMailComponent (ComponentPntr,
5082         OptionalFileName, WordSet, ErrorMessage, RecursionLevel + 1,
5083         MaxRecursionLevel);
5084       if (ErrorCode != B_OK)
5085         break;
5086     }
5087   }
5088 
5089   return ErrorCode;
5090 }
5091 
5092 
5093 /* The user has tried to open a file or several files with this application,
5094 via Tracker's open-with menu item.  If it is a database type file, then change
5095 the database file name to it.  Otherwise, ask the user whether they want to
5096 classify it as spam or non-spam.  There will be at most around 100 files, BeOS
5097 R5.0.3's Tracker crashes if it tries to pass on more than that many using Open
5098 With... etc.  The command is sent to an intermediary thread where it is
5099 asynchronously converted into a scripting message(s) that are sent back to this
5100 BApplication.  The intermediary is needed since we can't recursively execute
5101 scripting messages while processing a message (this RefsReceived one). */
5102 
5103 void
5104 ABSApp::RefsReceived (BMessage *MessagePntr)
5105 {
5106   if (g_CommanderLooperPntr != NULL)
5107     g_CommanderLooperPntr->CommandReferences (MessagePntr);
5108 }
5109 
5110 
5111 /* A scripting command is looking for something to execute it.  See if it is
5112 targetted at our database. */
5113 
5114 BHandler * ABSApp::ResolveSpecifier (
5115   BMessage *MessagePntr,
5116   int32 Index,
5117   BMessage *SpecifierMsgPntr,
5118   int32 SpecificationKind,
5119   const char *PropertyPntr)
5120 {
5121   int i;
5122 
5123   /* See if it is one of our commands. */
5124 
5125   if (SpecificationKind == B_DIRECT_SPECIFIER)
5126   {
5127     for (i = PN_MAX - 1; i >= 0; i--)
5128     {
5129       if (strcasecmp (PropertyPntr, g_PropertyNames [i]) == 0)
5130         return this; /* Found it!  Return the Handler (which is us). */
5131     }
5132   }
5133 
5134   /* Handle an unrecognized scripting command, let the parent figure it out. */
5135 
5136   return BApplication::ResolveSpecifier (
5137     MessagePntr, Index, SpecifierMsgPntr, SpecificationKind, PropertyPntr);
5138 }
5139 
5140 
5141 /* Save the database if it hasn't been saved yet.  Otherwise do nothing. */
5142 
5143 status_t ABSApp::SaveDatabaseIfNeeded (char *ErrorMessage)
5144 {
5145   if (m_DatabaseHasChanged)
5146     return LoadSaveDatabase (false /* DoLoad */, ErrorMessage);
5147 
5148   return B_OK;
5149 }
5150 
5151 
5152 /* Presumably the file is an e-mail message (or at least the header portion of
5153 one).  Break it into parts: header, body and MIME components.  Then add the
5154 words in the portions that match the current tokenization settings to the set
5155 of words. */
5156 
5157 status_t ABSApp::TokenizeParts (
5158   BPositionIO *PositionIOPntr,
5159   const char *OptionalFileName,
5160   set<string> &WordSet,
5161   char *ErrorMessage)
5162 {
5163   status_t        ErrorCode = B_OK;
5164   BEmailMessage   WholeEMail;
5165 
5166   sprintf (ErrorMessage, "ABSApp::TokenizeParts: While getting e-mail "
5167     "headers, had problems with \"%s\"", OptionalFileName);
5168 
5169   ErrorCode = WholeEMail.SetToRFC822 (
5170     PositionIOPntr /* it does its own seeking to the start */,
5171     -1 /* length */, true /* parse_now */);
5172   if (ErrorCode < 0) goto ErrorExit;
5173 
5174   ErrorCode = RecursivelyTokenizeMailComponent (&WholeEMail,
5175     OptionalFileName, WordSet, ErrorMessage, 0 /* Initial recursion level */,
5176     (m_TokenizeMode == TM_JUST_HEADER) ? 0 : 500 /* Max recursion level */);
5177 
5178 ErrorExit:
5179   return ErrorCode;
5180 }
5181 
5182 
5183 /* Add all the words in the whole file or memory buffer to the supplied set.
5184 The file doesn't have to be an e-mail message since it isn't parsed for e-mail
5185 headers or MIME headers or anything.  It blindly adds everything that looks
5186 like a word, though it does convert quoted printable codes to the characters
5187 they represent.  See also AddWordsToSet which does something more advanced. */
5188 
5189 status_t ABSApp::TokenizeWhole (
5190   BPositionIO *PositionIOPntr,
5191   const char *OptionalFileName,
5192   set<string> &WordSet,
5193   char *ErrorMessage)
5194 {
5195   string                AccumulatedWord;
5196   uint8                 Buffer [16 * 1024];
5197   uint8                *BufferCurrentPntr = Buffer + 0;
5198   uint8                *BufferEndPntr = Buffer + 0;
5199   const char           *IOErrorString =
5200                           "TokenizeWhole: Error %ld while reading \"%s\"";
5201   size_t                Length;
5202   int                   Letter = ' ';
5203   char                  HexString [4];
5204   int                   NextLetter = ' ';
5205   int                   NextNextLetter = ' ';
5206 
5207   /* Use a buffer since reading single characters from a BFile is so slow.
5208   BufferCurrentPntr is the position of the next character to be read.  When it
5209   reaches BufferEndPntr, it is time to fill the buffer again. */
5210 
5211 #define ReadChar(CharVar) \
5212   { \
5213     if (BufferCurrentPntr < BufferEndPntr) \
5214       CharVar = *BufferCurrentPntr++; \
5215     else /* Try to fill the buffer. */ \
5216     { \
5217       ssize_t AmountRead; \
5218       AmountRead = PositionIOPntr->Read (Buffer, sizeof (Buffer)); \
5219       if (AmountRead < 0) \
5220       { \
5221         sprintf (ErrorMessage, IOErrorString, AmountRead, OptionalFileName); \
5222         return AmountRead; \
5223       } \
5224       else if (AmountRead == 0) \
5225         CharVar = EOF; \
5226       else \
5227       { \
5228         BufferEndPntr = Buffer + AmountRead; \
5229         BufferCurrentPntr = Buffer + 0; \
5230         CharVar = *BufferCurrentPntr++; \
5231       } \
5232     } \
5233   }
5234 
5235   /* Read all the words in the file and add them to our local set of words.  A
5236   set is used since we don't care how many times a word occurs. */
5237 
5238   while (true)
5239   {
5240     /* We read two letters ahead so that we can decode quoted printable
5241     characters (an equals sign followed by two hex digits or a new line).  Note
5242     that Letter can become EOF (-1) when end of file is reached. */
5243 
5244     Letter = NextLetter;
5245     NextLetter = NextNextLetter;
5246     ReadChar (NextNextLetter);
5247 
5248     /* Decode quoted printable codes first, so that the rest of the code just
5249     sees an ordinary character.  Or even nothing, if it is the hidden line
5250     break combination.  This may falsely corrupt stuff following an equals
5251     sign, but usually won't. */
5252 
5253     if (Letter == '=')
5254     {
5255       if ((NextLetter == '\r' && NextNextLetter == '\n') ||
5256       (NextLetter == '\n' && NextNextLetter == '\r'))
5257       {
5258         /* Make the "=\r\n" pair disappear.  It's not even white space. */
5259         ReadChar (NextLetter);
5260         ReadChar (NextNextLetter);
5261         continue;
5262       }
5263       if (NextLetter == '\n' || NextLetter == '\r')
5264       {
5265         /* Make the "=\n" pair disappear.  It's not even white space. */
5266         NextLetter = NextNextLetter;
5267         ReadChar (NextNextLetter);
5268         continue;
5269       }
5270       if (NextNextLetter != EOF &&
5271       isxdigit (NextLetter) && isxdigit (NextNextLetter))
5272       {
5273         /* Convert the hex code to a letter. */
5274         HexString[0] = NextLetter;
5275         HexString[1] = NextNextLetter;
5276         HexString[2] = 0;
5277         Letter = strtoul (HexString, NULL, 16 /* number system base */);
5278         ReadChar (NextLetter);
5279         ReadChar (NextNextLetter);
5280       }
5281     }
5282 
5283     /* Convert to lower case to improve word matches.  Of course this loses a
5284     bit of information, such as MONEY vs Money, an indicator of spam.  Well,
5285     apparently that isn't all that useful a distinction, so do it. */
5286 
5287     if (Letter >= 'A' && Letter < 'Z')
5288       Letter = Letter + ('a' - 'A');
5289 
5290     /* See if it is a letter we treat as white space - all control characters
5291     and all punctuation except for: apostrophe (so "it's" and possessive
5292     versions of words get stored), dash (for hyphenated words), dollar sign
5293     (for cash amounts), period (for IP addresses, we later remove trailing
5294     (periods).  Note that codes above 127 are UTF-8 characters, which we
5295     consider non-space. */
5296 
5297     if (Letter < 0 /* EOF */ || (Letter < 128 && g_SpaceCharacters[Letter]))
5298     {
5299       /* That space finished off a word.  Remove trailing periods... */
5300 
5301       while ((Length = AccumulatedWord.size()) > 0 &&
5302       AccumulatedWord [Length-1] == '.')
5303         AccumulatedWord.resize (Length - 1);
5304 
5305       /* If there's anything left in the word, add it to the set.  Also ignore
5306       words which are too big (it's probably some binary encoded data).  But
5307       leave room for supercalifragilisticexpialidoceous.  According to one web
5308       site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
5309       currently in English.  Note that some uuencoded data was seen with a 60
5310       character line length. */
5311 
5312       if (Length > 0 && Length <= g_MaxWordLength)
5313         WordSet.insert (AccumulatedWord);
5314 
5315       /* Empty out the string to get ready for the next word. */
5316 
5317       AccumulatedWord.resize (0);
5318     }
5319     else /* Not a space-like character, add it to the word. */
5320       AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
5321 
5322     /* Stop at end of file or error.  Don't care which.  Exit here so that last
5323     word got processed. */
5324 
5325     if (Letter == EOF)
5326       break;
5327   }
5328 
5329   return B_OK;
5330 }
5331 
5332 
5333 
5334 /******************************************************************************
5335  * Implementation of the ClassificationChoicesView class, constructor,
5336  * destructor and the rest of the member functions in mostly alphabetical
5337  * order.
5338  */
5339 
5340 ClassificationChoicesWindow::ClassificationChoicesWindow (
5341   BRect FrameRect,
5342   const char *FileName,
5343   int NumberOfFiles)
5344 : BWindow (FrameRect, "Classification Choices", B_TITLED_WINDOW,
5345     B_NOT_ZOOMABLE | B_NOT_RESIZABLE | B_ASYNCHRONOUS_CONTROLS),
5346   m_BulkModeSelectedPntr (NULL),
5347   m_ChoosenClassificationPntr (NULL)
5348 {
5349   ClassificationChoicesView *SubViewPntr;
5350 
5351   SubViewPntr = new ClassificationChoicesView (Bounds(),
5352     FileName, NumberOfFiles);
5353   AddChild (SubViewPntr);
5354   SubViewPntr->ResizeToPreferred ();
5355   ResizeTo (SubViewPntr->Frame().Width(), SubViewPntr->Frame().Height());
5356 }
5357 
5358 
5359 void
5360 ClassificationChoicesWindow::MessageReceived (BMessage *MessagePntr)
5361 {
5362   BControl *ControlPntr;
5363 
5364   if (MessagePntr->what >= MSG_CLASS_BUTTONS &&
5365   MessagePntr->what < MSG_CLASS_BUTTONS + CL_MAX)
5366   {
5367     if (m_ChoosenClassificationPntr != NULL)
5368       *m_ChoosenClassificationPntr =
5369         (ClassificationTypes) (MessagePntr->what - MSG_CLASS_BUTTONS);
5370     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5371     return;
5372   }
5373 
5374   if (MessagePntr->what == MSG_BULK_CHECKBOX)
5375   {
5376     if (m_BulkModeSelectedPntr != NULL &&
5377     MessagePntr->FindPointer ("source", (void **) &ControlPntr) == B_OK)
5378       *m_BulkModeSelectedPntr = (ControlPntr->Value() == B_CONTROL_ON);
5379     return;
5380   }
5381 
5382   if (MessagePntr->what == MSG_CANCEL_BUTTON)
5383   {
5384     PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5385     return;
5386   }
5387 
5388   BWindow::MessageReceived (MessagePntr);
5389 }
5390 
5391 
5392 void
5393 ClassificationChoicesWindow::Go (
5394   bool *BulkModeSelectedPntr,
5395   ClassificationTypes *ChoosenClassificationPntr)
5396 {
5397   status_t  ErrorCode = 0;
5398   BView    *MainViewPntr;
5399   thread_id WindowThreadID;
5400 
5401   m_BulkModeSelectedPntr = BulkModeSelectedPntr;
5402   m_ChoosenClassificationPntr = ChoosenClassificationPntr;
5403   if (m_ChoosenClassificationPntr != NULL)
5404     *m_ChoosenClassificationPntr = CL_MAX;
5405 
5406   Show (); // Starts the window thread running.
5407 
5408   /* Move the window to the center of the screen it is now being displayed on
5409   (have to wait for it to be showing). */
5410 
5411   Lock ();
5412   MainViewPntr = FindView ("ClassificationChoicesView");
5413   if (MainViewPntr != NULL)
5414   {
5415     BRect   TempRect;
5416     BScreen TempScreen (this);
5417     float   X;
5418     float   Y;
5419 
5420     TempRect = TempScreen.Frame ();
5421     X = TempRect.Width() / 2;
5422     Y = TempRect.Height() / 2;
5423     TempRect = MainViewPntr->Frame();
5424     X -= TempRect.Width() / 2;
5425     Y -= TempRect.Height() / 2;
5426     MoveTo (ceilf (X), ceilf (Y));
5427   }
5428   Unlock ();
5429 
5430   /* Wait for the window to go away. */
5431 
5432   WindowThreadID = Thread ();
5433   if (WindowThreadID >= 0)
5434     // Delay until the window thread has died, presumably window deleted now.
5435     wait_for_thread (WindowThreadID, &ErrorCode);
5436 }
5437 
5438 
5439 
5440 /******************************************************************************
5441  * Implementation of the ClassificationChoicesView class, constructor,
5442  * destructor and the rest of the member functions in mostly alphabetical
5443  * order.
5444  */
5445 
5446 ClassificationChoicesView::ClassificationChoicesView (
5447   BRect FrameRect,
5448   const char *FileName,
5449   int NumberOfFiles)
5450 : BView (FrameRect, "ClassificationChoicesView",
5451     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_NAVIGABLE_JUMP),
5452   m_FileName (FileName),
5453   m_NumberOfFiles (NumberOfFiles),
5454   m_PreferredBottomY (ceilf (g_ButtonHeight * 10))
5455 {
5456 }
5457 
5458 
5459 void
5460 ClassificationChoicesView::AttachedToWindow ()
5461 {
5462   BButton            *ButtonPntr;
5463   BCheckBox          *CheckBoxPntr;
5464   ClassificationTypes Classification;
5465   float               Margin;
5466   float               RowHeight;
5467   float               RowTop;
5468   BTextView          *TextViewPntr;
5469   BRect               TempRect;
5470   char                TempString [2048];
5471   BRect               TextRect;
5472   float               X;
5473 
5474   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5475 
5476   RowHeight = g_ButtonHeight;
5477   if (g_CheckBoxHeight > RowHeight)
5478     RowHeight = g_CheckBoxHeight;
5479   RowHeight = ceilf (RowHeight * 1.1);
5480 
5481   TempRect = Bounds ();
5482   RowTop = TempRect.top;
5483 
5484   /* Show the file name text. */
5485 
5486   Margin = ceilf ((RowHeight - g_StringViewHeight) / 2);
5487   TempRect = Bounds ();
5488   TempRect.top = RowTop + Margin;
5489   TextRect = TempRect;
5490   TextRect.OffsetTo (0, 0);
5491   TextRect.InsetBy (g_MarginBetweenControls, 2);
5492   sprintf (TempString, "How do you want to classify the file named \"%s\"?",
5493     m_FileName);
5494   TextViewPntr = new BTextView (TempRect, "FileText", TextRect,
5495     B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE);
5496   AddChild (TextViewPntr);
5497   TextViewPntr->SetText (TempString);
5498   TextViewPntr->MakeEditable (false);
5499   TextViewPntr->SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5500   TextViewPntr->ResizeTo (TempRect.Width (),
5501     3 + TextViewPntr->TextHeight (0, sizeof (TempString)));
5502   RowTop = TextViewPntr->Frame().bottom + Margin;
5503 
5504   /* Make the classification buttons. */
5505 
5506   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5507   TempRect = Bounds ();
5508   TempRect.top = RowTop + Margin;
5509   X = Bounds().left + g_MarginBetweenControls;
5510   for (Classification = (ClassificationTypes) 0; Classification < CL_MAX;
5511   Classification = (ClassificationTypes) ((int) Classification + 1))
5512   {
5513     TempRect = Bounds ();
5514     TempRect.top = RowTop + Margin;
5515     TempRect.left = X;
5516     sprintf (TempString, "%s Button",
5517       g_ClassificationTypeNames [Classification]);
5518     ButtonPntr = new BButton (TempRect, TempString,
5519       g_ClassificationTypeNames [Classification], new BMessage (
5520       ClassificationChoicesWindow::MSG_CLASS_BUTTONS + Classification));
5521     AddChild (ButtonPntr);
5522     ButtonPntr->ResizeToPreferred ();
5523     X = ButtonPntr->Frame().right + 3 * g_MarginBetweenControls;
5524   }
5525   RowTop += ceilf (RowHeight * 1.2);
5526 
5527   /* Make the Cancel button. */
5528 
5529   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5530   TempRect = Bounds ();
5531   TempRect.top = RowTop + Margin;
5532   TempRect.left += g_MarginBetweenControls;
5533   ButtonPntr = new BButton (TempRect, "Cancel Button",
5534     "Cancel", new BMessage (ClassificationChoicesWindow::MSG_CANCEL_BUTTON));
5535   AddChild (ButtonPntr);
5536   ButtonPntr->ResizeToPreferred ();
5537   X = ButtonPntr->Frame().right + g_MarginBetweenControls;
5538 
5539   /* Make the checkbox for bulk operations. */
5540 
5541   if (m_NumberOfFiles > 1)
5542   {
5543     Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
5544     TempRect = Bounds ();
5545     TempRect.top = RowTop + Margin;
5546     TempRect.left = X;
5547     sprintf (TempString, "Mark all %d remaining messages the same way.",
5548       m_NumberOfFiles - 1);
5549     CheckBoxPntr = new BCheckBox (TempRect, "BulkBox", TempString,
5550       new BMessage (ClassificationChoicesWindow::MSG_BULK_CHECKBOX));
5551     AddChild (CheckBoxPntr);
5552     CheckBoxPntr->ResizeToPreferred ();
5553   }
5554   RowTop += RowHeight;
5555 
5556   m_PreferredBottomY = RowTop;
5557 }
5558 
5559 
5560 void
5561 ClassificationChoicesView::GetPreferredSize (float *width, float *height)
5562 {
5563   if (width != NULL)
5564     *width = Bounds().Width();
5565   if (height != NULL)
5566     *height = m_PreferredBottomY;
5567 }
5568 
5569 
5570 
5571 /******************************************************************************
5572  * Implementation of the CommanderLooper class, constructor, destructor and the
5573  * rest of the member functions in mostly alphabetical order.
5574  */
5575 
5576 CommanderLooper::CommanderLooper ()
5577 : BLooper ("CommanderLooper", B_NORMAL_PRIORITY),
5578   m_IsBusy (false)
5579 {
5580 }
5581 
5582 
5583 CommanderLooper::~CommanderLooper ()
5584 {
5585   g_CommanderLooperPntr = NULL;
5586   delete g_CommanderMessenger;
5587   g_CommanderMessenger = NULL;
5588 }
5589 
5590 
5591 /* Process some command line arguments.  Basically just send a message to this
5592 looper itself to do the work later.  That way the caller can continue doing
5593 whatever they're doing, particularly if it's the BApplication. */
5594 
5595 void
5596 CommanderLooper::CommandArguments (int argc, char **argv)
5597 {
5598   int      i;
5599   BMessage InternalMessage;
5600 
5601   InternalMessage.what = MSG_COMMAND_ARGUMENTS;
5602   for (i = 0; i < argc; i++)
5603     InternalMessage.AddString ("arg", argv[i]);
5604 
5605   PostMessage (&InternalMessage);
5606 }
5607 
5608 
5609 /* Copy the refs out of the given message and stuff them into an internal
5610 message to ourself (so that the original message can be returned to the caller,
5611 and if it is Tracker, it can close the file handles it has open).  Optionally
5612 allow preset classification rather than asking the user (set BulkMode to TRUE
5613 and specify the class with BulkClassification). */
5614 
5615 void
5616 CommanderLooper::CommandReferences (
5617   BMessage *MessagePntr,
5618   bool BulkMode,
5619   ClassificationTypes BulkClassification)
5620 {
5621   entry_ref EntryRef;
5622   int       i;
5623   BMessage  InternalMessage;
5624 
5625   InternalMessage.what = MSG_COMMAND_FILE_REFS;
5626   for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
5627     InternalMessage.AddRef ("refs", &EntryRef);
5628   InternalMessage.AddBool ("BulkMode", BulkMode);
5629   InternalMessage.AddInt32 ("BulkClassification", BulkClassification);
5630 
5631   PostMessage (&InternalMessage);
5632 }
5633 
5634 
5635 /* This function is called by other threads to see if the CommanderLooper is
5636 busy working on something. */
5637 
5638 bool
5639 CommanderLooper::IsBusy ()
5640 {
5641   if (m_IsBusy)
5642     return true;
5643 
5644   if (IsLocked () || !MessageQueue()->IsEmpty ())
5645     return true;
5646 
5647   return false;
5648 }
5649 
5650 
5651 void
5652 
5653 CommanderLooper::MessageReceived (BMessage *MessagePntr)
5654 {
5655   m_IsBusy = true;
5656 
5657   if (MessagePntr->what == MSG_COMMAND_ARGUMENTS)
5658     ProcessArgs (MessagePntr);
5659   else if (MessagePntr->what == MSG_COMMAND_FILE_REFS)
5660     ProcessRefs (MessagePntr);
5661   else
5662     BLooper::MessageReceived (MessagePntr);
5663 
5664   m_IsBusy = false;
5665 }
5666 
5667 
5668 /* Process the command line by converting it into a series of scripting
5669 messages (possibly thousands) and sent them to the BApplication synchronously
5670 (so we can print the result). */
5671 
5672 void
5673 CommanderLooper::ProcessArgs (BMessage *MessagePntr)
5674 {
5675   int32                 argc = 0;
5676   const char          **argv = NULL;
5677   int                   ArgumentIndex;
5678   uint32                CommandCode;
5679   const char           *CommandWord;
5680   status_t              ErrorCode;
5681   const char           *ErrorTitle = "ProcessArgs";
5682   char                 *EndPntr;
5683   int32                 i;
5684   BMessage              ReplyMessage;
5685   BMessage              ScriptMessage;
5686   struct property_info *PropInfoPntr;
5687   const char           *PropertyName;
5688   bool                  TempBool;
5689   float                 TempFloat;
5690   int32                 TempInt32;
5691   const char           *TempStringPntr;
5692   type_code             TypeCode;
5693   const char           *ValuePntr;
5694 
5695   /* Get the argument count and pointers to arguments out of the message and
5696   into our argc and argv. */
5697 
5698   ErrorCode = MessagePntr->GetInfo ("arg", &TypeCode, &argc);
5699   if (ErrorCode != B_OK || TypeCode != B_STRING_TYPE)
5700   {
5701     DisplayErrorMessage ("Unable to find argument strings in message",
5702       ErrorCode, ErrorTitle);
5703     goto ErrorExit;
5704   }
5705 
5706   if (argc < 2)
5707   {
5708     cerr << PrintUsage;
5709     DisplayErrorMessage ("You need to specify a command word, like GET, SET "
5710       "and so on followed by a property, like DatabaseFile, and maybe "
5711       "followed by a value of some sort", -1, ErrorTitle);
5712     goto ErrorExit;
5713   }
5714 
5715   argv = (const char **) malloc (sizeof (char *) * argc);
5716   if (argv == NULL)
5717   {
5718     DisplayErrorMessage ("Out of memory when allocating argv array",
5719       ENOMEM, ErrorTitle);
5720     goto ErrorExit;
5721   }
5722 
5723   for (i = 0; i < argc; i++)
5724   {
5725     if ((ErrorCode = MessagePntr->FindString ("arg", i, &argv[i])) != B_OK)
5726     {
5727       DisplayErrorMessage ("Unable to find argument in the BMessage",
5728         ErrorCode, ErrorTitle);
5729       goto ErrorExit;
5730     }
5731   }
5732 
5733   CommandWord = argv[1];
5734 
5735   /* Special case for the Quit command since it isn't a scripting command. */
5736 
5737   if (strcasecmp (CommandWord, "quit") == 0)
5738   {
5739     g_QuitCountdown = 10;
5740     goto ErrorExit;
5741   }
5742 
5743   /* Find the corresponding scripting command. */
5744 
5745   if (strcasecmp (CommandWord, "set") == 0)
5746     CommandCode = B_SET_PROPERTY;
5747   else if (strcasecmp (CommandWord, "get") == 0)
5748     CommandCode = B_GET_PROPERTY;
5749   else if (strcasecmp (CommandWord, "count") == 0)
5750     CommandCode = B_COUNT_PROPERTIES;
5751   else if (strcasecmp (CommandWord, "create") == 0)
5752     CommandCode = B_CREATE_PROPERTY;
5753   else if (strcasecmp (CommandWord, "delete") == 0)
5754     CommandCode = B_DELETE_PROPERTY;
5755   else
5756     CommandCode = B_EXECUTE_PROPERTY;
5757 
5758   if (CommandCode == B_EXECUTE_PROPERTY)
5759   {
5760     PropertyName = CommandWord;
5761     ArgumentIndex = 2; /* Arguments to the command start at this index. */
5762   }
5763   else
5764   {
5765     if (CommandCode == B_SET_PROPERTY)
5766     {
5767       /* SET commands require at least one argument value. */
5768       if (argc < 4)
5769       {
5770         cerr << PrintUsage;
5771         DisplayErrorMessage ("SET commands require at least one "
5772           "argument value after the property name", -1, ErrorTitle);
5773         goto ErrorExit;
5774       }
5775     }
5776     else
5777       if (argc < 3)
5778       {
5779         cerr << PrintUsage;
5780         DisplayErrorMessage ("You need to specify a property to act on",
5781           -1, ErrorTitle);
5782         goto ErrorExit;
5783       }
5784     PropertyName = argv[2];
5785     ArgumentIndex = 3;
5786   }
5787 
5788   /* See if it is one of our commands. */
5789 
5790   for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
5791   {
5792     if (PropInfoPntr->name == 0)
5793     {
5794       cerr << PrintUsage;
5795       DisplayErrorMessage ("The property specified isn't known or "
5796         "doesn't support the requested action (usually means it is an "
5797         "unknown command)", -1, ErrorTitle);
5798       goto ErrorExit; /* Unrecognized command. */
5799     }
5800 
5801     if (PropInfoPntr->commands[0] == CommandCode &&
5802     strcasecmp (PropertyName, PropInfoPntr->name) == 0)
5803       break;
5804   }
5805 
5806   /* Make the equivalent command message.  For commands with multiple
5807   arguments, repeat the message for each single argument and just change the
5808   data portion for each extra argument.  Send the command and wait for a reply,
5809   which we'll print out. */
5810 
5811   ScriptMessage.MakeEmpty ();
5812   ScriptMessage.what = CommandCode;
5813   ScriptMessage.AddSpecifier (PropertyName);
5814   while (true)
5815   {
5816     if (ArgumentIndex < argc) /* If there are arguments to be added. */
5817     {
5818       ValuePntr = argv[ArgumentIndex];
5819 
5820       /* Convert the value into the likely kind of data. */
5821 
5822       if (strcasecmp (ValuePntr, "yes") == 0 ||
5823       strcasecmp (ValuePntr, "true") == 0)
5824         ScriptMessage.AddBool (g_DataName, true);
5825       else if (strcasecmp (ValuePntr, "no") == 0 ||
5826       strcasecmp (ValuePntr, "false") == 0)
5827         ScriptMessage.AddBool (g_DataName, false);
5828       else
5829       {
5830         /* See if it is a number. */
5831         i = strtol (ValuePntr, &EndPntr, 0);
5832         if (*EndPntr == 0)
5833           ScriptMessage.AddInt32 (g_DataName, i);
5834         else /* Nope, it's just a string. */
5835           ScriptMessage.AddString (g_DataName, ValuePntr);
5836       }
5837     }
5838 
5839     ErrorCode = be_app_messenger.SendMessage (&ScriptMessage, &ReplyMessage);
5840     if (ErrorCode != B_OK)
5841     {
5842       DisplayErrorMessage ("Unable to send scripting command",
5843         ErrorCode, ErrorTitle);
5844       goto ErrorExit;
5845     }
5846 
5847     /* Print the reply to the scripting command.  Even in server mode.  To
5848     standard output. */
5849 
5850     if (ReplyMessage.FindString ("CommandText", &TempStringPntr) == B_OK)
5851     {
5852       TempInt32 = -1;
5853       if (ReplyMessage.FindInt32 ("error", &TempInt32) == B_OK &&
5854       TempInt32 == B_OK)
5855       {
5856         /* It's a successful reply to one of our scripting messages.  Print out
5857         the returned values code for command line users to see. */
5858 
5859         cout << "Result of command to " << TempStringPntr << " is:\t";
5860         if (ReplyMessage.FindString (g_ResultName, &TempStringPntr) == B_OK)
5861           cout << "\"" << TempStringPntr << "\"";
5862         else if (ReplyMessage.FindInt32 (g_ResultName, &TempInt32) == B_OK)
5863           cout << TempInt32;
5864         else if (ReplyMessage.FindFloat (g_ResultName, &TempFloat) == B_OK)
5865           cout << TempFloat;
5866         else if (ReplyMessage.FindBool (g_ResultName, &TempBool) == B_OK)
5867           cout << (TempBool ? "true" : "false");
5868         else
5869           cout << "just plain success";
5870         if (ReplyMessage.FindInt32 ("count", &TempInt32) == B_OK)
5871           cout << "\t(count " << TempInt32 << ")";
5872         for (i = 0; (i < 50) &&
5873         ReplyMessage.FindString ("words", i, &TempStringPntr) == B_OK &&
5874         ReplyMessage.FindFloat ("ratios", i, &TempFloat) == B_OK;
5875         i++)
5876         {
5877           if (i == 0)
5878             cout << "\twith top words:\t";
5879           else
5880             cout << "\t";
5881           cout << TempStringPntr << "/" << TempFloat;
5882         }
5883         cout << endl;
5884       }
5885       else /* An error reply, print out the error, even in server mode. */
5886       {
5887         cout << "Failure of command " << TempStringPntr << ", error ";
5888         cout << TempInt32 << " (" << strerror (TempInt32) << ")";
5889         if (ReplyMessage.FindString ("message", &TempStringPntr) == B_OK)
5890           cout << ", message: " << TempStringPntr;
5891         cout << "." << endl;
5892       }
5893     }
5894 
5895     /* Advance to the next argument and its scripting message. */
5896 
5897     ScriptMessage.RemoveName (g_DataName);
5898     if (++ArgumentIndex >= argc)
5899       break;
5900   }
5901 
5902 ErrorExit:
5903   free (argv);
5904 }
5905 
5906 
5907 /* Given a bunch of references to files, open the files.  If it's a database
5908 file, switch to using it as a database.  Otherwise, treat them as text files
5909 and add them to the database.  Prompt the user for the spam or genuine or
5910 uncertain (declassification) choice, with the option to bulk mark many files at
5911 once. */
5912 
5913 void
5914 CommanderLooper::ProcessRefs (BMessage *MessagePntr)
5915 {
5916   bool                         BulkMode = false;
5917   ClassificationTypes          BulkClassification = CL_GENUINE;
5918   ClassificationChoicesWindow *ChoiceWindowPntr;
5919   BEntry                       Entry;
5920   entry_ref                    EntryRef;
5921   status_t                     ErrorCode;
5922   const char                  *ErrorTitle = "CommanderLooper::ProcessRefs";
5923   int32                        NumberOfRefs = 0;
5924   BPath                        Path;
5925   int                          RefIndex;
5926   BMessage                     ReplyMessage;
5927   BMessage                     ScriptingMessage;
5928   bool                         TempBool;
5929   BFile                        TempFile;
5930   int32                        TempInt32;
5931   char                         TempString [PATH_MAX + 1024];
5932   type_code                    TypeCode;
5933 
5934   // Wait for ReadyToRun to finish initializing the globals with the sizes of
5935   // the controls, since they are needed when we show the custom alert box for
5936   // choosing the message type.
5937 
5938   TempInt32 = 0;
5939   while (!g_AppReadyToRunCompleted && TempInt32++ < 10)
5940     snooze (200000);
5941 
5942   ErrorCode = MessagePntr->GetInfo ("refs", &TypeCode, &NumberOfRefs);
5943   if (ErrorCode != B_OK || TypeCode != B_REF_TYPE || NumberOfRefs <= 0)
5944   {
5945     DisplayErrorMessage ("Unable to get refs from the message",
5946       ErrorCode, ErrorTitle);
5947     return;
5948   }
5949 
5950   if (MessagePntr->FindBool ("BulkMode", &TempBool) == B_OK)
5951     BulkMode = TempBool;
5952   if (MessagePntr->FindInt32 ("BulkClassification", &TempInt32) == B_OK &&
5953   TempInt32 >= 0 && TempInt32 < CL_MAX)
5954     BulkClassification = (ClassificationTypes) TempInt32;
5955 
5956   for (RefIndex = 0;
5957   MessagePntr->FindRef ("refs", RefIndex, &EntryRef) == B_OK;
5958   RefIndex++)
5959   {
5960     ScriptingMessage.MakeEmpty ();
5961     ScriptingMessage.what = 0; /* Haven't figured out what to do yet. */
5962 
5963     /* See if the entry is a valid file or directory or other thing. */
5964 
5965     ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
5966     if (ErrorCode != B_OK ||
5967     ((ErrorCode = /* assignment */ B_ENTRY_NOT_FOUND) != 0 /* this pacifies
5968     mwcc -nwhitehorn */ && !Entry.Exists ()) ||
5969     ((ErrorCode = Entry.GetPath (&Path)) != B_OK))
5970     {
5971       DisplayErrorMessage ("Bad entry reference encountered, will skip it",
5972         ErrorCode, ErrorTitle);
5973       BulkMode = false;
5974       continue; /* Bad file reference, try the next one. */
5975     }
5976 
5977     /* If it's a file, check if it is a spam database file.  Go by the magic
5978     text at the start of the file, in case someone has edited the file with a
5979     spreadsheet or other tool and lost the MIME type. */
5980 
5981     if (Entry.IsFile ())
5982     {
5983       ErrorCode = TempFile.SetTo (&Entry, B_READ_ONLY);
5984       if (ErrorCode != B_OK)
5985       {
5986         sprintf (TempString, "Unable to open file \"%s\" for reading, will "
5987           "skip it", Path.Path ());
5988         DisplayErrorMessage (TempString, ErrorCode, ErrorTitle);
5989         BulkMode = false;
5990         continue;
5991       }
5992       if (TempFile.Read (TempString, strlen (g_DatabaseRecognitionString)) ==
5993       (int) strlen (g_DatabaseRecognitionString) && strncmp (TempString,
5994       g_DatabaseRecognitionString, strlen (g_DatabaseRecognitionString)) == 0)
5995       {
5996         ScriptingMessage.what = B_SET_PROPERTY;
5997         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
5998         ScriptingMessage.AddString (g_DataName, Path.Path ());
5999       }
6000       TempFile.Unset ();
6001     }
6002 
6003     /* Not a database file.  Could be a directory or a file.  Submit it as
6004     something to be marked spam or genuine. */
6005 
6006     if (ScriptingMessage.what == 0)
6007     {
6008       if (!Entry.IsFile ())
6009       {
6010         sprintf (TempString, "\"%s\" is not a file, can't do anything with it",
6011           Path.Path ());
6012         DisplayErrorMessage (TempString, -1, ErrorTitle);
6013         BulkMode = false;
6014         continue;
6015       }
6016 
6017       if (!BulkMode) /* Have to ask the user. */
6018       {
6019         ChoiceWindowPntr = new ClassificationChoicesWindow (
6020           BRect (40, 40, 40 + 50 * g_MarginBetweenControls,
6021           40 + g_ButtonHeight * 5), Path.Path (), NumberOfRefs - RefIndex);
6022         ChoiceWindowPntr->Go (&BulkMode, &BulkClassification);
6023         if (BulkClassification == CL_MAX)
6024           break; /* Cancel was picked. */
6025       }
6026 
6027       /* Format the command for classifying the file. */
6028 
6029       ScriptingMessage.what = B_SET_PROPERTY;
6030 
6031       if (BulkClassification == CL_GENUINE)
6032         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_GENUINE]);
6033       else if (BulkClassification == CL_SPAM)
6034         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_SPAM]);
6035       else if (BulkClassification == CL_UNCERTAIN)
6036         ScriptingMessage.AddSpecifier (g_PropertyNames[PN_UNCERTAIN]);
6037       else /* Broken code */
6038         break;
6039       ScriptingMessage.AddString (g_DataName, Path.Path ());
6040     }
6041 
6042     /* Tell the BApplication to do the work, and wait for it to finish.  The
6043     BApplication will display any error messages for us. */
6044 
6045     ErrorCode =
6046       be_app_messenger.SendMessage (&ScriptingMessage, &ReplyMessage);
6047     if (ErrorCode != B_OK)
6048     {
6049       DisplayErrorMessage ("Unable to send scripting command",
6050         ErrorCode, ErrorTitle);
6051       return;
6052     }
6053 
6054     /* If there was an error, allow the user to stop by switching off bulk
6055     mode.  The message will already have been displayed in an alert box, if
6056     server mode is off. */
6057 
6058     if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
6059     TempInt32 != B_OK)
6060       BulkMode = false;
6061   }
6062 }
6063 
6064 
6065 
6066 /******************************************************************************
6067  * Implementation of the ControlsView class, constructor, destructor and the
6068  * rest of the member functions in mostly alphabetical order.
6069  */
6070 
6071 ControlsView::ControlsView (BRect NewBounds)
6072 : BView (NewBounds, "ControlsView", B_FOLLOW_TOP | B_FOLLOW_LEFT_RIGHT,
6073     B_WILL_DRAW | B_PULSE_NEEDED | B_NAVIGABLE_JUMP | B_FRAME_EVENTS),
6074   m_AboutButtonPntr (NULL),
6075   m_AddExampleButtonPntr (NULL),
6076   m_BrowseButtonPntr (NULL),
6077   m_BrowseFilePanelPntr (NULL),
6078   m_CreateDatabaseButtonPntr (NULL),
6079   m_DatabaseFileNameTextboxPntr (NULL),
6080   m_DatabaseLoadDone (false),
6081   m_EstimateSpamButtonPntr (NULL),
6082   m_EstimateSpamFilePanelPntr (NULL),
6083   m_GenuineCountTextboxPntr (NULL),
6084   m_IgnorePreviousClassCheckboxPntr (NULL),
6085   m_InstallThingsButtonPntr (NULL),
6086   m_PurgeAgeTextboxPntr (NULL),
6087   m_PurgeButtonPntr (NULL),
6088   m_PurgePopularityTextboxPntr (NULL),
6089   m_ResetToDefaultsButtonPntr (NULL),
6090   m_ScoringModeMenuBarPntr (NULL),
6091   m_ScoringModePopUpMenuPntr (NULL),
6092   m_ServerModeCheckboxPntr (NULL),
6093   m_SpamCountTextboxPntr (NULL),
6094   m_TimeOfLastPoll (0),
6095   m_TokenizeModeMenuBarPntr (NULL),
6096   m_TokenizeModePopUpMenuPntr (NULL),
6097   m_WordCountTextboxPntr (NULL)
6098 {
6099 }
6100 
6101 
6102 ControlsView::~ControlsView ()
6103 {
6104   if (m_BrowseFilePanelPntr != NULL)
6105   {
6106     delete m_BrowseFilePanelPntr;
6107     m_BrowseFilePanelPntr = NULL;
6108   }
6109 
6110   if (m_EstimateSpamFilePanelPntr != NULL)
6111   {
6112     delete m_EstimateSpamFilePanelPntr;
6113     m_EstimateSpamFilePanelPntr = NULL;
6114   }
6115 }
6116 
6117 
6118 void
6119 ControlsView::AttachedToWindow ()
6120 {
6121   float         BigPurgeButtonTop;
6122   BMessage      CommandMessage;
6123   const char   *EightDigitsString = " 12345678 ";
6124   float         Height;
6125   float         Margin;
6126   float         RowHeight;
6127   float         RowTop;
6128   ScoringModes  ScoringMode;
6129   const char   *StringPntr;
6130   BMenuItem    *TempMenuItemPntr;
6131   BRect         TempRect;
6132   char          TempString [PATH_MAX];
6133   TokenizeModes TokenizeMode;
6134   float         Width;
6135   float         X;
6136 
6137   SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
6138 
6139   TempRect = Bounds ();
6140   X = TempRect.right;
6141   RowTop = TempRect.top;
6142   RowHeight = g_ButtonHeight;
6143   if (g_TextBoxHeight > RowHeight)
6144     RowHeight = g_TextBoxHeight;
6145   RowHeight = ceilf (RowHeight * 1.1);
6146 
6147   /* Make the Create button at the far right of the first row of controls,
6148   which are all database file related. */
6149 
6150   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6151   TempRect = Bounds ();
6152   TempRect.top = RowTop + Margin;
6153   TempRect.bottom = TempRect.top + g_ButtonHeight;
6154 
6155   CommandMessage.MakeEmpty ();
6156   CommandMessage.what = B_CREATE_PROPERTY;
6157   CommandMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6158   m_CreateDatabaseButtonPntr = new BButton (TempRect, "Create Button",
6159     "Create", new BMessage (CommandMessage), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6160   if (m_CreateDatabaseButtonPntr == NULL) goto ErrorExit;
6161   AddChild (m_CreateDatabaseButtonPntr);
6162   m_CreateDatabaseButtonPntr->SetTarget (be_app);
6163   m_CreateDatabaseButtonPntr->ResizeToPreferred ();
6164   m_CreateDatabaseButtonPntr->GetPreferredSize (&Width, &Height);
6165   m_CreateDatabaseButtonPntr->MoveTo (X - Width, TempRect.top);
6166   X -= Width + g_MarginBetweenControls;
6167 
6168   /* Make the Browse button, middle of the first row. */
6169 
6170   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6171   TempRect = Bounds ();
6172   TempRect.top = RowTop + Margin;
6173   TempRect.bottom = TempRect.top + g_ButtonHeight;
6174 
6175   m_BrowseButtonPntr = new BButton (TempRect, "Browse Button",
6176     "Browse…", new BMessage (MSG_BROWSE_BUTTON), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6177   if (m_BrowseButtonPntr == NULL) goto ErrorExit;
6178   AddChild (m_BrowseButtonPntr);
6179   m_BrowseButtonPntr->SetTarget (this);
6180   m_BrowseButtonPntr->ResizeToPreferred ();
6181   m_BrowseButtonPntr->GetPreferredSize (&Width, &Height);
6182   m_BrowseButtonPntr->MoveTo (X - Width, TempRect.top);
6183   X -= Width + g_MarginBetweenControls;
6184 
6185   /* Fill the rest of the space on the first row with the file name box. */
6186 
6187   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6188   TempRect = Bounds ();
6189   TempRect.top = RowTop + Margin;
6190   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6191   TempRect.right = X;
6192 
6193   StringPntr = "Word Database:";
6194   strcpy (m_DatabaseFileNameCachedValue, "Unknown...");
6195   m_DatabaseFileNameTextboxPntr = new BTextControl (TempRect,
6196     "File Name",
6197     StringPntr /* label */,
6198     m_DatabaseFileNameCachedValue /* text */,
6199     new BMessage (MSG_DATABASE_NAME),
6200     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6201     B_WILL_DRAW | B_NAVIGABLE | B_NAVIGABLE_JUMP);
6202   AddChild (m_DatabaseFileNameTextboxPntr);
6203   m_DatabaseFileNameTextboxPntr->SetTarget (this);
6204   m_DatabaseFileNameTextboxPntr->SetDivider (
6205     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6206 
6207   /* Second row contains the purge age, and a long line explaining it.  There
6208   is space to the right where the top half of the big purge button will go. */
6209 
6210   RowTop += RowHeight /* previous row's RowHeight */;
6211   BigPurgeButtonTop = RowTop;
6212   TempRect = Bounds ();
6213   X = TempRect.left;
6214   RowHeight = g_TextBoxHeight;
6215   RowHeight = ceilf (RowHeight * 1.1);
6216 
6217   StringPntr = "Number of occurrences needed to store a word:";
6218   m_PurgeAgeCachedValue = 12345678;
6219 
6220   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6221   TempRect.top = RowTop + Margin;
6222   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6223   TempRect.left = X;
6224   TempRect.right = TempRect.left +
6225     be_plain_font->StringWidth (StringPntr) +
6226     be_plain_font->StringWidth (EightDigitsString) +
6227     3 * g_MarginBetweenControls;
6228 
6229   sprintf (TempString, "%d", (int) m_PurgeAgeCachedValue);
6230   m_PurgeAgeTextboxPntr = new BTextControl (TempRect,
6231     "Purge Age",
6232     StringPntr /* label */,
6233     TempString /* text */,
6234     new BMessage (MSG_PURGE_AGE),
6235     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6236     B_WILL_DRAW | B_NAVIGABLE);
6237   AddChild (m_PurgeAgeTextboxPntr);
6238   m_PurgeAgeTextboxPntr->SetTarget (this);
6239   m_PurgeAgeTextboxPntr->SetDivider (
6240     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6241 
6242   /* Third row contains the purge popularity and bottom half of the purge
6243   button. */
6244 
6245   RowTop += RowHeight /* previous row's RowHeight */;
6246   TempRect = Bounds ();
6247   X = TempRect.left;
6248   RowHeight = g_TextBoxHeight;
6249   RowHeight = ceilf (RowHeight * 1.1);
6250 
6251   StringPntr = "Number of messages to store words from:";
6252   m_PurgePopularityCachedValue = 87654321;
6253   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6254   TempRect.top = RowTop + Margin;
6255   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6256   TempRect.left = X;
6257   TempRect.right = TempRect.left +
6258     be_plain_font->StringWidth (StringPntr) +
6259     be_plain_font->StringWidth (EightDigitsString) +
6260     3 * g_MarginBetweenControls;
6261   X = TempRect.right + g_MarginBetweenControls;
6262 
6263   sprintf (TempString, "%d", (int) m_PurgePopularityCachedValue);
6264   m_PurgePopularityTextboxPntr = new BTextControl (TempRect,
6265     "Purge Popularity",
6266     StringPntr /* label */,
6267     TempString /* text */,
6268     new BMessage (MSG_PURGE_POPULARITY),
6269     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6270     B_WILL_DRAW | B_NAVIGABLE);
6271   AddChild (m_PurgePopularityTextboxPntr);
6272   m_PurgePopularityTextboxPntr->SetTarget (this);
6273   m_PurgePopularityTextboxPntr->SetDivider (
6274     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6275 
6276   /* Make the purge button, which will take up space in the 2nd and 3rd rows,
6277   on the right side.  Twice as tall as a regular button too. */
6278 
6279   StringPntr = "Remove Old Words";
6280   Margin = ceilf ((((RowTop + RowHeight) - BigPurgeButtonTop) -
6281     2 * g_TextBoxHeight) / 2);
6282   TempRect.top = BigPurgeButtonTop + Margin;
6283   TempRect.bottom = TempRect.top + 2 * g_TextBoxHeight;
6284   TempRect.left = X;
6285   TempRect.right = X + ceilf (2 * be_plain_font->StringWidth (StringPntr));
6286 
6287   CommandMessage.MakeEmpty ();
6288   CommandMessage.what = B_EXECUTE_PROPERTY;
6289   CommandMessage.AddSpecifier (g_PropertyNames[PN_PURGE]);
6290   m_PurgeButtonPntr = new BButton (TempRect, "Purge Button",
6291     StringPntr, new BMessage (CommandMessage), B_FOLLOW_LEFT | B_FOLLOW_TOP);
6292   if (m_PurgeButtonPntr == NULL) goto ErrorExit;
6293   m_PurgeButtonPntr->ResizeToPreferred();
6294   AddChild (m_PurgeButtonPntr);
6295   m_PurgeButtonPntr->SetTarget (be_app);
6296 
6297   /* The fourth row contains the ignore previous classification checkbox. */
6298 
6299   RowTop += RowHeight /* previous row's RowHeight */;
6300   TempRect = Bounds ();
6301   X = TempRect.left;
6302   RowHeight = g_CheckBoxHeight;
6303   RowHeight = ceilf (RowHeight * 1.1);
6304 
6305   StringPntr = "Allow Retraining on a Message";
6306   m_IgnorePreviousClassCachedValue = false;
6307 
6308   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6309   TempRect.top = RowTop + Margin;
6310   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6311   TempRect.left = X;
6312   m_IgnorePreviousClassCheckboxPntr = new BCheckBox (TempRect,
6313     "Ignore Check",
6314     StringPntr,
6315     new BMessage (MSG_IGNORE_CLASSIFICATION),
6316     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6317   if (m_IgnorePreviousClassCheckboxPntr == NULL) goto ErrorExit;
6318   AddChild (m_IgnorePreviousClassCheckboxPntr);
6319   m_IgnorePreviousClassCheckboxPntr->SetTarget (this);
6320   m_IgnorePreviousClassCheckboxPntr->ResizeToPreferred ();
6321   m_IgnorePreviousClassCheckboxPntr->GetPreferredSize (&Width, &Height);
6322   X += Width + g_MarginBetweenControls;
6323 
6324   /* The fifth row contains the server mode checkbox. */
6325 
6326   RowTop += RowHeight /* previous row's RowHeight */;
6327   TempRect = Bounds ();
6328   RowHeight = g_CheckBoxHeight;
6329   RowHeight = ceilf (RowHeight * 1.1);
6330 
6331   StringPntr = "Print errors to Terminal";
6332   m_ServerModeCachedValue = false;
6333 
6334   Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6335   TempRect.top = RowTop + Margin;
6336   TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6337   m_ServerModeCheckboxPntr = new BCheckBox (TempRect,
6338     "ServerMode Check",
6339     StringPntr,
6340     new BMessage (MSG_SERVER_MODE),
6341     B_FOLLOW_TOP | B_FOLLOW_LEFT);
6342   if (m_ServerModeCheckboxPntr == NULL) goto ErrorExit;
6343   AddChild (m_ServerModeCheckboxPntr);
6344   m_ServerModeCheckboxPntr->SetTarget (this);
6345   m_ServerModeCheckboxPntr->ResizeToPreferred ();
6346   m_ServerModeCheckboxPntr->GetPreferredSize (&Width, &Height);
6347 
6348   /* This row just contains a huge pop-up menu which shows the tokenize mode
6349   and an explanation of what each mode does. */
6350 
6351   RowTop += RowHeight /* previous row's RowHeight */;
6352   TempRect = Bounds ();
6353   RowHeight = g_PopUpMenuHeight;
6354   RowHeight = ceilf (RowHeight * 1.1);
6355 
6356   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6357   TempRect.top = RowTop + Margin;
6358   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6359 
6360   m_TokenizeModeCachedValue = TM_MAX; /* Illegal value will force redraw. */
6361   m_TokenizeModeMenuBarPntr = new BMenuBar (TempRect, "TokenizeModeMenuBar",
6362     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6363     false /* resize to fit items */);
6364   if (m_TokenizeModeMenuBarPntr == NULL) goto ErrorExit;
6365   m_TokenizeModePopUpMenuPntr = new BPopUpMenu ("TokenizeModePopUpMenu");
6366   if (m_TokenizeModePopUpMenuPntr == NULL) goto ErrorExit;
6367 
6368   for (TokenizeMode = (TokenizeModes) 0;
6369   TokenizeMode < TM_MAX;
6370   TokenizeMode = (TokenizeModes) ((int) TokenizeMode + 1))
6371   {
6372     /* Each different tokenize mode gets its own menu item.  Selecting the item
6373     will send a canned command to the application to switch to the appropriate
6374     tokenize mode.  An optional explanation of each mode is added to the mode
6375     name string. */
6376 
6377     CommandMessage.MakeEmpty ();
6378     CommandMessage.what = B_SET_PROPERTY;
6379     CommandMessage.AddSpecifier (g_PropertyNames[PN_TOKENIZE_MODE]);
6380     CommandMessage.AddString (g_DataName, g_TokenizeModeNames[TokenizeMode]);
6381     strcpy (TempString, g_TokenizeModeNames[TokenizeMode]);
6382     switch (TokenizeMode)
6383     {
6384       case TM_WHOLE:
6385         strcat (TempString, " - Scan everything");
6386         break;
6387 
6388       case TM_PLAIN_TEXT:
6389         strcat (TempString, " - Scan e-mail body text except rich text");
6390         break;
6391 
6392       case TM_PLAIN_TEXT_HEADER:
6393         strcat (TempString, " - Scan entire e-mail text except rich text");
6394         break;
6395 
6396       case TM_ANY_TEXT:
6397         strcat (TempString, " - Scan e-mail body text and text attachments");
6398         break;
6399 
6400       case TM_ANY_TEXT_HEADER:
6401        strcat (TempString, " - Scan entire e-mail text and text attachments (recommended)");
6402         break;
6403 
6404       case TM_ALL_PARTS:
6405         strcat (TempString, " - Scan e-mail body and all attachments");
6406         break;
6407 
6408       case TM_ALL_PARTS_HEADER:
6409         strcat (TempString, " - Scan all parts of the e-mail");
6410         break;
6411 
6412       case TM_JUST_HEADER:
6413         strcat (TempString, " - Scan just the header (mail routing information)");
6414         break;
6415 
6416       default:
6417         break;
6418     }
6419     TempMenuItemPntr =
6420       new BMenuItem (TempString, new BMessage (CommandMessage));
6421     if (TempMenuItemPntr == NULL) goto ErrorExit;
6422     TempMenuItemPntr->SetTarget (be_app);
6423     m_TokenizeModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6424   }
6425   m_TokenizeModeMenuBarPntr->AddItem (m_TokenizeModePopUpMenuPntr);
6426   AddChild (m_TokenizeModeMenuBarPntr);
6427 
6428   /* This row just contains a huge pop-up menu which shows the scoring mode
6429   and an explanation of what each mode does. */
6430 
6431   RowTop += RowHeight /* previous row's RowHeight */;
6432   TempRect = Bounds ();
6433   RowHeight = g_PopUpMenuHeight;
6434   RowHeight = ceilf (RowHeight * 1.1);
6435 
6436   Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6437   TempRect.top = RowTop + Margin;
6438   TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6439 
6440   m_ScoringModeCachedValue = SM_MAX; /* Illegal value will force redraw. */
6441   m_ScoringModeMenuBarPntr = new BMenuBar (TempRect, "ScoringModeMenuBar",
6442     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6443     false /* resize to fit items */);
6444   if (m_ScoringModeMenuBarPntr == NULL) goto ErrorExit;
6445   m_ScoringModePopUpMenuPntr = new BPopUpMenu ("ScoringModePopUpMenu");
6446   if (m_ScoringModePopUpMenuPntr == NULL) goto ErrorExit;
6447 
6448   for (ScoringMode = (ScoringModes) 0;
6449   ScoringMode < SM_MAX;
6450   ScoringMode = (ScoringModes) ((int) ScoringMode + 1))
6451   {
6452     /* Each different scoring mode gets its own menu item.  Selecting the item
6453     will send a canned command to the application to switch to the appropriate
6454     scoring mode.  An optional explanation of each mode is added to the mode
6455     name string. */
6456 
6457     CommandMessage.MakeEmpty ();
6458     CommandMessage.what = B_SET_PROPERTY;
6459     CommandMessage.AddSpecifier (g_PropertyNames[PN_SCORING_MODE]);
6460     CommandMessage.AddString (g_DataName, g_ScoringModeNames[ScoringMode]);
6461 /*
6462     strcpy (TempString, g_ScoringModeNames[ScoringMode]);
6463     switch (ScoringMode)
6464     {
6465       case SM_ROBINSON:
6466         strcat (TempString, " - Learning Method 1: Naive Bayesian");
6467         break;
6468 
6469       case SM_CHISQUARED:
6470         strcat (TempString, " - Learning Method 2: Chi-Squared");
6471         break;
6472 
6473       default:
6474         break;
6475     }
6476 */
6477     switch (ScoringMode)
6478     {
6479       case SM_ROBINSON:
6480         strcpy (TempString, "Learning method 1: Naive Bayesian");
6481         break;
6482 
6483       case SM_CHISQUARED:
6484         strcpy (TempString, "Learning method 2: Chi-Squared");
6485         break;
6486 
6487       default:
6488         break;
6489     }
6490     TempMenuItemPntr =
6491       new BMenuItem (TempString, new BMessage (CommandMessage));
6492     if (TempMenuItemPntr == NULL) goto ErrorExit;
6493     TempMenuItemPntr->SetTarget (be_app);
6494     m_ScoringModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6495   }
6496   m_ScoringModeMenuBarPntr->AddItem (m_ScoringModePopUpMenuPntr);
6497   AddChild (m_ScoringModeMenuBarPntr);
6498 
6499   /* The next row has the install MIME types button and the reset to defaults
6500   button, one on the left and the other on the right. */
6501 
6502   RowTop += RowHeight /* previous row's RowHeight */;
6503   TempRect = Bounds ();
6504   RowHeight = g_ButtonHeight;
6505   RowHeight = ceilf (RowHeight * 1.1);
6506 
6507   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6508   TempRect.top = RowTop + Margin;
6509   TempRect.bottom = TempRect.top + g_ButtonHeight;
6510 
6511   CommandMessage.MakeEmpty ();
6512   CommandMessage.what = B_EXECUTE_PROPERTY;
6513   CommandMessage.AddSpecifier (g_PropertyNames[PN_INSTALL_THINGS]);
6514   m_InstallThingsButtonPntr = new BButton (TempRect, "Install Button",
6515     "Install spam types",
6516     new BMessage (CommandMessage),
6517     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6518   if (m_InstallThingsButtonPntr == NULL) goto ErrorExit;
6519   AddChild (m_InstallThingsButtonPntr);
6520   m_InstallThingsButtonPntr->SetTarget (be_app);
6521   m_InstallThingsButtonPntr->ResizeToPreferred ();
6522 
6523   /* The Reset to Defaults button.  On the right side of the row. */
6524 
6525   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6526   TempRect = Bounds ();
6527   TempRect.top = RowTop + Margin;
6528   TempRect.bottom = TempRect.top + g_ButtonHeight;
6529 
6530   CommandMessage.MakeEmpty ();
6531   CommandMessage.what = B_EXECUTE_PROPERTY;
6532   CommandMessage.AddSpecifier (g_PropertyNames[PN_RESET_TO_DEFAULTS]);
6533   m_ResetToDefaultsButtonPntr = new BButton (TempRect, "Reset Button",
6534     "Default settings", new BMessage (CommandMessage),
6535     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6536   if (m_ResetToDefaultsButtonPntr == NULL) goto ErrorExit;
6537   AddChild (m_ResetToDefaultsButtonPntr);
6538   m_ResetToDefaultsButtonPntr->SetTarget (be_app);
6539   m_ResetToDefaultsButtonPntr->ResizeToPreferred ();
6540   m_ResetToDefaultsButtonPntr->GetPreferredSize (&Width, &Height);
6541   m_ResetToDefaultsButtonPntr->MoveTo (TempRect.right - Width, TempRect.top);
6542 
6543   /* The next row contains the Estimate, Add Examples and About buttons. */
6544 
6545   RowTop += RowHeight /* previous row's RowHeight */;
6546   TempRect = Bounds ();
6547   X = TempRect.left;
6548   RowHeight = g_ButtonHeight;
6549   RowHeight = ceilf (RowHeight * 1.1);
6550 
6551   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6552   TempRect.top = RowTop + Margin;
6553   TempRect.bottom = TempRect.top + g_ButtonHeight;
6554   TempRect.left = X;
6555 
6556   m_EstimateSpamButtonPntr = new BButton (TempRect, "Estimate Button",
6557     "Scan a message",
6558     new BMessage (MSG_ESTIMATE_BUTTON),
6559     B_FOLLOW_LEFT | B_FOLLOW_TOP);
6560   if (m_EstimateSpamButtonPntr == NULL) goto ErrorExit;
6561   AddChild (m_EstimateSpamButtonPntr);
6562   m_EstimateSpamButtonPntr->SetTarget (this);
6563   m_EstimateSpamButtonPntr->ResizeToPreferred ();
6564   X = m_EstimateSpamButtonPntr->Frame().right + g_MarginBetweenControls;
6565 
6566   /* The Add Example button in the middle.  Does the same as the browse button,
6567   but don't tell anyone that! */
6568 
6569   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6570   TempRect.top = RowTop + Margin;
6571   TempRect.bottom = TempRect.top + g_ButtonHeight;
6572   TempRect.left = X;
6573 
6574   m_AddExampleButtonPntr = new BButton (TempRect, "Example Button",
6575     "Train spam filter on a message",
6576     new BMessage (MSG_BROWSE_BUTTON),
6577     B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6578     B_WILL_DRAW | B_NAVIGABLE | B_FULL_UPDATE_ON_RESIZE);
6579   if (m_AddExampleButtonPntr == NULL) goto ErrorExit;
6580   AddChild (m_AddExampleButtonPntr);
6581   m_AddExampleButtonPntr->SetTarget (this);
6582   m_AddExampleButtonPntr->ResizeToPreferred ();
6583   X = m_AddExampleButtonPntr->Frame().right + g_MarginBetweenControls;
6584 
6585   /* Add the About button on the right. */
6586 
6587   Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6588   TempRect = Bounds ();
6589   TempRect.top = RowTop + Margin;
6590   TempRect.bottom = TempRect.top + g_ButtonHeight;
6591   TempRect.left = X;
6592 
6593   m_AboutButtonPntr = new BButton (TempRect, "About Button",
6594     "About…",
6595     new BMessage (B_ABOUT_REQUESTED),
6596     B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6597   if (m_AboutButtonPntr == NULL) goto ErrorExit;
6598   AddChild (m_AboutButtonPntr);
6599   m_AboutButtonPntr->SetTarget (be_app);
6600 
6601   /* This row displays various counters.  Starting with the genuine messages
6602   count on the left. */
6603 
6604   RowTop += RowHeight /* previous row's RowHeight */;
6605   TempRect = Bounds ();
6606   RowHeight = g_TextBoxHeight;
6607   RowHeight = ceilf (RowHeight * 1.1);
6608 
6609   StringPntr = "Genuine messages:";
6610   m_GenuineCountCachedValue = 87654321;
6611   sprintf (TempString, "%d", (int) m_GenuineCountCachedValue);
6612 
6613   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6614   TempRect = Bounds ();
6615   TempRect.top = RowTop + Margin;
6616   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6617   TempRect.right = TempRect.left +
6618     be_plain_font->StringWidth (StringPntr) +
6619     be_plain_font->StringWidth (TempString) +
6620     3 * g_MarginBetweenControls;
6621 
6622   m_GenuineCountTextboxPntr = new BTextControl (TempRect,
6623     "Genuine count",
6624     StringPntr /* label */,
6625     TempString /* text */,
6626     NULL /* no message */,
6627     B_FOLLOW_LEFT | B_FOLLOW_TOP,
6628     B_WILL_DRAW /* not B_NAVIGABLE */);
6629   AddChild (m_GenuineCountTextboxPntr);
6630   m_GenuineCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6631   m_GenuineCountTextboxPntr->SetDivider (
6632     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6633   m_GenuineCountTextboxPntr->SetEnabled (false); /* For display only. */
6634 
6635   /* The word count in the center. */
6636 
6637   StringPntr = "Word count:";
6638   m_WordCountCachedValue = 87654321;
6639   sprintf (TempString, "%d", (int) m_WordCountCachedValue);
6640 
6641   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6642   TempRect = Bounds ();
6643   TempRect.top = RowTop + Margin;
6644   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6645   Width = be_plain_font->StringWidth (StringPntr) +
6646     be_plain_font->StringWidth (TempString) +
6647     3 * g_MarginBetweenControls;
6648   TempRect.left = ceilf ((TempRect.right - TempRect.left) / 2 - Width / 2);
6649   TempRect.right = TempRect.left + Width;
6650 
6651   m_WordCountTextboxPntr = new BTextControl (TempRect,
6652     "Word count",
6653     StringPntr /* label */,
6654     TempString /* text */,
6655     NULL /* no message */,
6656     B_FOLLOW_H_CENTER | B_FOLLOW_TOP,
6657     B_WILL_DRAW /* not B_NAVIGABLE */);
6658   AddChild (m_WordCountTextboxPntr);
6659   m_WordCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6660   m_WordCountTextboxPntr->SetDivider (
6661     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6662   m_WordCountTextboxPntr->SetEnabled (false); /* For display only. */
6663 
6664   /* The spam count on the far right. */
6665 
6666   StringPntr = "Spam messages:";
6667   m_SpamCountCachedValue = 87654321;
6668   sprintf (TempString, "%d", (int) m_SpamCountCachedValue);
6669 
6670   Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6671   TempRect = Bounds ();
6672   TempRect.top = RowTop + Margin;
6673   TempRect.bottom = TempRect.top + g_TextBoxHeight;
6674   TempRect.left = TempRect.right -
6675     be_plain_font->StringWidth (StringPntr) -
6676     be_plain_font->StringWidth (TempString) -
6677     3 * g_MarginBetweenControls;
6678 
6679   m_SpamCountTextboxPntr = new BTextControl (TempRect,
6680     "Spam count",
6681     StringPntr /* label */,
6682     TempString /* text */,
6683     NULL /* no message */,
6684     B_FOLLOW_RIGHT | B_FOLLOW_TOP,
6685     B_WILL_DRAW /* not B_NAVIGABLE */);
6686   AddChild (m_SpamCountTextboxPntr);
6687   m_SpamCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6688   m_SpamCountTextboxPntr->SetDivider (
6689     be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6690   m_SpamCountTextboxPntr->SetEnabled (false); /* For display only. */
6691 
6692   /* Change the size of our view so it only takes up the space needed by the
6693   buttons. */
6694 
6695   RowTop += RowHeight /* previous row's RowHeight */;
6696   ResizeTo (Bounds().Width(), RowTop - Bounds().top + 1);
6697 
6698   return; /* Successful. */
6699 
6700 ErrorExit:
6701   DisplayErrorMessage ("Unable to initialise the controls view.");
6702 }
6703 
6704 
6705 void
6706 ControlsView::BrowseForDatabaseFile ()
6707 {
6708   if (m_BrowseFilePanelPntr == NULL)
6709   {
6710     BEntry      DirectoryEntry;
6711     entry_ref   DirectoryEntryRef;
6712     BMessage    GetDatabasePathCommand;
6713     BMessage    GetDatabasePathResult;
6714     const char *StringPntr = NULL;
6715 
6716     /* Create a new file panel.  First set up the entry ref stuff so that the
6717     file panel can open to show the initial directory (the one where the
6718     database file currently is).  Note that we have to create it after the
6719     window and view are up and running, otherwise the BMessenger won't point to
6720     a valid looper/handler.  First find out the current database file name to
6721     use as a starting point. */
6722 
6723     GetDatabasePathCommand.what = B_GET_PROPERTY;
6724     GetDatabasePathCommand.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6725     be_app_messenger.SendMessage (&GetDatabasePathCommand,
6726       &GetDatabasePathResult, 5000000 /* delivery timeout */,
6727       5000000 /* reply timeout */);
6728     if (GetDatabasePathResult.FindString (g_ResultName, &StringPntr) != B_OK ||
6729     DirectoryEntry.SetTo (StringPntr) != B_OK ||
6730     DirectoryEntry.GetParent (&DirectoryEntry) != B_OK)
6731       DirectoryEntry.SetTo ("."); /* Default directory if we can't find it. */
6732     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6733     {
6734       DisplayErrorMessage (
6735         "Unable to set up the file requestor starting directory.  Sorry.");
6736       return;
6737     }
6738 
6739     m_BrowseFilePanelPntr = new BFilePanel (
6740       B_OPEN_PANEL /* mode */,
6741       &be_app_messenger /* target for event messages */,
6742       &DirectoryEntryRef /* starting directory */,
6743       B_FILE_NODE,
6744       true /* true for multiple selections */,
6745       NULL /* canned message */,
6746       NULL /* ref filter */,
6747       false /* true for modal */,
6748       true /* true to hide when done */);
6749   }
6750 
6751   if (m_BrowseFilePanelPntr != NULL)
6752     m_BrowseFilePanelPntr->Show (); /* Answer returned later in RefsReceived. */
6753 }
6754 
6755 
6756 void
6757 ControlsView::BrowseForFileToEstimate ()
6758 {
6759   if (m_EstimateSpamFilePanelPntr == NULL)
6760   {
6761     BEntry      DirectoryEntry;
6762     entry_ref   DirectoryEntryRef;
6763     status_t    ErrorCode;
6764     BMessenger  MessengerToSelf (this);
6765     BPath       PathToMailDirectory;
6766 
6767     /* Create a new file panel.  First set up the entry ref stuff so that the
6768     file panel can open to show the initial directory (the user's mail
6769     directory).  Note that we have to create the panel after the window and
6770     view are up and running, otherwise the BMessenger won't point to a valid
6771     looper/handler. */
6772 
6773     ErrorCode = find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6774     if (ErrorCode == B_OK)
6775     {
6776       PathToMailDirectory.Append ("mail");
6777       ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(),
6778         true /* traverse symbolic links*/);
6779       if (ErrorCode != B_OK || !DirectoryEntry.Exists ())
6780       {
6781         /* If no mail directory, try home directory. */
6782         find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6783         ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6784       }
6785     }
6786     if (ErrorCode != B_OK)
6787       PathToMailDirectory.SetTo (".");
6788 
6789     DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6790     if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6791     {
6792       DisplayErrorMessage (
6793         "Unable to set up the file requestor starting directory.  Sorry.");
6794       return;
6795     }
6796 
6797     m_EstimateSpamFilePanelPntr = new BFilePanel (
6798       B_OPEN_PANEL /* mode */,
6799       &MessengerToSelf /* target for event messages */,
6800       &DirectoryEntryRef /* starting directory */,
6801       B_FILE_NODE,
6802       true /* true for multiple selections */,
6803       new BMessage (MSG_ESTIMATE_FILE_REFS) /* canned message */,
6804       NULL /* ref filter */,
6805       false /* true for modal */,
6806       true /* true to hide when done */);
6807   }
6808 
6809   if (m_EstimateSpamFilePanelPntr != NULL)
6810     m_EstimateSpamFilePanelPntr->Show (); /* Answer sent via a message. */
6811 }
6812 
6813 
6814 /* The display has been resized.  Have to manually adjust the popup menu bar to
6815 show the new size (the sub-items need to be resized too).  Then make it redraw.
6816 Well, actually just resetting the mark on the current item will resize it
6817 properly. */
6818 
6819 void
6820 ControlsView::FrameResized (float, float)
6821 {
6822   m_ScoringModeCachedValue = SM_MAX; /* Force it to reset the mark. */
6823   m_TokenizeModeCachedValue = TM_MAX; /* Force it to reset the mark. */
6824 }
6825 
6826 
6827 void
6828 ControlsView::MessageReceived (BMessage *MessagePntr)
6829 {
6830   BMessage CommandMessage;
6831   bool     TempBool;
6832   uint32   TempUint32;
6833 
6834   switch (MessagePntr->what)
6835   {
6836     case MSG_BROWSE_BUTTON:
6837       BrowseForDatabaseFile ();
6838       break;
6839 
6840     case MSG_DATABASE_NAME:
6841       if (strcmp (m_DatabaseFileNameCachedValue,
6842       m_DatabaseFileNameTextboxPntr->Text ()) != 0)
6843         SubmitCommandString (PN_DATABASE_FILE, B_SET_PROPERTY,
6844         m_DatabaseFileNameTextboxPntr->Text ());
6845       break;
6846 
6847     case MSG_ESTIMATE_BUTTON:
6848       BrowseForFileToEstimate ();
6849       break;
6850 
6851     case MSG_ESTIMATE_FILE_REFS:
6852       EstimateRefFilesAndDisplay (MessagePntr);
6853       break;
6854 
6855     case MSG_IGNORE_CLASSIFICATION:
6856       TempBool = (m_IgnorePreviousClassCheckboxPntr->Value() == B_CONTROL_ON);
6857       if (m_IgnorePreviousClassCachedValue != TempBool)
6858         SubmitCommandBool (PN_IGNORE_PREVIOUS_CLASSIFICATION,
6859         B_SET_PROPERTY, TempBool);
6860       break;
6861 
6862     case MSG_PURGE_AGE:
6863       TempUint32 = strtoul (m_PurgeAgeTextboxPntr->Text (), NULL, 10);
6864       if (m_PurgeAgeCachedValue != TempUint32)
6865         SubmitCommandInt32 (PN_PURGE_AGE, B_SET_PROPERTY, TempUint32);
6866       break;
6867 
6868     case MSG_PURGE_POPULARITY:
6869       TempUint32 = strtoul (m_PurgePopularityTextboxPntr->Text (), NULL, 10);
6870       if (m_PurgePopularityCachedValue != TempUint32)
6871         SubmitCommandInt32 (PN_PURGE_POPULARITY, B_SET_PROPERTY, TempUint32);
6872       break;
6873 
6874     case MSG_SERVER_MODE:
6875       TempBool = (m_ServerModeCheckboxPntr->Value() == B_CONTROL_ON);
6876       if (m_ServerModeCachedValue != TempBool)
6877         SubmitCommandBool (PN_SERVER_MODE, B_SET_PROPERTY, TempBool);
6878       break;
6879 
6880     default:
6881       BView::MessageReceived (MessagePntr);
6882   }
6883 }
6884 
6885 
6886 /* Check the server for changes in the state of the database, and if there are
6887 any changes, update the displayed values.  Since this is a read only
6888 examination of the server, we go directly to the application rather than
6889 sending it messages.  Also, when sending messages, we can't find out what it is
6890 doing while it is busy with a batch of spam additions (all the spam add
6891 commands will be in the queue ahead of our requests for info).  Instead, we
6892 lock the BApplication (so it isn't changing things while we're looking) and
6893 retrieve our values. */
6894 
6895 void
6896 ControlsView::PollServerForChanges ()
6897 {
6898   ABSApp     *MyAppPntr;
6899   BMenuItem  *TempMenuItemPntr;
6900   char        TempString [PATH_MAX];
6901   BWindow    *WindowPntr;
6902 
6903   /* We need a pointer to our window, for changing the title etc. */
6904 
6905   WindowPntr = Window ();
6906   if (WindowPntr == NULL)
6907     return; /* No window, no point in updating the display! */
6908 
6909   /* Check the server mode flag.  If the mode is off, then the window has to be
6910   minimized.  Similarly, if it gets turned on, maximize the window.  Note that
6911   the user can maximize the window manually, even while still in server mode.
6912   */
6913 
6914   if (g_ServerMode != m_ServerModeCachedValue &&
6915   m_ServerModeCheckboxPntr != NULL)
6916   {
6917     m_ServerModeCachedValue = g_ServerMode;
6918     m_ServerModeCheckboxPntr->SetValue (
6919       m_ServerModeCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6920     WindowPntr->Minimize (m_ServerModeCachedValue);
6921   }
6922 
6923   if (WindowPntr->IsMinimized ())
6924     return; /* Window isn't visible, don't waste time updating it. */
6925 
6926   /* So that people don't stare at a blank screen, request a database load if
6927   nothing is there.  But only do it once, so the user doesn't get a lot of
6928   invalid database messages if one doesn't exist yet.  In server mode, we never
6929   get this far so it is only loaded when the user wants to see something. */
6930 
6931   if (!m_DatabaseLoadDone)
6932   {
6933     m_DatabaseLoadDone = true;
6934     /* Counting the number of words will load the database. */
6935     SubmitCommandString (PN_DATABASE_FILE, B_COUNT_PROPERTIES, "");
6936   }
6937 
6938   /* Check various read only values, which can be read from the BApplication
6939   without having to lock it.  This is useful for displaying the number of words
6940   as it is changing.  First up is the purge age setting. */
6941 
6942   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
6943   if (MyAppPntr == NULL)
6944     return; /* Doesn't exist or is the wrong class.  Not likely! */
6945 
6946   if (MyAppPntr->m_PurgeAge != m_PurgeAgeCachedValue &&
6947   m_PurgeAgeTextboxPntr != NULL)
6948   {
6949     m_PurgeAgeCachedValue = MyAppPntr->m_PurgeAge;
6950     sprintf (TempString, "%lu", m_PurgeAgeCachedValue);
6951     m_PurgeAgeTextboxPntr->SetText (TempString);
6952   }
6953 
6954   /* Check the purge popularity. */
6955 
6956   if (MyAppPntr->m_PurgePopularity != m_PurgePopularityCachedValue &&
6957   m_PurgePopularityTextboxPntr != NULL)
6958   {
6959     m_PurgePopularityCachedValue = MyAppPntr->m_PurgePopularity;
6960     sprintf (TempString, "%lu", m_PurgePopularityCachedValue);
6961     m_PurgePopularityTextboxPntr->SetText (TempString);
6962   }
6963 
6964   /* Check the Ignore Previous Classification flag. */
6965 
6966   if (MyAppPntr->m_IgnorePreviousClassification !=
6967   m_IgnorePreviousClassCachedValue &&
6968   m_IgnorePreviousClassCheckboxPntr != NULL)
6969   {
6970     m_IgnorePreviousClassCachedValue =
6971       MyAppPntr->m_IgnorePreviousClassification;
6972     m_IgnorePreviousClassCheckboxPntr->SetValue (
6973       m_IgnorePreviousClassCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6974   }
6975 
6976   /* Update the genuine count. */
6977 
6978   if (MyAppPntr->m_TotalGenuineMessages != m_GenuineCountCachedValue &&
6979   m_GenuineCountTextboxPntr != NULL)
6980   {
6981     m_GenuineCountCachedValue = MyAppPntr->m_TotalGenuineMessages;
6982     sprintf (TempString, "%lu", m_GenuineCountCachedValue);
6983     m_GenuineCountTextboxPntr->SetText (TempString);
6984   }
6985 
6986   /* Update the spam count. */
6987 
6988   if (MyAppPntr->m_TotalSpamMessages != m_SpamCountCachedValue &&
6989   m_SpamCountTextboxPntr != NULL)
6990   {
6991     m_SpamCountCachedValue = MyAppPntr->m_TotalSpamMessages;
6992     sprintf (TempString, "%lu", m_SpamCountCachedValue);
6993     m_SpamCountTextboxPntr->SetText (TempString);
6994   }
6995 
6996   /* Update the word count. */
6997 
6998   if (MyAppPntr->m_WordCount != m_WordCountCachedValue &&
6999   m_WordCountTextboxPntr != NULL)
7000   {
7001     m_WordCountCachedValue = MyAppPntr->m_WordCount;
7002     sprintf (TempString, "%lu", m_WordCountCachedValue);
7003     m_WordCountTextboxPntr->SetText (TempString);
7004   }
7005 
7006   /* Update the tokenize mode pop-up menu. */
7007 
7008   if (MyAppPntr->m_TokenizeMode != m_TokenizeModeCachedValue &&
7009   m_TokenizeModePopUpMenuPntr != NULL)
7010   {
7011     m_TokenizeModeCachedValue = MyAppPntr->m_TokenizeMode;
7012     TempMenuItemPntr =
7013       m_TokenizeModePopUpMenuPntr->ItemAt ((int) m_TokenizeModeCachedValue);
7014     if (TempMenuItemPntr != NULL)
7015       TempMenuItemPntr->SetMarked (true);
7016   }
7017 
7018   /* Update the scoring mode pop-up menu. */
7019 
7020   if (MyAppPntr->m_ScoringMode != m_ScoringModeCachedValue &&
7021   m_ScoringModePopUpMenuPntr != NULL)
7022   {
7023     m_ScoringModeCachedValue = MyAppPntr->m_ScoringMode;
7024     TempMenuItemPntr =
7025       m_ScoringModePopUpMenuPntr->ItemAt ((int) m_ScoringModeCachedValue);
7026     if (TempMenuItemPntr != NULL)
7027       TempMenuItemPntr->SetMarked (true);
7028   }
7029 
7030   /* Lock the application.  This will stop it from processing any further
7031   messages until we are done.  Or if it is busy, the lock will fail. */
7032 
7033   if (MyAppPntr->LockWithTimeout (100000) != B_OK)
7034     return; /* It's probably busy doing something. */
7035 
7036   /* See if the database file name has changed. */
7037 
7038   if (strcmp (MyAppPntr->m_DatabaseFileName.String (),
7039   m_DatabaseFileNameCachedValue) != 0 &&
7040   m_DatabaseFileNameTextboxPntr != NULL)
7041   {
7042     strcpy (m_DatabaseFileNameCachedValue,
7043       MyAppPntr->m_DatabaseFileName.String ());
7044     m_DatabaseFileNameTextboxPntr->SetText (m_DatabaseFileNameCachedValue);
7045     WindowPntr->SetTitle (m_DatabaseFileNameCachedValue);
7046   }
7047 
7048   /* Done.  Let the BApplication continue processing messages. */
7049 
7050   MyAppPntr->Unlock ();
7051 }
7052 
7053 
7054 void
7055 ControlsView::Pulse ()
7056 {
7057   if (system_time () > m_TimeOfLastPoll + 200000)
7058   {
7059     PollServerForChanges ();
7060     m_TimeOfLastPoll = system_time ();
7061   }
7062 }
7063 
7064 
7065 
7066 /******************************************************************************
7067  * Implementation of the DatabaseWindow class, constructor, destructor and the
7068  * rest of the member functions in mostly alphabetical order.
7069  */
7070 
7071 DatabaseWindow::DatabaseWindow ()
7072 : BWindow (BRect (30, 30, 620, 400),
7073     "Haiku spam filter server",
7074     B_DOCUMENT_WINDOW, B_ASYNCHRONOUS_CONTROLS)
7075 {
7076   BRect TempRect;
7077 
7078   /* Add the controls view. */
7079 
7080   m_ControlsViewPntr = new ControlsView (Bounds ());
7081   if (m_ControlsViewPntr == NULL)
7082     goto ErrorExit;
7083   AddChild (m_ControlsViewPntr);
7084 
7085   /* Add the word view in the remaining space under the controls view. */
7086 
7087 
7088   TempRect = Bounds ();
7089   TempRect.top = m_ControlsViewPntr->Frame().bottom + 1;
7090   m_WordsViewPntr = new WordsView (TempRect);
7091   if (m_WordsViewPntr == NULL)
7092     goto ErrorExit;
7093   AddChild (m_WordsViewPntr);
7094 
7095  /* Minimize the window if we are starting up in server mode.  This is done
7096 	before the window is open so it doesn't flash onto the screen, and possibly
7097 	steal a keystroke or two.  The ControlsView will further update the minimize
7098 	mode when it detects changes in the server mode. */
7099   Minimize (g_ServerMode);
7100 
7101   return;
7102 
7103 ErrorExit:
7104   DisplayErrorMessage ("Unable to initialise the window contents.");
7105 }
7106 
7107 
7108 void
7109 DatabaseWindow::MessageReceived (BMessage *MessagePntr)
7110 {
7111   if (MessagePntr->what == B_MOUSE_WHEEL_CHANGED)
7112   {
7113     /* Pass the mouse wheel stuff down to the words view, since that's the only
7114     one which does scrolling so we don't need to worry about whether it has
7115     focus or not. */
7116 
7117     if (m_WordsViewPntr != NULL)
7118       m_WordsViewPntr->MessageReceived (MessagePntr);
7119   }
7120   else
7121     BWindow::MessageReceived (MessagePntr);
7122 }
7123 
7124 
7125 bool
7126 DatabaseWindow::QuitRequested ()
7127 {
7128   be_app->PostMessage (B_QUIT_REQUESTED);
7129   return true;
7130 }
7131 
7132 
7133 
7134 /******************************************************************************
7135  * Implementation of the word display view.
7136  */
7137 
7138 WordsView::WordsView (BRect NewBounds)
7139 : BView (NewBounds, "WordsView", B_FOLLOW_ALL_SIDES,
7140     B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE | B_NAVIGABLE | B_PULSE_NEEDED),
7141   m_ArrowLineDownPntr (NULL),
7142   m_ArrowLineUpPntr (NULL),
7143   m_ArrowPageDownPntr (NULL),
7144   m_ArrowPageUpPntr (NULL),
7145   m_LastTimeAKeyWasPressed (0)
7146 {
7147   font_height TempFontHeight;
7148 
7149   GetFont (&m_TextFont); /* Modify the default font to be our own. */
7150   m_TextFont.SetSize (ceilf (m_TextFont.Size() * 1.1));
7151   m_TextFont.GetHeight (&TempFontHeight);
7152   SetFont (&m_TextFont);
7153 
7154   m_LineHeight = ceilf (TempFontHeight.ascent +
7155     TempFontHeight.descent + TempFontHeight.leading);
7156   m_AscentHeight = ceilf (TempFontHeight.ascent);
7157   m_TextHeight = ceilf (TempFontHeight.ascent +
7158     TempFontHeight.descent);
7159 
7160   m_FocusedColour.red = 255;
7161   m_FocusedColour.green = 255;
7162   m_FocusedColour.blue = 255;
7163   m_FocusedColour.alpha = 255;
7164 
7165   m_UnfocusedColour.red = 245;
7166   m_UnfocusedColour.green = 245;
7167   m_UnfocusedColour.blue = 255;
7168   m_UnfocusedColour.alpha = 255;
7169 
7170   m_BackgroundColour = m_UnfocusedColour;
7171   SetViewColor (m_BackgroundColour);
7172   SetLowColor (m_BackgroundColour);
7173   SetHighColor (0, 0, 0);
7174 
7175   strcpy (m_FirstDisplayedWord, "a");
7176 }
7177 
7178 
7179 void
7180 WordsView::AttachedToWindow ()
7181 {
7182   BPolygon        DownLinePolygon (g_DownLinePoints,
7183                     sizeof (g_DownLinePoints) /
7184                     sizeof (g_DownLinePoints[0]));
7185 
7186   BPolygon        DownPagePolygon (g_DownPagePoints,
7187                     sizeof (g_DownPagePoints) /
7188                     sizeof (g_DownPagePoints[0]));
7189 
7190   BPolygon        UpLinePolygon (g_UpLinePoints,
7191                     sizeof (g_UpLinePoints) /
7192                     sizeof (g_UpLinePoints[0]));
7193 
7194   BPolygon        UpPagePolygon (g_UpPagePoints,
7195                     sizeof (g_UpPagePoints) /
7196                     sizeof (g_UpPagePoints[0]));
7197 
7198   BPicture        TempOffPicture;
7199   BPicture        TempOnPicture;
7200   BRect           TempRect;
7201 
7202   /* Make the buttons and associated polygon images for the forward and
7203   backwards a word or a page of words buttons.  They're the width of the scroll
7204   bar area on the right, but twice as tall as usual, since there is no scroll
7205   bar and that will make it easier to use them.  First the up a line button. */
7206 
7207   SetHighColor (0, 0, 0);
7208   BeginPicture (&TempOffPicture);
7209   FillPolygon (&UpLinePolygon);
7210   SetHighColor (180, 180, 180);
7211   StrokePolygon (&UpLinePolygon);
7212   EndPicture ();
7213 
7214   SetHighColor (128, 128, 128);
7215   BeginPicture (&TempOnPicture);
7216   FillPolygon (&UpLinePolygon);
7217   EndPicture ();
7218 
7219   TempRect = Bounds ();
7220   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7221   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7222   m_ArrowLineUpPntr = new BPictureButton (TempRect, "Up Line",
7223     &TempOffPicture, &TempOnPicture,
7224     new BMessage (MSG_LINE_UP), B_ONE_STATE_BUTTON,
7225     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7226   if (m_ArrowLineUpPntr == NULL) goto ErrorExit;
7227   AddChild (m_ArrowLineUpPntr);
7228   m_ArrowLineUpPntr->SetTarget (this);
7229 
7230   /* Up a page button. */
7231 
7232   SetHighColor (0, 0, 0);
7233   BeginPicture (&TempOffPicture);
7234   FillPolygon (&UpPagePolygon);
7235   SetHighColor (180, 180, 180);
7236   StrokePolygon (&UpPagePolygon);
7237   EndPicture ();
7238 
7239   SetHighColor (128, 128, 128);
7240   BeginPicture (&TempOnPicture);
7241   FillPolygon (&UpPagePolygon);
7242   EndPicture ();
7243 
7244   TempRect = Bounds ();
7245   TempRect.top += 2 * B_H_SCROLL_BAR_HEIGHT + 1;
7246   TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7247   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7248   m_ArrowPageUpPntr = new BPictureButton (TempRect, "Up Page",
7249     &TempOffPicture, &TempOnPicture,
7250     new BMessage (MSG_PAGE_UP), B_ONE_STATE_BUTTON,
7251     B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7252   if (m_ArrowPageUpPntr == NULL) goto ErrorExit;
7253   AddChild (m_ArrowPageUpPntr);
7254   m_ArrowPageUpPntr->SetTarget (this);
7255 
7256   /* Down a page button. */
7257 
7258   SetHighColor (0, 0, 0);
7259   BeginPicture (&TempOffPicture);
7260   FillPolygon (&DownPagePolygon);
7261   SetHighColor (180, 180, 180);
7262   StrokePolygon (&DownPagePolygon);
7263   EndPicture ();
7264 
7265   SetHighColor (128, 128, 128);
7266   BeginPicture (&TempOnPicture);
7267   FillPolygon (&DownPagePolygon);
7268   EndPicture ();
7269 
7270   TempRect = Bounds ();
7271   TempRect.bottom -= 3 * B_H_SCROLL_BAR_HEIGHT + 1;
7272   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7273   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7274   m_ArrowPageDownPntr = new BPictureButton (TempRect, "Down Page",
7275     &TempOffPicture, &TempOnPicture,
7276     new BMessage (MSG_PAGE_DOWN), B_ONE_STATE_BUTTON,
7277     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7278   if (m_ArrowPageDownPntr == NULL) goto ErrorExit;
7279   AddChild (m_ArrowPageDownPntr);
7280   m_ArrowPageDownPntr->SetTarget (this);
7281 
7282   /* Down a line button. */
7283 
7284   SetHighColor (0, 0, 0);
7285   BeginPicture (&TempOffPicture);
7286   FillPolygon (&DownLinePolygon);
7287   SetHighColor (180, 180, 180);
7288   StrokePolygon (&DownLinePolygon);
7289   EndPicture ();
7290 
7291   SetHighColor (128, 128, 128);
7292   BeginPicture (&TempOnPicture);
7293   FillPolygon (&DownLinePolygon);
7294   EndPicture ();
7295 
7296   TempRect = Bounds ();
7297   TempRect.bottom -= B_H_SCROLL_BAR_HEIGHT;
7298   TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7299   TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7300   m_ArrowLineDownPntr = new BPictureButton (TempRect, "Down Line",
7301     &TempOffPicture, &TempOnPicture,
7302     new BMessage (MSG_LINE_DOWN), B_ONE_STATE_BUTTON,
7303     B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7304   if (m_ArrowLineDownPntr == NULL) goto ErrorExit;
7305   AddChild (m_ArrowLineDownPntr);
7306   m_ArrowLineDownPntr->SetTarget (this);
7307 
7308   return;
7309 
7310 ErrorExit:
7311   DisplayErrorMessage ("Problems while making view displaying the words.");
7312 }
7313 
7314 
7315 /* Draw the words starting with the one at or after m_FirstDisplayedWord.  This
7316 requires looking at the database in the BApplication, which may or may not be
7317 available (if it isn't, don't draw, a redraw will usually be requested by the
7318 Pulse member function when it keeps on noticing that the stuff on the display
7319 doesn't match the database). */
7320 
7321 void
7322 WordsView::Draw (BRect UpdateRect)
7323 {
7324   float                   AgeDifference;
7325   float                   AgeProportion;
7326   float                   CenterX;
7327   float                   ColumnLeftCenterX;
7328   float                   ColumnMiddleCenterX;
7329   float                   ColumnRightCenterX;
7330   float                   CompensatedRatio;
7331   StatisticsMap::iterator DataIter;
7332   StatisticsMap::iterator EndIter;
7333   rgb_color               FillColour;
7334   float                   GenuineProportion;
7335   uint32                  GenuineSpamSum;
7336   float                   HeightPixels;
7337   float                   HeightProportion;
7338   float                   LeftBounds;
7339   ABSApp                 *MyAppPntr;
7340   uint32                  NewestAge;
7341   uint32                  OldestAge;
7342   float                   OneFifthTotalGenuine;
7343   float                   OneFifthTotalSpam;
7344   double                  RawProbabilityRatio;
7345   float                   RightBounds;
7346   float                   SpamProportion;
7347   StatisticsPointer       StatisticsPntr;
7348   BRect                   TempRect;
7349   char                    TempString [PATH_MAX];
7350   float                   TotalGenuineMessages = 1.0; /* Avoid divide by 0. */
7351   float                   TotalSpamMessages = 1.0;
7352   float                   Width;
7353   float                   Y;
7354 
7355   /* Lock the application.  This will stop it from processing any further
7356   messages until we are done.  Or if it is busy, the lock will fail. */
7357 
7358   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7359   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (100000) != B_OK)
7360     return; /* It's probably busy doing something. */
7361 
7362   /* Set up various loop invariant variables. */
7363 
7364   if (MyAppPntr->m_TotalGenuineMessages > 0)
7365     TotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7366   OneFifthTotalGenuine = TotalGenuineMessages / 5;
7367 
7368   if (MyAppPntr->m_TotalSpamMessages > 0)
7369     TotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7370   OneFifthTotalSpam = TotalSpamMessages / 5;
7371 
7372   EndIter = MyAppPntr->m_WordMap.end ();
7373 
7374   OldestAge = MyAppPntr->m_OldestAge;
7375   NewestAge = /* actually newest age plus one */
7376     MyAppPntr->m_TotalGenuineMessages + MyAppPntr->m_TotalSpamMessages;
7377 
7378   if (NewestAge == 0)
7379     goto NormalExit; /* No words to display, or something is badly wrong. */
7380 
7381   NewestAge--; /* The newest message has age NewestAge. */
7382   AgeDifference = NewestAge - OldestAge; /* Can be zero if just one message. */
7383 
7384   LeftBounds = Bounds().left;
7385   RightBounds = Bounds().right - B_V_SCROLL_BAR_WIDTH;
7386   Width = RightBounds - LeftBounds;
7387   FillColour.alpha = 255;
7388 
7389   CenterX = ceilf (LeftBounds + Width * 0.5);
7390   ColumnLeftCenterX = ceilf (LeftBounds + Width * 0.05);
7391   ColumnMiddleCenterX = CenterX;
7392   ColumnRightCenterX = ceilf (LeftBounds + Width * 0.95);
7393 
7394   for (DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord),
7395   Y = Bounds().top;
7396   DataIter != EndIter && Y < UpdateRect.bottom;
7397   DataIter++, Y += m_LineHeight)
7398   {
7399     if (Y + m_LineHeight < UpdateRect.top)
7400       continue; /* Not in the visible area yet, don't actually draw. */
7401 
7402     /* Draw the colour bar behind the word.  It reflects the spamness or
7403     genuineness of that particular word, plus the importance of the word and
7404     the age of the word.
7405 
7406     First calculate the compensated spam ratio (described elsewhere).  It is
7407     close to 0.0 for genuine words and close to 1.0 for pure spam.  It is drawn
7408     as a blue bar to the left of center if it is less than 0.5, and a red bar
7409     on the right of center if it is greater than 0.5.  At exactly 0.5 nothing
7410     is drawn; the word is worthless as an indicator.
7411 
7412     The height of the bar corresponds to the number of messages the word was
7413     found in.  Make the height proportional to the total of spam and genuine
7414     messages for the word divided by the sum of the most extreme spam and
7415     genuine counts in the database.
7416 
7417     The staturation of the colour corresponds to the age of the word, with old
7418     words being almost white rather than solid blue or red. */
7419 
7420     StatisticsPntr = &DataIter->second;
7421 
7422     SpamProportion = StatisticsPntr->spamCount / TotalSpamMessages;
7423     GenuineProportion = StatisticsPntr->genuineCount / TotalGenuineMessages;
7424     if (SpamProportion + GenuineProportion > 0.0f)
7425       RawProbabilityRatio =
7426       SpamProportion / (SpamProportion + GenuineProportion);
7427     else
7428       RawProbabilityRatio = g_RobinsonX;
7429 
7430     /* The compensated ratio leans towards 0.5 (RobinsonX) more for fewer
7431     data points, with a weight of 0.45 (RobinsonS). */
7432 
7433     GenuineSpamSum =
7434       StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
7435     CompensatedRatio =
7436       (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
7437       (g_RobinsonS + GenuineSpamSum);
7438 
7439     /* Used to use the height based on the most frequent word, but some words,
7440     like "From", show up in all messages which made most other words just
7441     appear as a thin line.  I did a histogram plot of the sizes in my test
7442     database, and figured that you get better coverage of 90% of the messages
7443     if you use 1/5 of the total number as the count which gives you 100%
7444     height.  The other 10% get a full height bar, but most people wouldn't care
7445     that they're super frequently used. */
7446 
7447     HeightProportion = 0.5f * (StatisticsPntr->genuineCount /
7448       OneFifthTotalGenuine + StatisticsPntr->spamCount / OneFifthTotalSpam);
7449 
7450     if (HeightProportion > 1.0f)
7451       HeightProportion = 1.0f;
7452     HeightPixels = ceilf (HeightProportion * m_TextHeight);
7453 
7454     if (AgeDifference <= 0.0f)
7455       AgeProportion = 1.0; /* New is 1.0, old is 0.0 */
7456     else
7457       AgeProportion = (StatisticsPntr->age - OldestAge) / AgeDifference;
7458 
7459     TempRect.top = ceilf (Y + m_TextHeight / 2 - HeightPixels / 2);
7460     TempRect.bottom = TempRect.top + HeightPixels;
7461 
7462     if (CompensatedRatio < 0.5f)
7463     {
7464       TempRect.left = ceilf (
7465         CenterX - 1.6f * (0.5f - CompensatedRatio) * (CenterX - LeftBounds));
7466       TempRect.right = CenterX;
7467       FillColour.red = 230 - (int) (AgeProportion * 230.0f);
7468       FillColour.green = FillColour.red;
7469       FillColour.blue = 255;
7470     }
7471     else /* Ratio >= 0.5, red spam block. */
7472     {
7473       TempRect.left = CenterX;
7474       TempRect.right = ceilf (
7475         CenterX + 1.6f * (CompensatedRatio - 0.5f) * (RightBounds - CenterX));
7476       FillColour.blue = 230 - (int) (AgeProportion * 230.0f);
7477       FillColour.green = FillColour.blue;
7478       FillColour.red = 255;
7479     }
7480     SetHighColor (FillColour);
7481     SetDrawingMode (B_OP_COPY);
7482     FillRect (TempRect);
7483 
7484     /* Print the text centered in columns of various widths.  The number of
7485     genuine messages in the left 10% of the width, the word in the middle 80%,
7486     and the number of spam messages using the word in the right 10%. */
7487 
7488     SetHighColor (0, 0, 0);
7489     SetDrawingMode (B_OP_OVER); /* So that antialiased text mixes better. */
7490 
7491     sprintf (TempString, "%lu", StatisticsPntr->genuineCount);
7492     Width = m_TextFont.StringWidth (TempString);
7493     MovePenTo (ceilf (ColumnLeftCenterX - Width / 2), Y + m_AscentHeight);
7494     DrawString (TempString);
7495 
7496     strcpy (TempString, DataIter->first.c_str ());
7497     Width = m_TextFont.StringWidth (TempString);
7498     MovePenTo (ceilf (ColumnMiddleCenterX - Width / 2), Y + m_AscentHeight);
7499     DrawString (TempString);
7500 
7501     sprintf (TempString, "%lu", StatisticsPntr->spamCount);
7502     Width = m_TextFont.StringWidth (TempString);
7503     MovePenTo (ceilf (ColumnRightCenterX - Width / 2), Y + m_AscentHeight);
7504     DrawString (TempString);
7505   }
7506 
7507   /* Draw the first word (the one which the user types in to select the first
7508   displayed word) on the right, in the scroll bar margin, rotated 90 degrees to
7509   fit between the page up and page down buttons. */
7510 
7511   Width = m_TextFont.StringWidth (m_FirstDisplayedWord);
7512   if (Width > 0)
7513   {
7514     TempRect = Bounds ();
7515     TempRect.top += 4 * B_H_SCROLL_BAR_HEIGHT + 1;
7516     TempRect.bottom -= 5 * B_H_SCROLL_BAR_HEIGHT + 1;
7517 
7518     MovePenTo (TempRect.right - m_TextHeight + m_AscentHeight - 1,
7519       ceilf ((TempRect.bottom + TempRect.top) / 2 + Width / 2));
7520     m_TextFont.SetRotation (90);
7521     SetFont (&m_TextFont, B_FONT_ROTATION);
7522     DrawString (m_FirstDisplayedWord);
7523     m_TextFont.SetRotation (0);
7524     SetFont (&m_TextFont, B_FONT_ROTATION);
7525   }
7526 
7527 NormalExit:
7528 
7529   /* Successfully finished drawing.  Update the cached values to match what we
7530   have drawn. */
7531   m_CachedTotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7532   m_CachedTotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7533   m_CachedWordCount = MyAppPntr->m_WordCount;
7534 
7535   /* Done.  Let the BApplication continue processing messages. */
7536   MyAppPntr->Unlock ();
7537 }
7538 
7539 
7540 /* When the user presses keys, they select the first word to be displayed in
7541 the view (it's the word at or lexicographically after the word typed in).  The
7542 keys are appended to the starting word, until the user stops typing for a
7543 while, then the next key will be the first letter of a new starting word. */
7544 
7545 void
7546 WordsView::KeyDown (const char *BufferPntr, int32 NumBytes)
7547 {
7548   int32          CharLength;
7549   bigtime_t      CurrentTime;
7550   char           TempString [40];
7551 
7552   CurrentTime = system_time ();
7553 
7554   if (NumBytes < (int32) sizeof (TempString))
7555   {
7556     memcpy (TempString, BufferPntr, NumBytes);
7557     TempString [NumBytes] = 0;
7558     CharLength = strlen (TempString); /* So NUL bytes don't get through. */
7559 
7560     /* Check for arrow keys, which move the view up and down. */
7561 
7562     if (CharLength == 1 &&
7563     (TempString[0] == B_UP_ARROW ||
7564     TempString[0] == B_DOWN_ARROW ||
7565     TempString[0] == B_PAGE_UP ||
7566     TempString[0] == B_PAGE_DOWN))
7567     {
7568       MoveTextUpOrDown ((TempString[0] == B_UP_ARROW) ? MSG_LINE_UP :
7569         ((TempString[0] == B_DOWN_ARROW) ? MSG_LINE_DOWN :
7570         ((TempString[0] == B_PAGE_UP) ? MSG_PAGE_UP : MSG_PAGE_DOWN)));
7571     }
7572     else if (CharLength > 1 ||
7573     (CharLength == 1 && 32 <= (uint8) TempString[0]))
7574     {
7575       /* Have a non-control character, or some sort of multibyte char.  Add it
7576       to the word and mark things for redisplay starting at the resulting word.
7577       */
7578 
7579       if (CurrentTime - m_LastTimeAKeyWasPressed >= 1000000 /* microseconds */)
7580         strcpy (m_FirstDisplayedWord, TempString); /* Starting a new word. */
7581       else if (strlen (m_FirstDisplayedWord) + CharLength <= g_MaxWordLength)
7582         strcat (m_FirstDisplayedWord, TempString); /* Append to existing. */
7583 
7584       Invalidate ();
7585     }
7586   }
7587 
7588   m_LastTimeAKeyWasPressed = CurrentTime;
7589   BView::KeyDown (BufferPntr, NumBytes);
7590 }
7591 
7592 
7593 /* Change the background colour to show that we have the focus.  When we have
7594 it, keystrokes will select the word to be displayed at the top of the list. */
7595 
7596 void
7597 WordsView::MakeFocus (bool Focused)
7598 {
7599   if (Focused)
7600     m_BackgroundColour = m_FocusedColour;
7601   else
7602     m_BackgroundColour = m_UnfocusedColour;
7603   SetViewColor (m_BackgroundColour);
7604   SetLowColor (m_BackgroundColour);
7605 
7606   /* Also need to set the background colour for the scroll buttons, since they
7607   can't be made transparent. */
7608 
7609   if (m_ArrowLineDownPntr != NULL)
7610   {
7611     m_ArrowLineDownPntr->SetViewColor (m_BackgroundColour);
7612     m_ArrowLineDownPntr->Invalidate ();
7613   }
7614 
7615   if (m_ArrowLineUpPntr != NULL)
7616   {
7617     m_ArrowLineUpPntr->SetViewColor (m_BackgroundColour);
7618     m_ArrowLineUpPntr->Invalidate ();
7619   }
7620 
7621   if (m_ArrowPageDownPntr != NULL)
7622   {
7623     m_ArrowPageDownPntr->SetViewColor (m_BackgroundColour);
7624     m_ArrowPageDownPntr->Invalidate ();
7625   }
7626 
7627   if (m_ArrowPageUpPntr != NULL)
7628   {
7629     m_ArrowPageUpPntr->SetViewColor (m_BackgroundColour);
7630     m_ArrowPageUpPntr->Invalidate ();
7631   }
7632 
7633   Invalidate ();
7634 
7635   BView::MakeFocus (Focused);
7636 }
7637 
7638 
7639 void
7640 WordsView::MessageReceived (BMessage *MessagePntr)
7641 {
7642   int32     CountFound;
7643   float     DeltaY; /* Usually -1.0, 0.0 or +1.0. */
7644   type_code TypeFound;
7645 
7646   switch (MessagePntr->what)
7647   {
7648     case B_MOUSE_WHEEL_CHANGED:
7649       if (MessagePntr->FindFloat ("be:wheel_delta_y", &DeltaY) != 0) break;
7650       if (DeltaY < 0)
7651         MoveTextUpOrDown (MSG_LINE_UP);
7652       else if (DeltaY > 0)
7653         MoveTextUpOrDown (MSG_LINE_DOWN);
7654       break;
7655 
7656     case MSG_LINE_DOWN:
7657     case MSG_LINE_UP:
7658     case MSG_PAGE_DOWN:
7659     case MSG_PAGE_UP:
7660       MoveTextUpOrDown (MessagePntr->what);
7661       break;
7662 
7663     case B_SIMPLE_DATA: /* Something has been dropped in our view. */
7664       if (MessagePntr->GetInfo ("refs", &TypeFound, &CountFound) == B_OK &&
7665       CountFound > 0 && TypeFound == B_REF_TYPE)
7666       {
7667         RefsDroppedHere (MessagePntr);
7668         break;
7669       }
7670       /* Else fall through to the default case, in case it is something else
7671       dropped that the system knows about. */
7672 
7673     default:
7674       BView::MessageReceived (MessagePntr);
7675   }
7676 }
7677 
7678 
7679 /* If the user clicks on our view, take over the focus. */
7680 
7681 void
7682 WordsView::MouseDown (BPoint)
7683 {
7684   if (!IsFocus ())
7685     MakeFocus (true);
7686 }
7687 
7688 
7689 void
7690 WordsView::MoveTextUpOrDown (uint32 MovementType)
7691 {
7692   StatisticsMap::iterator  DataIter;
7693   int                      i;
7694   ABSApp                  *MyAppPntr;
7695   int                      PageSize;
7696 
7697   /* Lock the application.  This will stop it from processing any further
7698   messages until we are done (we need to look at the word list directly).  Or
7699   if it is busy, the lock will fail. */
7700 
7701   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7702   if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (2000000) != B_OK)
7703     return; /* It's probably busy doing something. */
7704 
7705   PageSize = (int) (Bounds().Height() / m_LineHeight - 1);
7706   if (PageSize < 1)
7707     PageSize = 1;
7708 
7709   DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord);
7710 
7711   switch (MovementType)
7712   {
7713     case MSG_LINE_UP:
7714       if (DataIter != MyAppPntr->m_WordMap.begin ())
7715         DataIter--;
7716       break;
7717 
7718     case MSG_LINE_DOWN:
7719       if (DataIter != MyAppPntr->m_WordMap.end ())
7720         DataIter++;
7721       break;
7722 
7723     case MSG_PAGE_UP:
7724       for (i = 0; i < PageSize; i++)
7725       {
7726         if (DataIter == MyAppPntr->m_WordMap.begin ())
7727           break;
7728         DataIter--;
7729       }
7730       break;
7731 
7732     case MSG_PAGE_DOWN:
7733       for (i = 0; i < PageSize; i++)
7734       {
7735         if (DataIter == MyAppPntr->m_WordMap.end ())
7736           break;
7737         DataIter++;
7738       }
7739       break;
7740   }
7741 
7742   if (DataIter != MyAppPntr->m_WordMap.end ())
7743     strcpy (m_FirstDisplayedWord, DataIter->first.c_str ());
7744 
7745   Invalidate ();
7746 
7747   MyAppPntr->Unlock ();
7748 }
7749 
7750 
7751 /* This function periodically polls the BApplication to see if anything has
7752 changed.  If the word list is different or the display has changed in some
7753 other way, it will then try to refresh the display, repeating the attempt until
7754 it gets successfully drawn. */
7755 
7756 void
7757 WordsView::Pulse ()
7758 {
7759   ABSApp *MyAppPntr;
7760 
7761   /* Probe the BApplication to see if it has changed. */
7762 
7763   MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7764   if (MyAppPntr == NULL)
7765     return; /* Something is wrong, give up. */
7766 
7767   if (MyAppPntr->m_TotalGenuineMessages != m_CachedTotalGenuineMessages ||
7768   MyAppPntr->m_TotalSpamMessages != m_CachedTotalSpamMessages ||
7769   MyAppPntr->m_WordCount != m_CachedWordCount)
7770     Invalidate ();
7771 }
7772 
7773 
7774 /* The user has dragged and dropped some file references on the words view.  If
7775 it is in the left third, add the file(s) as examples of genuine messages, right
7776 third for spam messages and if it is in the middle third then evaluate the
7777 file(s) for spaminess. */
7778 
7779 void
7780 WordsView::RefsDroppedHere (BMessage *MessagePntr)
7781 {
7782   float  Left;
7783   bool   SpamExample = true; /* TRUE if example is of spam, FALSE genuine. */
7784   float  Third;
7785   BPoint WhereDropped;
7786 
7787   /* Find out which third of the view it was dropped into. */
7788 
7789   if (MessagePntr->FindPoint ("_drop_point_", &WhereDropped) != B_OK)
7790     return;  /* Need to know where it was dropped. */
7791   ConvertFromScreen (&WhereDropped);
7792   Third = Bounds().Width() / 3;
7793   Left = Bounds().left;
7794   if (WhereDropped.x < Left + Third)
7795     SpamExample = false;
7796   else if (WhereDropped.x < Left + 2 * Third)
7797   {
7798     /* In the middle third, evaluate all files for spaminess. */
7799     EstimateRefFilesAndDisplay (MessagePntr);
7800     return;
7801   }
7802 
7803   if (g_CommanderLooperPntr != NULL)
7804     g_CommanderLooperPntr->CommandReferences (
7805     MessagePntr, true /* BulkMode */, SpamExample ? CL_SPAM : CL_GENUINE);
7806 }
7807 
7808 
7809 
7810 /******************************************************************************
7811  * Finally, the main program which drives it all.
7812  */
7813 
7814 int main (int argc, char**)
7815 {
7816   g_CommandLineMode = (argc > 1);
7817   if (!g_CommandLineMode)
7818     cout << PrintUsage; /* In case no arguments specified. */
7819 
7820   g_CommanderLooperPntr = new CommanderLooper;
7821   if (g_CommanderLooperPntr != NULL)
7822   {
7823     g_CommanderMessenger = new BMessenger (NULL, g_CommanderLooperPntr);
7824     g_CommanderLooperPntr->Run ();
7825   }
7826 
7827   ABSApp MyApp;
7828 
7829   if (MyApp.InitCheck () == 0)
7830   {
7831     MyApp.LoadSaveSettings (true /* DoLoad */);
7832     MyApp.Run ();
7833   }
7834 
7835   if (g_CommanderLooperPntr != NULL)
7836   {
7837     g_CommanderLooperPntr->PostMessage (B_QUIT_REQUESTED);
7838     snooze (100000); /* Let the CommanderLooper thread run so it quits. */
7839   }
7840 
7841   cerr << "SpamDBM shutting down..." << endl;
7842   return 0; /* And implicitly destroys MyApp, which writes out the database. */
7843 }
7844