View Javadoc

1   /*
2    * ====================================================================
3    *
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer.
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:
22   *       "This product includes software developed by the
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote
28   *    products derived from this software without prior written
29   *    permission. For written permission, please contact
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called
33   *    "Classifier4J", nor may "Classifier4J" appear in their names
34   *    without prior written permission. For written permission, please
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J.bayesian;
53  
54  import java.util.ArrayList;
55  import java.util.List;
56  
57  import net.sf.classifier4J.AbstractCategorizedTrainableClassifier;
58  import net.sf.classifier4J.DefaultStopWordsProvider;
59  import net.sf.classifier4J.DefaultTokenizer;
60  import net.sf.classifier4J.ICategorisedClassifier;
61  import net.sf.classifier4J.IClassifier;
62  import net.sf.classifier4J.IStopWordProvider;
63  import net.sf.classifier4J.ITokenizer;
64  import net.sf.classifier4J.util.ToStringBuilder;
65  
66  /***
67   *
68   * <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes'
69   * theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p>
70   *
71   * <p>The basic usage pattern for this class is:
72   * <ol>
73   * 		<li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li>
74   * 		<li>Create a new instance of BayesianClassifier, passing the IWordsDataSource
75   * 			to the constructor</li>
76   * 		<li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) }
77   * 			or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) }
78   * </ol>
79   * </p>
80   *
81   * <p>For example:<br>
82   * <tt>
83   * 		IWordsDataSource wds = new SimpleWordsDataSource();<br>
84   * 		IClassifier classifier = new BayesianClassifier(wds);<br>
85   * 		System.out.println( "Matches = " + classifier.classify("This is a sentence") );
86   * </tt>
87   * </p>
88   *
89   * @author Nick Lothian
90   * @author Peter Leschev
91   *
92   */
93  public class BayesianClassifier extends AbstractCategorizedTrainableClassifier {
94  
95      IWordsDataSource wordsData;
96      ITokenizer tokenizer;
97      IStopWordProvider stopWordProvider;
98  
99      private boolean isCaseSensitive = false;
100 
101     /***
102      * Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer
103      * (set to BREAK_ON_WORD_BREAKS).
104      */
105     public BayesianClassifier() {
106         this(new SimpleWordsDataSource(), new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
107     }
108 
109     /***
110      * Constructor for BayesianClassifier that specifies a datasource. The
111      * DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used.
112      *
113      * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
114      */
115     public BayesianClassifier(IWordsDataSource wd) {
116         this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
117     }
118 
119     /***
120      * Constructor for BayesianClassifier that specifies a datasource & tokenizer
121      *
122      * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
123      * @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
124      */
125     public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) {
126         this(wd, tokenizer, new DefaultStopWordsProvider());
127     }
128 
129     /***
130      * Constructor for BayesianClassifier that specifies a datasource, tokenizer
131      * and stop words provider
132      *
133      * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
134      * @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
135      * @param swp a {@link net.sf.classifier4J.IStopWordProvider}
136      */
137     public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) {
138         if (wd == null) {
139             throw new IllegalArgumentException("IWordsDataSource can't be null");
140         }
141         this.wordsData = wd;
142 
143         if (tokenizer == null) {
144             throw new IllegalArgumentException("ITokenizer can't be null");
145         }
146         this.tokenizer = tokenizer;
147 
148         if (swp == null) {
149             throw new IllegalArgumentException("IStopWordProvider can't be null");
150         }
151         this.stopWordProvider = swp;
152     }
153 
154     /***
155      * @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String)
156      */
157     public boolean isMatch(String category, String input) throws WordsDataSourceException {
158         return isMatch(category, tokenizer.tokenize(input));
159     }
160 
161     /***
162      * @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String)
163      */
164     public double classify(String category, String input) throws WordsDataSourceException {
165         if (category == null) {
166             throw new IllegalArgumentException("category cannot be null");
167         }
168         if (input == null) {
169             throw new IllegalArgumentException("input cannot be null");
170         }
171 
172         checkCategoriesSupported(category);
173 
174         return classify(category, tokenizer.tokenize(input));
175     }
176 
177     public void teachMatch(String category, String input) throws WordsDataSourceException {
178         if (category == null) {
179             throw new IllegalArgumentException("category cannot be null");
180         }
181 
182         if (input == null) {
183             throw new IllegalArgumentException("input cannot be null");
184         }
185 
186         checkCategoriesSupported(category);
187 
188         teachMatch(category, tokenizer.tokenize(input));
189     }
190 
191     public void teachNonMatch(String category, String input) throws WordsDataSourceException {
192         if (category == null) {
193             throw new IllegalArgumentException("category cannot be null");
194         }
195 
196         if (input == null) {
197             throw new IllegalArgumentException("input cannot be null");
198         }
199 
200         checkCategoriesSupported(category);
201 
202         teachNonMatch(category, tokenizer.tokenize(input));
203     }
204 
205     protected boolean isMatch(String category, String input[]) throws WordsDataSourceException {
206         if (category == null) {
207             throw new IllegalArgumentException("category cannot be null");
208         }
209 
210         if (input == null) {
211             throw new IllegalArgumentException("input cannot be null");
212         }
213 
214         checkCategoriesSupported(category);
215 
216         double matchProbability = classify(category, input);
217 
218         return (matchProbability >= cutoff);
219     }
220 
221     protected double classify(String category, String words[]) throws WordsDataSourceException {
222         WordProbability[] wps = calcWordsProbability(category, words);
223         return normaliseSignificance(calculateOverallProbability(wps));
224     }
225 
226     protected void teachMatch(String category, String words[]) throws WordsDataSourceException {
227         boolean categorise = false;
228         if (wordsData instanceof ICategorisedWordsDataSource) {
229             categorise = true;
230         }
231         for (int i = 0; i <= words.length - 1; i++) {
232             if (isClassifiableWord(words[i])) {
233                 if (categorise) {
234                     ((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));
235                 } else {
236                     wordsData.addMatch(transformWord(words[i]));
237                 }
238             }
239         }
240     }
241 
242     protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {
243         boolean categorise = false;
244         if (wordsData instanceof ICategorisedWordsDataSource) {
245             categorise = true;
246         }
247 
248         for (int i = 0; i <= words.length - 1; i++) {
249             if (isClassifiableWord(words[i])) {
250                 if (categorise) {
251                     ((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));
252                 } else {
253                     wordsData.addNonMatch(transformWord(words[i]));
254                 }
255 
256             }
257         }
258     }
259 
260     /***
261      * Allows transformations to be done to word.
262      * This implementation transforms the word to lowercase if the classifier
263      * is in case-insenstive mode.
264      *
265      * @param word
266      * @return the transformed word
267      * @throws IllegalArgumentException if a null is passed
268      */
269     protected String transformWord(String word) {
270         if (word != null) {
271             if (!isCaseSensitive) {
272                 return word.toLowerCase();
273             } else {
274                 return word;
275             }
276         } else {
277             throw new IllegalArgumentException("Null cannot be passed");
278         }
279     }
280 
281     /***
282      *
283      * NOTE: Override this method with care. There is a good chance it will be removed
284      * or have signature changes is later versions.
285      *
286      * <br />
287      * @todo need an option to only use the "X" most "important" words when calculating overall probability
288      * "important" is defined as being most distant from NEUTAL_PROBABILITY
289      */
290     protected double calculateOverallProbability(WordProbability[] wps) {
291         if (wps == null || wps.length == 0) {
292             return IClassifier.NEUTRAL_PROBABILITY;
293         } else {
294             // we need to calculate xy/(xy + z)
295             // where z = (1-x)(1-y)
296 
297             // firstly, calculate z and xy
298             double z = 0d;
299             double xy = 0d;
300             for (int i = 0; i < wps.length; i++) {
301                 if (z == 0) {
302                     z = (1 - wps[i].getProbability());
303                 } else {
304                     z = z * (1 - wps[i].getProbability());
305                 }
306 
307                 if (xy == 0) {
308                     xy = wps[i].getProbability();
309                 } else {
310                     xy = xy * wps[i].getProbability();
311                 }
312             }
313 
314             double numerator = xy;
315             double denominator = xy + z;
316 
317             return numerator / denominator;
318         }
319     }
320 
321     private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {
322         if (category == null) {
323             throw new IllegalArgumentException("category cannont be null");
324         }
325 
326         boolean categorise = false;
327         if (wordsData instanceof ICategorisedWordsDataSource) {
328             categorise = true;
329         }
330 
331         checkCategoriesSupported(category);
332 
333         if (words == null) {
334             return new WordProbability[0];
335         } else {
336             List wps = new ArrayList();
337             for (int i = 0; i < words.length; i++) {
338                 if (isClassifiableWord(words[i])) {
339                     WordProbability wp = null;
340                     if (categorise) {
341                         wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));
342                     } else {
343                         wp = wordsData.getWordProbability(transformWord(words[i]));
344                     }
345                     if (wp != null) {
346                         wps.add(wp);
347                     }
348                 }
349             }
350             return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);
351         }
352     }
353 
354     private void checkCategoriesSupported(String category) {
355         // if the category is not the default
356         if (!ICategorisedClassifier.DEFAULT_CATEGORY.equals(category)) {
357             // and the data source does not support categories
358             if (!(wordsData instanceof ICategorisedWordsDataSource)) {
359                 // throw an IllegalArgumentException
360                 throw new IllegalArgumentException("Word Data Source does not support non-default categories.");
361             }
362         }
363     }
364 
365     private boolean isClassifiableWord(String word) {
366         if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) {
367             return false;
368         } else {
369             return true;
370         }
371     }
372 
373     protected static double normaliseSignificance(double sig) {
374 
375         if (Double.compare(IClassifier.UPPER_BOUND, sig) < 0) {
376             return IClassifier.UPPER_BOUND;
377         } else if (Double.compare(IClassifier.LOWER_BOUND, sig) > 0) {
378             return IClassifier.LOWER_BOUND;
379         } else {
380             return sig;
381         }
382     }
383     /***
384      * @return true if the classifier is case sensitive, false otherwise
385      * (false by default)
386      */
387     public boolean isCaseSensitive() {
388         return isCaseSensitive;
389     }
390 
391     /***
392      * @param b True if the classifier should be case sensitive, false otherwise
393      */
394     public void setCaseSensitive(boolean b) {
395         isCaseSensitive = b;
396     }
397 
398     /***
399      * @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used
400      * by this classifier
401      */
402     public IWordsDataSource getWordsDataSource() {
403         return wordsData;
404     }
405 
406     /***
407      * @return the {@link net.sf.classifier4J.ITokenizer} used
408      * by this classifier
409      */
410     public ITokenizer getTokenizer() {
411         return tokenizer;
412     }
413 
414     /***
415      * @return the {@link net.sf.classifier4J.IStopWordProvider} used
416      * by this classifier
417      */
418     public IStopWordProvider getStopWordProvider() {
419         return stopWordProvider;
420     }
421 
422     public String toString() {
423         return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();
424     }
425 
426 }