Coverage report

  %line %branch
net.sf.classifier4J.bayesian.BayesianClassifier
70% 
85% 

 1  
 /*
 2  
  * ====================================================================
 3  
  *
 4  
  * The Apache Software License, Version 1.1
 5  
  *
 6  
  * Copyright (c) 2003 Nick Lothian. All rights reserved.
 7  
  *
 8  
  * Redistribution and use in source and binary forms, with or without
 9  
  * modification, are permitted provided that the following conditions
 10  
  * are met:
 11  
  *
 12  
  * 1. Redistributions of source code must retain the above copyright
 13  
  *    notice, this list of conditions and the following disclaimer.
 14  
  *
 15  
  * 2. Redistributions in binary form must reproduce the above copyright
 16  
  *    notice, this list of conditions and the following disclaimer in
 17  
  *    the documentation and/or other materials provided with the
 18  
  *    distribution.
 19  
  *
 20  
  * 3. The end-user documentation included with the redistribution, if
 21  
  *    any, must include the following acknowlegement:
 22  
  *       "This product includes software developed by the
 23  
  *        developers of Classifier4J (http://classifier4j.sf.net/)."
 24  
  *    Alternately, this acknowlegement may appear in the software itself,
 25  
  *    if and wherever such third-party acknowlegements normally appear.
 26  
  *
 27  
  * 4. The name "Classifier4J" must not be used to endorse or promote
 28  
  *    products derived from this software without prior written
 29  
  *    permission. For written permission, please contact
 30  
  *    http://sourceforge.net/users/nicklothian/.
 31  
  *
 32  
  * 5. Products derived from this software may not be called
 33  
  *    "Classifier4J", nor may "Classifier4J" appear in their names
 34  
  *    without prior written permission. For written permission, please
 35  
  *    contact http://sourceforge.net/users/nicklothian/.
 36  
  *
 37  
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 38  
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 39  
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 40  
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 41  
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 42  
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 43  
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 44  
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 45  
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 46  
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 47  
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 48  
  * SUCH DAMAGE.
 49  
  * ====================================================================
 50  
  */
 51  
 
 52  
 package net.sf.classifier4J.bayesian;
 53  
 
 54  
 import java.util.ArrayList;
 55  
 import java.util.List;
 56  
 
 57  
 import net.sf.classifier4J.AbstractCategorizedTrainableClassifier;
 58  
 import net.sf.classifier4J.DefaultStopWordsProvider;
 59  
 import net.sf.classifier4J.DefaultTokenizer;
 60  
 import net.sf.classifier4J.ICategorisedClassifier;
 61  
 import net.sf.classifier4J.IClassifier;
 62  
 import net.sf.classifier4J.IStopWordProvider;
 63  
 import net.sf.classifier4J.ITokenizer;
 64  
 import net.sf.classifier4J.util.ToStringBuilder;
 65  
 
 66  
 /**
 67  
  *
 68  
  * <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes'
 69  
  * theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p>
 70  
  *
 71  
  * <p>The basic usage pattern for this class is:
 72  
  * <ol>
 73  
  * 		<li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li>
 74  
  * 		<li>Create a new instance of BayesianClassifier, passing the IWordsDataSource
 75  
  * 			to the constructor</li>
 76  
  * 		<li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) }
 77  
  * 			or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) }
 78  
  * </ol>
 79  
  * </p>
 80  
  *
 81  
  * <p>For example:<br>
 82  
  * <tt>
 83  
  * 		IWordsDataSource wds = new SimpleWordsDataSource();<br>
 84  
  * 		IClassifier classifier = new BayesianClassifier(wds);<br>
 85  
  * 		System.out.println( "Matches = " + classifier.classify("This is a sentence") );
 86  
  * </tt>
 87  
  * </p>
 88  
  *
 89  
  * @author Nick Lothian
 90  
  * @author Peter Leschev
 91  
  *
 92  
  */
 93  
 public class BayesianClassifier extends AbstractCategorizedTrainableClassifier {
 94  
 
 95  
     IWordsDataSource wordsData;
 96  
     ITokenizer tokenizer;
 97  
     IStopWordProvider stopWordProvider;
 98  
 
 99  16
     private boolean isCaseSensitive = false;
 100  
 
 101  
     /**
 102  
      * Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer
 103  
      * (set to BREAK_ON_WORD_BREAKS).
 104  
      */
 105  
     public BayesianClassifier() {
 106  8
         this(new SimpleWordsDataSource(), class="keyword">new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
 107  8
     }
 108  
 
 109  
     /**
 110  
      * Constructor for BayesianClassifier that specifies a datasource. The
 111  
      * DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used.
 112  
      *
 113  
      * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
 114  
      */
 115  
     public BayesianClassifier(IWordsDataSource wd) {
 116  4
         this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
 117  4
     }
 118  
 
 119  
     /**
 120  
      * Constructor for BayesianClassifier that specifies a datasource & tokenizer
 121  
      *
 122  
      * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
 123  
      * @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
 124  
      */
 125  
     public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) {
 126  14
         this(wd, tokenizer, new DefaultStopWordsProvider());
 127  14
     }
 128  
 
 129  
     /**
 130  
      * Constructor for BayesianClassifier that specifies a datasource, tokenizer
 131  
      * and stop words provider
 132  
      *
 133  
      * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
 134  
      * @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
 135  
      * @param swp a {@link net.sf.classifier4J.IStopWordProvider}
 136  
      */
 137  16
     public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) {
 138  16
         if (wd == null) {
 139  0
             throw new IllegalArgumentException("IWordsDataSource can't be null");
 140  
         }
 141  16
         this.wordsData = wd;
 142  
 
 143  16
         if (tokenizer == null) {
 144  0
             throw new IllegalArgumentException("ITokenizer can't be null");
 145  
         }
 146  16
         this.tokenizer = tokenizer;
 147  
 
 148  16
         if (swp == null) {
 149  0
             throw new IllegalArgumentException("IStopWordProvider can't be null");
 150  
         }
 151  16
         this.stopWordProvider = swp;
 152  16
     }
 153  
 
 154  
     /**
 155  
      * @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String)
 156  
      */
 157  
     public boolean isMatch(String category, String input) throws WordsDataSourceException {
 158  0
         return isMatch(category, tokenizer.tokenize(input));
 159  
     }
 160  
 
 161  
     /**
 162  
      * @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String)
 163  
      */
 164  
     public double classify(String category, String input) throws WordsDataSourceException {
 165  0
         if (category == null) {
 166  0
             throw new IllegalArgumentException("category cannot be null");
 167  
         }
 168  0
         if (input == null) {
 169  0
             throw new IllegalArgumentException("input cannot be null");
 170  
         }
 171  
 
 172  0
         checkCategoriesSupported(category);
 173  
 
 174  0
         return classify(category, tokenizer.tokenize(input));
 175  
     }
 176  
 
 177  
     public void teachMatch(String category, String input) throws WordsDataSourceException {
 178  0
         if (category == null) {
 179  0
             throw new IllegalArgumentException("category cannot be null");
 180  
         }
 181  
 
 182  0
         if (input == null) {
 183  0
             throw new IllegalArgumentException("input cannot be null");
 184  
         }
 185  
 
 186  0
         checkCategoriesSupported(category);
 187  
 
 188  0
         teachMatch(category, tokenizer.tokenize(input));
 189  0
     }
 190  
 
 191  
     public void teachNonMatch(String category, String input) throws WordsDataSourceException {
 192  0
         if (category == null) {
 193  0
             throw new IllegalArgumentException("category cannot be null");
 194  
         }
 195  
 
 196  0
         if (input == null) {
 197  0
             throw new IllegalArgumentException("input cannot be null");
 198  
         }
 199  
 
 200  0
         checkCategoriesSupported(category);
 201  
 
 202  0
         teachNonMatch(category, tokenizer.tokenize(input));
 203  0
     }
 204  
 
 205  
     protected boolean isMatch(String category, String input[]) throws WordsDataSourceException {
 206  10
         if (category == null) {
 207  0
             throw new IllegalArgumentException("category cannot be null");
 208  
         }
 209  
 
 210  10
         if (input == null) {
 211  0
             throw new IllegalArgumentException("input cannot be null");
 212  
         }
 213  
 
 214  10
         checkCategoriesSupported(category);
 215  
 
 216  10
         double matchProbability = classify(category, input);
 217  
 
 218  10
         return (matchProbability >= cutoff);
 219  
     }
 220  
 
 221  
     protected double classify(String category, String words[]) throws WordsDataSourceException {
 222  14
         WordProbability[] wps = calcWordsProbability(category, words);
 223  14
         return normaliseSignificance(calculateOverallProbability(wps));
 224  
     }
 225  
 
 226  
     protected void teachMatch(String category, String words[]) throws WordsDataSourceException {
 227  6
         boolean categorise = false;
 228  6
         if (wordsData instanceof ICategorisedWordsDataSource) {
 229  0
             categorise = true;
 230  
         }
 231  510
         for (int i = 0; i <= words.length - 1; i++) {
 232  504
             if (isClassclass="keyword">ifiableWord(words[i])) {
 233  334
                 if (categorise) {
 234  0
                     ((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));
 235  
                 } else {
 236  334
                     wordsData.addMatch(transformWord(words[i]));
 237  
                 }
 238  
             }
 239  
         }
 240  6
     }
 241  
 
 242  
     protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {
 243  4
         boolean categorise = false;
 244  4
         if (wordsData instanceof ICategorisedWordsDataSource) {
 245  0
             categorise = true;
 246  
         }
 247  
 
 248  318
         for (int i = 0; i <= words.length - 1; i++) {
 249  314
             if (isClassclass="keyword">ifiableWord(words[i])) {
 250  202
                 if (categorise) {
 251  0
                     ((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));
 252  
                 } else {
 253  202
                     wordsData.addNonMatch(transformWord(words[i]));
 254  
                 }
 255  
 
 256  
             }
 257  
         }
 258  4
     }
 259  
 
 260  
     /**
 261  
      * Allows transformations to be done to word.
 262  
      * This implementation transforms the word to lowercase if the classifier
 263  
      * is in case-insenstive mode.
 264  
      *
 265  
      * @param word
 266  
      * @return the transformed word
 267  
      * @throws IllegalArgumentException if a null is passed
 268  
      */
 269  
     protected String transformWord(String word) {
 270  1092
         if (word != null) {
 271  1090
             if (!isCaseSensitive) {
 272  1086
                 return word.toLowerCase();
 273  
             } else {
 274  4
                 return word;
 275  
             }
 276  
         } else {
 277  2
             throw new IllegalArgumentException("Null cannot be passed");
 278  
         }
 279  
     }
 280  
 
 281  
     /**
 282  
      *
 283  
      * NOTE: Override this method with care. There is a good chance it will be removed
 284  
      * or have signature changes is later versions.
 285  
      *
 286  
      * <br />
 287  
      * @todo need an option to only use the "X" most "important" words when calculating overall probability
 288  
      * "important" is defined as being most distant from NEUTAL_PROBABILITY
 289  
      */
 290  
     protected double calculateOverallProbability(WordProbability[] wps) {
 291  16
         if (wps == null || wps.length == 0) {
 292  2
             return IClassifier.NEUTRAL_PROBABILITY;
 293  
         } else {
 294  
             // we need to calculate xy/(xy + z)
 295  
             // where z = (1-x)(1-y)
 296  
 
 297  
             // firstly, calculate z and xy
 298  14
             double z = 0d;
 299  14
             double xy = 0d;
 300  562
             for (int i = 0; i < wps.length; i++) {
 301  548
                 if (z == 0) {
 302  14
                     z = (1 - wps[i].getProbability());
 303  
                 } else {
 304  534
                     z = z * (1 - wps[i].getProbability());
 305  
                 }
 306  
 
 307  548
                 if (xy == 0) {
 308  14
                     xy = wps[i].getProbability();
 309  
                 } else {
 310  534
                     xy = xy * wps[i].getProbability();
 311  
                 }
 312  
             }
 313  
 
 314  14
             double numerator = xy;
 315  14
             double denominator = xy + z;
 316  
 
 317  14
             return numerator / denominator;
 318  
         }
 319  
     }
 320  
 
 321  
     private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {
 322  14
         if (category == null) {
 323  0
             throw new IllegalArgumentException("category cannont be null");
 324  
         }
 325  
 
 326  14
         boolean categorise = false;
 327  14
         if (wordsData instanceof ICategorisedWordsDataSource) {
 328  0
             categorise = true;
 329  
         }
 330  
 
 331  14
         checkCategoriesSupported(category);
 332  
 
 333  14
         if (words == null) {
 334  0
             return new WordProbability[0];
 335  
         } else {
 336  14
             List wps = new ArrayList();
 337  856
             for (int i = 0; i < words.length; i++) {
 338  842
                 if (isClassclass="keyword">ifiableWord(words[i])) {
 339  548
                     WordProbability wp = null;
 340  548
                     if (categorise) {
 341  0
                         wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));
 342  
                     } else {
 343  548
                         wp = wordsData.getWordProbability(transformWord(words[i]));
 344  
                     }
 345  548
                     if (wp != null) {
 346  542
                         wps.add(wp);
 347  
                     }
 348  
                 }
 349  
             }
 350  14
             return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);
 351  
         }
 352  
     }
 353  
 
 354  
     private void checkCategoriesSupported(String category) {
 355  
         // if the category is not the default
 356  24
         if (!ICategorisedClassclass="keyword">ifier.DEFAULT_CATEGORY.equals(category)) {
 357  
             // and the data source does not support categories
 358  0
             if (!(wordsData instanceof ICategorisedWordsDataSource)) {
 359  
                 // throw an IllegalArgumentException
 360  0
                 throw new IllegalArgumentException("Word Data Source does not support non-default categories.");
 361  
             }
 362  
         }
 363  24
     }
 364  
 
 365  
     private boolean isClassifiableWord(String word) {
 366  1660
         if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) {
 367  576
             return false;
 368  
         } else {
 369  1084
             return true;
 370  
         }
 371  
     }
 372  
 
 373  
     protected static double normaliseSignificance(class="keyword">double sig) {
 374  
 
 375  682
         if (Double.compare(IClassclass="keyword">ifier.UPPER_BOUND, sig) < 0) {
 376  646
             return IClassifier.UPPER_BOUND;
 377  36
         } else if (Double.compare(IClassclass="keyword">ifier.LOWER_BOUND, sig) > 0) {
 378  10
             return IClassifier.LOWER_BOUND;
 379  
         } else {
 380  26
             return sig;
 381  
         }
 382  
     }
 383  
     /**
 384  
      * @return true if the classifier is case sensitive, false otherwise
 385  
      * (false by default)
 386  
      */
 387  
     public boolean isCaseSensitive() {
 388  6
         return isCaseSensitive;
 389  
     }
 390  
 
 391  
     /**
 392  
      * @param b True if the classifier should be case sensitive, false otherwise
 393  
      */
 394  
     public void setCaseSensitive(boolean b) {
 395  4
         isCaseSensitive = b;
 396  4
     }
 397  
 
 398  
     /**
 399  
      * @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used
 400  
      * by this classifier
 401  
      */
 402  
     public IWordsDataSource getWordsDataSource() {
 403  2
         return wordsData;
 404  
     }
 405  
 
 406  
     /**
 407  
      * @return the {@link net.sf.classifier4J.ITokenizer} used
 408  
      * by this classifier
 409  
      */
 410  
     public ITokenizer getTokenizer() {
 411  2
         return tokenizer;
 412  
     }
 413  
 
 414  
     /**
 415  
      * @return the {@link net.sf.classifier4J.IStopWordProvider} used
 416  
      * by this classifier
 417  
      */
 418  
     public IStopWordProvider getStopWordProvider() {
 419  2
         return stopWordProvider;
 420  
     }
 421  
 
 422  
     public String toString() {
 423  0
         return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();
 424  
     }
 425  
 
 426  
 }

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.