| %line | %branch | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| net.sf.classifier4J.bayesian.BayesianClassifier |
|
|
| 1 | /* |
|
| 2 | * ==================================================================== |
|
| 3 | * |
|
| 4 | * The Apache Software License, Version 1.1 |
|
| 5 | * |
|
| 6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
| 7 | * |
|
| 8 | * Redistribution and use in source and binary forms, with or without |
|
| 9 | * modification, are permitted provided that the following conditions |
|
| 10 | * are met: |
|
| 11 | * |
|
| 12 | * 1. Redistributions of source code must retain the above copyright |
|
| 13 | * notice, this list of conditions and the following disclaimer. |
|
| 14 | * |
|
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
| 16 | * notice, this list of conditions and the following disclaimer in |
|
| 17 | * the documentation and/or other materials provided with the |
|
| 18 | * distribution. |
|
| 19 | * |
|
| 20 | * 3. The end-user documentation included with the redistribution, if |
|
| 21 | * any, must include the following acknowlegement: |
|
| 22 | * "This product includes software developed by the |
|
| 23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
| 24 | * Alternately, this acknowlegement may appear in the software itself, |
|
| 25 | * if and wherever such third-party acknowlegements normally appear. |
|
| 26 | * |
|
| 27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
| 28 | * products derived from this software without prior written |
|
| 29 | * permission. For written permission, please contact |
|
| 30 | * http://sourceforge.net/users/nicklothian/. |
|
| 31 | * |
|
| 32 | * 5. Products derived from this software may not be called |
|
| 33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
| 34 | * without prior written permission. For written permission, please |
|
| 35 | * contact http://sourceforge.net/users/nicklothian/. |
|
| 36 | * |
|
| 37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
| 38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
| 39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
| 40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
| 41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
| 42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
| 43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
| 44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
| 45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
| 46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
| 47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
| 48 | * SUCH DAMAGE. |
|
| 49 | * ==================================================================== |
|
| 50 | */ |
|
| 51 | ||
| 52 | package net.sf.classifier4J.bayesian; |
|
| 53 | ||
| 54 | import java.util.ArrayList; |
|
| 55 | import java.util.List; |
|
| 56 | ||
| 57 | import net.sf.classifier4J.AbstractCategorizedTrainableClassifier; |
|
| 58 | import net.sf.classifier4J.DefaultStopWordsProvider; |
|
| 59 | import net.sf.classifier4J.DefaultTokenizer; |
|
| 60 | import net.sf.classifier4J.ICategorisedClassifier; |
|
| 61 | import net.sf.classifier4J.IClassifier; |
|
| 62 | import net.sf.classifier4J.IStopWordProvider; |
|
| 63 | import net.sf.classifier4J.ITokenizer; |
|
| 64 | import net.sf.classifier4J.util.ToStringBuilder; |
|
| 65 | ||
| 66 | /** |
|
| 67 | * |
|
| 68 | * <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes' |
|
| 69 | * theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p> |
|
| 70 | * |
|
| 71 | * <p>The basic usage pattern for this class is: |
|
| 72 | * <ol> |
|
| 73 | * <li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li> |
|
| 74 | * <li>Create a new instance of BayesianClassifier, passing the IWordsDataSource |
|
| 75 | * to the constructor</li> |
|
| 76 | * <li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) } |
|
| 77 | * or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) } |
|
| 78 | * </ol> |
|
| 79 | * </p> |
|
| 80 | * |
|
| 81 | * <p>For example:<br> |
|
| 82 | * <tt> |
|
| 83 | * IWordsDataSource wds = new SimpleWordsDataSource();<br> |
|
| 84 | * IClassifier classifier = new BayesianClassifier(wds);<br> |
|
| 85 | * System.out.println( "Matches = " + classifier.classify("This is a sentence") ); |
|
| 86 | * </tt> |
|
| 87 | * </p> |
|
| 88 | * |
|
| 89 | * @author Nick Lothian |
|
| 90 | * @author Peter Leschev |
|
| 91 | * |
|
| 92 | */ |
|
| 93 | public class BayesianClassifier extends AbstractCategorizedTrainableClassifier { |
|
| 94 | ||
| 95 | IWordsDataSource wordsData; |
|
| 96 | ITokenizer tokenizer; |
|
| 97 | IStopWordProvider stopWordProvider; |
|
| 98 | ||
| 99 | 16 | private boolean isCaseSensitive = false; |
| 100 | ||
| 101 | /** |
|
| 102 | * Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer |
|
| 103 | * (set to BREAK_ON_WORD_BREAKS). |
|
| 104 | */ |
|
| 105 | public BayesianClassifier() { |
|
| 106 | 8 | this(new SimpleWordsDataSource(), class="keyword">new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS)); |
| 107 | 8 | } |
| 108 | ||
| 109 | /** |
|
| 110 | * Constructor for BayesianClassifier that specifies a datasource. The |
|
| 111 | * DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used. |
|
| 112 | * |
|
| 113 | * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource} |
|
| 114 | */ |
|
| 115 | public BayesianClassifier(IWordsDataSource wd) { |
|
| 116 | 4 | this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS)); |
| 117 | 4 | } |
| 118 | ||
| 119 | /** |
|
| 120 | * Constructor for BayesianClassifier that specifies a datasource & tokenizer |
|
| 121 | * |
|
| 122 | * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource} |
|
| 123 | * @param tokenizer a {@link net.sf.classifier4J.ITokenizer} |
|
| 124 | */ |
|
| 125 | public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) { |
|
| 126 | 14 | this(wd, tokenizer, new DefaultStopWordsProvider()); |
| 127 | 14 | } |
| 128 | ||
| 129 | /** |
|
| 130 | * Constructor for BayesianClassifier that specifies a datasource, tokenizer |
|
| 131 | * and stop words provider |
|
| 132 | * |
|
| 133 | * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource} |
|
| 134 | * @param tokenizer a {@link net.sf.classifier4J.ITokenizer} |
|
| 135 | * @param swp a {@link net.sf.classifier4J.IStopWordProvider} |
|
| 136 | */ |
|
| 137 | 16 | public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) { |
| 138 | 16 | if (wd == null) { |
| 139 | 0 | throw new IllegalArgumentException("IWordsDataSource can't be null"); |
| 140 | } |
|
| 141 | 16 | this.wordsData = wd; |
| 142 | ||
| 143 | 16 | if (tokenizer == null) { |
| 144 | 0 | throw new IllegalArgumentException("ITokenizer can't be null"); |
| 145 | } |
|
| 146 | 16 | this.tokenizer = tokenizer; |
| 147 | ||
| 148 | 16 | if (swp == null) { |
| 149 | 0 | throw new IllegalArgumentException("IStopWordProvider can't be null"); |
| 150 | } |
|
| 151 | 16 | this.stopWordProvider = swp; |
| 152 | 16 | } |
| 153 | ||
| 154 | /** |
|
| 155 | * @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String) |
|
| 156 | */ |
|
| 157 | public boolean isMatch(String category, String input) throws WordsDataSourceException { |
|
| 158 | 0 | return isMatch(category, tokenizer.tokenize(input)); |
| 159 | } |
|
| 160 | ||
| 161 | /** |
|
| 162 | * @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String) |
|
| 163 | */ |
|
| 164 | public double classify(String category, String input) throws WordsDataSourceException { |
|
| 165 | 0 | if (category == null) { |
| 166 | 0 | throw new IllegalArgumentException("category cannot be null"); |
| 167 | } |
|
| 168 | 0 | if (input == null) { |
| 169 | 0 | throw new IllegalArgumentException("input cannot be null"); |
| 170 | } |
|
| 171 | ||
| 172 | 0 | checkCategoriesSupported(category); |
| 173 | ||
| 174 | 0 | return classify(category, tokenizer.tokenize(input)); |
| 175 | } |
|
| 176 | ||
| 177 | public void teachMatch(String category, String input) throws WordsDataSourceException { |
|
| 178 | 0 | if (category == null) { |
| 179 | 0 | throw new IllegalArgumentException("category cannot be null"); |
| 180 | } |
|
| 181 | ||
| 182 | 0 | if (input == null) { |
| 183 | 0 | throw new IllegalArgumentException("input cannot be null"); |
| 184 | } |
|
| 185 | ||
| 186 | 0 | checkCategoriesSupported(category); |
| 187 | ||
| 188 | 0 | teachMatch(category, tokenizer.tokenize(input)); |
| 189 | 0 | } |
| 190 | ||
| 191 | public void teachNonMatch(String category, String input) throws WordsDataSourceException { |
|
| 192 | 0 | if (category == null) { |
| 193 | 0 | throw new IllegalArgumentException("category cannot be null"); |
| 194 | } |
|
| 195 | ||
| 196 | 0 | if (input == null) { |
| 197 | 0 | throw new IllegalArgumentException("input cannot be null"); |
| 198 | } |
|
| 199 | ||
| 200 | 0 | checkCategoriesSupported(category); |
| 201 | ||
| 202 | 0 | teachNonMatch(category, tokenizer.tokenize(input)); |
| 203 | 0 | } |
| 204 | ||
| 205 | protected boolean isMatch(String category, String input[]) throws WordsDataSourceException { |
|
| 206 | 10 | if (category == null) { |
| 207 | 0 | throw new IllegalArgumentException("category cannot be null"); |
| 208 | } |
|
| 209 | ||
| 210 | 10 | if (input == null) { |
| 211 | 0 | throw new IllegalArgumentException("input cannot be null"); |
| 212 | } |
|
| 213 | ||
| 214 | 10 | checkCategoriesSupported(category); |
| 215 | ||
| 216 | 10 | double matchProbability = classify(category, input); |
| 217 | ||
| 218 | 10 | return (matchProbability >= cutoff); |
| 219 | } |
|
| 220 | ||
| 221 | protected double classify(String category, String words[]) throws WordsDataSourceException { |
|
| 222 | 14 | WordProbability[] wps = calcWordsProbability(category, words); |
| 223 | 14 | return normaliseSignificance(calculateOverallProbability(wps)); |
| 224 | } |
|
| 225 | ||
| 226 | protected void teachMatch(String category, String words[]) throws WordsDataSourceException { |
|
| 227 | 6 | boolean categorise = false; |
| 228 | 6 | if (wordsData instanceof ICategorisedWordsDataSource) { |
| 229 | 0 | categorise = true; |
| 230 | } |
|
| 231 | 510 | for (int i = 0; i <= words.length - 1; i++) { |
| 232 | 504 | if (isClassclass="keyword">ifiableWord(words[i])) { |
| 233 | 334 | if (categorise) { |
| 234 | 0 | ((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i])); |
| 235 | } else { |
|
| 236 | 334 | wordsData.addMatch(transformWord(words[i])); |
| 237 | } |
|
| 238 | } |
|
| 239 | } |
|
| 240 | 6 | } |
| 241 | ||
| 242 | protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException { |
|
| 243 | 4 | boolean categorise = false; |
| 244 | 4 | if (wordsData instanceof ICategorisedWordsDataSource) { |
| 245 | 0 | categorise = true; |
| 246 | } |
|
| 247 | ||
| 248 | 318 | for (int i = 0; i <= words.length - 1; i++) { |
| 249 | 314 | if (isClassclass="keyword">ifiableWord(words[i])) { |
| 250 | 202 | if (categorise) { |
| 251 | 0 | ((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i])); |
| 252 | } else { |
|
| 253 | 202 | wordsData.addNonMatch(transformWord(words[i])); |
| 254 | } |
|
| 255 | ||
| 256 | } |
|
| 257 | } |
|
| 258 | 4 | } |
| 259 | ||
| 260 | /** |
|
| 261 | * Allows transformations to be done to word. |
|
| 262 | * This implementation transforms the word to lowercase if the classifier |
|
| 263 | * is in case-insenstive mode. |
|
| 264 | * |
|
| 265 | * @param word |
|
| 266 | * @return the transformed word |
|
| 267 | * @throws IllegalArgumentException if a null is passed |
|
| 268 | */ |
|
| 269 | protected String transformWord(String word) { |
|
| 270 | 1092 | if (word != null) { |
| 271 | 1090 | if (!isCaseSensitive) { |
| 272 | 1086 | return word.toLowerCase(); |
| 273 | } else { |
|
| 274 | 4 | return word; |
| 275 | } |
|
| 276 | } else { |
|
| 277 | 2 | throw new IllegalArgumentException("Null cannot be passed"); |
| 278 | } |
|
| 279 | } |
|
| 280 | ||
| 281 | /** |
|
| 282 | * |
|
| 283 | * NOTE: Override this method with care. There is a good chance it will be removed |
|
| 284 | * or have signature changes is later versions. |
|
| 285 | * |
|
| 286 | * <br /> |
|
| 287 | * @todo need an option to only use the "X" most "important" words when calculating overall probability |
|
| 288 | * "important" is defined as being most distant from NEUTAL_PROBABILITY |
|
| 289 | */ |
|
| 290 | protected double calculateOverallProbability(WordProbability[] wps) { |
|
| 291 | 16 | if (wps == null || wps.length == 0) { |
| 292 | 2 | return IClassifier.NEUTRAL_PROBABILITY; |
| 293 | } else { |
|
| 294 | // we need to calculate xy/(xy + z) |
|
| 295 | // where z = (1-x)(1-y) |
|
| 296 | ||
| 297 | // firstly, calculate z and xy |
|
| 298 | 14 | double z = 0d; |
| 299 | 14 | double xy = 0d; |
| 300 | 562 | for (int i = 0; i < wps.length; i++) { |
| 301 | 548 | if (z == 0) { |
| 302 | 14 | z = (1 - wps[i].getProbability()); |
| 303 | } else { |
|
| 304 | 534 | z = z * (1 - wps[i].getProbability()); |
| 305 | } |
|
| 306 | ||
| 307 | 548 | if (xy == 0) { |
| 308 | 14 | xy = wps[i].getProbability(); |
| 309 | } else { |
|
| 310 | 534 | xy = xy * wps[i].getProbability(); |
| 311 | } |
|
| 312 | } |
|
| 313 | ||
| 314 | 14 | double numerator = xy; |
| 315 | 14 | double denominator = xy + z; |
| 316 | ||
| 317 | 14 | return numerator / denominator; |
| 318 | } |
|
| 319 | } |
|
| 320 | ||
| 321 | private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException { |
|
| 322 | 14 | if (category == null) { |
| 323 | 0 | throw new IllegalArgumentException("category cannont be null"); |
| 324 | } |
|
| 325 | ||
| 326 | 14 | boolean categorise = false; |
| 327 | 14 | if (wordsData instanceof ICategorisedWordsDataSource) { |
| 328 | 0 | categorise = true; |
| 329 | } |
|
| 330 | ||
| 331 | 14 | checkCategoriesSupported(category); |
| 332 | ||
| 333 | 14 | if (words == null) { |
| 334 | 0 | return new WordProbability[0]; |
| 335 | } else { |
|
| 336 | 14 | List wps = new ArrayList(); |
| 337 | 856 | for (int i = 0; i < words.length; i++) { |
| 338 | 842 | if (isClassclass="keyword">ifiableWord(words[i])) { |
| 339 | 548 | WordProbability wp = null; |
| 340 | 548 | if (categorise) { |
| 341 | 0 | wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i])); |
| 342 | } else { |
|
| 343 | 548 | wp = wordsData.getWordProbability(transformWord(words[i])); |
| 344 | } |
|
| 345 | 548 | if (wp != null) { |
| 346 | 542 | wps.add(wp); |
| 347 | } |
|
| 348 | } |
|
| 349 | } |
|
| 350 | 14 | return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]); |
| 351 | } |
|
| 352 | } |
|
| 353 | ||
| 354 | private void checkCategoriesSupported(String category) { |
|
| 355 | // if the category is not the default |
|
| 356 | 24 | if (!ICategorisedClassclass="keyword">ifier.DEFAULT_CATEGORY.equals(category)) { |
| 357 | // and the data source does not support categories |
|
| 358 | 0 | if (!(wordsData instanceof ICategorisedWordsDataSource)) { |
| 359 | // throw an IllegalArgumentException |
|
| 360 | 0 | throw new IllegalArgumentException("Word Data Source does not support non-default categories."); |
| 361 | } |
|
| 362 | } |
|
| 363 | 24 | } |
| 364 | ||
| 365 | private boolean isClassifiableWord(String word) { |
|
| 366 | 1660 | if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) { |
| 367 | 576 | return false; |
| 368 | } else { |
|
| 369 | 1084 | return true; |
| 370 | } |
|
| 371 | } |
|
| 372 | ||
| 373 | protected static double normaliseSignificance(class="keyword">double sig) { |
|
| 374 | ||
| 375 | 682 | if (Double.compare(IClassclass="keyword">ifier.UPPER_BOUND, sig) < 0) { |
| 376 | 646 | return IClassifier.UPPER_BOUND; |
| 377 | 36 | } else if (Double.compare(IClassclass="keyword">ifier.LOWER_BOUND, sig) > 0) { |
| 378 | 10 | return IClassifier.LOWER_BOUND; |
| 379 | } else { |
|
| 380 | 26 | return sig; |
| 381 | } |
|
| 382 | } |
|
| 383 | /** |
|
| 384 | * @return true if the classifier is case sensitive, false otherwise |
|
| 385 | * (false by default) |
|
| 386 | */ |
|
| 387 | public boolean isCaseSensitive() { |
|
| 388 | 6 | return isCaseSensitive; |
| 389 | } |
|
| 390 | ||
| 391 | /** |
|
| 392 | * @param b True if the classifier should be case sensitive, false otherwise |
|
| 393 | */ |
|
| 394 | public void setCaseSensitive(boolean b) { |
|
| 395 | 4 | isCaseSensitive = b; |
| 396 | 4 | } |
| 397 | ||
| 398 | /** |
|
| 399 | * @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used |
|
| 400 | * by this classifier |
|
| 401 | */ |
|
| 402 | public IWordsDataSource getWordsDataSource() { |
|
| 403 | 2 | return wordsData; |
| 404 | } |
|
| 405 | ||
| 406 | /** |
|
| 407 | * @return the {@link net.sf.classifier4J.ITokenizer} used |
|
| 408 | * by this classifier |
|
| 409 | */ |
|
| 410 | public ITokenizer getTokenizer() { |
|
| 411 | 2 | return tokenizer; |
| 412 | } |
|
| 413 | ||
| 414 | /** |
|
| 415 | * @return the {@link net.sf.classifier4J.IStopWordProvider} used |
|
| 416 | * by this classifier |
|
| 417 | */ |
|
| 418 | public IStopWordProvider getStopWordProvider() { |
|
| 419 | 2 | return stopWordProvider; |
| 420 | } |
|
| 421 | ||
| 422 | public String toString() { |
|
| 423 | 0 | return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString(); |
| 424 | } |
|
| 425 | ||
| 426 | } |
| This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |