%line | %branch | |||||||||
---|---|---|---|---|---|---|---|---|---|---|
net.sf.classifier4J.bayesian.BayesianClassifier |
|
|
1 | /* |
|
2 | * ==================================================================== |
|
3 | * |
|
4 | * The Apache Software License, Version 1.1 |
|
5 | * |
|
6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
7 | * |
|
8 | * Redistribution and use in source and binary forms, with or without |
|
9 | * modification, are permitted provided that the following conditions |
|
10 | * are met: |
|
11 | * |
|
12 | * 1. Redistributions of source code must retain the above copyright |
|
13 | * notice, this list of conditions and the following disclaimer. |
|
14 | * |
|
15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
16 | * notice, this list of conditions and the following disclaimer in |
|
17 | * the documentation and/or other materials provided with the |
|
18 | * distribution. |
|
19 | * |
|
20 | * 3. The end-user documentation included with the redistribution, if |
|
21 | * any, must include the following acknowlegement: |
|
22 | * "This product includes software developed by the |
|
23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
24 | * Alternately, this acknowlegement may appear in the software itself, |
|
25 | * if and wherever such third-party acknowlegements normally appear. |
|
26 | * |
|
27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
28 | * products derived from this software without prior written |
|
29 | * permission. For written permission, please contact |
|
30 | * http://sourceforge.net/users/nicklothian/. |
|
31 | * |
|
32 | * 5. Products derived from this software may not be called |
|
33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
34 | * without prior written permission. For written permission, please |
|
35 | * contact http://sourceforge.net/users/nicklothian/. |
|
36 | * |
|
37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
48 | * SUCH DAMAGE. |
|
49 | * ==================================================================== |
|
50 | */ |
|
51 | ||
52 | package net.sf.classifier4J.bayesian; |
|
53 | ||
54 | import java.util.ArrayList; |
|
55 | import java.util.List; |
|
56 | ||
57 | import net.sf.classifier4J.AbstractCategorizedTrainableClassifier; |
|
58 | import net.sf.classifier4J.DefaultStopWordsProvider; |
|
59 | import net.sf.classifier4J.DefaultTokenizer; |
|
60 | import net.sf.classifier4J.ICategorisedClassifier; |
|
61 | import net.sf.classifier4J.IClassifier; |
|
62 | import net.sf.classifier4J.IStopWordProvider; |
|
63 | import net.sf.classifier4J.ITokenizer; |
|
64 | import net.sf.classifier4J.util.ToStringBuilder; |
|
65 | ||
66 | /** |
|
67 | * |
|
68 | * <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes' |
|
69 | * theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p> |
|
70 | * |
|
71 | * <p>The basic usage pattern for this class is: |
|
72 | * <ol> |
|
73 | * <li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li> |
|
74 | * <li>Create a new instance of BayesianClassifier, passing the IWordsDataSource |
|
75 | * to the constructor</li> |
|
76 | * <li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) } |
|
77 | * or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) } |
|
78 | * </ol> |
|
79 | * </p> |
|
80 | * |
|
81 | * <p>For example:<br> |
|
82 | * <tt> |
|
83 | * IWordsDataSource wds = new SimpleWordsDataSource();<br> |
|
84 | * IClassifier classifier = new BayesianClassifier(wds);<br> |
|
85 | * System.out.println( "Matches = " + classifier.classify("This is a sentence") ); |
|
86 | * </tt> |
|
87 | * </p> |
|
88 | * |
|
89 | * @author Nick Lothian |
|
90 | * @author Peter Leschev |
|
91 | * |
|
92 | */ |
|
93 | public class BayesianClassifier extends AbstractCategorizedTrainableClassifier { |
|
94 | ||
95 | IWordsDataSource wordsData; |
|
96 | ITokenizer tokenizer; |
|
97 | IStopWordProvider stopWordProvider; |
|
98 | ||
99 | 16 | private boolean isCaseSensitive = false; |
100 | ||
101 | /** |
|
102 | * Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer |
|
103 | * (set to BREAK_ON_WORD_BREAKS). |
|
104 | */ |
|
105 | public BayesianClassifier() { |
|
106 | 8 | this(new SimpleWordsDataSource(), class="keyword">new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS)); |
107 | 8 | } |
108 | ||
109 | /** |
|
110 | * Constructor for BayesianClassifier that specifies a datasource. The |
|
111 | * DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used. |
|
112 | * |
|
113 | * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource} |
|
114 | */ |
|
115 | public BayesianClassifier(IWordsDataSource wd) { |
|
116 | 4 | this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS)); |
117 | 4 | } |
118 | ||
119 | /** |
|
120 | * Constructor for BayesianClassifier that specifies a datasource & tokenizer |
|
121 | * |
|
122 | * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource} |
|
123 | * @param tokenizer a {@link net.sf.classifier4J.ITokenizer} |
|
124 | */ |
|
125 | public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) { |
|
126 | 14 | this(wd, tokenizer, new DefaultStopWordsProvider()); |
127 | 14 | } |
128 | ||
129 | /** |
|
130 | * Constructor for BayesianClassifier that specifies a datasource, tokenizer |
|
131 | * and stop words provider |
|
132 | * |
|
133 | * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource} |
|
134 | * @param tokenizer a {@link net.sf.classifier4J.ITokenizer} |
|
135 | * @param swp a {@link net.sf.classifier4J.IStopWordProvider} |
|
136 | */ |
|
137 | 16 | public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) { |
138 | 16 | if (wd == null) { |
139 | 0 | throw new IllegalArgumentException("IWordsDataSource can't be null"); |
140 | } |
|
141 | 16 | this.wordsData = wd; |
142 | ||
143 | 16 | if (tokenizer == null) { |
144 | 0 | throw new IllegalArgumentException("ITokenizer can't be null"); |
145 | } |
|
146 | 16 | this.tokenizer = tokenizer; |
147 | ||
148 | 16 | if (swp == null) { |
149 | 0 | throw new IllegalArgumentException("IStopWordProvider can't be null"); |
150 | } |
|
151 | 16 | this.stopWordProvider = swp; |
152 | 16 | } |
153 | ||
154 | /** |
|
155 | * @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String) |
|
156 | */ |
|
157 | public boolean isMatch(String category, String input) throws WordsDataSourceException { |
|
158 | 0 | return isMatch(category, tokenizer.tokenize(input)); |
159 | } |
|
160 | ||
161 | /** |
|
162 | * @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String) |
|
163 | */ |
|
164 | public double classify(String category, String input) throws WordsDataSourceException { |
|
165 | 0 | if (category == null) { |
166 | 0 | throw new IllegalArgumentException("category cannot be null"); |
167 | } |
|
168 | 0 | if (input == null) { |
169 | 0 | throw new IllegalArgumentException("input cannot be null"); |
170 | } |
|
171 | ||
172 | 0 | checkCategoriesSupported(category); |
173 | ||
174 | 0 | return classify(category, tokenizer.tokenize(input)); |
175 | } |
|
176 | ||
177 | public void teachMatch(String category, String input) throws WordsDataSourceException { |
|
178 | 0 | if (category == null) { |
179 | 0 | throw new IllegalArgumentException("category cannot be null"); |
180 | } |
|
181 | ||
182 | 0 | if (input == null) { |
183 | 0 | throw new IllegalArgumentException("input cannot be null"); |
184 | } |
|
185 | ||
186 | 0 | checkCategoriesSupported(category); |
187 | ||
188 | 0 | teachMatch(category, tokenizer.tokenize(input)); |
189 | 0 | } |
190 | ||
191 | public void teachNonMatch(String category, String input) throws WordsDataSourceException { |
|
192 | 0 | if (category == null) { |
193 | 0 | throw new IllegalArgumentException("category cannot be null"); |
194 | } |
|
195 | ||
196 | 0 | if (input == null) { |
197 | 0 | throw new IllegalArgumentException("input cannot be null"); |
198 | } |
|
199 | ||
200 | 0 | checkCategoriesSupported(category); |
201 | ||
202 | 0 | teachNonMatch(category, tokenizer.tokenize(input)); |
203 | 0 | } |
204 | ||
205 | protected boolean isMatch(String category, String input[]) throws WordsDataSourceException { |
|
206 | 10 | if (category == null) { |
207 | 0 | throw new IllegalArgumentException("category cannot be null"); |
208 | } |
|
209 | ||
210 | 10 | if (input == null) { |
211 | 0 | throw new IllegalArgumentException("input cannot be null"); |
212 | } |
|
213 | ||
214 | 10 | checkCategoriesSupported(category); |
215 | ||
216 | 10 | double matchProbability = classify(category, input); |
217 | ||
218 | 10 | return (matchProbability >= cutoff); |
219 | } |
|
220 | ||
221 | protected double classify(String category, String words[]) throws WordsDataSourceException { |
|
222 | 14 | WordProbability[] wps = calcWordsProbability(category, words); |
223 | 14 | return normaliseSignificance(calculateOverallProbability(wps)); |
224 | } |
|
225 | ||
226 | protected void teachMatch(String category, String words[]) throws WordsDataSourceException { |
|
227 | 6 | boolean categorise = false; |
228 | 6 | if (wordsData instanceof ICategorisedWordsDataSource) { |
229 | 0 | categorise = true; |
230 | } |
|
231 | 510 | for (int i = 0; i <= words.length - 1; i++) { |
232 | 504 | if (isClassclass="keyword">ifiableWord(words[i])) { |
233 | 334 | if (categorise) { |
234 | 0 | ((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i])); |
235 | } else { |
|
236 | 334 | wordsData.addMatch(transformWord(words[i])); |
237 | } |
|
238 | } |
|
239 | } |
|
240 | 6 | } |
241 | ||
242 | protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException { |
|
243 | 4 | boolean categorise = false; |
244 | 4 | if (wordsData instanceof ICategorisedWordsDataSource) { |
245 | 0 | categorise = true; |
246 | } |
|
247 | ||
248 | 318 | for (int i = 0; i <= words.length - 1; i++) { |
249 | 314 | if (isClassclass="keyword">ifiableWord(words[i])) { |
250 | 202 | if (categorise) { |
251 | 0 | ((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i])); |
252 | } else { |
|
253 | 202 | wordsData.addNonMatch(transformWord(words[i])); |
254 | } |
|
255 | ||
256 | } |
|
257 | } |
|
258 | 4 | } |
259 | ||
260 | /** |
|
261 | * Allows transformations to be done to word. |
|
262 | * This implementation transforms the word to lowercase if the classifier |
|
263 | * is in case-insenstive mode. |
|
264 | * |
|
265 | * @param word |
|
266 | * @return the transformed word |
|
267 | * @throws IllegalArgumentException if a null is passed |
|
268 | */ |
|
269 | protected String transformWord(String word) { |
|
270 | 1092 | if (word != null) { |
271 | 1090 | if (!isCaseSensitive) { |
272 | 1086 | return word.toLowerCase(); |
273 | } else { |
|
274 | 4 | return word; |
275 | } |
|
276 | } else { |
|
277 | 2 | throw new IllegalArgumentException("Null cannot be passed"); |
278 | } |
|
279 | } |
|
280 | ||
281 | /** |
|
282 | * |
|
283 | * NOTE: Override this method with care. There is a good chance it will be removed |
|
284 | * or have signature changes is later versions. |
|
285 | * |
|
286 | * <br /> |
|
287 | * @todo need an option to only use the "X" most "important" words when calculating overall probability |
|
288 | * "important" is defined as being most distant from NEUTAL_PROBABILITY |
|
289 | */ |
|
290 | protected double calculateOverallProbability(WordProbability[] wps) { |
|
291 | 16 | if (wps == null || wps.length == 0) { |
292 | 2 | return IClassifier.NEUTRAL_PROBABILITY; |
293 | } else { |
|
294 | // we need to calculate xy/(xy + z) |
|
295 | // where z = (1-x)(1-y) |
|
296 | ||
297 | // firstly, calculate z and xy |
|
298 | 14 | double z = 0d; |
299 | 14 | double xy = 0d; |
300 | 562 | for (int i = 0; i < wps.length; i++) { |
301 | 548 | if (z == 0) { |
302 | 14 | z = (1 - wps[i].getProbability()); |
303 | } else { |
|
304 | 534 | z = z * (1 - wps[i].getProbability()); |
305 | } |
|
306 | ||
307 | 548 | if (xy == 0) { |
308 | 14 | xy = wps[i].getProbability(); |
309 | } else { |
|
310 | 534 | xy = xy * wps[i].getProbability(); |
311 | } |
|
312 | } |
|
313 | ||
314 | 14 | double numerator = xy; |
315 | 14 | double denominator = xy + z; |
316 | ||
317 | 14 | return numerator / denominator; |
318 | } |
|
319 | } |
|
320 | ||
321 | private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException { |
|
322 | 14 | if (category == null) { |
323 | 0 | throw new IllegalArgumentException("category cannont be null"); |
324 | } |
|
325 | ||
326 | 14 | boolean categorise = false; |
327 | 14 | if (wordsData instanceof ICategorisedWordsDataSource) { |
328 | 0 | categorise = true; |
329 | } |
|
330 | ||
331 | 14 | checkCategoriesSupported(category); |
332 | ||
333 | 14 | if (words == null) { |
334 | 0 | return new WordProbability[0]; |
335 | } else { |
|
336 | 14 | List wps = new ArrayList(); |
337 | 856 | for (int i = 0; i < words.length; i++) { |
338 | 842 | if (isClassclass="keyword">ifiableWord(words[i])) { |
339 | 548 | WordProbability wp = null; |
340 | 548 | if (categorise) { |
341 | 0 | wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i])); |
342 | } else { |
|
343 | 548 | wp = wordsData.getWordProbability(transformWord(words[i])); |
344 | } |
|
345 | 548 | if (wp != null) { |
346 | 542 | wps.add(wp); |
347 | } |
|
348 | } |
|
349 | } |
|
350 | 14 | return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]); |
351 | } |
|
352 | } |
|
353 | ||
354 | private void checkCategoriesSupported(String category) { |
|
355 | // if the category is not the default |
|
356 | 24 | if (!ICategorisedClassclass="keyword">ifier.DEFAULT_CATEGORY.equals(category)) { |
357 | // and the data source does not support categories |
|
358 | 0 | if (!(wordsData instanceof ICategorisedWordsDataSource)) { |
359 | // throw an IllegalArgumentException |
|
360 | 0 | throw new IllegalArgumentException("Word Data Source does not support non-default categories."); |
361 | } |
|
362 | } |
|
363 | 24 | } |
364 | ||
365 | private boolean isClassifiableWord(String word) { |
|
366 | 1660 | if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) { |
367 | 576 | return false; |
368 | } else { |
|
369 | 1084 | return true; |
370 | } |
|
371 | } |
|
372 | ||
373 | protected static double normaliseSignificance(class="keyword">double sig) { |
|
374 | ||
375 | 682 | if (Double.compare(IClassclass="keyword">ifier.UPPER_BOUND, sig) < 0) { |
376 | 646 | return IClassifier.UPPER_BOUND; |
377 | 36 | } else if (Double.compare(IClassclass="keyword">ifier.LOWER_BOUND, sig) > 0) { |
378 | 10 | return IClassifier.LOWER_BOUND; |
379 | } else { |
|
380 | 26 | return sig; |
381 | } |
|
382 | } |
|
383 | /** |
|
384 | * @return true if the classifier is case sensitive, false otherwise |
|
385 | * (false by default) |
|
386 | */ |
|
387 | public boolean isCaseSensitive() { |
|
388 | 6 | return isCaseSensitive; |
389 | } |
|
390 | ||
391 | /** |
|
392 | * @param b True if the classifier should be case sensitive, false otherwise |
|
393 | */ |
|
394 | public void setCaseSensitive(boolean b) { |
|
395 | 4 | isCaseSensitive = b; |
396 | 4 | } |
397 | ||
398 | /** |
|
399 | * @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used |
|
400 | * by this classifier |
|
401 | */ |
|
402 | public IWordsDataSource getWordsDataSource() { |
|
403 | 2 | return wordsData; |
404 | } |
|
405 | ||
406 | /** |
|
407 | * @return the {@link net.sf.classifier4J.ITokenizer} used |
|
408 | * by this classifier |
|
409 | */ |
|
410 | public ITokenizer getTokenizer() { |
|
411 | 2 | return tokenizer; |
412 | } |
|
413 | ||
414 | /** |
|
415 | * @return the {@link net.sf.classifier4J.IStopWordProvider} used |
|
416 | * by this classifier |
|
417 | */ |
|
418 | public IStopWordProvider getStopWordProvider() { |
|
419 | 2 | return stopWordProvider; |
420 | } |
|
421 | ||
422 | public String toString() { |
|
423 | 0 | return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString(); |
424 | } |
|
425 | ||
426 | } |
This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |