1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 package net.sf.classifier4J.bayesian;
53
54 import java.util.ArrayList;
55 import java.util.List;
56
57 import net.sf.classifier4J.AbstractCategorizedTrainableClassifier;
58 import net.sf.classifier4J.DefaultStopWordsProvider;
59 import net.sf.classifier4J.DefaultTokenizer;
60 import net.sf.classifier4J.ICategorisedClassifier;
61 import net.sf.classifier4J.IClassifier;
62 import net.sf.classifier4J.IStopWordProvider;
63 import net.sf.classifier4J.ITokenizer;
64 import net.sf.classifier4J.util.ToStringBuilder;
65
66 /***
67 *
68 * <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes'
69 * theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p>
70 *
71 * <p>The basic usage pattern for this class is:
72 * <ol>
73 * <li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li>
74 * <li>Create a new instance of BayesianClassifier, passing the IWordsDataSource
75 * to the constructor</li>
76 * <li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) }
77 * or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) }
78 * </ol>
79 * </p>
80 *
81 * <p>For example:<br>
82 * <tt>
83 * IWordsDataSource wds = new SimpleWordsDataSource();<br>
84 * IClassifier classifier = new BayesianClassifier(wds);<br>
85 * System.out.println( "Matches = " + classifier.classify("This is a sentence") );
86 * </tt>
87 * </p>
88 *
89 * @author Nick Lothian
90 * @author Peter Leschev
91 *
92 */
93 public class BayesianClassifier extends AbstractCategorizedTrainableClassifier {
94
95 IWordsDataSource wordsData;
96 ITokenizer tokenizer;
97 IStopWordProvider stopWordProvider;
98
99 private boolean isCaseSensitive = false;
100
101 /***
102 * Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer
103 * (set to BREAK_ON_WORD_BREAKS).
104 */
105 public BayesianClassifier() {
106 this(new SimpleWordsDataSource(), new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
107 }
108
109 /***
110 * Constructor for BayesianClassifier that specifies a datasource. The
111 * DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used.
112 *
113 * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
114 */
115 public BayesianClassifier(IWordsDataSource wd) {
116 this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
117 }
118
119 /***
120 * Constructor for BayesianClassifier that specifies a datasource & tokenizer
121 *
122 * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
123 * @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
124 */
125 public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) {
126 this(wd, tokenizer, new DefaultStopWordsProvider());
127 }
128
129 /***
130 * Constructor for BayesianClassifier that specifies a datasource, tokenizer
131 * and stop words provider
132 *
133 * @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
134 * @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
135 * @param swp a {@link net.sf.classifier4J.IStopWordProvider}
136 */
137 public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) {
138 if (wd == null) {
139 throw new IllegalArgumentException("IWordsDataSource can't be null");
140 }
141 this.wordsData = wd;
142
143 if (tokenizer == null) {
144 throw new IllegalArgumentException("ITokenizer can't be null");
145 }
146 this.tokenizer = tokenizer;
147
148 if (swp == null) {
149 throw new IllegalArgumentException("IStopWordProvider can't be null");
150 }
151 this.stopWordProvider = swp;
152 }
153
154 /***
155 * @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String)
156 */
157 public boolean isMatch(String category, String input) throws WordsDataSourceException {
158 return isMatch(category, tokenizer.tokenize(input));
159 }
160
161 /***
162 * @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String)
163 */
164 public double classify(String category, String input) throws WordsDataSourceException {
165 if (category == null) {
166 throw new IllegalArgumentException("category cannot be null");
167 }
168 if (input == null) {
169 throw new IllegalArgumentException("input cannot be null");
170 }
171
172 checkCategoriesSupported(category);
173
174 return classify(category, tokenizer.tokenize(input));
175 }
176
177 public void teachMatch(String category, String input) throws WordsDataSourceException {
178 if (category == null) {
179 throw new IllegalArgumentException("category cannot be null");
180 }
181
182 if (input == null) {
183 throw new IllegalArgumentException("input cannot be null");
184 }
185
186 checkCategoriesSupported(category);
187
188 teachMatch(category, tokenizer.tokenize(input));
189 }
190
191 public void teachNonMatch(String category, String input) throws WordsDataSourceException {
192 if (category == null) {
193 throw new IllegalArgumentException("category cannot be null");
194 }
195
196 if (input == null) {
197 throw new IllegalArgumentException("input cannot be null");
198 }
199
200 checkCategoriesSupported(category);
201
202 teachNonMatch(category, tokenizer.tokenize(input));
203 }
204
205 protected boolean isMatch(String category, String input[]) throws WordsDataSourceException {
206 if (category == null) {
207 throw new IllegalArgumentException("category cannot be null");
208 }
209
210 if (input == null) {
211 throw new IllegalArgumentException("input cannot be null");
212 }
213
214 checkCategoriesSupported(category);
215
216 double matchProbability = classify(category, input);
217
218 return (matchProbability >= cutoff);
219 }
220
221 protected double classify(String category, String words[]) throws WordsDataSourceException {
222 WordProbability[] wps = calcWordsProbability(category, words);
223 return normaliseSignificance(calculateOverallProbability(wps));
224 }
225
226 protected void teachMatch(String category, String words[]) throws WordsDataSourceException {
227 boolean categorise = false;
228 if (wordsData instanceof ICategorisedWordsDataSource) {
229 categorise = true;
230 }
231 for (int i = 0; i <= words.length - 1; i++) {
232 if (isClassifiableWord(words[i])) {
233 if (categorise) {
234 ((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));
235 } else {
236 wordsData.addMatch(transformWord(words[i]));
237 }
238 }
239 }
240 }
241
242 protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {
243 boolean categorise = false;
244 if (wordsData instanceof ICategorisedWordsDataSource) {
245 categorise = true;
246 }
247
248 for (int i = 0; i <= words.length - 1; i++) {
249 if (isClassifiableWord(words[i])) {
250 if (categorise) {
251 ((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));
252 } else {
253 wordsData.addNonMatch(transformWord(words[i]));
254 }
255
256 }
257 }
258 }
259
260 /***
261 * Allows transformations to be done to word.
262 * This implementation transforms the word to lowercase if the classifier
263 * is in case-insenstive mode.
264 *
265 * @param word
266 * @return the transformed word
267 * @throws IllegalArgumentException if a null is passed
268 */
269 protected String transformWord(String word) {
270 if (word != null) {
271 if (!isCaseSensitive) {
272 return word.toLowerCase();
273 } else {
274 return word;
275 }
276 } else {
277 throw new IllegalArgumentException("Null cannot be passed");
278 }
279 }
280
281 /***
282 *
283 * NOTE: Override this method with care. There is a good chance it will be removed
284 * or have signature changes is later versions.
285 *
286 * <br />
287 * @todo need an option to only use the "X" most "important" words when calculating overall probability
288 * "important" is defined as being most distant from NEUTAL_PROBABILITY
289 */
290 protected double calculateOverallProbability(WordProbability[] wps) {
291 if (wps == null || wps.length == 0) {
292 return IClassifier.NEUTRAL_PROBABILITY;
293 } else {
294
295
296
297
298 double z = 0d;
299 double xy = 0d;
300 for (int i = 0; i < wps.length; i++) {
301 if (z == 0) {
302 z = (1 - wps[i].getProbability());
303 } else {
304 z = z * (1 - wps[i].getProbability());
305 }
306
307 if (xy == 0) {
308 xy = wps[i].getProbability();
309 } else {
310 xy = xy * wps[i].getProbability();
311 }
312 }
313
314 double numerator = xy;
315 double denominator = xy + z;
316
317 return numerator / denominator;
318 }
319 }
320
321 private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {
322 if (category == null) {
323 throw new IllegalArgumentException("category cannont be null");
324 }
325
326 boolean categorise = false;
327 if (wordsData instanceof ICategorisedWordsDataSource) {
328 categorise = true;
329 }
330
331 checkCategoriesSupported(category);
332
333 if (words == null) {
334 return new WordProbability[0];
335 } else {
336 List wps = new ArrayList();
337 for (int i = 0; i < words.length; i++) {
338 if (isClassifiableWord(words[i])) {
339 WordProbability wp = null;
340 if (categorise) {
341 wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));
342 } else {
343 wp = wordsData.getWordProbability(transformWord(words[i]));
344 }
345 if (wp != null) {
346 wps.add(wp);
347 }
348 }
349 }
350 return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);
351 }
352 }
353
354 private void checkCategoriesSupported(String category) {
355
356 if (!ICategorisedClassifier.DEFAULT_CATEGORY.equals(category)) {
357
358 if (!(wordsData instanceof ICategorisedWordsDataSource)) {
359
360 throw new IllegalArgumentException("Word Data Source does not support non-default categories.");
361 }
362 }
363 }
364
365 private boolean isClassifiableWord(String word) {
366 if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) {
367 return false;
368 } else {
369 return true;
370 }
371 }
372
373 protected static double normaliseSignificance(double sig) {
374
375 if (Double.compare(IClassifier.UPPER_BOUND, sig) < 0) {
376 return IClassifier.UPPER_BOUND;
377 } else if (Double.compare(IClassifier.LOWER_BOUND, sig) > 0) {
378 return IClassifier.LOWER_BOUND;
379 } else {
380 return sig;
381 }
382 }
383 /***
384 * @return true if the classifier is case sensitive, false otherwise
385 * (false by default)
386 */
387 public boolean isCaseSensitive() {
388 return isCaseSensitive;
389 }
390
391 /***
392 * @param b True if the classifier should be case sensitive, false otherwise
393 */
394 public void setCaseSensitive(boolean b) {
395 isCaseSensitive = b;
396 }
397
398 /***
399 * @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used
400 * by this classifier
401 */
402 public IWordsDataSource getWordsDataSource() {
403 return wordsData;
404 }
405
406 /***
407 * @return the {@link net.sf.classifier4J.ITokenizer} used
408 * by this classifier
409 */
410 public ITokenizer getTokenizer() {
411 return tokenizer;
412 }
413
414 /***
415 * @return the {@link net.sf.classifier4J.IStopWordProvider} used
416 * by this classifier
417 */
418 public IStopWordProvider getStopWordProvider() {
419 return stopWordProvider;
420 }
421
422 public String toString() {
423 return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();
424 }
425
426 }