unit tests coverage

Coverage report

%line %branch

net.sf.classifier4J.bayesian.BayesianClassifier
70%

85%

1
/*

2
* ====================================================================

3
*

4
* The Apache Software License, Version 1.1

5
*

6
* Copyright (c) 2003 Nick Lothian. All rights reserved.

7
*

8
* Redistribution and use in source and binary forms, with or without

9
* modification, are permitted provided that the following conditions

10
* are met:

11
*

12
* 1. Redistributions of source code must retain the above copyright

13
* notice, this list of conditions and the following disclaimer.

14
*

15
* 2. Redistributions in binary form must reproduce the above copyright

16
* notice, this list of conditions and the following disclaimer in

17
* the documentation and/or other materials provided with the

18
* distribution.

19
*

20
* 3. The end-user documentation included with the redistribution, if

21
* any, must include the following acknowlegement:

22
* "This product includes software developed by the

23
* developers of Classifier4J (http://classifier4j.sf.net/)."

24
* Alternately, this acknowlegement may appear in the software itself,

25
* if and wherever such third-party acknowlegements normally appear.

26
*

27
* 4. The name "Classifier4J" must not be used to endorse or promote

28
* products derived from this software without prior written

29
* permission. For written permission, please contact

30
* http://sourceforge.net/users/nicklothian/.

31
*

32
* 5. Products derived from this software may not be called

33
* "Classifier4J", nor may "Classifier4J" appear in their names

34
* without prior written permission. For written permission, please

35
* contact http://sourceforge.net/users/nicklothian/.

36
*

37
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED

38
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES

39
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

40
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR

41
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

42
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

43
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF

44
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

45
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

46
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

47
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

48
* SUCH DAMAGE.

49
* ====================================================================

50
*/

51

52
package net.sf.classifier4J.bayesian;

53

54
import java.util.ArrayList;

55
import java.util.List;

56

57
import net.sf.classifier4J.AbstractCategorizedTrainableClassifier;

58
import net.sf.classifier4J.DefaultStopWordsProvider;

59
import net.sf.classifier4J.DefaultTokenizer;

60
import net.sf.classifier4J.ICategorisedClassifier;

61
import net.sf.classifier4J.IClassifier;

62
import net.sf.classifier4J.IStopWordProvider;

63
import net.sf.classifier4J.ITokenizer;

64
import net.sf.classifier4J.util.ToStringBuilder;

65

66
/**

67
*

68
* A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes'

69
* theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).

70
*

71
* The basic usage pattern for this class is:

72
* <ol>

73
* <li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li>

74
* <li>Create a new instance of BayesianClassifier, passing the IWordsDataSource

75
* to the constructor</li>

76
* <li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) }

77
* or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) }

78
* </ol>

79
* 

80
*

81
* For example: 

82
* <tt>

83
* IWordsDataSource wds = new SimpleWordsDataSource(); 

84
* IClassifier classifier = new BayesianClassifier(wds); 

85
* System.out.println( "Matches = " + classifier.classify("This is a sentence") );

86
* </tt>

87
* 

88
*

89
* @author Nick Lothian

90
* @author Peter Leschev

91
*

92
*/

93
public class BayesianClassifier extends AbstractCategorizedTrainableClassifier {

94

95
IWordsDataSource wordsData;

96
ITokenizer tokenizer;

97
IStopWordProvider stopWordProvider;

98

99 16
private boolean isCaseSensitive = false;

100

101
/**

102
* Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer

103
* (set to BREAK_ON_WORD_BREAKS).

104
*/

105
public BayesianClassifier() {

106 8
this(new SimpleWordsDataSource(), class="keyword">new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));

107 8
}

108

109
/**

110
* Constructor for BayesianClassifier that specifies a datasource. The

111
* DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used.

112
*

113
* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}

114
*/

115
public BayesianClassifier(IWordsDataSource wd) {

116 4
this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));

117 4
}

118

119
/**

120
* Constructor for BayesianClassifier that specifies a datasource & tokenizer

121
*

122
* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}

123
* @param tokenizer a {@link net.sf.classifier4J.ITokenizer}

124
*/

125
public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) {

126 14
this(wd, tokenizer, new DefaultStopWordsProvider());

127 14
}

128

129
/**

130
* Constructor for BayesianClassifier that specifies a datasource, tokenizer

131
* and stop words provider

132
*

133
* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}

134
* @param tokenizer a {@link net.sf.classifier4J.ITokenizer}

135
* @param swp a {@link net.sf.classifier4J.IStopWordProvider}

136
*/

137 16
public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) {

138 16
if (wd == null) {

139 0
throw new IllegalArgumentException("IWordsDataSource can't be null");

140
}

141 16
this.wordsData = wd;

142

143 16
if (tokenizer == null) {

144 0
throw new IllegalArgumentException("ITokenizer can't be null");

145
}

146 16
this.tokenizer = tokenizer;

147

148 16
if (swp == null) {

149 0
throw new IllegalArgumentException("IStopWordProvider can't be null");

150
}

151 16
this.stopWordProvider = swp;

152 16
}

153

154
/**

155
* @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String)

156
*/

157
public boolean isMatch(String category, String input) throws WordsDataSourceException {

158 0
return isMatch(category, tokenizer.tokenize(input));

159
}

160

161
/**

162
* @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String)

163
*/

164
public double classify(String category, String input) throws WordsDataSourceException {

165 0
if (category == null) {

166 0
throw new IllegalArgumentException("category cannot be null");

167
}

168 0
if (input == null) {

169 0
throw new IllegalArgumentException("input cannot be null");

170
}

171

172 0
checkCategoriesSupported(category);

173

174 0
return classify(category, tokenizer.tokenize(input));

175
}

176

177
public void teachMatch(String category, String input) throws WordsDataSourceException {

178 0
if (category == null) {

179 0
throw new IllegalArgumentException("category cannot be null");

180
}

181

182 0
if (input == null) {

183 0
throw new IllegalArgumentException("input cannot be null");

184
}

185

186 0
checkCategoriesSupported(category);

187

188 0
teachMatch(category, tokenizer.tokenize(input));

189 0
}

190

191
public void teachNonMatch(String category, String input) throws WordsDataSourceException {

192 0
if (category == null) {

193 0
throw new IllegalArgumentException("category cannot be null");

194
}

195

196 0
if (input == null) {

197 0
throw new IllegalArgumentException("input cannot be null");

198
}

199

200 0
checkCategoriesSupported(category);

201

202 0
teachNonMatch(category, tokenizer.tokenize(input));

203 0
}

204

205
protected boolean isMatch(String category, String input[]) throws WordsDataSourceException {

206 10
if (category == null) {

207 0
throw new IllegalArgumentException("category cannot be null");

208
}

209

210 10
if (input == null) {

211 0
throw new IllegalArgumentException("input cannot be null");

212
}

213

214 10
checkCategoriesSupported(category);

215

216 10
double matchProbability = classify(category, input);

217

218 10
return (matchProbability >= cutoff);

219
}

220

221
protected double classify(String category, String words[]) throws WordsDataSourceException {

222 14
WordProbability[] wps = calcWordsProbability(category, words);

223 14
return normaliseSignificance(calculateOverallProbability(wps));

224
}

225

226
protected void teachMatch(String category, String words[]) throws WordsDataSourceException {

227 6
boolean categorise = false;

228 6
if (wordsData instanceof ICategorisedWordsDataSource) {

229 0
categorise = true;

230
}

231 510
for (int i = 0; i <= words.length - 1; i++) {

232 504
if (isClassclass="keyword">ifiableWord(words[i])) {

233 334
if (categorise) {

234 0
((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));

235
} else {

236 334
wordsData.addMatch(transformWord(words[i]));

237
}

238
}

239
}

240 6
}

241

242
protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {

243 4
boolean categorise = false;

244 4
if (wordsData instanceof ICategorisedWordsDataSource) {

245 0
categorise = true;

246
}

247

248 318
for (int i = 0; i <= words.length - 1; i++) {

249 314
if (isClassclass="keyword">ifiableWord(words[i])) {

250 202
if (categorise) {

251 0
((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));

252
} else {

253 202
wordsData.addNonMatch(transformWord(words[i]));

254
}

255

256
}

257
}

258 4
}

259

260
/**

261
* Allows transformations to be done to word.

262
* This implementation transforms the word to lowercase if the classifier

263
* is in case-insenstive mode.

264
*

265
* @param word

266
* @return the transformed word

267
* @throws IllegalArgumentException if a null is passed

268
*/

269
protected String transformWord(String word) {

270 1092
if (word != null) {

271 1090
if (!isCaseSensitive) {

272 1086
return word.toLowerCase();

273
} else {

274 4
return word;

275
}

276
} else {

277 2
throw new IllegalArgumentException("Null cannot be passed");

278
}

279
}

280

281
/**

282
*

283
* NOTE: Override this method with care. There is a good chance it will be removed

284
* or have signature changes is later versions.

285
*

286
* 

287
* @todo need an option to only use the "X" most "important" words when calculating overall probability

288
* "important" is defined as being most distant from NEUTAL_PROBABILITY

289
*/

290
protected double calculateOverallProbability(WordProbability[] wps) {

291 16
if (wps == null || wps.length == 0) {

292 2
return IClassifier.NEUTRAL_PROBABILITY;

293
} else {

294
// we need to calculate xy/(xy + z)

295
// where z = (1-x)(1-y)

296

297
// firstly, calculate z and xy

298 14
double z = 0d;

299 14
double xy = 0d;

300 562
for (int i = 0; i < wps.length; i++) {

301 548
if (z == 0) {

302 14
z = (1 - wps[i].getProbability());

303
} else {

304 534
z = z * (1 - wps[i].getProbability());

305
}

306

307 548
if (xy == 0) {

308 14
xy = wps[i].getProbability();

309
} else {

310 534
xy = xy * wps[i].getProbability();

311
}

312
}

313

314 14
double numerator = xy;

315 14
double denominator = xy + z;

316

317 14
return numerator / denominator;

318
}

319
}

320

321
private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {

322 14
if (category == null) {

323 0
throw new IllegalArgumentException("category cannont be null");

324
}

325

326 14
boolean categorise = false;

327 14
if (wordsData instanceof ICategorisedWordsDataSource) {

328 0
categorise = true;

329
}

330

331 14
checkCategoriesSupported(category);

332

333 14
if (words == null) {

334 0
return new WordProbability[0];

335
} else {

336 14
List wps = new ArrayList();

337 856
for (int i = 0; i < words.length; i++) {

338 842
if (isClassclass="keyword">ifiableWord(words[i])) {

339 548
WordProbability wp = null;

340 548
if (categorise) {

341 0
wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));

342
} else {

343 548
wp = wordsData.getWordProbability(transformWord(words[i]));

344
}

345 548
if (wp != null) {

346 542
wps.add(wp);

347
}

348
}

349
}

350 14
return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);

351
}

352
}

353

354
private void checkCategoriesSupported(String category) {

355
// if the category is not the default

356 24
if (!ICategorisedClassclass="keyword">ifier.DEFAULT_CATEGORY.equals(category)) {

357
// and the data source does not support categories

358 0
if (!(wordsData instanceof ICategorisedWordsDataSource)) {

359
// throw an IllegalArgumentException

360 0
throw new IllegalArgumentException("Word Data Source does not support non-default categories.");

361
}

362
}

363 24
}

364

365
private boolean isClassifiableWord(String word) {

366 1660
if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) {

367 576
return false;

368
} else {

369 1084
return true;

370
}

371
}

372

373
protected static double normaliseSignificance(class="keyword">double sig) {

374

375 682
if (Double.compare(IClassclass="keyword">ifier.UPPER_BOUND, sig) < 0) {

376 646
return IClassifier.UPPER_BOUND;

377 36
} else if (Double.compare(IClassclass="keyword">ifier.LOWER_BOUND, sig) > 0) {

378 10
return IClassifier.LOWER_BOUND;

379
} else {

380 26
return sig;

381
}

382
}

383
/**

384
* @return true if the classifier is case sensitive, false otherwise

385
* (false by default)

386
*/

387
public boolean isCaseSensitive() {

388 6
return isCaseSensitive;

389
}

390

391
/**

392
* @param b True if the classifier should be case sensitive, false otherwise

393
*/

394
public void setCaseSensitive(boolean b) {

395 4
isCaseSensitive = b;

396 4
}

397

398
/**

399
* @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used

400
* by this classifier

401
*/

402
public IWordsDataSource getWordsDataSource() {

403 2
return wordsData;

404
}

405

406
/**

407
* @return the {@link net.sf.classifier4J.ITokenizer} used

408
* by this classifier

409
*/

410
public ITokenizer getTokenizer() {

411 2
return tokenizer;

412
}

413

414
/**

415
* @return the {@link net.sf.classifier4J.IStopWordProvider} used

416
* by this classifier

417
*/

418
public IStopWordProvider getStopWordProvider() {

419 2
return stopWordProvider;

420
}

421

422
public String toString() {

423 0
return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();

424
}

425

426
}

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.

1		/*
2		* ====================================================================
3		*
4		* The Apache Software License, Version 1.1
5		*
6		* Copyright (c) 2003 Nick Lothian. All rights reserved.
7		*
8		* Redistribution and use in source and binary forms, with or without
9		* modification, are permitted provided that the following conditions
10		* are met:
11		*
12		* 1. Redistributions of source code must retain the above copyright
13		* notice, this list of conditions and the following disclaimer.
14		*
15		* 2. Redistributions in binary form must reproduce the above copyright
16		* notice, this list of conditions and the following disclaimer in
17		* the documentation and/or other materials provided with the
18		* distribution.
19		*
20		* 3. The end-user documentation included with the redistribution, if
21		* any, must include the following acknowlegement:
22		* "This product includes software developed by the
23		* developers of Classifier4J (http://classifier4j.sf.net/)."
24		* Alternately, this acknowlegement may appear in the software itself,
25		* if and wherever such third-party acknowlegements normally appear.
26		*
27		* 4. The name "Classifier4J" must not be used to endorse or promote
28		* products derived from this software without prior written
29		* permission. For written permission, please contact
30		* http://sourceforge.net/users/nicklothian/.
31		*
32		* 5. Products derived from this software may not be called
33		* "Classifier4J", nor may "Classifier4J" appear in their names
34		* without prior written permission. For written permission, please
35		* contact http://sourceforge.net/users/nicklothian/.
36		*
37		* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38		* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39		* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40		* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41		* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42		* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43		* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44		* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45		* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46		* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47		* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48		* SUCH DAMAGE.
49		* ====================================================================
50		*/
51
52		package net.sf.classifier4J.bayesian;
53
54		import java.util.ArrayList;
55		import java.util.List;
56
57		import net.sf.classifier4J.AbstractCategorizedTrainableClassifier;
58		import net.sf.classifier4J.DefaultStopWordsProvider;
59		import net.sf.classifier4J.DefaultTokenizer;
60		import net.sf.classifier4J.ICategorisedClassifier;
61		import net.sf.classifier4J.IClassifier;
62		import net.sf.classifier4J.IStopWordProvider;
63		import net.sf.classifier4J.ITokenizer;
64		import net.sf.classifier4J.util.ToStringBuilder;
65
66		/**
67		*
68		* <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes'
69		* theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p>
70		*
71		* <p>The basic usage pattern for this class is:
72		* <ol>
73		* <li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li>
74		* <li>Create a new instance of BayesianClassifier, passing the IWordsDataSource
75		* to the constructor</li>
76		* <li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) }
77		* or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) }
78		* </ol>
79		* </p>
80		*
81		* <p>For example:<br>
82		* <tt>
83		* IWordsDataSource wds = new SimpleWordsDataSource();<br>
84		* IClassifier classifier = new BayesianClassifier(wds);<br>
85		* System.out.println( "Matches = " + classifier.classify("This is a sentence") );
86		* </tt>
87		* </p>
88		*
89		* @author Nick Lothian
90		* @author Peter Leschev
91		*
92		*/
93		public class BayesianClassifier extends AbstractCategorizedTrainableClassifier {
94
95		IWordsDataSource wordsData;
96		ITokenizer tokenizer;
97		IStopWordProvider stopWordProvider;
98
99	16	private boolean isCaseSensitive = false;
100
101		/**
102		* Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer
103		* (set to BREAK_ON_WORD_BREAKS).
104		*/
105		public BayesianClassifier() {
106	8	this(new SimpleWordsDataSource(), class="keyword">new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
107	8	}
108
109		/**
110		* Constructor for BayesianClassifier that specifies a datasource. The
111		* DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used.
112		*
113		* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
114		*/
115		public BayesianClassifier(IWordsDataSource wd) {
116	4	this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
117	4	}
118
119		/**
120		* Constructor for BayesianClassifier that specifies a datasource & tokenizer
121		*
122		* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
123		* @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
124		*/
125		public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) {
126	14	this(wd, tokenizer, new DefaultStopWordsProvider());
127	14	}
128
129		/**
130		* Constructor for BayesianClassifier that specifies a datasource, tokenizer
131		* and stop words provider
132		*
133		* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
134		* @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
135		* @param swp a {@link net.sf.classifier4J.IStopWordProvider}
136		*/
137	16	public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) {
138	16	if (wd == null) {
139	0	throw new IllegalArgumentException("IWordsDataSource can't be null");
140		}
141	16	this.wordsData = wd;
142
143	16	if (tokenizer == null) {
144	0	throw new IllegalArgumentException("ITokenizer can't be null");
145		}
146	16	this.tokenizer = tokenizer;
147
148	16	if (swp == null) {
149	0	throw new IllegalArgumentException("IStopWordProvider can't be null");
150		}
151	16	this.stopWordProvider = swp;
152	16	}
153
154		/**
155		* @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String)
156		*/
157		public boolean isMatch(String category, String input) throws WordsDataSourceException {
158	0	return isMatch(category, tokenizer.tokenize(input));
159		}
160
161		/**
162		* @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String)
163		*/
164		public double classify(String category, String input) throws WordsDataSourceException {
165	0	if (category == null) {
166	0	throw new IllegalArgumentException("category cannot be null");
167		}
168	0	if (input == null) {
169	0	throw new IllegalArgumentException("input cannot be null");
170		}
171
172	0	checkCategoriesSupported(category);
173
174	0	return classify(category, tokenizer.tokenize(input));
175		}
176
177		public void teachMatch(String category, String input) throws WordsDataSourceException {
178	0	if (category == null) {
179	0	throw new IllegalArgumentException("category cannot be null");
180		}
181
182	0	if (input == null) {
183	0	throw new IllegalArgumentException("input cannot be null");
184		}
185
186	0	checkCategoriesSupported(category);
187
188	0	teachMatch(category, tokenizer.tokenize(input));
189	0	}
190
191		public void teachNonMatch(String category, String input) throws WordsDataSourceException {
192	0	if (category == null) {
193	0	throw new IllegalArgumentException("category cannot be null");
194		}
195
196	0	if (input == null) {
197	0	throw new IllegalArgumentException("input cannot be null");
198		}
199
200	0	checkCategoriesSupported(category);
201
202	0	teachNonMatch(category, tokenizer.tokenize(input));
203	0	}
204
205		protected boolean isMatch(String category, String input[]) throws WordsDataSourceException {
206	10	if (category == null) {
207	0	throw new IllegalArgumentException("category cannot be null");
208		}
209
210	10	if (input == null) {
211	0	throw new IllegalArgumentException("input cannot be null");
212		}
213
214	10	checkCategoriesSupported(category);
215
216	10	double matchProbability = classify(category, input);
217
218	10	return (matchProbability >= cutoff);
219		}
220
221		protected double classify(String category, String words[]) throws WordsDataSourceException {
222	14	WordProbability[] wps = calcWordsProbability(category, words);
223	14	return normaliseSignificance(calculateOverallProbability(wps));
224		}
225
226		protected void teachMatch(String category, String words[]) throws WordsDataSourceException {
227	6	boolean categorise = false;
228	6	if (wordsData instanceof ICategorisedWordsDataSource) {
229	0	categorise = true;
230		}
231	510	for (int i = 0; i <= words.length - 1; i++) {
232	504	if (isClassclass="keyword">ifiableWord(words[i])) {
233	334	if (categorise) {
234	0	((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));
235		} else {
236	334	wordsData.addMatch(transformWord(words[i]));
237		}
238		}
239		}
240	6	}
241
242		protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {
243	4	boolean categorise = false;
244	4	if (wordsData instanceof ICategorisedWordsDataSource) {
245	0	categorise = true;
246		}
247
248	318	for (int i = 0; i <= words.length - 1; i++) {
249	314	if (isClassclass="keyword">ifiableWord(words[i])) {
250	202	if (categorise) {
251	0	((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));
252		} else {
253	202	wordsData.addNonMatch(transformWord(words[i]));
254		}
255
256		}
257		}
258	4	}
259
260		/**
261		* Allows transformations to be done to word.
262		* This implementation transforms the word to lowercase if the classifier
263		* is in case-insenstive mode.
264		*
265		* @param word
266		* @return the transformed word
267		* @throws IllegalArgumentException if a null is passed
268		*/
269		protected String transformWord(String word) {
270	1092	if (word != null) {
271	1090	if (!isCaseSensitive) {
272	1086	return word.toLowerCase();
273		} else {
274	4	return word;
275		}
276		} else {
277	2	throw new IllegalArgumentException("Null cannot be passed");
278		}
279		}
280
281		/**
282		*
283		* NOTE: Override this method with care. There is a good chance it will be removed
284		* or have signature changes is later versions.
285		*
286		* <br />
287		* @todo need an option to only use the "X" most "important" words when calculating overall probability
288		* "important" is defined as being most distant from NEUTAL_PROBABILITY
289		*/
290		protected double calculateOverallProbability(WordProbability[] wps) {
291	16	if (wps == null \|\| wps.length == 0) {
292	2	return IClassifier.NEUTRAL_PROBABILITY;
293		} else {
294		// we need to calculate xy/(xy + z)
295		// where z = (1-x)(1-y)
296
297		// firstly, calculate z and xy
298	14	double z = 0d;
299	14	double xy = 0d;
300	562	for (int i = 0; i < wps.length; i++) {
301	548	if (z == 0) {
302	14	z = (1 - wps[i].getProbability());
303		} else {
304	534	z = z * (1 - wps[i].getProbability());
305		}
306
307	548	if (xy == 0) {
308	14	xy = wps[i].getProbability();
309		} else {
310	534	xy = xy * wps[i].getProbability();
311		}
312		}
313
314	14	double numerator = xy;
315	14	double denominator = xy + z;
316
317	14	return numerator / denominator;
318		}
319		}
320
321		private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {
322	14	if (category == null) {
323	0	throw new IllegalArgumentException("category cannont be null");
324		}
325
326	14	boolean categorise = false;
327	14	if (wordsData instanceof ICategorisedWordsDataSource) {
328	0	categorise = true;
329		}
330
331	14	checkCategoriesSupported(category);
332
333	14	if (words == null) {
334	0	return new WordProbability[0];
335		} else {
336	14	List wps = new ArrayList();
337	856	for (int i = 0; i < words.length; i++) {
338	842	if (isClassclass="keyword">ifiableWord(words[i])) {
339	548	WordProbability wp = null;
340	548	if (categorise) {
341	0	wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));
342		} else {
343	548	wp = wordsData.getWordProbability(transformWord(words[i]));
344		}
345	548	if (wp != null) {
346	542	wps.add(wp);
347		}
348		}
349		}
350	14	return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);
351		}
352		}
353
354		private void checkCategoriesSupported(String category) {
355		// if the category is not the default
356	24	if (!ICategorisedClassclass="keyword">ifier.DEFAULT_CATEGORY.equals(category)) {
357		// and the data source does not support categories
358	0	if (!(wordsData instanceof ICategorisedWordsDataSource)) {
359		// throw an IllegalArgumentException
360	0	throw new IllegalArgumentException("Word Data Source does not support non-default categories.");
361		}
362		}
363	24	}
364
365		private boolean isClassifiableWord(String word) {
366	1660	if (word == null \|\| "".equals(word) \|\| stopWordProvider.isStopWord(word)) {
367	576	return false;
368		} else {
369	1084	return true;
370		}
371		}
372
373		protected static double normaliseSignificance(class="keyword">double sig) {
374
375	682	if (Double.compare(IClassclass="keyword">ifier.UPPER_BOUND, sig) < 0) {
376	646	return IClassifier.UPPER_BOUND;
377	36	} else if (Double.compare(IClassclass="keyword">ifier.LOWER_BOUND, sig) > 0) {
378	10	return IClassifier.LOWER_BOUND;
379		} else {
380	26	return sig;
381		}
382		}
383		/**
384		* @return true if the classifier is case sensitive, false otherwise
385		* (false by default)
386		*/
387		public boolean isCaseSensitive() {
388	6	return isCaseSensitive;
389		}
390
391		/**
392		* @param b True if the classifier should be case sensitive, false otherwise
393		*/
394		public void setCaseSensitive(boolean b) {
395	4	isCaseSensitive = b;
396	4	}
397
398		/**
399		* @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used
400		* by this classifier
401		*/
402		public IWordsDataSource getWordsDataSource() {
403	2	return wordsData;
404		}
405
406		/**
407		* @return the {@link net.sf.classifier4J.ITokenizer} used
408		* by this classifier
409		*/
410		public ITokenizer getTokenizer() {
411	2	return tokenizer;
412		}
413
414		/**
415		* @return the {@link net.sf.classifier4J.IStopWordProvider} used
416		* by this classifier
417		*/
418		public IStopWordProvider getStopWordProvider() {
419	2	return stopWordProvider;
420		}
421
422		public String toString() {
423	0	return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();
424		}
425
426		}