1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  
53  package net.sf.classifier4J.bayesian;
54  
55  import junit.framework.TestCase;
56  import junit.textui.TestRunner;
57  import net.sf.classifier4J.DefaultStopWordsProvider;
58  import net.sf.classifier4J.DefaultTokenizer;
59  import net.sf.classifier4J.ICategorisedClassifier;
60  import net.sf.classifier4J.IClassifier;
61  import net.sf.classifier4J.IStopWordProvider;
62  import net.sf.classifier4J.ITokenizer;
63  
64  import org.apache.commons.logging.Log;
65  import org.apache.commons.logging.LogFactory;
66  
67  /*
68   * @author Nick Lothian
69   * @author Peter Leschev
70   */
71  public class BayesianClassifierTest extends TestCase {
72  
73  	private Log log = LogFactory.getLog(this.getClass());
74  
75  	public BayesianClassifierTest(String name) {
76  		super(name);
77  	}
78  
79  	public void testClassify() throws Exception {
80  
81  		SimpleWordsDataSource wds = new SimpleWordsDataSource();
82  		BayesianClassifier classifier = new BayesianClassifier(wds);
83  
84  		String sentence[] = { "This", "is", "a", "sentence", "about", "java" };
85  
86  		assertEquals(IClassifier.NEUTRAL_PROBABILITY, classifier.classify(ICategorisedClassifier.DEFAULT_CATEGORY, sentence), 0d);
87  
88  		wds.setWordProbability(new WordProbability("This", 0.5d));
89  		wds.setWordProbability(new WordProbability("is", 0.5d));
90  		wds.setWordProbability(new WordProbability("a", 0.5d));
91  		wds.setWordProbability(new WordProbability("sentence", 0.2d));
92  		wds.setWordProbability(new WordProbability("about", 0.5d));
93  		wds.setWordProbability(new WordProbability("java", 0.99d));
94  
95  		assertEquals(0.96d, classifier.classify(ICategorisedClassifier.DEFAULT_CATEGORY, sentence), 0.009d);
96  	}
97  
98  	public void testTeaching() throws Exception {
99  		BayesianClassifier classifier = new BayesianClassifier();
100 
101 		String sentence1[] = {"The", "menu", "tag", "library", "manages", "the", 
102 							  "complex", "process", "of", "creating", "menus", "in",
103 							  "JavaScript", "The", "menu", "tag", "itself", "is", 
104 							  "an", "abstract", "class", "that", "extends", "the", 
105 							  "TagSupport", "class", "and", "overrides", "the", 
106 							  "doStartTag", "and", "doEndTag", "methods.", "The", 
107 							  "getMenu", "method,", "which", "is", "a", "template", 
108 							  "method", "and", "should", "be", "overridden", "in", 
109 							  "the", "subclasses,", "provides", "JavaScript", "to", 
110 							  "add", "menu", "items", "in", "the", "menu", 
111 							  "structure", "created", "in", "the", "doStartTag", 
112 							  "method", "Subclasses", "of", "the", "menu", "tag", 
113 							  "override", "the", "getMenu", "method,", "which", 
114 							  "uses", "menu", "builders", "to", "render", "menu", 
115 							  "data", "from", "the", "data", "source"};
116 							  						
117 		String sentence2[] = {"I", "witness", "a", "more", "subtle", 
118 							  "demonstration", "of", "real", "time", "physics", 
119 							  "simulation", "at", "the", "tiny", "Palo", "Alto", 
120 							  "office", "of", "Havok", "a", "competing", "physics", 
121 							  "engine", "shop", "On", "the", "screen", "a", 
122 							  "computer", "generated", "sailboat", "floats", "in", 
123 							  "a", "stone", "lined", "pool", "of", "water", "The", 
124 							  "company's", "genial", "Irish", "born", "cofounder", 
125 							  "Hugh", "Reynolds", "shows", "me", "how", "to", 
126 							  "push", "the", "boat", "with", "a", "mouse", "When", 
127 							  "I", "nudge", "it", "air", "fills", "the", "sail", 
128 							  "causing", "the", "ship", "to", "tilt", "leeward", 
129 							  "Ripples", "in", "the", "water", "deflect", "off", 
130 							  "the", "stones", "intersecting", "with", "one", 
131 							  "another", "I", "urge", "the", "boat", "onward", 
132 							  "and", "it", "glides", "effortlessly", "into", "the", 
133 							  "wall", "Reynolds", "tosses", "in", "a", "handful", 
134 							  "of", "virtual", "coins", "they", "spin", "through", 
135 							  "the", "air,", "splash", "into", "the", "water,", 
136 							  "and", "sink"};
137 							  
138 		String sentence3[] = {"The", "New", "Input", "Output", "NIO", "libraries", 
139 							 "introduced", "in", "Java", "2", "Platform", 
140 							 "Standard", "Edition", "J2SE", "1.4", "address", 
141 							 "this", "problem", "NIO", "uses", "a", "buffer", 
142 							 "oriented", "model", "That", "is", "NIO", "deals", 
143 							 "with", "data", "primarily", "in", "large", "blocks", 
144 							 "This", "eliminates", "the", "overhead", "caused", 
145 							 "by", "the", "stream", "model", "and", "even", "makes",
146 							 "use", "of", "OS", "level", "facilities", "where", 
147 							 "possible", "to", "maximize", "throughput"};
148 							 
149 		String sentence4[] = {"As", "governments", "scramble", "to", "contain", 
150 							 "SARS", "the", "World", "Health", "Organisation", 
151 							 "said", "it", "was", "extending", "the", "scope", "of",
152 							 "its", "April", "2", "travel", "alert", "to", 
153 							 "include", "Beijing", "and", "the", "northern", 
154 							 "Chinese", "province", "of", "Shanxi", "together", 
155 							 "with", "Toronto", "the", "epicentre", "of", "the", 
156 							 "SARS", "outbreak", "in", "Canada"};
157 							 
158 		String sentence5[] = {"That", "was", "our", "worst", "problem", "I", 
159 							 "tried", "to", "see", "it", "the", "XP", "way", "Well",
160 							 "what", "we", "can", "do", "is", "implement", 
161 							 "something", "I", "can't", "give", "any", "guarantees",
162 							 "as", "to", "how", "much", "of", "it", "will", "be", 
163 							 "implemented", "in", "a", "month", "I", "won't", 
164 							 "even", "hazard", "a", "guess", "as", "to", "how", 
165 							 "long", "it", "would", "take", "to", "implement", "as",
166 							 "a", "whole", "I", "can't", "draw", "UML", "diagrams", 
167 							 "for", "it", "or", "write", "technical", "specs", 
168 							 "that", "would", "take", "time", "from", "coding", 
169 							 "it", "which", "we", "can't", "afford", "Oh", "and", 
170 							 "I", "have", "two", "kids", "I", "can't", "do", "much",
171 							 "OverTime", "But", "I", "should", "be", "able", "to", 
172 							 "do", "something", "simple", "that", "will", "have", 
173 							 "very", "few", "bugs", "and", "show", "a", "working", 
174 							 "program", "early", "and", "often"}; 		
175     
176 
177 		classifier.teachMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence1);
178 		classifier.teachNonMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence2);
179 		classifier.teachMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence3);
180 		classifier.teachNonMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence4);
181 		classifier.teachMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence5);
182 
183 		assertTrue(classifier.isMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence1));
184 		assertTrue(!classifier.isMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence2));
185 		assertTrue(classifier.isMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence3));
186 		assertTrue(!classifier.isMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence4));
187 		assertTrue(classifier.isMatch(ICategorisedClassifier.DEFAULT_CATEGORY, sentence5));
188 	}
189 
190 	public void testGetWordsDataSource() throws Exception {
191 		SimpleWordsDataSource wds = new SimpleWordsDataSource();
192 		BayesianClassifier classifier = new BayesianClassifier(wds);
193 		
194 		assertEquals(wds, classifier.getWordsDataSource());
195 	}
196 
197 	public void testGetTokenizer() throws Exception {
198 		SimpleWordsDataSource wds = new SimpleWordsDataSource();
199 		ITokenizer tokenizer = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS);		
200 		BayesianClassifier classifier = new BayesianClassifier(wds, tokenizer);
201 		
202 		assertEquals(tokenizer, classifier.getTokenizer());
203 	}
204 
205 	public void testGetStopWordProvider() throws Exception {
206 		SimpleWordsDataSource wds = new SimpleWordsDataSource();
207 		ITokenizer tokenizer = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS);
208 		IStopWordProvider stopWordProvider =  new DefaultStopWordsProvider();		
209 		BayesianClassifier classifier = new BayesianClassifier(wds, tokenizer, stopWordProvider);
210 		
211 		assertEquals(stopWordProvider, classifier.getStopWordProvider());		
212 	}
213 
214 	public void testCaseSensitive() throws Exception {
215 		BayesianClassifier classifier = new BayesianClassifier();
216 		assertFalse(classifier.isCaseSensitive()); // case insensitive by default;
217 		classifier.setCaseSensitive(true);
218 		assertTrue(classifier.isCaseSensitive());
219 	}
220 
221 	public void testTransformWord() throws Exception {
222 		BayesianClassifier classifier = new BayesianClassifier();
223 		assertFalse(classifier.isCaseSensitive());
224 		
225 		String word = null;
226 		try {		
227 			classifier.transformWord(word);
228 			fail("No exception thrown when null passed");
229 		} catch (IllegalArgumentException e) {
230 			// do nothing - this should be thrown
231 		}
232 		
233 		word = "myWord";
234 		assertEquals(word.toLowerCase(), classifier.transformWord(word));
235 		
236 		classifier.setCaseSensitive(true);
237 		assertNotSame(word.toLowerCase(), classifier.transformWord(word));
238 		assertEquals(word, classifier.transformWord(word));		
239 	}
240 
241 	public void testCalculateOverallProbability() throws Exception {
242 		double prob = 0.3d;
243 		WordProbability wp1 = new WordProbability("myWord1", prob);
244 		WordProbability wp2 = new WordProbability("myWord2", prob);
245 		WordProbability wp3 = new WordProbability("myWord3", prob);
246 		
247 		WordProbability[] wps = {wp1, wp2, wp3};
248 		double errorMargin = 0.0001d;
249 		
250 		double xy = (prob * prob * prob);
251 		double z = (1-prob)*(1-prob)*(1-prob);
252 		
253 		double result = xy/(xy + z);
254 		
255 		BayesianClassifier classifier = new BayesianClassifier();
256 		 		
257 		assertEquals(result, classifier.calculateOverallProbability(wps), errorMargin);
258 	}
259 
260 
261 	public static void main(String[] args) throws Exception {
262 		TestRunner.run(BayesianClassifierTest.class);
263 	}
264 }