View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J.bayesian;
53  
54  import java.io.Serializable;
55  
56  import net.sf.classifier4J.IClassifier;
57  import net.sf.classifier4J.ICategorisedClassifier;
58  import net.sf.classifier4J.util.*;
59  import net.sf.classifier4J.util.CompareToBuilder;
60  import net.sf.classifier4J.util.EqualsBuilder;
61  import net.sf.classifier4J.util.ToStringBuilder;
62  
63  import org.apache.commons.logging.Log;
64  import org.apache.commons.logging.LogFactory;
65  
66  /***
67   * Represents the probability of a particular word. The user of this object
68   * can either:
69   * <ol>
70   * 		<li>Set a specific probability for a particular word <I>or</I></li>
71   * 		<li>Define the matching and non-matching counts for the particular word. 
72   *        This class then calculates the probability for you.</li>
73   * </ol>
74   * 
75   * @author Nick Lothian
76   * @author Peter Leschev
77   */
78  public class WordProbability implements Comparable, Serializable {
79  
80      private static final int UNDEFINED = -1;
81  
82      private String word = "";
83      private String category = ICategorisedClassifier.DEFAULT_CATEGORY;
84  
85      private long matchingCount = UNDEFINED;
86      private long nonMatchingCount = UNDEFINED;
87  
88      private double probability = IClassifier.NEUTRAL_PROBABILITY;
89  
90      public WordProbability() {
91          setMatchingCount(0);
92          setNonMatchingCount(0);
93      }
94  
95      public WordProbability(String w) {
96          setWord(w);
97          setMatchingCount(0);
98          setNonMatchingCount(0);
99      }
100 
101     public WordProbability(String c, String w) {
102         setCategory(c);
103         setWord(w);
104         setMatchingCount(0);
105         setNonMatchingCount(0);
106     }
107 
108     public WordProbability(String w, double probability) {
109         setWord(w);
110         setProbability(probability);
111     }
112 
113     public WordProbability(String w, long matchingCount, long nonMatchingCount) {
114         setWord(w);
115         setMatchingCount(matchingCount);
116         setNonMatchingCount(nonMatchingCount);
117     }
118 
119     public void setWord(String w) {
120         this.word = w;
121     }
122 
123     public void setCategory(String category) {
124         this.category = category;
125     }
126 
127     public void setProbability(double probability) {
128         this.probability = probability;
129         this.matchingCount = UNDEFINED;
130         this.nonMatchingCount = UNDEFINED;
131     }
132 
133     public void setMatchingCount(long matchingCount) {
134         if (matchingCount < 0) {
135             throw new IllegalArgumentException("matchingCount must be greater than 0");
136         }
137         this.matchingCount = matchingCount;
138         calculateProbability();
139     }
140 
141     public void setNonMatchingCount(long nonMatchingCount) {
142         if (nonMatchingCount < 0) {
143             throw new IllegalArgumentException("nonMatchingCount must be greater than 0");
144         }
145         this.nonMatchingCount = nonMatchingCount;
146         calculateProbability();
147     }
148 
149     public void registerMatch() {
150         if (matchingCount == Long.MAX_VALUE) {
151             throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
152         }
153         matchingCount++;
154         calculateProbability();
155     }
156 
157     public void registerNonMatch() {
158         if (nonMatchingCount == Long.MAX_VALUE) {
159             throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
160         }
161         nonMatchingCount++;
162         calculateProbability();
163     }
164 
165     private void calculateProbability() {
166         // the logger can't be a field because this class might be serialized 
167         Log log = LogFactory.getLog(this.getClass());
168 
169         String method = "calculateProbability() ";
170 
171         if (log.isDebugEnabled()) {
172             log.debug(method + "START");
173 
174             log.debug(method + "matchingCount = " + matchingCount);
175             log.debug(method + "nonMatchingCount = " + nonMatchingCount);
176         }
177 
178         double result = IClassifier.NEUTRAL_PROBABILITY;
179 
180         if (matchingCount == 0) {
181             if (nonMatchingCount == 0) {
182                 result = IClassifier.NEUTRAL_PROBABILITY;
183             } else {
184                 result = IClassifier.LOWER_BOUND;
185             }
186         } else {
187             result = BayesianClassifier.normaliseSignificance((double) matchingCount / (double) (matchingCount + nonMatchingCount));
188         }
189 
190         probability = result;
191 
192         if (log.isDebugEnabled()) {
193             log.debug(method + "END Calculated [" + probability + "]");
194         }
195     }
196 
197     /***
198          * @return
199          */
200     public double getProbability() {
201         return probability;
202     }
203 
204     public long getMatchingCount() {
205 
206         if (matchingCount == UNDEFINED) {
207             throw new UnsupportedOperationException("MatchingCount has not been defined");
208         }
209 
210         return matchingCount;
211     }
212 
213     public long getNonMatchingCount() {
214 
215         if (nonMatchingCount == UNDEFINED) {
216             throw new UnsupportedOperationException("nonMatchingCount has not been defined");
217         }
218 
219         return nonMatchingCount;
220     }
221 
222     /***
223      * @return
224      */
225     public String getWord() {
226         return word;
227     }
228 
229     public String getCategory() {
230         return category;
231     }
232 
233     public boolean equals(Object o) {
234         if (!(o instanceof WordProbability)) {
235             return false;
236         }
237         WordProbability rhs = (WordProbability) o;
238         return new EqualsBuilder().append(getWord(), rhs.getWord()).append(getCategory(), rhs.getCategory()).isEquals();
239     }
240 
241     public int compareTo(java.lang.Object o) {
242         if (!(o instanceof WordProbability)) {
243             throw new ClassCastException(o.getClass() + " is not a " + this.getClass());
244         }
245         WordProbability rhs = (WordProbability) o;
246         return new CompareToBuilder().append(this.getCategory(), rhs.getCategory()).append(this.getWord(), rhs.getWord()).toComparison();
247     }
248 
249     public String toString() {
250         return new ToStringBuilder(this).append("word", word).append("category", category).append("probability", probability).append("matchingCount", matchingCount).append("nonMatchingCount", nonMatchingCount).toString();
251     }
252 
253     public int hashCode() {
254         // you pick a hard-coded, randomly chosen, non-zero, odd number
255         // ideally different for each class
256         return new HashCodeBuilder(17, 37).append(word).append(category).toHashCode();
257     }
258 }