1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  
16  
17  
18  
19  
20  
21  
22  
23  
24  
25  
26  
27  
28  
29  
30  
31  
32  
33  
34  
35  
36  
37  
38  
39  
40  
41  
42  
43  
44  
45  
46  
47  
48  
49  
50  
51  
52  package net.sf.classifier4J.bayesian;
53  
54  import java.io.Serializable;
55  
56  import net.sf.classifier4J.IClassifier;
57  import net.sf.classifier4J.ICategorisedClassifier;
58  import net.sf.classifier4J.util.*;
59  import net.sf.classifier4J.util.CompareToBuilder;
60  import net.sf.classifier4J.util.EqualsBuilder;
61  import net.sf.classifier4J.util.ToStringBuilder;
62  
63  import org.apache.commons.logging.Log;
64  import org.apache.commons.logging.LogFactory;
65  
66  /***
67   * Represents the probability of a particular word. The user of this object
68   * can either:
69   * <ol>
70   * 		<li>Set a specific probability for a particular word <I>or</I></li>
71   * 		<li>Define the matching and non-matching counts for the particular word. 
72   *        This class then calculates the probability for you.</li>
73   * </ol>
74   * 
75   * @author Nick Lothian
76   * @author Peter Leschev
77   */
78  public class WordProbability implements Comparable, Serializable {
79  
80      private static final int UNDEFINED = -1;
81  
82      private String word = "";
83      private String category = ICategorisedClassifier.DEFAULT_CATEGORY;
84  
85      private long matchingCount = UNDEFINED;
86      private long nonMatchingCount = UNDEFINED;
87  
88      private double probability = IClassifier.NEUTRAL_PROBABILITY;
89  
90      public WordProbability() {
91          setMatchingCount(0);
92          setNonMatchingCount(0);
93      }
94  
95      public WordProbability(String w) {
96          setWord(w);
97          setMatchingCount(0);
98          setNonMatchingCount(0);
99      }
100 
101     public WordProbability(String c, String w) {
102         setCategory(c);
103         setWord(w);
104         setMatchingCount(0);
105         setNonMatchingCount(0);
106     }
107 
108     public WordProbability(String w, double probability) {
109         setWord(w);
110         setProbability(probability);
111     }
112 
113     public WordProbability(String w, long matchingCount, long nonMatchingCount) {
114         setWord(w);
115         setMatchingCount(matchingCount);
116         setNonMatchingCount(nonMatchingCount);
117     }
118 
119     public void setWord(String w) {
120         this.word = w;
121     }
122 
123     public void setCategory(String category) {
124         this.category = category;
125     }
126 
127     public void setProbability(double probability) {
128         this.probability = probability;
129         this.matchingCount = UNDEFINED;
130         this.nonMatchingCount = UNDEFINED;
131     }
132 
133     public void setMatchingCount(long matchingCount) {
134         if (matchingCount < 0) {
135             throw new IllegalArgumentException("matchingCount must be greater than 0");
136         }
137         this.matchingCount = matchingCount;
138         calculateProbability();
139     }
140 
141     public void setNonMatchingCount(long nonMatchingCount) {
142         if (nonMatchingCount < 0) {
143             throw new IllegalArgumentException("nonMatchingCount must be greater than 0");
144         }
145         this.nonMatchingCount = nonMatchingCount;
146         calculateProbability();
147     }
148 
149     public void registerMatch() {
150         if (matchingCount == Long.MAX_VALUE) {
151             throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
152         }
153         matchingCount++;
154         calculateProbability();
155     }
156 
157     public void registerNonMatch() {
158         if (nonMatchingCount == Long.MAX_VALUE) {
159             throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
160         }
161         nonMatchingCount++;
162         calculateProbability();
163     }
164 
165     private void calculateProbability() {
166         
167         Log log = LogFactory.getLog(this.getClass());
168 
169         String method = "calculateProbability() ";
170 
171         if (log.isDebugEnabled()) {
172             log.debug(method + "START");
173 
174             log.debug(method + "matchingCount = " + matchingCount);
175             log.debug(method + "nonMatchingCount = " + nonMatchingCount);
176         }
177 
178         double result = IClassifier.NEUTRAL_PROBABILITY;
179 
180         if (matchingCount == 0) {
181             if (nonMatchingCount == 0) {
182                 result = IClassifier.NEUTRAL_PROBABILITY;
183             } else {
184                 result = IClassifier.LOWER_BOUND;
185             }
186         } else {
187             result = BayesianClassifier.normaliseSignificance((double) matchingCount / (double) (matchingCount + nonMatchingCount));
188         }
189 
190         probability = result;
191 
192         if (log.isDebugEnabled()) {
193             log.debug(method + "END Calculated [" + probability + "]");
194         }
195     }
196 
197     /***
198          * @return
199          */
200     public double getProbability() {
201         return probability;
202     }
203 
204     public long getMatchingCount() {
205 
206         if (matchingCount == UNDEFINED) {
207             throw new UnsupportedOperationException("MatchingCount has not been defined");
208         }
209 
210         return matchingCount;
211     }
212 
213     public long getNonMatchingCount() {
214 
215         if (nonMatchingCount == UNDEFINED) {
216             throw new UnsupportedOperationException("nonMatchingCount has not been defined");
217         }
218 
219         return nonMatchingCount;
220     }
221 
222     /***
223      * @return
224      */
225     public String getWord() {
226         return word;
227     }
228 
229     public String getCategory() {
230         return category;
231     }
232 
233     public boolean equals(Object o) {
234         if (!(o instanceof WordProbability)) {
235             return false;
236         }
237         WordProbability rhs = (WordProbability) o;
238         return new EqualsBuilder().append(getWord(), rhs.getWord()).append(getCategory(), rhs.getCategory()).isEquals();
239     }
240 
241     public int compareTo(java.lang.Object o) {
242         if (!(o instanceof WordProbability)) {
243             throw new ClassCastException(o.getClass() + " is not a " + this.getClass());
244         }
245         WordProbability rhs = (WordProbability) o;
246         return new CompareToBuilder().append(this.getCategory(), rhs.getCategory()).append(this.getWord(), rhs.getWord()).toComparison();
247     }
248 
249     public String toString() {
250         return new ToStringBuilder(this).append("word", word).append("category", category).append("probability", probability).append("matchingCount", matchingCount).append("nonMatchingCount", nonMatchingCount).toString();
251     }
252 
253     public int hashCode() {
254         
255         
256         return new HashCodeBuilder(17, 37).append(word).append(category).toHashCode();
257     }
258 }