1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 package net.sf.classifier4J.bayesian;
53
54 import java.io.Serializable;
55
56 import net.sf.classifier4J.IClassifier;
57 import net.sf.classifier4J.ICategorisedClassifier;
58 import net.sf.classifier4J.util.*;
59 import net.sf.classifier4J.util.CompareToBuilder;
60 import net.sf.classifier4J.util.EqualsBuilder;
61 import net.sf.classifier4J.util.ToStringBuilder;
62
63 import org.apache.commons.logging.Log;
64 import org.apache.commons.logging.LogFactory;
65
66 /***
67 * Represents the probability of a particular word. The user of this object
68 * can either:
69 * <ol>
70 * <li>Set a specific probability for a particular word <I>or</I></li>
71 * <li>Define the matching and non-matching counts for the particular word.
72 * This class then calculates the probability for you.</li>
73 * </ol>
74 *
75 * @author Nick Lothian
76 * @author Peter Leschev
77 */
78 public class WordProbability implements Comparable, Serializable {
79
80 private static final int UNDEFINED = -1;
81
82 private String word = "";
83 private String category = ICategorisedClassifier.DEFAULT_CATEGORY;
84
85 private long matchingCount = UNDEFINED;
86 private long nonMatchingCount = UNDEFINED;
87
88 private double probability = IClassifier.NEUTRAL_PROBABILITY;
89
90 public WordProbability() {
91 setMatchingCount(0);
92 setNonMatchingCount(0);
93 }
94
95 public WordProbability(String w) {
96 setWord(w);
97 setMatchingCount(0);
98 setNonMatchingCount(0);
99 }
100
101 public WordProbability(String c, String w) {
102 setCategory(c);
103 setWord(w);
104 setMatchingCount(0);
105 setNonMatchingCount(0);
106 }
107
108 public WordProbability(String w, double probability) {
109 setWord(w);
110 setProbability(probability);
111 }
112
113 public WordProbability(String w, long matchingCount, long nonMatchingCount) {
114 setWord(w);
115 setMatchingCount(matchingCount);
116 setNonMatchingCount(nonMatchingCount);
117 }
118
119 public void setWord(String w) {
120 this.word = w;
121 }
122
123 public void setCategory(String category) {
124 this.category = category;
125 }
126
127 public void setProbability(double probability) {
128 this.probability = probability;
129 this.matchingCount = UNDEFINED;
130 this.nonMatchingCount = UNDEFINED;
131 }
132
133 public void setMatchingCount(long matchingCount) {
134 if (matchingCount < 0) {
135 throw new IllegalArgumentException("matchingCount must be greater than 0");
136 }
137 this.matchingCount = matchingCount;
138 calculateProbability();
139 }
140
141 public void setNonMatchingCount(long nonMatchingCount) {
142 if (nonMatchingCount < 0) {
143 throw new IllegalArgumentException("nonMatchingCount must be greater than 0");
144 }
145 this.nonMatchingCount = nonMatchingCount;
146 calculateProbability();
147 }
148
149 public void registerMatch() {
150 if (matchingCount == Long.MAX_VALUE) {
151 throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
152 }
153 matchingCount++;
154 calculateProbability();
155 }
156
157 public void registerNonMatch() {
158 if (nonMatchingCount == Long.MAX_VALUE) {
159 throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
160 }
161 nonMatchingCount++;
162 calculateProbability();
163 }
164
165 private void calculateProbability() {
166
167 Log log = LogFactory.getLog(this.getClass());
168
169 String method = "calculateProbability() ";
170
171 if (log.isDebugEnabled()) {
172 log.debug(method + "START");
173
174 log.debug(method + "matchingCount = " + matchingCount);
175 log.debug(method + "nonMatchingCount = " + nonMatchingCount);
176 }
177
178 double result = IClassifier.NEUTRAL_PROBABILITY;
179
180 if (matchingCount == 0) {
181 if (nonMatchingCount == 0) {
182 result = IClassifier.NEUTRAL_PROBABILITY;
183 } else {
184 result = IClassifier.LOWER_BOUND;
185 }
186 } else {
187 result = BayesianClassifier.normaliseSignificance((double) matchingCount / (double) (matchingCount + nonMatchingCount));
188 }
189
190 probability = result;
191
192 if (log.isDebugEnabled()) {
193 log.debug(method + "END Calculated [" + probability + "]");
194 }
195 }
196
197 /***
198 * @return
199 */
200 public double getProbability() {
201 return probability;
202 }
203
204 public long getMatchingCount() {
205
206 if (matchingCount == UNDEFINED) {
207 throw new UnsupportedOperationException("MatchingCount has not been defined");
208 }
209
210 return matchingCount;
211 }
212
213 public long getNonMatchingCount() {
214
215 if (nonMatchingCount == UNDEFINED) {
216 throw new UnsupportedOperationException("nonMatchingCount has not been defined");
217 }
218
219 return nonMatchingCount;
220 }
221
222 /***
223 * @return
224 */
225 public String getWord() {
226 return word;
227 }
228
229 public String getCategory() {
230 return category;
231 }
232
233 public boolean equals(Object o) {
234 if (!(o instanceof WordProbability)) {
235 return false;
236 }
237 WordProbability rhs = (WordProbability) o;
238 return new EqualsBuilder().append(getWord(), rhs.getWord()).append(getCategory(), rhs.getCategory()).isEquals();
239 }
240
241 public int compareTo(java.lang.Object o) {
242 if (!(o instanceof WordProbability)) {
243 throw new ClassCastException(o.getClass() + " is not a " + this.getClass());
244 }
245 WordProbability rhs = (WordProbability) o;
246 return new CompareToBuilder().append(this.getCategory(), rhs.getCategory()).append(this.getWord(), rhs.getWord()).toComparison();
247 }
248
249 public String toString() {
250 return new ToStringBuilder(this).append("word", word).append("category", category).append("probability", probability).append("matchingCount", matchingCount).append("nonMatchingCount", nonMatchingCount).toString();
251 }
252
253 public int hashCode() {
254
255
256 return new HashCodeBuilder(17, 37).append(word).append(category).toHashCode();
257 }
258 }