1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 package net.sf.classifier4J;
53
54 import java.io.BufferedReader;
55 import java.io.IOException;
56 import java.io.InputStream;
57 import java.io.InputStreamReader;
58
59 import java.util.ArrayList;
60 import java.util.Arrays;
61 import java.util.Collections;
62 import java.util.HashMap;
63 import java.util.Iterator;
64 import java.util.LinkedHashSet;
65 import java.util.List;
66 import java.util.Map;
67 import java.util.Set;
68 import java.util.TreeSet;
69
70 /***
71 * @author Nick Lothian
72 * @author Peter Leschev
73 */
74 public class Utilities {
75
76 public static Map getWordFrequency(String input) {
77 return getWordFrequency(input, false);
78 }
79
80 public static Map getWordFrequency(String input, boolean caseSensitive) {
81 return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), new DefaultStopWordsProvider());
82 }
83
84 /***
85 * Get a Map of words and Integer representing the number of each word
86 *
87 * @param input The String to get the word frequency of
88 * @param caseSensitive true if words should be treated as separate if they have different case
89 * @param tokenizer a junit.framework.TestCase#run()
90 * @param stopWordsProvider
91 * @return
92 */
93 public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) {
94 String convertedInput = input;
95 if (!caseSensitive) {
96 convertedInput = input.toLowerCase();
97 }
98
99
100 String[] words = tokenizer.tokenize(convertedInput);
101 Arrays.sort(words);
102
103 String[] uniqueWords = getUniqueWords(words);
104
105 Map result = new HashMap();
106 for (int i = 0; i < uniqueWords.length; i++) {
107 if (stopWordsProvider == null) {
108
109 result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
110 } else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) {
111
112 result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
113 }
114 }
115
116 return result;
117 }
118
119 private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) {
120 if (wordFrequencies == null || frequency == null) {
121 return new String[0];
122 } else {
123 List results = new ArrayList();
124 Iterator it = wordFrequencies.keySet().iterator();
125
126 while (it.hasNext()) {
127 String word = (String) it.next();
128 if (frequency.equals(wordFrequencies.get(word))) {
129 results.add(word);
130 }
131 }
132
133 return (String[]) results.toArray(new String[results.size()]);
134
135 }
136 }
137
138 public static Set getMostFrequentWords(int count, Map wordFrequencies) {
139 Set result = new LinkedHashSet();
140
141 Integer max = (Integer) Collections.max(wordFrequencies.values());
142
143 int freq = max.intValue();
144 while (result.size() < count && freq > 0) {
145
146 String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq));
147 result.addAll(Arrays.asList(words));
148 freq--;
149 }
150
151 return result;
152 }
153
154
155 private static boolean isWord(String word) {
156 if (word != null && !word.trim().equals("")) {
157 return true;
158 } else {
159 return false;
160 }
161 }
162
163 /***
164 * Find all unique words in an array of words
165 *
166 * @param input an array of Strings
167 * @return an array of all unique strings. Order is not guarenteed
168 */
169 public static String[] getUniqueWords(String[] input) {
170 if (input == null) {
171 return new String[0];
172 } else {
173 Set result = new TreeSet();
174 for (int i = 0; i < input.length; i++) {
175 result.add(input[i]);
176 }
177 return (String[]) result.toArray(new String[result.size()]);
178 }
179 }
180
181 /***
182 * Count how many times a word appears in an array of words
183 *
184 * @param word The word to count
185 * @param words non-null array of words
186 */
187 public static int countWords(String word, String[] words) {
188
189
190
191 int itemIndex = Arrays.binarySearch(words, word);
192
193
194 if (itemIndex > 0) {
195 while (itemIndex > 0 && words[itemIndex].equals(word)) {
196 itemIndex--;
197 }
198 }
199
200
201 int count = 0;
202 while (itemIndex < words.length && itemIndex >= 0) {
203 if (words[itemIndex].equals(word)) {
204 count++;
205 }
206
207 itemIndex++;
208 if (itemIndex < words.length) {
209 if (!words[itemIndex].equals(word)) {
210 break;
211 }
212 }
213 }
214
215 return count;
216 }
217
218 /***
219 *
220 * @param input a String which may contain many sentences
221 * @return an array of Strings, each element containing a sentence
222 */
223 public static String[] getSentences(String input) {
224 if (input == null) {
225 return new String[0];
226 } else {
227
228 return input.split("(//.|!|//?)+(//s|//z)");
229 }
230
231 }
232
233 /***
234 * Given an inputStream, this method returns a String. New lines are
235 * replaced with " "
236 */
237 public static String getString(InputStream is) throws IOException {
238
239 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
240 String line = "";
241 StringBuffer stringBuffer = new StringBuffer();
242 while ((line = reader.readLine()) != null) {
243 stringBuffer.append(line);
244 stringBuffer.append(" ");
245 }
246
247 reader.close();
248
249 return stringBuffer.toString().trim();
250 }
251 }