%line | %branch | |||||||||
---|---|---|---|---|---|---|---|---|---|---|
net.sf.classifier4J.Utilities |
|
|
1 | /* |
|
2 | * ==================================================================== |
|
3 | * |
|
4 | * The Apache Software License, Version 1.1 |
|
5 | * |
|
6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
7 | * |
|
8 | * Redistribution and use in source and binary forms, with or without |
|
9 | * modification, are permitted provided that the following conditions |
|
10 | * are met: |
|
11 | * |
|
12 | * 1. Redistributions of source code must retain the above copyright |
|
13 | * notice, this list of conditions and the following disclaimer. |
|
14 | * |
|
15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
16 | * notice, this list of conditions and the following disclaimer in |
|
17 | * the documentation and/or other materials provided with the |
|
18 | * distribution. |
|
19 | * |
|
20 | * 3. The end-user documentation included with the redistribution, if |
|
21 | * any, must include the following acknowlegement: |
|
22 | * "This product includes software developed by the |
|
23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
24 | * Alternately, this acknowlegement may appear in the software itself, |
|
25 | * if and wherever such third-party acknowlegements normally appear. |
|
26 | * |
|
27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
28 | * products derived from this software without prior written |
|
29 | * permission. For written permission, please contact |
|
30 | * http://sourceforge.net/users/nicklothian/. |
|
31 | * |
|
32 | * 5. Products derived from this software may not be called |
|
33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
34 | * without prior written permission. For written permission, please |
|
35 | * contact http://sourceforge.net/users/nicklothian/. |
|
36 | * |
|
37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
48 | * SUCH DAMAGE. |
|
49 | * ==================================================================== |
|
50 | */ |
|
51 | ||
52 | package net.sf.classifier4J; |
|
53 | ||
54 | import java.io.BufferedReader; |
|
55 | import java.io.IOException; |
|
56 | import java.io.InputStream; |
|
57 | import java.io.InputStreamReader; |
|
58 | ||
59 | import java.util.ArrayList; |
|
60 | import java.util.Arrays; |
|
61 | import java.util.Collections; |
|
62 | import java.util.HashMap; |
|
63 | import java.util.Iterator; |
|
64 | import java.util.LinkedHashSet; |
|
65 | import java.util.List; |
|
66 | import java.util.Map; |
|
67 | import java.util.Set; |
|
68 | import java.util.TreeSet; |
|
69 | ||
70 | /** |
|
71 | * @author Nick Lothian |
|
72 | * @author Peter Leschev |
|
73 | */ |
|
74 | 0 | public class Utilities { |
75 | ||
76 | public static Map getWordFrequency(String input) { |
|
77 | 6 | return getWordFrequency(input, false); |
78 | } |
|
79 | ||
80 | public static Map getWordFrequency(String input, boolean caseSensitive) { |
|
81 | 8 | return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), class="keyword">new DefaultStopWordsProvider()); |
82 | } |
|
83 | ||
84 | /** |
|
85 | * Get a Map of words and Integer representing the number of each word |
|
86 | * |
|
87 | * @param input The String to get the word frequency of |
|
88 | * @param caseSensitive true if words should be treated as separate if they have different case |
|
89 | * @param tokenizer a junit.framework.TestCase#run() |
|
90 | * @param stopWordsProvider |
|
91 | * @return |
|
92 | */ |
|
93 | public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) { |
|
94 | 30 | String convertedInput = input; |
95 | 30 | if (!caseSensitive) { |
96 | 28 | convertedInput = input.toLowerCase(); |
97 | } |
|
98 | ||
99 | // tokenize into an array of words |
|
100 | 30 | String[] words = tokenizer.tokenize(convertedInput); |
101 | 30 | Arrays.sort(words); |
102 | ||
103 | 30 | String[] uniqueWords = getUniqueWords(words); |
104 | ||
105 | 30 | Map result = new HashMap(); |
106 | 218 | for (int i = 0; i < uniqueWords.length; i++) { |
107 | 188 | if (stopWordsProvider == null) { |
108 | // no stop word provider, so add all words |
|
109 | 8 | result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words))); |
110 | 180 | } else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) { |
111 | // add only words that are not stop words |
|
112 | 106 | result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words))); |
113 | } |
|
114 | } |
|
115 | ||
116 | 30 | return result; |
117 | } |
|
118 | ||
119 | private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) { |
|
120 | 36 | if (wordFrequencies == null || frequency == class="keyword">null) { |
121 | 0 | return new String[0]; |
122 | } else { |
|
123 | 36 | List results = new ArrayList(); |
124 | 36 | Iterator it = wordFrequencies.keySet().iterator(); |
125 | ||
126 | 402 | while (it.hasNext()) { |
127 | 330 | String word = (String) it.next(); |
128 | 330 | if (frequency.equals(wordFrequencies.get(word))) { |
129 | 92 | results.add(word); |
130 | } |
|
131 | } |
|
132 | ||
133 | 36 | return (String[]) results.toArray(new String[results.size()]); |
134 | ||
135 | } |
|
136 | } |
|
137 | ||
138 | public static Set getMostFrequentWords(int count, Map wordFrequencies) { |
|
139 | 14 | Set result = new LinkedHashSet(); |
140 | ||
141 | 14 | Integer max = (Integer) Collections.max(wordFrequencies.values()); |
142 | ||
143 | 14 | int freq = max.class="keyword">intValue(); |
144 | 64 | while (result.size() < count && freq > 0) { |
145 | // this is very icky |
|
146 | 36 | String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq)); |
147 | 36 | result.addAll(Arrays.asList(words)); |
148 | 36 | freq--; |
149 | } |
|
150 | ||
151 | 14 | return result; |
152 | } |
|
153 | ||
154 | ||
155 | private static boolean isWord(String word) { |
|
156 | 180 | if (word != null && !word.trim().equals("")) { |
157 | 170 | return true; |
158 | } else { |
|
159 | 10 | return false; |
160 | } |
|
161 | } |
|
162 | ||
163 | /** |
|
164 | * Find all unique words in an array of words |
|
165 | * |
|
166 | * @param input an array of Strings |
|
167 | * @return an array of all unique strings. Order is not guarenteed |
|
168 | */ |
|
169 | public static String[] getUniqueWords(String[] input) { |
|
170 | 36 | if (input == null) { |
171 | 2 | return new String[0]; |
172 | } else { |
|
173 | 34 | Set result = new TreeSet(); |
174 | 308 | for (int i = 0; i < input.length; i++) { |
175 | 274 | result.add(input[i]); |
176 | } |
|
177 | 34 | return (String[]) result.toArray(new String[result.size()]); |
178 | } |
|
179 | } |
|
180 | ||
181 | /** |
|
182 | * Count how many times a word appears in an array of words |
|
183 | * |
|
184 | * @param word The word to count |
|
185 | * @param words non-null array of words |
|
186 | */ |
|
187 | public static int countWords(String word, String[] words) { |
|
188 | // find the index of one of the items in the array. |
|
189 | // From the JDK docs on binarySearch: |
|
190 | // If the array contains multiple elements equal to the specified object, there is no guarantee which one will be found. |
|
191 | 122 | int itemIndex = Arrays.binarySearch(words, word); |
192 | ||
193 | // iterate backwards until we find the first match |
|
194 | 122 | if (itemIndex > 0) { |
195 | 312 | while (itemIndex > 0 && words[itemIndex].equals(word)) { |
196 | 108 | itemIndex--; |
197 | } |
|
198 | } |
|
199 | ||
200 | // now itemIndex is one item before the start of the words |
|
201 | 122 | int count = 0; |
202 | 416 | while (itemIndex < words.length && itemIndex >= 0) { |
203 | 262 | if (words[itemIndex].equals(word)) { |
204 | 166 | count++; |
205 | } |
|
206 | ||
207 | 262 | itemIndex++; |
208 | 262 | if (itemIndex < words.length) { |
209 | 234 | if (!words[itemIndex].equals(word)) { |
210 | 90 | break; |
211 | } |
|
212 | } |
|
213 | } |
|
214 | ||
215 | 122 | return count; |
216 | } |
|
217 | ||
218 | /** |
|
219 | * |
|
220 | * @param input a String which may contain many sentences |
|
221 | * @return an array of Strings, each element containing a sentence |
|
222 | */ |
|
223 | public static String[] getSentences(String input) { |
|
224 | 16 | if (input == null) { |
225 | 2 | return new String[0]; |
226 | } else { |
|
227 | // split on a ".", a "!", a "?" followed by a space or EOL |
|
228 | 14 | return input.split("(\\.|!|\\?)+(\\s|\\z)"); |
229 | } |
|
230 | ||
231 | } |
|
232 | ||
233 | /** |
|
234 | * Given an inputStream, this method returns a String. New lines are |
|
235 | * replaced with " " |
|
236 | */ |
|
237 | public static String getString(InputStream is) throws IOException { |
|
238 | ||
239 | 2 | BufferedReader reader = new BufferedReader(class="keyword">new InputStreamReader(is)); |
240 | 2 | String line = ""; |
241 | 2 | StringBuffer stringBuffer = new StringBuffer(); |
242 | 6 | while ((line = reader.readLine()) != null) { |
243 | 2 | stringBuffer.append(line); |
244 | 2 | stringBuffer.append(" "); |
245 | } |
|
246 | ||
247 | 2 | reader.close(); |
248 | ||
249 | 2 | return stringBuffer.toString().trim(); |
250 | } |
|
251 | } |
This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |