| %line | %branch | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| net.sf.classifier4J.Utilities |
|
|
| 1 | /* |
|
| 2 | * ==================================================================== |
|
| 3 | * |
|
| 4 | * The Apache Software License, Version 1.1 |
|
| 5 | * |
|
| 6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
| 7 | * |
|
| 8 | * Redistribution and use in source and binary forms, with or without |
|
| 9 | * modification, are permitted provided that the following conditions |
|
| 10 | * are met: |
|
| 11 | * |
|
| 12 | * 1. Redistributions of source code must retain the above copyright |
|
| 13 | * notice, this list of conditions and the following disclaimer. |
|
| 14 | * |
|
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
| 16 | * notice, this list of conditions and the following disclaimer in |
|
| 17 | * the documentation and/or other materials provided with the |
|
| 18 | * distribution. |
|
| 19 | * |
|
| 20 | * 3. The end-user documentation included with the redistribution, if |
|
| 21 | * any, must include the following acknowlegement: |
|
| 22 | * "This product includes software developed by the |
|
| 23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
| 24 | * Alternately, this acknowlegement may appear in the software itself, |
|
| 25 | * if and wherever such third-party acknowlegements normally appear. |
|
| 26 | * |
|
| 27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
| 28 | * products derived from this software without prior written |
|
| 29 | * permission. For written permission, please contact |
|
| 30 | * http://sourceforge.net/users/nicklothian/. |
|
| 31 | * |
|
| 32 | * 5. Products derived from this software may not be called |
|
| 33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
| 34 | * without prior written permission. For written permission, please |
|
| 35 | * contact http://sourceforge.net/users/nicklothian/. |
|
| 36 | * |
|
| 37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
| 38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
| 39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
| 40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
| 41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
| 42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
| 43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
| 44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
| 45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
| 46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
| 47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
| 48 | * SUCH DAMAGE. |
|
| 49 | * ==================================================================== |
|
| 50 | */ |
|
| 51 | ||
| 52 | package net.sf.classifier4J; |
|
| 53 | ||
| 54 | import java.io.BufferedReader; |
|
| 55 | import java.io.IOException; |
|
| 56 | import java.io.InputStream; |
|
| 57 | import java.io.InputStreamReader; |
|
| 58 | ||
| 59 | import java.util.ArrayList; |
|
| 60 | import java.util.Arrays; |
|
| 61 | import java.util.Collections; |
|
| 62 | import java.util.HashMap; |
|
| 63 | import java.util.Iterator; |
|
| 64 | import java.util.LinkedHashSet; |
|
| 65 | import java.util.List; |
|
| 66 | import java.util.Map; |
|
| 67 | import java.util.Set; |
|
| 68 | import java.util.TreeSet; |
|
| 69 | ||
| 70 | /** |
|
| 71 | * @author Nick Lothian |
|
| 72 | * @author Peter Leschev |
|
| 73 | */ |
|
| 74 | 0 | public class Utilities { |
| 75 | ||
| 76 | public static Map getWordFrequency(String input) { |
|
| 77 | 6 | return getWordFrequency(input, false); |
| 78 | } |
|
| 79 | ||
| 80 | public static Map getWordFrequency(String input, boolean caseSensitive) { |
|
| 81 | 8 | return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), class="keyword">new DefaultStopWordsProvider()); |
| 82 | } |
|
| 83 | ||
| 84 | /** |
|
| 85 | * Get a Map of words and Integer representing the number of each word |
|
| 86 | * |
|
| 87 | * @param input The String to get the word frequency of |
|
| 88 | * @param caseSensitive true if words should be treated as separate if they have different case |
|
| 89 | * @param tokenizer a junit.framework.TestCase#run() |
|
| 90 | * @param stopWordsProvider |
|
| 91 | * @return |
|
| 92 | */ |
|
| 93 | public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) { |
|
| 94 | 30 | String convertedInput = input; |
| 95 | 30 | if (!caseSensitive) { |
| 96 | 28 | convertedInput = input.toLowerCase(); |
| 97 | } |
|
| 98 | ||
| 99 | // tokenize into an array of words |
|
| 100 | 30 | String[] words = tokenizer.tokenize(convertedInput); |
| 101 | 30 | Arrays.sort(words); |
| 102 | ||
| 103 | 30 | String[] uniqueWords = getUniqueWords(words); |
| 104 | ||
| 105 | 30 | Map result = new HashMap(); |
| 106 | 218 | for (int i = 0; i < uniqueWords.length; i++) { |
| 107 | 188 | if (stopWordsProvider == null) { |
| 108 | // no stop word provider, so add all words |
|
| 109 | 8 | result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words))); |
| 110 | 180 | } else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) { |
| 111 | // add only words that are not stop words |
|
| 112 | 106 | result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words))); |
| 113 | } |
|
| 114 | } |
|
| 115 | ||
| 116 | 30 | return result; |
| 117 | } |
|
| 118 | ||
| 119 | private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) { |
|
| 120 | 36 | if (wordFrequencies == null || frequency == class="keyword">null) { |
| 121 | 0 | return new String[0]; |
| 122 | } else { |
|
| 123 | 36 | List results = new ArrayList(); |
| 124 | 36 | Iterator it = wordFrequencies.keySet().iterator(); |
| 125 | ||
| 126 | 402 | while (it.hasNext()) { |
| 127 | 330 | String word = (String) it.next(); |
| 128 | 330 | if (frequency.equals(wordFrequencies.get(word))) { |
| 129 | 92 | results.add(word); |
| 130 | } |
|
| 131 | } |
|
| 132 | ||
| 133 | 36 | return (String[]) results.toArray(new String[results.size()]); |
| 134 | ||
| 135 | } |
|
| 136 | } |
|
| 137 | ||
| 138 | public static Set getMostFrequentWords(int count, Map wordFrequencies) { |
|
| 139 | 14 | Set result = new LinkedHashSet(); |
| 140 | ||
| 141 | 14 | Integer max = (Integer) Collections.max(wordFrequencies.values()); |
| 142 | ||
| 143 | 14 | int freq = max.class="keyword">intValue(); |
| 144 | 64 | while (result.size() < count && freq > 0) { |
| 145 | // this is very icky |
|
| 146 | 36 | String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq)); |
| 147 | 36 | result.addAll(Arrays.asList(words)); |
| 148 | 36 | freq--; |
| 149 | } |
|
| 150 | ||
| 151 | 14 | return result; |
| 152 | } |
|
| 153 | ||
| 154 | ||
| 155 | private static boolean isWord(String word) { |
|
| 156 | 180 | if (word != null && !word.trim().equals("")) { |
| 157 | 170 | return true; |
| 158 | } else { |
|
| 159 | 10 | return false; |
| 160 | } |
|
| 161 | } |
|
| 162 | ||
| 163 | /** |
|
| 164 | * Find all unique words in an array of words |
|
| 165 | * |
|
| 166 | * @param input an array of Strings |
|
| 167 | * @return an array of all unique strings. Order is not guarenteed |
|
| 168 | */ |
|
| 169 | public static String[] getUniqueWords(String[] input) { |
|
| 170 | 36 | if (input == null) { |
| 171 | 2 | return new String[0]; |
| 172 | } else { |
|
| 173 | 34 | Set result = new TreeSet(); |
| 174 | 308 | for (int i = 0; i < input.length; i++) { |
| 175 | 274 | result.add(input[i]); |
| 176 | } |
|
| 177 | 34 | return (String[]) result.toArray(new String[result.size()]); |
| 178 | } |
|
| 179 | } |
|
| 180 | ||
| 181 | /** |
|
| 182 | * Count how many times a word appears in an array of words |
|
| 183 | * |
|
| 184 | * @param word The word to count |
|
| 185 | * @param words non-null array of words |
|
| 186 | */ |
|
| 187 | public static int countWords(String word, String[] words) { |
|
| 188 | // find the index of one of the items in the array. |
|
| 189 | // From the JDK docs on binarySearch: |
|
| 190 | // If the array contains multiple elements equal to the specified object, there is no guarantee which one will be found. |
|
| 191 | 122 | int itemIndex = Arrays.binarySearch(words, word); |
| 192 | ||
| 193 | // iterate backwards until we find the first match |
|
| 194 | 122 | if (itemIndex > 0) { |
| 195 | 312 | while (itemIndex > 0 && words[itemIndex].equals(word)) { |
| 196 | 108 | itemIndex--; |
| 197 | } |
|
| 198 | } |
|
| 199 | ||
| 200 | // now itemIndex is one item before the start of the words |
|
| 201 | 122 | int count = 0; |
| 202 | 416 | while (itemIndex < words.length && itemIndex >= 0) { |
| 203 | 262 | if (words[itemIndex].equals(word)) { |
| 204 | 166 | count++; |
| 205 | } |
|
| 206 | ||
| 207 | 262 | itemIndex++; |
| 208 | 262 | if (itemIndex < words.length) { |
| 209 | 234 | if (!words[itemIndex].equals(word)) { |
| 210 | 90 | break; |
| 211 | } |
|
| 212 | } |
|
| 213 | } |
|
| 214 | ||
| 215 | 122 | return count; |
| 216 | } |
|
| 217 | ||
| 218 | /** |
|
| 219 | * |
|
| 220 | * @param input a String which may contain many sentences |
|
| 221 | * @return an array of Strings, each element containing a sentence |
|
| 222 | */ |
|
| 223 | public static String[] getSentences(String input) { |
|
| 224 | 16 | if (input == null) { |
| 225 | 2 | return new String[0]; |
| 226 | } else { |
|
| 227 | // split on a ".", a "!", a "?" followed by a space or EOL |
|
| 228 | 14 | return input.split("(\\.|!|\\?)+(\\s|\\z)"); |
| 229 | } |
|
| 230 | ||
| 231 | } |
|
| 232 | ||
| 233 | /** |
|
| 234 | * Given an inputStream, this method returns a String. New lines are |
|
| 235 | * replaced with " " |
|
| 236 | */ |
|
| 237 | public static String getString(InputStream is) throws IOException { |
|
| 238 | ||
| 239 | 2 | BufferedReader reader = new BufferedReader(class="keyword">new InputStreamReader(is)); |
| 240 | 2 | String line = ""; |
| 241 | 2 | StringBuffer stringBuffer = new StringBuffer(); |
| 242 | 6 | while ((line = reader.readLine()) != null) { |
| 243 | 2 | stringBuffer.append(line); |
| 244 | 2 | stringBuffer.append(" "); |
| 245 | } |
|
| 246 | ||
| 247 | 2 | reader.close(); |
| 248 | ||
| 249 | 2 | return stringBuffer.toString().trim(); |
| 250 | } |
|
| 251 | } |
| This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |