Coverage report

  %line %branch
net.sf.classifier4J.Utilities
97% 
98% 

 1  
 /*
 2  
  * ====================================================================
 3  
  * 
 4  
  * The Apache Software License, Version 1.1
 5  
  *
 6  
  * Copyright (c) 2003 Nick Lothian. All rights reserved.
 7  
  *
 8  
  * Redistribution and use in source and binary forms, with or without
 9  
  * modification, are permitted provided that the following conditions
 10  
  * are met:
 11  
  *
 12  
  * 1. Redistributions of source code must retain the above copyright
 13  
  *    notice, this list of conditions and the following disclaimer. 
 14  
  *
 15  
  * 2. Redistributions in binary form must reproduce the above copyright
 16  
  *    notice, this list of conditions and the following disclaimer in
 17  
  *    the documentation and/or other materials provided with the
 18  
  *    distribution.
 19  
  *
 20  
  * 3. The end-user documentation included with the redistribution, if
 21  
  *    any, must include the following acknowlegement:  
 22  
  *       "This product includes software developed by the 
 23  
  *        developers of Classifier4J (http://classifier4j.sf.net/)."
 24  
  *    Alternately, this acknowlegement may appear in the software itself,
 25  
  *    if and wherever such third-party acknowlegements normally appear.
 26  
  *
 27  
  * 4. The name "Classifier4J" must not be used to endorse or promote 
 28  
  *    products derived from this software without prior written 
 29  
  *    permission. For written permission, please contact   
 30  
  *    http://sourceforge.net/users/nicklothian/.
 31  
  *
 32  
  * 5. Products derived from this software may not be called 
 33  
  *    "Classifier4J", nor may "Classifier4J" appear in their names 
 34  
  *    without prior written permission. For written permission, please 
 35  
  *    contact http://sourceforge.net/users/nicklothian/.
 36  
  *
 37  
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 38  
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 39  
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 40  
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 41  
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 42  
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 43  
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 44  
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 45  
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 46  
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 47  
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 48  
  * SUCH DAMAGE.
 49  
  * ====================================================================
 50  
  */
 51  
 
 52  
 package net.sf.classifier4J;
 53  
 
 54  
 import java.io.BufferedReader;
 55  
 import java.io.IOException;
 56  
 import java.io.InputStream;
 57  
 import java.io.InputStreamReader;
 58  
 
 59  
 import java.util.ArrayList;
 60  
 import java.util.Arrays;
 61  
 import java.util.Collections;
 62  
 import java.util.HashMap;
 63  
 import java.util.Iterator;
 64  
 import java.util.LinkedHashSet;
 65  
 import java.util.List;
 66  
 import java.util.Map;
 67  
 import java.util.Set;
 68  
 import java.util.TreeSet;
 69  
 
 70  
 /**
 71  
  * @author Nick Lothian
 72  
  * @author Peter Leschev
 73  
  */
 74  0
 public class Utilities {
 75  
 
 76  
     public static Map getWordFrequency(String input) {
 77  6
         return getWordFrequency(input, false);
 78  
     }
 79  
 
 80  
     public static Map getWordFrequency(String input, boolean caseSensitive) {
 81  8
         return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), class="keyword">new DefaultStopWordsProvider());
 82  
     }
 83  
 
 84  
     /**
 85  
      * Get a Map of words and Integer representing the number of each word
 86  
      * 
 87  
      * @param input The String to get the word frequency of
 88  
      * @param caseSensitive true if words should be treated as separate if they have different case
 89  
      * @param tokenizer a junit.framework.TestCase#run()
 90  
      * @param stopWordsProvider
 91  
      * @return
 92  
      */
 93  
     public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) {
 94  30
         String convertedInput = input;
 95  30
         if (!caseSensitive) {
 96  28
             convertedInput = input.toLowerCase();
 97  
         }
 98  
 
 99  
         // tokenize into an array of words
 100  30
         String[] words = tokenizer.tokenize(convertedInput);
 101  30
         Arrays.sort(words);
 102  
 
 103  30
         String[] uniqueWords = getUniqueWords(words);
 104  
 
 105  30
         Map result = new HashMap();
 106  218
         for (int i = 0; i < uniqueWords.length; i++) {
 107  188
             if (stopWordsProvider == null) {
 108  
                 // no stop word provider, so add all words
 109  8
                 result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
 110  180
             } else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) {
 111  
                 // add only words that are not stop words			
 112  106
                 result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
 113  
             }
 114  
         }
 115  
 
 116  30
         return result;
 117  
     }
 118  
 
 119  
     private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) {
 120  36
         if (wordFrequencies == null || frequency == class="keyword">null) {
 121  0
             return new String[0];
 122  
         } else {
 123  36
             List results = new ArrayList();
 124  36
             Iterator it = wordFrequencies.keySet().iterator();
 125  
 
 126  402
             while (it.hasNext()) {
 127  330
                 String word = (String) it.next();
 128  330
                 if (frequency.equals(wordFrequencies.get(word))) {
 129  92
                     results.add(word);
 130  
                 }
 131  
             }
 132  
 
 133  36
             return (String[]) results.toArray(new String[results.size()]);
 134  
 
 135  
         }
 136  
     }    
 137  
     
 138  
     public static Set getMostFrequentWords(int count, Map wordFrequencies) {
 139  14
         Set result = new LinkedHashSet();
 140  
 
 141  14
         Integer max = (Integer) Collections.max(wordFrequencies.values());
 142  
 
 143  14
         int freq = max.class="keyword">intValue();
 144  64
         while (result.size() < count && freq > 0) {
 145  
             // this is very icky
 146  36
             String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq));
 147  36
             result.addAll(Arrays.asList(words));
 148  36
             freq--;
 149  
         }
 150  
 
 151  14
         return result;
 152  
     }
 153  
 
 154  
     
 155  
     private static boolean isWord(String word) {
 156  180
         if (word != null && !word.trim().equals("")) {
 157  170
             return true;
 158  
         } else {
 159  10
             return false;
 160  
         }
 161  
     }
 162  
 
 163  
     /**
 164  
      * Find all unique words in an array of words
 165  
      * 
 166  
      * @param input an array of Strings
 167  
      * @return an array of all unique strings. Order is not guarenteed
 168  
      */
 169  
     public static String[] getUniqueWords(String[] input) {
 170  36
         if (input == null) {
 171  2
             return new String[0];
 172  
         } else {
 173  34
             Set result = new TreeSet();
 174  308
             for (int i = 0; i < input.length; i++) {
 175  274
                 result.add(input[i]);
 176  
             }
 177  34
             return (String[]) result.toArray(new String[result.size()]);
 178  
         }
 179  
     }
 180  
 
 181  
     /**
 182  
      * Count how many times a word appears in an array of words
 183  
      * 
 184  
      * @param word The word to count
 185  
      * @param words non-null array of words 
 186  
      */
 187  
     public static int countWords(String word, String[] words) {
 188  
         // find the index of one of the items in the array.
 189  
         // From the JDK docs on binarySearch:
 190  
         // If the array contains multiple elements equal to the specified object, there is no guarantee which one will be found. 
 191  122
         int itemIndex = Arrays.binarySearch(words, word);
 192  
 
 193  
         // iterate backwards until we find the first match
 194  122
         if (itemIndex > 0) {
 195  312
             while (itemIndex > 0 && words[itemIndex].equals(word)) {
 196  108
                 itemIndex--;
 197  
             }
 198  
         }
 199  
 
 200  
         // now itemIndex is one item before the start of the words
 201  122
         int count = 0;
 202  416
         while (itemIndex < words.length && itemIndex >= 0) {
 203  262
             if (words[itemIndex].equals(word)) {
 204  166
                 count++;
 205  
             }
 206  
 
 207  262
             itemIndex++;
 208  262
             if (itemIndex < words.length) {
 209  234
                 if (!words[itemIndex].equals(word)) {
 210  90
                     break;
 211  
                 }
 212  
             }
 213  
         }
 214  
 
 215  122
         return count;
 216  
     }
 217  
 
 218  
     /**
 219  
      * 
 220  
      * @param input a String which may contain many sentences
 221  
      * @return an array of Strings, each element containing a sentence
 222  
      */
 223  
     public static String[] getSentences(String input) {
 224  16
         if (input == null) {
 225  2
             return new String[0];
 226  
         } else {
 227  
             // split on a ".", a "!", a "?" followed by a space or EOL
 228  14
             return input.split("(\\.|!|\\?)+(\\s|\\z)");
 229  
         }
 230  
 
 231  
     }
 232  
 
 233  
     /**
 234  
      * Given an inputStream, this method returns a String. New lines are 
 235  
      * replaced with " "
 236  
      */
 237  
     public static String getString(InputStream is) throws IOException {
 238  
 
 239  2
         BufferedReader reader = new BufferedReader(class="keyword">new InputStreamReader(is));
 240  2
         String line = "";
 241  2
         StringBuffer stringBuffer = new StringBuffer();
 242  6
         while ((line = reader.readLine()) != null) {
 243  2
             stringBuffer.append(line);
 244  2
             stringBuffer.append(" ");
 245  
         }
 246  
 
 247  2
         reader.close();
 248  
 
 249  2
         return stringBuffer.toString().trim();
 250  
     }
 251  
 }

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.