View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J;
53  
54  import java.io.BufferedReader;
55  import java.io.IOException;
56  import java.io.InputStream;
57  import java.io.InputStreamReader;
58  
59  import java.util.ArrayList;
60  import java.util.Arrays;
61  import java.util.Collections;
62  import java.util.HashMap;
63  import java.util.Iterator;
64  import java.util.LinkedHashSet;
65  import java.util.List;
66  import java.util.Map;
67  import java.util.Set;
68  import java.util.TreeSet;
69  
70  /***
71   * @author Nick Lothian
72   * @author Peter Leschev
73   */
74  public class Utilities {
75  
76      public static Map getWordFrequency(String input) {
77          return getWordFrequency(input, false);
78      }
79  
80      public static Map getWordFrequency(String input, boolean caseSensitive) {
81          return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), new DefaultStopWordsProvider());
82      }
83  
84      /***
85       * Get a Map of words and Integer representing the number of each word
86       * 
87       * @param input The String to get the word frequency of
88       * @param caseSensitive true if words should be treated as separate if they have different case
89       * @param tokenizer a junit.framework.TestCase#run()
90       * @param stopWordsProvider
91       * @return
92       */
93      public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) {
94          String convertedInput = input;
95          if (!caseSensitive) {
96              convertedInput = input.toLowerCase();
97          }
98  
99          // tokenize into an array of words
100         String[] words = tokenizer.tokenize(convertedInput);
101         Arrays.sort(words);
102 
103         String[] uniqueWords = getUniqueWords(words);
104 
105         Map result = new HashMap();
106         for (int i = 0; i < uniqueWords.length; i++) {
107             if (stopWordsProvider == null) {
108                 // no stop word provider, so add all words
109                 result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
110             } else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) {
111                 // add only words that are not stop words			
112                 result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
113             }
114         }
115 
116         return result;
117     }
118 
119     private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) {
120         if (wordFrequencies == null || frequency == null) {
121             return new String[0];
122         } else {
123             List results = new ArrayList();
124             Iterator it = wordFrequencies.keySet().iterator();
125 
126             while (it.hasNext()) {
127                 String word = (String) it.next();
128                 if (frequency.equals(wordFrequencies.get(word))) {
129                     results.add(word);
130                 }
131             }
132 
133             return (String[]) results.toArray(new String[results.size()]);
134 
135         }
136     }    
137     
138     public static Set getMostFrequentWords(int count, Map wordFrequencies) {
139         Set result = new LinkedHashSet();
140 
141         Integer max = (Integer) Collections.max(wordFrequencies.values());
142 
143         int freq = max.intValue();
144         while (result.size() < count && freq > 0) {
145             // this is very icky
146             String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq));
147             result.addAll(Arrays.asList(words));
148             freq--;
149         }
150 
151         return result;
152     }
153 
154     
155     private static boolean isWord(String word) {
156         if (word != null && !word.trim().equals("")) {
157             return true;
158         } else {
159             return false;
160         }
161     }
162 
163     /***
164      * Find all unique words in an array of words
165      * 
166      * @param input an array of Strings
167      * @return an array of all unique strings. Order is not guarenteed
168      */
169     public static String[] getUniqueWords(String[] input) {
170         if (input == null) {
171             return new String[0];
172         } else {
173             Set result = new TreeSet();
174             for (int i = 0; i < input.length; i++) {
175                 result.add(input[i]);
176             }
177             return (String[]) result.toArray(new String[result.size()]);
178         }
179     }
180 
181     /***
182      * Count how many times a word appears in an array of words
183      * 
184      * @param word The word to count
185      * @param words non-null array of words 
186      */
187     public static int countWords(String word, String[] words) {
188         // find the index of one of the items in the array.
189         // From the JDK docs on binarySearch:
190         // If the array contains multiple elements equal to the specified object, there is no guarantee which one will be found. 
191         int itemIndex = Arrays.binarySearch(words, word);
192 
193         // iterate backwards until we find the first match
194         if (itemIndex > 0) {
195             while (itemIndex > 0 && words[itemIndex].equals(word)) {
196                 itemIndex--;
197             }
198         }
199 
200         // now itemIndex is one item before the start of the words
201         int count = 0;
202         while (itemIndex < words.length && itemIndex >= 0) {
203             if (words[itemIndex].equals(word)) {
204                 count++;
205             }
206 
207             itemIndex++;
208             if (itemIndex < words.length) {
209                 if (!words[itemIndex].equals(word)) {
210                     break;
211                 }
212             }
213         }
214 
215         return count;
216     }
217 
218     /***
219      * 
220      * @param input a String which may contain many sentences
221      * @return an array of Strings, each element containing a sentence
222      */
223     public static String[] getSentences(String input) {
224         if (input == null) {
225             return new String[0];
226         } else {
227             // split on a ".", a "!", a "?" followed by a space or EOL
228             return input.split("(//.|!|//?)+(//s|//z)");
229         }
230 
231     }
232 
233     /***
234      * Given an inputStream, this method returns a String. New lines are 
235      * replaced with " "
236      */
237     public static String getString(InputStream is) throws IOException {
238 
239         BufferedReader reader = new BufferedReader(new InputStreamReader(is));
240         String line = "";
241         StringBuffer stringBuffer = new StringBuffer();
242         while ((line = reader.readLine()) != null) {
243             stringBuffer.append(line);
244             stringBuffer.append(" ");
245         }
246 
247         reader.close();
248 
249         return stringBuffer.toString().trim();
250     }
251 }