unit tests coverage

Coverage report

%line %branch

net.sf.classifier4J.Utilities
97%

98%

1
/*

2
* ====================================================================

3
*

4
* The Apache Software License, Version 1.1

5
*

6
* Copyright (c) 2003 Nick Lothian. All rights reserved.

7
*

8
* Redistribution and use in source and binary forms, with or without

9
* modification, are permitted provided that the following conditions

10
* are met:

11
*

12
* 1. Redistributions of source code must retain the above copyright

13
* notice, this list of conditions and the following disclaimer.

14
*

15
* 2. Redistributions in binary form must reproduce the above copyright

16
* notice, this list of conditions and the following disclaimer in

17
* the documentation and/or other materials provided with the

18
* distribution.

19
*

20
* 3. The end-user documentation included with the redistribution, if

21
* any, must include the following acknowlegement:

22
* "This product includes software developed by the

23
* developers of Classifier4J (http://classifier4j.sf.net/)."

24
* Alternately, this acknowlegement may appear in the software itself,

25
* if and wherever such third-party acknowlegements normally appear.

26
*

27
* 4. The name "Classifier4J" must not be used to endorse or promote

28
* products derived from this software without prior written

29
* permission. For written permission, please contact

30
* http://sourceforge.net/users/nicklothian/.

31
*

32
* 5. Products derived from this software may not be called

33
* "Classifier4J", nor may "Classifier4J" appear in their names

34
* without prior written permission. For written permission, please

35
* contact http://sourceforge.net/users/nicklothian/.

36
*

37
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED

38
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES

39
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

40
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR

41
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

42
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

43
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF

44
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

45
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

46
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

47
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

48
* SUCH DAMAGE.

49
* ====================================================================

50
*/

51

52
package net.sf.classifier4J;

53

54
import java.io.BufferedReader;

55
import java.io.IOException;

56
import java.io.InputStream;

57
import java.io.InputStreamReader;

58

59
import java.util.ArrayList;

60
import java.util.Arrays;

61
import java.util.Collections;

62
import java.util.HashMap;

63
import java.util.Iterator;

64
import java.util.LinkedHashSet;

65
import java.util.List;

66
import java.util.Map;

67
import java.util.Set;

68
import java.util.TreeSet;

69

70
/**

71
* @author Nick Lothian

72
* @author Peter Leschev

73
*/

74 0
public class Utilities {

75

76
public static Map getWordFrequency(String input) {

77 6
return getWordFrequency(input, false);

78
}

79

80
public static Map getWordFrequency(String input, boolean caseSensitive) {

81 8
return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), class="keyword">new DefaultStopWordsProvider());

82
}

83

84
/**

85
* Get a Map of words and Integer representing the number of each word

86
*

87
* @param input The String to get the word frequency of

88
* @param caseSensitive true if words should be treated as separate if they have different case

89
* @param tokenizer a junit.framework.TestCase#run()

90
* @param stopWordsProvider

91
* @return

92
*/

93
public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) {

94 30
String convertedInput = input;

95 30
if (!caseSensitive) {

96 28
convertedInput = input.toLowerCase();

97
}

98

99
// tokenize into an array of words

100 30
String[] words = tokenizer.tokenize(convertedInput);

101 30
Arrays.sort(words);

102

103 30
String[] uniqueWords = getUniqueWords(words);

104

105 30
Map result = new HashMap();

106 218
for (int i = 0; i < uniqueWords.length; i++) {

107 188
if (stopWordsProvider == null) {

108
// no stop word provider, so add all words

109 8
result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));

110 180
} else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) {

111
// add only words that are not stop words

112 106
result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));

113
}

114
}

115

116 30
return result;

117
}

118

119
private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) {

120 36
if (wordFrequencies == null || frequency == class="keyword">null) {

121 0
return new String[0];

122
} else {

123 36
List results = new ArrayList();

124 36
Iterator it = wordFrequencies.keySet().iterator();

125

126 402
while (it.hasNext()) {

127 330
String word = (String) it.next();

128 330
if (frequency.equals(wordFrequencies.get(word))) {

129 92
results.add(word);

130
}

131
}

132

133 36
return (String[]) results.toArray(new String[results.size()]);

134

135
}

136
}

137

138
public static Set getMostFrequentWords(int count, Map wordFrequencies) {

139 14
Set result = new LinkedHashSet();

140

141 14
Integer max = (Integer) Collections.max(wordFrequencies.values());

142

143 14
int freq = max.class="keyword">intValue();

144 64
while (result.size() < count && freq > 0) {

145
// this is very icky

146 36
String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq));

147 36
result.addAll(Arrays.asList(words));

148 36
freq--;

149
}

150

151 14
return result;

152
}

153

154

155
private static boolean isWord(String word) {

156 180
if (word != null && !word.trim().equals("")) {

157 170
return true;

158
} else {

159 10
return false;

160
}

161
}

162

163
/**

164
* Find all unique words in an array of words

165
*

166
* @param input an array of Strings

167
* @return an array of all unique strings. Order is not guarenteed

168
*/

169
public static String[] getUniqueWords(String[] input) {

170 36
if (input == null) {

171 2
return new String[0];

172
} else {

173 34
Set result = new TreeSet();

174 308
for (int i = 0; i < input.length; i++) {

175 274
result.add(input[i]);

176
}

177 34
return (String[]) result.toArray(new String[result.size()]);

178
}

179
}

180

181
/**

182
* Count how many times a word appears in an array of words

183
*

184
* @param word The word to count

185
* @param words non-null array of words

186
*/

187
public static int countWords(String word, String[] words) {

188
// find the index of one of the items in the array.

189
// From the JDK docs on binarySearch:

190
// If the array contains multiple elements equal to the specified object, there is no guarantee which one will be found.

191 122
int itemIndex = Arrays.binarySearch(words, word);

192

193
// iterate backwards until we find the first match

194 122
if (itemIndex > 0) {

195 312
while (itemIndex > 0 && words[itemIndex].equals(word)) {

196 108
itemIndex--;

197
}

198
}

199

200
// now itemIndex is one item before the start of the words

201 122
int count = 0;

202 416
while (itemIndex < words.length && itemIndex >= 0) {

203 262
if (words[itemIndex].equals(word)) {

204 166
count++;

205
}

206

207 262
itemIndex++;

208 262
if (itemIndex < words.length) {

209 234
if (!words[itemIndex].equals(word)) {

210 90
break;

211
}

212
}

213
}

214

215 122
return count;

216
}

217

218
/**

219
*

220
* @param input a String which may contain many sentences

221
* @return an array of Strings, each element containing a sentence

222
*/

223
public static String[] getSentences(String input) {

224 16
if (input == null) {

225 2
return new String[0];

226
} else {

227
// split on a ".", a "!", a "?" followed by a space or EOL

228 14
return input.split("(\\.|!|\\?)+(\\s|\\z)");

229
}

230

231
}

232

233
/**

234
* Given an inputStream, this method returns a String. New lines are

235
* replaced with " "

236
*/

237
public static String getString(InputStream is) throws IOException {

238

239 2
BufferedReader reader = new BufferedReader(class="keyword">new InputStreamReader(is));

240 2
String line = "";

241 2
StringBuffer stringBuffer = new StringBuffer();

242 6
while ((line = reader.readLine()) != null) {

243 2
stringBuffer.append(line);

244 2
stringBuffer.append(" ");

245
}

246

247 2
reader.close();

248

249 2
return stringBuffer.toString().trim();

250
}

251
}

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.

1		/*
2		* ====================================================================
3		*
4		* The Apache Software License, Version 1.1
5		*
6		* Copyright (c) 2003 Nick Lothian. All rights reserved.
7		*
8		* Redistribution and use in source and binary forms, with or without
9		* modification, are permitted provided that the following conditions
10		* are met:
11		*
12		* 1. Redistributions of source code must retain the above copyright
13		* notice, this list of conditions and the following disclaimer.
14		*
15		* 2. Redistributions in binary form must reproduce the above copyright
16		* notice, this list of conditions and the following disclaimer in
17		* the documentation and/or other materials provided with the
18		* distribution.
19		*
20		* 3. The end-user documentation included with the redistribution, if
21		* any, must include the following acknowlegement:
22		* "This product includes software developed by the
23		* developers of Classifier4J (http://classifier4j.sf.net/)."
24		* Alternately, this acknowlegement may appear in the software itself,
25		* if and wherever such third-party acknowlegements normally appear.
26		*
27		* 4. The name "Classifier4J" must not be used to endorse or promote
28		* products derived from this software without prior written
29		* permission. For written permission, please contact
30		* http://sourceforge.net/users/nicklothian/.
31		*
32		* 5. Products derived from this software may not be called
33		* "Classifier4J", nor may "Classifier4J" appear in their names
34		* without prior written permission. For written permission, please
35		* contact http://sourceforge.net/users/nicklothian/.
36		*
37		* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38		* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39		* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40		* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41		* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42		* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43		* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44		* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45		* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46		* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47		* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48		* SUCH DAMAGE.
49		* ====================================================================
50		*/
51
52		package net.sf.classifier4J;
53
54		import java.io.BufferedReader;
55		import java.io.IOException;
56		import java.io.InputStream;
57		import java.io.InputStreamReader;
58
59		import java.util.ArrayList;
60		import java.util.Arrays;
61		import java.util.Collections;
62		import java.util.HashMap;
63		import java.util.Iterator;
64		import java.util.LinkedHashSet;
65		import java.util.List;
66		import java.util.Map;
67		import java.util.Set;
68		import java.util.TreeSet;
69
70		/**
71		* @author Nick Lothian
72		* @author Peter Leschev
73		*/
74	0	public class Utilities {
75
76		public static Map getWordFrequency(String input) {
77	6	return getWordFrequency(input, false);
78		}
79
80		public static Map getWordFrequency(String input, boolean caseSensitive) {
81	8	return getWordFrequency(input, caseSensitive, new DefaultTokenizer(), class="keyword">new DefaultStopWordsProvider());
82		}
83
84		/**
85		* Get a Map of words and Integer representing the number of each word
86		*
87		* @param input The String to get the word frequency of
88		* @param caseSensitive true if words should be treated as separate if they have different case
89		* @param tokenizer a junit.framework.TestCase#run()
90		* @param stopWordsProvider
91		* @return
92		*/
93		public static Map getWordFrequency(String input, boolean caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordsProvider) {
94	30	String convertedInput = input;
95	30	if (!caseSensitive) {
96	28	convertedInput = input.toLowerCase();
97		}
98
99		// tokenize into an array of words
100	30	String[] words = tokenizer.tokenize(convertedInput);
101	30	Arrays.sort(words);
102
103	30	String[] uniqueWords = getUniqueWords(words);
104
105	30	Map result = new HashMap();
106	218	for (int i = 0; i < uniqueWords.length; i++) {
107	188	if (stopWordsProvider == null) {
108		// no stop word provider, so add all words
109	8	result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
110	180	} else if (isWord(uniqueWords[i]) && !stopWordsProvider.isStopWord(uniqueWords[i])) {
111		// add only words that are not stop words
112	106	result.put(uniqueWords[i], new Integer(countWords(uniqueWords[i], words)));
113		}
114		}
115
116	30	return result;
117		}
118
119		private static String[] findWordsWithFrequency(Map wordFrequencies, Integer frequency) {
120	36	if (wordFrequencies == null \|\| frequency == class="keyword">null) {
121	0	return new String[0];
122		} else {
123	36	List results = new ArrayList();
124	36	Iterator it = wordFrequencies.keySet().iterator();
125
126	402	while (it.hasNext()) {
127	330	String word = (String) it.next();
128	330	if (frequency.equals(wordFrequencies.get(word))) {
129	92	results.add(word);
130		}
131		}
132
133	36	return (String[]) results.toArray(new String[results.size()]);
134
135		}
136		}
137
138		public static Set getMostFrequentWords(int count, Map wordFrequencies) {
139	14	Set result = new LinkedHashSet();
140
141	14	Integer max = (Integer) Collections.max(wordFrequencies.values());
142
143	14	int freq = max.class="keyword">intValue();
144	64	while (result.size() < count && freq > 0) {
145		// this is very icky
146	36	String words[] = findWordsWithFrequency(wordFrequencies, new Integer(freq));
147	36	result.addAll(Arrays.asList(words));
148	36	freq--;
149		}
150
151	14	return result;
152		}
153
154
155		private static boolean isWord(String word) {
156	180	if (word != null && !word.trim().equals("")) {
157	170	return true;
158		} else {
159	10	return false;
160		}
161		}
162
163		/**
164		* Find all unique words in an array of words
165		*
166		* @param input an array of Strings
167		* @return an array of all unique strings. Order is not guarenteed
168		*/
169		public static String[] getUniqueWords(String[] input) {
170	36	if (input == null) {
171	2	return new String[0];
172		} else {
173	34	Set result = new TreeSet();
174	308	for (int i = 0; i < input.length; i++) {
175	274	result.add(input[i]);
176		}
177	34	return (String[]) result.toArray(new String[result.size()]);
178		}
179		}
180
181		/**
182		* Count how many times a word appears in an array of words
183		*
184		* @param word The word to count
185		* @param words non-null array of words
186		*/
187		public static int countWords(String word, String[] words) {
188		// find the index of one of the items in the array.
189		// From the JDK docs on binarySearch:
190		// If the array contains multiple elements equal to the specified object, there is no guarantee which one will be found.
191	122	int itemIndex = Arrays.binarySearch(words, word);
192
193		// iterate backwards until we find the first match
194	122	if (itemIndex > 0) {
195	312	while (itemIndex > 0 && words[itemIndex].equals(word)) {
196	108	itemIndex--;
197		}
198		}
199
200		// now itemIndex is one item before the start of the words
201	122	int count = 0;
202	416	while (itemIndex < words.length && itemIndex >= 0) {
203	262	if (words[itemIndex].equals(word)) {
204	166	count++;
205		}
206
207	262	itemIndex++;
208	262	if (itemIndex < words.length) {
209	234	if (!words[itemIndex].equals(word)) {
210	90	break;
211		}
212		}
213		}
214
215	122	return count;
216		}
217
218		/**
219		*
220		* @param input a String which may contain many sentences
221		* @return an array of Strings, each element containing a sentence
222		*/
223		public static String[] getSentences(String input) {
224	16	if (input == null) {
225	2	return new String[0];
226		} else {
227		// split on a ".", a "!", a "?" followed by a space or EOL
228	14	return input.split("(\\.\|!\|\\?)+(\\s\|\\z)");
229		}
230
231		}
232
233		/**
234		* Given an inputStream, this method returns a String. New lines are
235		* replaced with " "
236		*/
237		public static String getString(InputStream is) throws IOException {
238
239	2	BufferedReader reader = new BufferedReader(class="keyword">new InputStreamReader(is));
240	2	String line = "";
241	2	StringBuffer stringBuffer = new StringBuffer();
242	6	while ((line = reader.readLine()) != null) {
243	2	stringBuffer.append(line);
244	2	stringBuffer.append(" ");
245		}
246
247	2	reader.close();
248
249	2	return stringBuffer.toString().trim();
250		}
251		}