unit tests coverage

Coverage report

%line %branch

net.sf.classifier4J.DefaultTokenizer
69%

86%

1
/*

2
* ====================================================================

3
*

4
* The Apache Software License, Version 1.1

5
*

6
* Copyright (c) 2003 Nick Lothian. All rights reserved.

7
*

8
* Redistribution and use in source and binary forms, with or without

9
* modification, are permitted provided that the following conditions

10
* are met:

11
*

12
* 1. Redistributions of source code must retain the above copyright

13
* notice, this list of conditions and the following disclaimer.

14
*

15
* 2. Redistributions in binary form must reproduce the above copyright

16
* notice, this list of conditions and the following disclaimer in

17
* the documentation and/or other materials provided with the

18
* distribution.

19
*

20
* 3. The end-user documentation included with the redistribution, if

21
* any, must include the following acknowlegement:

22
* "This product includes software developed by the

23
* developers of Classifier4J (http://classifier4j.sf.net/)."

24
* Alternately, this acknowlegement may appear in the software itself,

25
* if and wherever such third-party acknowlegements normally appear.

26
*

27
* 4. The name "Classifier4J" must not be used to endorse or promote

28
* products derived from this software without prior written

29
* permission. For written permission, please contact

30
* http://sourceforge.net/users/nicklothian/.

31
*

32
* 5. Products derived from this software may not be called

33
* "Classifier4J", nor may "Classifier4J" appear in their names

34
* without prior written permission. For written permission, please

35
* contact http://sourceforge.net/users/nicklothian/.

36
*

37
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED

38
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES

39
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

40
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR

41
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

42
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

43
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF

44
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

45
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

46
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

47
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

48
* SUCH DAMAGE.

49
* ====================================================================

50
*/

51
package net.sf.classifier4J;

52

53
import net.sf.classifier4J.util.ToStringBuilder;

54

55
/**

56
* @author Peter Leschev

57
*/

58
public class DefaultTokenizer implements ITokenizer {

59

60
/**

61
* Use a the "\W" (non-word characters) regexp to split the string passed to classify

62
*/

63 12
public static int BREAK_ON_WORD_BREAKS = 1;

64

65
/**

66
* Use a the "\s" (whitespace) regexp to split the string passed to classify

67
*/

68 12
public static int BREAK_ON_WHITESPACE = 2;

69

70 52
private int tokenizerConfig = -1;

71 52
private String customTokenizerRegExp = null;

72

73
/**

74
* Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default

75
*/

76
public DefaultTokenizer() {

77 22
this(BREAK_ON_WORD_BREAKS);

78 22
}

79

80 48
public DefaultTokenizer(int tokenizerConfig) {

81 48
setTokenizerConfig(tokenizerConfig);

82 46
}

83

84 4
public DefaultTokenizer(String regularExpression) {

85 4
setCustomTokenizerRegExp(regularExpression);

86 2
}

87

88
/**

89
* @return the custom regular expression to use for {@link #tokenize(String)}

90
*/

91
public String getCustomTokenizerRegExp() {

92 0
return customTokenizerRegExp;

93
}

94

95
/**

96
* @return The configuration setting used by {@link #tokenize(String)}.

97
*/

98
public int getTokenizerConfig() {

99 0
return tokenizerConfig;

100
}

101

102
/**

103
* <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}.

104
* Note that this regular expression will only be used if tokenizerConfig is set to

105
* {@link #BREAK_ON_CUSTOM_REGEXP }</p>

106
*

107
* @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null.

108
*/

109
public void setCustomTokenizerRegExp(String string) {

110

111 4
if (string == null) {

112 2
throw new IllegalArgumentException("Regular Expression string must not be null");

113
}

114

115 2
customTokenizerRegExp = string;

116 2
}

117

118
/**

119
* @param tokConfig The configuration setting for use by {@link #tokenize(String)}.

120
* Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS}

121
* and {@link #BREAK_ON_WHITESPACE}

122
*/

123
public void setTokenizerConfig(int tokConfig) {

124

125 48
if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) {

126 2
throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE");

127
}

128

129 46
tokenizerConfig = tokConfig;

130 46
}

131

132
public String[] tokenize(String input) {

133

134 38
String regexp = "";

135

136 38
if (customTokenizerRegExp != null) {

137 0
regexp = customTokenizerRegExp;

138 38
} else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {

139 36
regexp = "\\W";

140 2
} else if (tokenizerConfig == BREAK_ON_WHITESPACE) {

141 2
regexp = "\\s";

142
} else {

143 0
throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig);

144
}

145

146 38
if (input != null) {

147 38
String[] words = input.split(regexp);

148 38
return words;

149

150
} else {

151 0
return new String[0];

152
}

153
}

154

155
public String toString() {

156

157 0
ToStringBuilder toStringBuilder = new ToStringBuilder(this);

158

159 0
if (customTokenizerRegExp != null) {

160 0
toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp);

161 0
} else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {

162 0
toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS");

163 0
} else if (tokenizerConfig == BREAK_ON_WHITESPACE) {

164 0
toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE");

165
}

166

167 0
return toStringBuilder.toString();

168
}

169
}

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.

1		/*
2		* ====================================================================
3		*
4		* The Apache Software License, Version 1.1
5		*
6		* Copyright (c) 2003 Nick Lothian. All rights reserved.
7		*
8		* Redistribution and use in source and binary forms, with or without
9		* modification, are permitted provided that the following conditions
10		* are met:
11		*
12		* 1. Redistributions of source code must retain the above copyright
13		* notice, this list of conditions and the following disclaimer.
14		*
15		* 2. Redistributions in binary form must reproduce the above copyright
16		* notice, this list of conditions and the following disclaimer in
17		* the documentation and/or other materials provided with the
18		* distribution.
19		*
20		* 3. The end-user documentation included with the redistribution, if
21		* any, must include the following acknowlegement:
22		* "This product includes software developed by the
23		* developers of Classifier4J (http://classifier4j.sf.net/)."
24		* Alternately, this acknowlegement may appear in the software itself,
25		* if and wherever such third-party acknowlegements normally appear.
26		*
27		* 4. The name "Classifier4J" must not be used to endorse or promote
28		* products derived from this software without prior written
29		* permission. For written permission, please contact
30		* http://sourceforge.net/users/nicklothian/.
31		*
32		* 5. Products derived from this software may not be called
33		* "Classifier4J", nor may "Classifier4J" appear in their names
34		* without prior written permission. For written permission, please
35		* contact http://sourceforge.net/users/nicklothian/.
36		*
37		* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38		* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39		* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40		* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41		* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42		* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43		* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44		* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45		* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46		* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47		* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48		* SUCH DAMAGE.
49		* ====================================================================
50		*/
51		package net.sf.classifier4J;
52
53		import net.sf.classifier4J.util.ToStringBuilder;
54
55		/**
56		* @author Peter Leschev
57		*/
58		public class DefaultTokenizer implements ITokenizer {
59
60		/**
61		* Use a the "\W" (non-word characters) regexp to split the string passed to classify
62		*/
63	12	public static int BREAK_ON_WORD_BREAKS = 1;
64
65		/**
66		* Use a the "\s" (whitespace) regexp to split the string passed to classify
67		*/
68	12	public static int BREAK_ON_WHITESPACE = 2;
69
70	52	private int tokenizerConfig = -1;
71	52	private String customTokenizerRegExp = null;
72
73		/**
74		* Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
75		*/
76		public DefaultTokenizer() {
77	22	this(BREAK_ON_WORD_BREAKS);
78	22	}
79
80	48	public DefaultTokenizer(int tokenizerConfig) {
81	48	setTokenizerConfig(tokenizerConfig);
82	46	}
83
84	4	public DefaultTokenizer(String regularExpression) {
85	4	setCustomTokenizerRegExp(regularExpression);
86	2	}
87
88		/**
89		* @return the custom regular expression to use for {@link #tokenize(String)}
90		*/
91		public String getCustomTokenizerRegExp() {
92	0	return customTokenizerRegExp;
93		}
94
95		/**
96		* @return The configuration setting used by {@link #tokenize(String)}.
97		*/
98		public int getTokenizerConfig() {
99	0	return tokenizerConfig;
100		}
101
102		/**
103		* <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}.
104		* Note that this regular expression will only be used if tokenizerConfig is set to
105		* {@link #BREAK_ON_CUSTOM_REGEXP }</p>
106		*
107		* @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null.
108		*/
109		public void setCustomTokenizerRegExp(String string) {
110
111	4	if (string == null) {
112	2	throw new IllegalArgumentException("Regular Expression string must not be null");
113		}
114
115	2	customTokenizerRegExp = string;
116	2	}
117
118		/**
119		* @param tokConfig The configuration setting for use by {@link #tokenize(String)}.
120		* Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS}
121		* and {@link #BREAK_ON_WHITESPACE}
122		*/
123		public void setTokenizerConfig(int tokConfig) {
124
125	48	if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) {
126	2	throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE");
127		}
128
129	46	tokenizerConfig = tokConfig;
130	46	}
131
132		public String[] tokenize(String input) {
133
134	38	String regexp = "";
135
136	38	if (customTokenizerRegExp != null) {
137	0	regexp = customTokenizerRegExp;
138	38	} else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
139	36	regexp = "\\W";
140	2	} else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
141	2	regexp = "\\s";
142		} else {
143	0	throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig);
144		}
145
146	38	if (input != null) {
147	38	String[] words = input.split(regexp);
148	38	return words;
149
150		} else {
151	0	return new String[0];
152		}
153		}
154
155		public String toString() {
156
157	0	ToStringBuilder toStringBuilder = new ToStringBuilder(this);
158
159	0	if (customTokenizerRegExp != null) {
160	0	toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp);
161	0	} else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
162	0	toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS");
163	0	} else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
164	0	toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE");
165		}
166
167	0	return toStringBuilder.toString();
168		}
169		}