1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 package net.sf.classifier4J;
52
53 import net.sf.classifier4J.util.ToStringBuilder;
54
55 /***
56 * @author Peter Leschev
57 */
58 public class DefaultTokenizer implements ITokenizer {
59
60 /***
61 * Use a the "\W" (non-word characters) regexp to split the string passed to classify
62 */
63 public static int BREAK_ON_WORD_BREAKS = 1;
64
65 /***
66 * Use a the "\s" (whitespace) regexp to split the string passed to classify
67 */
68 public static int BREAK_ON_WHITESPACE = 2;
69
70 private int tokenizerConfig = -1;
71 private String customTokenizerRegExp = null;
72
73 /***
74 * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
75 */
76 public DefaultTokenizer() {
77 this(BREAK_ON_WORD_BREAKS);
78 }
79
80 public DefaultTokenizer(int tokenizerConfig) {
81 setTokenizerConfig(tokenizerConfig);
82 }
83
84 public DefaultTokenizer(String regularExpression) {
85 setCustomTokenizerRegExp(regularExpression);
86 }
87
88 /***
89 * @return the custom regular expression to use for {@link #tokenize(String)}
90 */
91 public String getCustomTokenizerRegExp() {
92 return customTokenizerRegExp;
93 }
94
95 /***
96 * @return The configuration setting used by {@link #tokenize(String)}.
97 */
98 public int getTokenizerConfig() {
99 return tokenizerConfig;
100 }
101
102 /***
103 * <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}.
104 * Note that this regular expression will only be used if tokenizerConfig is set to
105 * {@link #BREAK_ON_CUSTOM_REGEXP }</p>
106 *
107 * @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null.
108 */
109 public void setCustomTokenizerRegExp(String string) {
110
111 if (string == null) {
112 throw new IllegalArgumentException("Regular Expression string must not be null");
113 }
114
115 customTokenizerRegExp = string;
116 }
117
118 /***
119 * @param tokConfig The configuration setting for use by {@link #tokenize(String)}.
120 * Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS}
121 * and {@link #BREAK_ON_WHITESPACE}
122 */
123 public void setTokenizerConfig(int tokConfig) {
124
125 if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) {
126 throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE");
127 }
128
129 tokenizerConfig = tokConfig;
130 }
131
132 public String[] tokenize(String input) {
133
134 String regexp = "";
135
136 if (customTokenizerRegExp != null) {
137 regexp = customTokenizerRegExp;
138 } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
139 regexp = "//W";
140 } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
141 regexp = "//s";
142 } else {
143 throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig);
144 }
145
146 if (input != null) {
147 String[] words = input.split(regexp);
148 return words;
149
150 } else {
151 return new String[0];
152 }
153 }
154
155 public String toString() {
156
157 ToStringBuilder toStringBuilder = new ToStringBuilder(this);
158
159 if (customTokenizerRegExp != null) {
160 toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp);
161 } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
162 toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS");
163 } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
164 toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE");
165 }
166
167 return toStringBuilder.toString();
168 }
169 }