1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  
16  
17  
18  
19  
20  
21  
22  
23  
24  
25  
26  
27  
28  
29  
30  
31  
32  
33  
34  
35  
36  
37  
38  
39  
40  
41  
42  
43  
44  
45  
46  
47  
48  
49  
50  
51  package net.sf.classifier4J;
52  
53  import net.sf.classifier4J.util.ToStringBuilder;
54  
55  /*** 
56   * @author Peter Leschev
57   */
58  public class DefaultTokenizer implements ITokenizer {
59  
60      /***
61       * Use a the "\W" (non-word characters) regexp to split the string passed to classify
62       */
63      public static int BREAK_ON_WORD_BREAKS = 1;
64  
65      /***
66       * Use a the "\s" (whitespace) regexp to split the string passed to classify
67       */
68      public static int BREAK_ON_WHITESPACE = 2;
69  
70      private int tokenizerConfig = -1;
71      private String customTokenizerRegExp = null;
72  
73      /***
74       * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
75       */
76      public DefaultTokenizer() {
77          this(BREAK_ON_WORD_BREAKS);
78      }
79  
80      public DefaultTokenizer(int tokenizerConfig) {
81          setTokenizerConfig(tokenizerConfig);
82      }
83  
84      public DefaultTokenizer(String regularExpression) {
85          setCustomTokenizerRegExp(regularExpression);
86      }
87  
88      /***
89       * @return the custom regular expression to use for {@link #tokenize(String)}
90       */
91      public String getCustomTokenizerRegExp() {
92          return customTokenizerRegExp;
93      }
94  
95      /***
96       * @return The configuration setting used by {@link #tokenize(String)}.
97       */
98      public int getTokenizerConfig() {
99          return tokenizerConfig;
100     }
101 
102     /***
103      * <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}.
104      * Note that this regular expression will only be used if tokenizerConfig is set to
105      * {@link #BREAK_ON_CUSTOM_REGEXP }</p>
106      *
107      * @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null.
108      */
109     public void setCustomTokenizerRegExp(String string) {
110 
111         if (string == null) {
112             throw new IllegalArgumentException("Regular Expression string must not be null");
113         }
114 
115         customTokenizerRegExp = string;
116     }
117 
118     /***
119      * @param tokConfig The configuration setting for use by {@link #tokenize(String)}.
120      * Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS}
121      * and {@link #BREAK_ON_WHITESPACE}
122      */
123     public void setTokenizerConfig(int tokConfig) {
124 
125         if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) {
126             throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE");
127         }
128 
129         tokenizerConfig = tokConfig;
130     }
131 
132     public String[] tokenize(String input) {
133 
134         String regexp = "";
135 
136         if (customTokenizerRegExp != null) {
137             regexp = customTokenizerRegExp;
138         } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
139             regexp = "//W";
140         } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
141             regexp = "//s";
142         } else {
143             throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig);
144         }
145 
146         if (input != null) {
147             String[] words = input.split(regexp);
148             return words;
149 
150         } else {
151             return new String[0];
152         }
153     }
154 
155     public String toString() {
156 
157         ToStringBuilder toStringBuilder = new ToStringBuilder(this);
158 
159         if (customTokenizerRegExp != null) {
160             toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp);
161         } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
162             toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS");
163         } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
164             toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE");
165         }
166 
167         return toStringBuilder.toString();
168     }
169 }