DefaultTokenizer xref

View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  package net.sf.classifier4J;
52  
53  import net.sf.classifier4J.util.ToStringBuilder;
54  
55  /*** 
56   * @author Peter Leschev
57   */
58  public class DefaultTokenizer implements ITokenizer {
59  
60      /***
61       * Use a the "\W" (non-word characters) regexp to split the string passed to classify
62       */
63      public static int BREAK_ON_WORD_BREAKS = 1;
64  
65      /***
66       * Use a the "\s" (whitespace) regexp to split the string passed to classify
67       */
68      public static int BREAK_ON_WHITESPACE = 2;
69  
70      private int tokenizerConfig = -1;
71      private String customTokenizerRegExp = null;
72  
73      /***
74       * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
75       */
76      public DefaultTokenizer() {
77          this(BREAK_ON_WORD_BREAKS);
78      }
79  
80      public DefaultTokenizer(int tokenizerConfig) {
81          setTokenizerConfig(tokenizerConfig);
82      }
83  
84      public DefaultTokenizer(String regularExpression) {
85          setCustomTokenizerRegExp(regularExpression);
86      }
87  
88      /***
89       * @return the custom regular expression to use for {@link #tokenize(String)}
90       */
91      public String getCustomTokenizerRegExp() {
92          return customTokenizerRegExp;
93      }
94  
95      /***
96       * @return The configuration setting used by {@link #tokenize(String)}.
97       */
98      public int getTokenizerConfig() {
99          return tokenizerConfig;
100     }
101 
102     /***
103      * <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}.
104      * Note that this regular expression will only be used if tokenizerConfig is set to
105      * {@link #BREAK_ON_CUSTOM_REGEXP }</p>
106      *
107      * @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null.
108      */
109     public void setCustomTokenizerRegExp(String string) {
110 
111         if (string == null) {
112             throw new IllegalArgumentException("Regular Expression string must not be null");
113         }
114 
115         customTokenizerRegExp = string;
116     }
117 
118     /***
119      * @param tokConfig The configuration setting for use by {@link #tokenize(String)}.
120      * Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS}
121      * and {@link #BREAK_ON_WHITESPACE}
122      */
123     public void setTokenizerConfig(int tokConfig) {
124 
125         if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) {
126             throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE");
127         }
128 
129         tokenizerConfig = tokConfig;
130     }
131 
132     public String[] tokenize(String input) {
133 
134         String regexp = "";
135 
136         if (customTokenizerRegExp != null) {
137             regexp = customTokenizerRegExp;
138         } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
139             regexp = "//W";
140         } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
141             regexp = "//s";
142         } else {
143             throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig);
144         }
145 
146         if (input != null) {
147             String[] words = input.split(regexp);
148             return words;
149 
150         } else {
151             return new String[0];
152         }
153     }
154 
155     public String toString() {
156 
157         ToStringBuilder toStringBuilder = new ToStringBuilder(this);
158 
159         if (customTokenizerRegExp != null) {
160             toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp);
161         } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
162             toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS");
163         } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
164             toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE");
165         }
166 
167         return toStringBuilder.toString();
168     }
169 }