| %line | %branch | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| net.sf.classifier4J.DefaultTokenizer |
|
|
| 1 | /* |
|
| 2 | * ==================================================================== |
|
| 3 | * |
|
| 4 | * The Apache Software License, Version 1.1 |
|
| 5 | * |
|
| 6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
| 7 | * |
|
| 8 | * Redistribution and use in source and binary forms, with or without |
|
| 9 | * modification, are permitted provided that the following conditions |
|
| 10 | * are met: |
|
| 11 | * |
|
| 12 | * 1. Redistributions of source code must retain the above copyright |
|
| 13 | * notice, this list of conditions and the following disclaimer. |
|
| 14 | * |
|
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
| 16 | * notice, this list of conditions and the following disclaimer in |
|
| 17 | * the documentation and/or other materials provided with the |
|
| 18 | * distribution. |
|
| 19 | * |
|
| 20 | * 3. The end-user documentation included with the redistribution, if |
|
| 21 | * any, must include the following acknowlegement: |
|
| 22 | * "This product includes software developed by the |
|
| 23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
| 24 | * Alternately, this acknowlegement may appear in the software itself, |
|
| 25 | * if and wherever such third-party acknowlegements normally appear. |
|
| 26 | * |
|
| 27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
| 28 | * products derived from this software without prior written |
|
| 29 | * permission. For written permission, please contact |
|
| 30 | * http://sourceforge.net/users/nicklothian/. |
|
| 31 | * |
|
| 32 | * 5. Products derived from this software may not be called |
|
| 33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
| 34 | * without prior written permission. For written permission, please |
|
| 35 | * contact http://sourceforge.net/users/nicklothian/. |
|
| 36 | * |
|
| 37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
| 38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
| 39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
| 40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
| 41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
| 42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
| 43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
| 44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
| 45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
| 46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
| 47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
| 48 | * SUCH DAMAGE. |
|
| 49 | * ==================================================================== |
|
| 50 | */ |
|
| 51 | package net.sf.classifier4J; |
|
| 52 | ||
| 53 | import net.sf.classifier4J.util.ToStringBuilder; |
|
| 54 | ||
| 55 | /** |
|
| 56 | * @author Peter Leschev |
|
| 57 | */ |
|
| 58 | public class DefaultTokenizer implements ITokenizer { |
|
| 59 | ||
| 60 | /** |
|
| 61 | * Use a the "\W" (non-word characters) regexp to split the string passed to classify |
|
| 62 | */ |
|
| 63 | 12 | public static int BREAK_ON_WORD_BREAKS = 1; |
| 64 | ||
| 65 | /** |
|
| 66 | * Use a the "\s" (whitespace) regexp to split the string passed to classify |
|
| 67 | */ |
|
| 68 | 12 | public static int BREAK_ON_WHITESPACE = 2; |
| 69 | ||
| 70 | 52 | private int tokenizerConfig = -1; |
| 71 | 52 | private String customTokenizerRegExp = null; |
| 72 | ||
| 73 | /** |
|
| 74 | * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default |
|
| 75 | */ |
|
| 76 | public DefaultTokenizer() { |
|
| 77 | 22 | this(BREAK_ON_WORD_BREAKS); |
| 78 | 22 | } |
| 79 | ||
| 80 | 48 | public DefaultTokenizer(int tokenizerConfig) { |
| 81 | 48 | setTokenizerConfig(tokenizerConfig); |
| 82 | 46 | } |
| 83 | ||
| 84 | 4 | public DefaultTokenizer(String regularExpression) { |
| 85 | 4 | setCustomTokenizerRegExp(regularExpression); |
| 86 | 2 | } |
| 87 | ||
| 88 | /** |
|
| 89 | * @return the custom regular expression to use for {@link #tokenize(String)} |
|
| 90 | */ |
|
| 91 | public String getCustomTokenizerRegExp() { |
|
| 92 | 0 | return customTokenizerRegExp; |
| 93 | } |
|
| 94 | ||
| 95 | /** |
|
| 96 | * @return The configuration setting used by {@link #tokenize(String)}. |
|
| 97 | */ |
|
| 98 | public int getTokenizerConfig() { |
|
| 99 | 0 | return tokenizerConfig; |
| 100 | } |
|
| 101 | ||
| 102 | /** |
|
| 103 | * <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}. |
|
| 104 | * Note that this regular expression will only be used if tokenizerConfig is set to |
|
| 105 | * {@link #BREAK_ON_CUSTOM_REGEXP }</p> |
|
| 106 | * |
|
| 107 | * @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null. |
|
| 108 | */ |
|
| 109 | public void setCustomTokenizerRegExp(String string) { |
|
| 110 | ||
| 111 | 4 | if (string == null) { |
| 112 | 2 | throw new IllegalArgumentException("Regular Expression string must not be null"); |
| 113 | } |
|
| 114 | ||
| 115 | 2 | customTokenizerRegExp = string; |
| 116 | 2 | } |
| 117 | ||
| 118 | /** |
|
| 119 | * @param tokConfig The configuration setting for use by {@link #tokenize(String)}. |
|
| 120 | * Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS} |
|
| 121 | * and {@link #BREAK_ON_WHITESPACE} |
|
| 122 | */ |
|
| 123 | public void setTokenizerConfig(int tokConfig) { |
|
| 124 | ||
| 125 | 48 | if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) { |
| 126 | 2 | throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE"); |
| 127 | } |
|
| 128 | ||
| 129 | 46 | tokenizerConfig = tokConfig; |
| 130 | 46 | } |
| 131 | ||
| 132 | public String[] tokenize(String input) { |
|
| 133 | ||
| 134 | 38 | String regexp = ""; |
| 135 | ||
| 136 | 38 | if (customTokenizerRegExp != null) { |
| 137 | 0 | regexp = customTokenizerRegExp; |
| 138 | 38 | } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) { |
| 139 | 36 | regexp = "\\W"; |
| 140 | 2 | } else if (tokenizerConfig == BREAK_ON_WHITESPACE) { |
| 141 | 2 | regexp = "\\s"; |
| 142 | } else { |
|
| 143 | 0 | throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig); |
| 144 | } |
|
| 145 | ||
| 146 | 38 | if (input != null) { |
| 147 | 38 | String[] words = input.split(regexp); |
| 148 | 38 | return words; |
| 149 | ||
| 150 | } else { |
|
| 151 | 0 | return new String[0]; |
| 152 | } |
|
| 153 | } |
|
| 154 | ||
| 155 | public String toString() { |
|
| 156 | ||
| 157 | 0 | ToStringBuilder toStringBuilder = new ToStringBuilder(this); |
| 158 | ||
| 159 | 0 | if (customTokenizerRegExp != null) { |
| 160 | 0 | toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp); |
| 161 | 0 | } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) { |
| 162 | 0 | toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS"); |
| 163 | 0 | } else if (tokenizerConfig == BREAK_ON_WHITESPACE) { |
| 164 | 0 | toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE"); |
| 165 | } |
|
| 166 | ||
| 167 | 0 | return toStringBuilder.toString(); |
| 168 | } |
|
| 169 | } |
| This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |