Coverage report

  %line %branch
net.sf.classifier4J.DefaultTokenizer
69% 
86% 

 1  
 /*
 2  
  * ====================================================================
 3  
  * 
 4  
  * The Apache Software License, Version 1.1
 5  
  *
 6  
  * Copyright (c) 2003 Nick Lothian. All rights reserved.
 7  
  *
 8  
  * Redistribution and use in source and binary forms, with or without
 9  
  * modification, are permitted provided that the following conditions
 10  
  * are met:
 11  
  *
 12  
  * 1. Redistributions of source code must retain the above copyright
 13  
  *    notice, this list of conditions and the following disclaimer. 
 14  
  *
 15  
  * 2. Redistributions in binary form must reproduce the above copyright
 16  
  *    notice, this list of conditions and the following disclaimer in
 17  
  *    the documentation and/or other materials provided with the
 18  
  *    distribution.
 19  
  *
 20  
  * 3. The end-user documentation included with the redistribution, if
 21  
  *    any, must include the following acknowlegement:  
 22  
  *       "This product includes software developed by the 
 23  
  *        developers of Classifier4J (http://classifier4j.sf.net/)."
 24  
  *    Alternately, this acknowlegement may appear in the software itself,
 25  
  *    if and wherever such third-party acknowlegements normally appear.
 26  
  *
 27  
  * 4. The name "Classifier4J" must not be used to endorse or promote 
 28  
  *    products derived from this software without prior written 
 29  
  *    permission. For written permission, please contact   
 30  
  *    http://sourceforge.net/users/nicklothian/.
 31  
  *
 32  
  * 5. Products derived from this software may not be called 
 33  
  *    "Classifier4J", nor may "Classifier4J" appear in their names 
 34  
  *    without prior written permission. For written permission, please 
 35  
  *    contact http://sourceforge.net/users/nicklothian/.
 36  
  *
 37  
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 38  
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 39  
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 40  
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 41  
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 42  
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 43  
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 44  
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 45  
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 46  
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 47  
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 48  
  * SUCH DAMAGE.
 49  
  * ====================================================================
 50  
  */
 51  
 package net.sf.classifier4J;
 52  
 
 53  
 import net.sf.classifier4J.util.ToStringBuilder;
 54  
 
 55  
 /** 
 56  
  * @author Peter Leschev
 57  
  */
 58  
 public class DefaultTokenizer implements ITokenizer {
 59  
 
 60  
     /**
 61  
      * Use a the "\W" (non-word characters) regexp to split the string passed to classify
 62  
      */
 63  12
     public static int BREAK_ON_WORD_BREAKS = 1;
 64  
 
 65  
     /**
 66  
      * Use a the "\s" (whitespace) regexp to split the string passed to classify
 67  
      */
 68  12
     public static int BREAK_ON_WHITESPACE = 2;
 69  
 
 70  52
     private int tokenizerConfig = -1;
 71  52
     private String customTokenizerRegExp = null;
 72  
 
 73  
     /**
 74  
      * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
 75  
      */
 76  
     public DefaultTokenizer() {
 77  22
         this(BREAK_ON_WORD_BREAKS);
 78  22
     }
 79  
 
 80  48
     public DefaultTokenizer(int tokenizerConfig) {
 81  48
         setTokenizerConfig(tokenizerConfig);
 82  46
     }
 83  
 
 84  4
     public DefaultTokenizer(String regularExpression) {
 85  4
         setCustomTokenizerRegExp(regularExpression);
 86  2
     }
 87  
 
 88  
     /**
 89  
      * @return the custom regular expression to use for {@link #tokenize(String)}
 90  
      */
 91  
     public String getCustomTokenizerRegExp() {
 92  0
         return customTokenizerRegExp;
 93  
     }
 94  
 
 95  
     /**
 96  
      * @return The configuration setting used by {@link #tokenize(String)}.
 97  
      */
 98  
     public int getTokenizerConfig() {
 99  0
         return tokenizerConfig;
 100  
     }
 101  
 
 102  
     /**
 103  
      * <p>Allows the use of custom regular expressions to split up the input to {@link net.sf.classifier4J.IClassifier#classify(java.lang.String)}.
 104  
      * Note that this regular expression will only be used if tokenizerConfig is set to
 105  
      * {@link #BREAK_ON_CUSTOM_REGEXP }</p>
 106  
      *
 107  
      * @param string set the custom regular expression to use for {@link #tokenize(String)}. Must not be null.
 108  
      */
 109  
     public void setCustomTokenizerRegExp(String string) {
 110  
 
 111  4
         if (string == null) {
 112  2
             throw new IllegalArgumentException("Regular Expression string must not be null");
 113  
         }
 114  
 
 115  2
         customTokenizerRegExp = string;
 116  2
     }
 117  
 
 118  
     /**
 119  
      * @param tokConfig The configuration setting for use by {@link #tokenize(String)}.
 120  
      * Valid values are {@link #BREAK_ON_CUSTOM_REGEXP}, {@link #BREAK_ON_WORD_BREAKS}
 121  
      * and {@link #BREAK_ON_WHITESPACE}
 122  
      */
 123  
     public void setTokenizerConfig(int tokConfig) {
 124  
 
 125  48
         if (tokConfig != BREAK_ON_WORD_BREAKS && tokConfig != BREAK_ON_WHITESPACE) {
 126  2
             throw new IllegalArgumentException("tokenConfiguration must be either BREAK_ON_WORD_BREAKS or BREAK_ON_WHITESPACE");
 127  
         }
 128  
 
 129  46
         tokenizerConfig = tokConfig;
 130  46
     }
 131  
 
 132  
     public String[] tokenize(String input) {
 133  
 
 134  38
         String regexp = "";
 135  
 
 136  38
         if (customTokenizerRegExp != null) {
 137  0
             regexp = customTokenizerRegExp;
 138  38
         } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
 139  36
             regexp = "\\W";
 140  2
         } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
 141  2
             regexp = "\\s";
 142  
         } else {
 143  0
             throw new IllegalStateException("Illegal tokenizer configuration. customTokenizerRegExp = null & tokenizerConfig = " + tokenizerConfig);
 144  
         }
 145  
 
 146  38
         if (input != null) {
 147  38
             String[] words = input.split(regexp);
 148  38
             return words;
 149  
 
 150  
         } else {
 151  0
             return new String[0];
 152  
         }
 153  
     }
 154  
 
 155  
     public String toString() {
 156  
 
 157  0
         ToStringBuilder toStringBuilder = new ToStringBuilder(this);
 158  
 
 159  0
         if (customTokenizerRegExp != null) {
 160  0
             toStringBuilder = toStringBuilder.append("customTokenizerRegExp", customTokenizerRegExp);
 161  0
         } else if (tokenizerConfig == BREAK_ON_WORD_BREAKS) {
 162  0
             toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WORD_BREAKS");
 163  0
         } else if (tokenizerConfig == BREAK_ON_WHITESPACE) {
 164  0
             toStringBuilder = toStringBuilder.append("tokenizerConfig", "BREAK_ON_WHITESPACE");
 165  
         }
 166  
 
 167  0
         return toStringBuilder.toString();
 168  
     }
 169  
 }

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.