Coverage report

  %line %branch
net.sf.classifier4J.SimpleHTMLTokenizer
82% 
87% 

 1  
 /*
 2  
  * ====================================================================
 3  
  * 
 4  
  * The Apache Software License, Version 1.1
 5  
  *
 6  
  * Copyright (c) 2003 Nick Lothian. All rights reserved.
 7  
  *
 8  
  * Redistribution and use in source and binary forms, with or without
 9  
  * modification, are permitted provided that the following conditions
 10  
  * are met:
 11  
  *
 12  
  * 1. Redistributions of source code must retain the above copyright
 13  
  *    notice, this list of conditions and the following disclaimer. 
 14  
  *
 15  
  * 2. Redistributions in binary form must reproduce the above copyright
 16  
  *    notice, this list of conditions and the following disclaimer in
 17  
  *    the documentation and/or other materials provided with the
 18  
  *    distribution.
 19  
  *
 20  
  * 3. The end-user documentation included with the redistribution, if
 21  
  *    any, must include the following acknowlegement:  
 22  
  *       "This product includes software developed by the 
 23  
  *        developers of Classifier4J (http://classifier4j.sf.net/)."
 24  
  *    Alternately, this acknowlegement may appear in the software itself,
 25  
  *    if and wherever such third-party acknowlegements normally appear.
 26  
  *
 27  
  * 4. The name "Classifier4J" must not be used to endorse or promote 
 28  
  *    products derived from this software without prior written 
 29  
  *    permission. For written permission, please contact   
 30  
  *    http://sourceforge.net/users/nicklothian/.
 31  
  *
 32  
  * 5. Products derived from this software may not be called 
 33  
  *    "Classifier4J", nor may "Classifier4J" appear in their names 
 34  
  *    without prior written permission. For written permission, please 
 35  
  *    contact http://sourceforge.net/users/nicklothian/.
 36  
  *
 37  
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 38  
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 39  
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 40  
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 41  
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 42  
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 43  
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 44  
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 45  
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 46  
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 47  
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 48  
  * SUCH DAMAGE.
 49  
  * ====================================================================
 50  
  */
 51  
 
 52  
 package net.sf.classifier4J;
 53  
 
 54  
 import java.util.Stack;
 55  
 /**
 56  
  * <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed
 57  
  * in a normal web browser.</p>
 58  
  * 
 59  
  * <p>It does not handle meta tags, alt or text attributes, but it does remove 
 60  
  * CSS style definitions and javascript code.</p>
 61  
  * 
 62  
  * <p>It handles entity references by replacing them with a space(!!). This can be 
 63  
  * overridden.</p> 
 64  
  * 
 65  
  * 
 66  
  * @since 18 Nov 2003
 67  
  * @author Nick Lothian
 68  
  */
 69  
 public class SimpleHTMLTokenizer extends DefaultTokenizer {
 70  
 
 71  
     /**
 72  
      * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
 73  
      */
 74  
     public SimpleHTMLTokenizer() {
 75  4
         super();
 76  4
     }
 77  
 
 78  
     public SimpleHTMLTokenizer(int tokenizerConfig) {
 79  0
         super(tokenizerConfig);
 80  0
     }
 81  
 
 82  
     public SimpleHTMLTokenizer(String regularExpression) {
 83  0
         super(regularExpression);
 84  0
     }
 85  
 
 86  
     /**
 87  
      * Replaces entity references with spaces
 88  
      * 
 89  
      * @param contentsWithUnresolvedEntityReferences the contents with the entity references
 90  
      * @return the contents with the entities replaces with spaces
 91  
      */
 92  
     protected String resolveEntities(String contentsWithUnresolvedEntityReferences) {
 93  6
         if (contentsWithUnresolvedEntityReferences == null) {
 94  0
             throw new IllegalArgumentException("Cannot pass null");
 95  
         }
 96  
 
 97  6
         return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " ");
 98  
     }
 99  
 
 100  
     /**
 101  
      * @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String)
 102  
      */
 103  
     public String[] tokenize(String input) {
 104  2
         Stack stack = new Stack();
 105  2
         Stack tagStack = new Stack();
 106  
 
 107  
         // iterate over the input string and parse find text that would be displayed
 108  2
         char[] class="keyword">chars = input.toCharArray();
 109  
 
 110  2
         StringBuffer result = new StringBuffer();
 111  
 
 112  2
         StringBuffer currentTagName = new StringBuffer();
 113  74
         for (int i = 0; i < chars.length; i++) {
 114  
 
 115  72
             switch (chars[i]) {
 116  
                 case '<' :
 117  4
                     stack.push(Boolean.TRUE);
 118  4
                     currentTagName = new StringBuffer();
 119  4
                     break;
 120  
                 case '>' :
 121  4
                     stack.pop();
 122  4
                     if (currentTagName != null) {
 123  4
                         String currentTag = currentTagName.toString();
 124  
 
 125  4
                         if (currentTag.startsWith("/")) {
 126  2
                             tagStack.pop();
 127  
                         } else {
 128  
 
 129  2
                             tagStack.push(currentTag.toLowerCase());
 130  
                         }
 131  
                     }
 132  
                     break;
 133  
                 default :
 134  64
                     if (stack.size() == 0) {
 135  54
                         String currentTag = (String) tagStack.peek();
 136  
                         // ignore everything inside <script></script> or <style></style> tags
 137  54
                         if (currentTag != null) {
 138  54
                             if (!(currentTag.startsWith("script") || currentTag.startsWith("style"))) {
 139  54
                                 result.append(chars[i]);
 140  
                             }
 141  
                         } else {
 142  0
                             result.append(chars[i]);
 143  
                         }
 144  
 
 145  
                     } else {
 146  10
                         currentTagName.append(chars[i]);
 147  
                     }
 148  
                     break;
 149  
             }
 150  
         }
 151  
 
 152  2
         return super.tokenize(resolveEntities(result.toString()).trim());
 153  
     }
 154  
 
 155  
 }

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.